blob: 167e356a6c2846b0832108acfa9db1f1ac35a1f8 [file] [log] [blame]
/**************************************************************************************************
* *
* This file is part of BLASFEO. *
* *
* BLASFEO -- BLAS For Embedded Optimization. *
* Copyright (C) 2016-2017 by Gianluca Frison. *
* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
* All rights reserved. *
* *
* HPMPC is free software; you can redistribute it and/or *
* modify it under the terms of the GNU Lesser General Public *
* License as published by the Free Software Foundation; either *
* version 2.1 of the License, or (at your option) any later version. *
* *
* HPMPC is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
* See the GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public *
* License along with HPMPC; if not, write to the Free Software *
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
* *
* Author: Gianluca Frison, giaf (at) dtu.dk *
* gianluca.frison (at) imtek.uni-freiburg.de *
* *
**************************************************************************************************/
#include <math.h>
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
//#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dgemm_nt_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
{
const int bs = 4;
double
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
c_00=0, c_01=0, c_02=0, c_03=0,
c_10=0, c_11=0, c_12=0, c_13=0,
c_20=0, c_21=0, c_22=0, c_23=0,
c_30=0, c_31=0, c_32=0, c_33=0;
double
*C1, *D1;
int k;
for(k=0; k<kmax-3; k+=4)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 1
a_0 = A[4];
a_1 = A[5];
a_2 = A[6];
a_3 = A[7];
b_0 = B[4];
b_1 = B[5];
b_2 = B[6];
b_3 = B[7];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 2
a_0 = A[8];
a_1 = A[9];
a_2 = A[10];
a_3 = A[11];
b_0 = B[8];
b_1 = B[9];
b_2 = B[10];
b_3 = B[11];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 3
a_0 = A[12];
a_1 = A[13];
a_2 = A[14];
a_3 = A[15];
b_0 = B[12];
b_1 = B[13];
b_2 = B[14];
b_3 = B[15];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 16;
B += 16;
}
for(; k<kmax; k++)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 4;
}
if(offsetC==0)
{
c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
}
else if(offsetC==1)
{
C1 = C0 + sdc*bs;
c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
}
else if(offsetC==2)
{
C1 = C0 + sdc*bs;
c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
}
else //if(offsetC==3)
{
C1 = C0 + sdc*bs;
c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
}
// shift sol for cols
if(n0>0)
{
if(n0==1)
{
c_00 = c_01;
c_10 = c_11;
c_20 = c_21;
c_30 = c_31;
c_01 = c_02;
c_11 = c_12;
c_21 = c_22;
c_31 = c_32;
c_02 = c_03;
c_12 = c_13;
c_22 = c_23;
c_32 = c_33;
D0 += 1*bs;
}
else if(n0==2)
{
c_00 = c_02;
c_10 = c_12;
c_20 = c_22;
c_30 = c_32;
c_01 = c_03;
c_11 = c_13;
c_21 = c_23;
c_31 = c_33;
D0 += 2*bs;
}
else //if(n0==3)
{
c_00 = c_03;
c_10 = c_13;
c_20 = c_23;
c_30 = c_33;
D0 += 3*bs;
}
}
int kn = n1 - n0;
if(offsetD==0)
{
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
}
else if(offsetD==1)
{
D1 = D0 + sdd*bs;
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
}
else if(offsetD==2)
{
D1 = D0 + sdd*bs;
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
}
else //if(offsetD==3)
{
D1 = D0 + sdd*bs;
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
}
return;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dgemm_nt_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
{
const int bs = 4;
double
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
c_00=0, c_01=0, c_02=0, c_03=0,
c_10=0, c_11=0, c_12=0, c_13=0,
c_20=0, c_21=0, c_22=0, c_23=0,
c_30=0, c_31=0, c_32=0, c_33=0;
int k;
for(k=0; k<kmax-3; k+=4)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 1
a_0 = A[4];
a_1 = A[5];
a_2 = A[6];
a_3 = A[7];
b_0 = B[4];
b_1 = B[5];
b_2 = B[6];
b_3 = B[7];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 2
a_0 = A[8];
a_1 = A[9];
a_2 = A[10];
a_3 = A[11];
b_0 = B[8];
b_1 = B[9];
b_2 = B[10];
b_3 = B[11];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 3
a_0 = A[12];
a_1 = A[13];
a_2 = A[14];
a_3 = A[15];
b_0 = B[12];
b_1 = B[13];
b_2 = B[14];
b_3 = B[15];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 16;
B += 16;
}
for(; k<kmax; k++)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 4;
}
c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
if(km>=4)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
D[3+bs*0] = c_30;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
D[3+bs*1] = c_31;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
D[3+bs*2] = c_32;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
D[3+bs*3] = c_33;
}
else if(km>=3)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
}
else if(km>=2)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
}
else //if(km>=1)
{
D[0+bs*0] = c_00;
if(kn==1)
return;
D[0+bs*1] = c_01;
if(kn==2)
return;
D[0+bs*2] = c_02;
if(kn==3)
return;
D[0+bs*3] = c_03;
}
}
#endif
#if defined(TARGET_GENERIC)
void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
{
kernel_dgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dgemm_nn_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
{
const int bs = 4;
double
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
c_00=0, c_01=0, c_02=0, c_03=0,
c_10=0, c_11=0, c_12=0, c_13=0,
c_20=0, c_21=0, c_22=0, c_23=0,
c_30=0, c_31=0, c_32=0, c_33=0;
double
*C1, *D1;
int k;
k = 0;
if(offsetB!=0)
{
if(offsetB==1)
{
B += 1;
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[4];
b_2 = B[8];
b_3 = B[12];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto scale;
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[4];
b_2 = B[8];
b_3 = B[12];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto scale;
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[4];
b_2 = B[8];
b_3 = B[12];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 1;
B += bs*(sdb-1);
k += 1;
}
else if(offsetB==2)
{
B += 2;
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[4];
b_2 = B[8];
b_3 = B[12];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto scale;
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[4];
b_2 = B[8];
b_3 = B[12];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 1;
B += bs*(sdb-1);
k += 1;
}
else // if(offsetB==3)
{
B += 3;
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[4];
b_2 = B[8];
b_3 = B[12];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 1;
B += bs*(sdb-1);
k += 1;
}
}
for(; k<kmax-3; k+=4)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[4];
b_2 = B[8];
b_3 = B[12];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 1
a_0 = A[4];
a_1 = A[5];
a_2 = A[6];
a_3 = A[7];
b_0 = B[1];
b_1 = B[5];
b_2 = B[9];
b_3 = B[13];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 2
a_0 = A[8];
a_1 = A[9];
a_2 = A[10];
a_3 = A[11];
b_0 = B[2];
b_1 = B[6];
b_2 = B[10];
b_3 = B[14];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 3
a_0 = A[12];
a_1 = A[13];
a_2 = A[14];
a_3 = A[15];
b_0 = B[3];
b_1 = B[7];
b_2 = B[11];
b_3 = B[15];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 16;
B += 4*sdb;
}
for(; k<kmax; k++)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[4];
b_2 = B[8];
b_3 = B[12];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 1;
}
scale:
if(offsetC==0)
{
c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
}
else if(offsetC==1)
{
C1 = C0 + sdc*bs;
c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
}
else if(offsetC==2)
{
C1 = C0 + sdc*bs;
c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
}
else //if(offsetC==3)
{
C1 = C0 + sdc*bs;
c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
}
// shift sol for cols
if(n0>0)
{
if(n0==1)
{
c_00 = c_01;
c_10 = c_11;
c_20 = c_21;
c_30 = c_31;
c_01 = c_02;
c_11 = c_12;
c_21 = c_22;
c_31 = c_32;
c_02 = c_03;
c_12 = c_13;
c_22 = c_23;
c_32 = c_33;
D0 += 1*bs;
}
else if(n0==2)
{
c_00 = c_02;
c_10 = c_12;
c_20 = c_22;
c_30 = c_32;
c_01 = c_03;
c_11 = c_13;
c_21 = c_23;
c_31 = c_33;
D0 += 2*bs;
}
else //if(n0==3)
{
c_00 = c_03;
c_10 = c_13;
c_20 = c_23;
c_30 = c_33;
D0 += 3*bs;
}
}
int kn = n1 - n0;
if(offsetD==0)
{
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
}
else if(offsetD==1)
{
D1 = D0 + sdd*bs;
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
}
else if(offsetD==2)
{
D1 = D0 + sdd*bs;
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
}
else //if(offsetD==3)
{
D1 = D0 + sdd*bs;
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
}
return;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
{
kernel_dgemm_nn_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, beta, 0, C, 0, 0, D, 0, 0, 4, 0, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dsyrk_nt_l_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
{
const int bs = 4;
double
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
c_00=0,
c_10=0, c_11=0,
c_20=0, c_21=0, c_22=0,
c_30=0, c_31=0, c_32=0, c_33=0;
double
*C1, *D1;
int k;
for(k=0; k<kmax-3; k+=4)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_33 += a_3 * b_3;
// k = 1
a_0 = A[4];
a_1 = A[5];
a_2 = A[6];
a_3 = A[7];
b_0 = B[4];
b_1 = B[5];
b_2 = B[6];
b_3 = B[7];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_33 += a_3 * b_3;
// k = 2
a_0 = A[8];
a_1 = A[9];
a_2 = A[10];
a_3 = A[11];
b_0 = B[8];
b_1 = B[9];
b_2 = B[10];
b_3 = B[11];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_33 += a_3 * b_3;
// k = 3
a_0 = A[12];
a_1 = A[13];
a_2 = A[14];
a_3 = A[15];
b_0 = B[12];
b_1 = B[13];
b_2 = B[14];
b_3 = B[15];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_33 += a_3 * b_3;
A += 16;
B += 16;
}
for(; k<kmax; k++)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_33 += a_3 * b_3;
A += 4;
B += 4;
}
if(offsetC==0)
{
c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
}
else if(offsetC==1)
{
C1 = C0 + sdc*bs;
c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
}
else if(offsetC==2)
{
C1 = C0 + sdc*bs;
c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
}
else //if(offsetC==3)
{
C1 = C0 + sdc*bs;
c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
}
// shift sol for cols
if(n0>0)
{
if(n0==1)
{
c_10 = c_11;
c_20 = c_21;
c_30 = c_31;
c_21 = c_22;
c_31 = c_32;
c_32 = c_33;
D0 += 1*bs;
}
else if(n0==2)
{
c_20 = c_22;
c_30 = c_32;
c_31 = c_33;
D0 += 2*bs;
}
else //if(n0==3)
{
c_30 = c_33;
D0 += 3*bs;
}
}
int kn = n1 - n0;
if(offsetD==0)
{
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
}
else if(offsetD==1)
{
D1 = D0 + sdd*bs;
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
}
else if(offsetD==2)
{
D1 = D0 + sdd*bs;
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
}
else //if(offsetD==3)
{
D1 = D0 + sdd*bs;
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
}
return;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dsyrk_nt_l_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
{
const int bs = 4;
double
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
c_00=0,
c_10=0, c_11=0,
c_20=0, c_21=0, c_22=0,
c_30=0, c_31=0, c_32=0, c_33=0;
int k;
for(k=0; k<kmax-3; k+=4)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_33 += a_3 * b_3;
// k = 1
a_0 = A[4];
a_1 = A[5];
a_2 = A[6];
a_3 = A[7];
b_0 = B[4];
b_1 = B[5];
b_2 = B[6];
b_3 = B[7];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_33 += a_3 * b_3;
// k = 2
a_0 = A[8];
a_1 = A[9];
a_2 = A[10];
a_3 = A[11];
b_0 = B[8];
b_1 = B[9];
b_2 = B[10];
b_3 = B[11];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_33 += a_3 * b_3;
// k = 3
a_0 = A[12];
a_1 = A[13];
a_2 = A[14];
a_3 = A[15];
b_0 = B[12];
b_1 = B[13];
b_2 = B[14];
b_3 = B[15];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_33 += a_3 * b_3;
A += 16;
B += 16;
}
for(; k<kmax; k++)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_33 += a_3 * b_3;
A += 4;
B += 4;
}
c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
if(km>=4)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
D[3+bs*0] = c_30;
if(kn==1)
return;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
D[3+bs*1] = c_31;
if(kn==2)
return;
D[2+bs*2] = c_22;
D[3+bs*2] = c_32;
if(kn==3)
return;
D[3+bs*3] = c_33;
}
else if(km>=3)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
if(kn==1)
return;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
if(kn==2)
return;
D[2+bs*2] = c_22;
}
else if(km>=2)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
if(kn==1)
return;
D[1+bs*1] = c_11;
}
else //if(km>=1)
{
D[0+bs*0] = c_00;
}
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dsyrk_nt_l_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
{
kernel_dsyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrmm_nt_ru_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
{
const int bs = 4;
double
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
c_00=0, c_01=0, c_02=0, c_03=0,
c_10=0, c_11=0, c_12=0, c_13=0,
c_20=0, c_21=0, c_22=0, c_23=0,
c_30=0, c_31=0, c_32=0, c_33=0;
int k;
k = 0;
// k = 0
if(kmax>0)
{
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
A += 4;
B += 4;
k++;
}
// k = 1
if(kmax>0)
{
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
A += 4;
B += 4;
k++;
}
// k = 2
if(kmax>0)
{
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
A += 4;
B += 4;
k++;
}
for(; k<kmax-3; k+=4)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 1
a_0 = A[4];
a_1 = A[5];
a_2 = A[6];
a_3 = A[7];
b_0 = B[4];
b_1 = B[5];
b_2 = B[6];
b_3 = B[7];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 2
a_0 = A[8];
a_1 = A[9];
a_2 = A[10];
a_3 = A[11];
b_0 = B[8];
b_1 = B[9];
b_2 = B[10];
b_3 = B[11];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 3
a_0 = A[12];
a_1 = A[13];
a_2 = A[14];
a_3 = A[15];
b_0 = B[12];
b_1 = B[13];
b_2 = B[14];
b_3 = B[15];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 16;
B += 16;
}
for(; k<kmax; k++)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 4;
}
c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
if(km>=4)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
D[3+bs*0] = c_30;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
D[3+bs*1] = c_31;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
D[3+bs*2] = c_32;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
D[3+bs*3] = c_33;
}
else if(km>=3)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
}
else if(km>=2)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
}
else //if(km>=1)
{
D[0+bs*0] = c_00;
if(kn==1)
return;
D[0+bs*1] = c_01;
if(kn==2)
return;
D[0+bs*2] = c_02;
if(kn==3)
return;
D[0+bs*3] = c_03;
}
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D)
{
kernel_dtrmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrmm_nn_rl_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
{
const int bs = 4;
double
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
c_00=0, c_01=0, c_02=0, c_03=0,
c_10=0, c_11=0, c_12=0, c_13=0,
c_20=0, c_21=0, c_22=0, c_23=0,
c_30=0, c_31=0, c_32=0, c_33=0;
double *D1;
int k;
B += offsetB;
k = 0;
if(offsetB==0)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto store;
// k = 1
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto store;
// k = 2
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
b_2 = B[8];
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto store;
// k = 3
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
b_2 = B[8];
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
b_3 = B[12];
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 4*sdb-3;
k += 1;
}
else if(offsetB==1)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto store;
// k = 1
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto store;
// k = 2
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
b_2 = B[8];
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
A += 4;
B += 4*sdb-3;
k += 1;
}
else if(offsetB==2)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto store;
// k = 1
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
A += 4;
B += 4*sdb-3;
k += 1;
if(k>=kmax)
goto store;
// k = 2
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
b_2 = B[8];
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto store;
// k = 3
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
b_2 = B[8];
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
b_3 = B[12];
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto store;
// k = 4
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
b_2 = B[8];
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
b_3 = B[12];
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto store;
// k = 5
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
b_2 = B[8];
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
b_3 = B[12];
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 4*sdb-3;
k += 1;
}
else // if(offetB==3)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
A += 4;
B += 4*sdb-3;
k += 1;
if(k>=kmax)
goto store;
// k = 1
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto store;
// k = 2
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
b_2 = B[8];
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto store;
// k = 3
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
b_2 = B[8];
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
b_3 = B[12];
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 1;
k += 1;
if(k>=kmax)
goto store;
// k = 4
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
b_1 = B[4];
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
b_2 = B[8];
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
b_3 = B[12];
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 4*sdb-3;
k += 1;
}
for(; k<kmax-3; k+=4)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[4];
b_2 = B[8];
b_3 = B[12];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 1
a_0 = A[4];
a_1 = A[5];
a_2 = A[6];
a_3 = A[7];
b_0 = B[1];
b_1 = B[5];
b_2 = B[9];
b_3 = B[13];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 2
a_0 = A[8];
a_1 = A[9];
a_2 = A[10];
a_3 = A[11];
b_0 = B[2];
b_1 = B[6];
b_2 = B[10];
b_3 = B[14];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
// k = 3
a_0 = A[12];
a_1 = A[13];
a_2 = A[14];
a_3 = A[15];
b_0 = B[3];
b_1 = B[7];
b_2 = B[11];
b_3 = B[15];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 16;
B += 4*sdb;
}
for(; k<kmax; k++)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[4];
b_2 = B[8];
b_3 = B[12];
c_00 += a_0 * b_0;
c_10 += a_1 * b_0;
c_20 += a_2 * b_0;
c_30 += a_3 * b_0;
c_01 += a_0 * b_1;
c_11 += a_1 * b_1;
c_21 += a_2 * b_1;
c_31 += a_3 * b_1;
c_02 += a_0 * b_2;
c_12 += a_1 * b_2;
c_22 += a_2 * b_2;
c_32 += a_3 * b_2;
c_03 += a_0 * b_3;
c_13 += a_1 * b_3;
c_23 += a_2 * b_3;
c_33 += a_3 * b_3;
A += 4;
B += 1;
}
store:
c_00 = alpha[0]*c_00;
c_10 = alpha[0]*c_10;
c_20 = alpha[0]*c_20;
c_30 = alpha[0]*c_30;
c_01 = alpha[0]*c_01;
c_11 = alpha[0]*c_11;
c_21 = alpha[0]*c_21;
c_31 = alpha[0]*c_31;
c_02 = alpha[0]*c_02;
c_12 = alpha[0]*c_12;
c_22 = alpha[0]*c_22;
c_32 = alpha[0]*c_32;
c_03 = alpha[0]*c_03;
c_13 = alpha[0]*c_13;
c_23 = alpha[0]*c_23;
c_33 = alpha[0]*c_33;
// shift sol for cols
if(n0>0)
{
if(n0==1)
{
c_00 = c_01;
c_10 = c_11;
c_20 = c_21;
c_30 = c_31;
c_01 = c_02;
c_11 = c_12;
c_21 = c_22;
c_31 = c_32;
c_02 = c_03;
c_12 = c_13;
c_22 = c_23;
c_32 = c_33;
D0 += 1*bs;
}
else if(n0==2)
{
c_00 = c_02;
c_10 = c_12;
c_20 = c_22;
c_30 = c_32;
c_01 = c_03;
c_11 = c_13;
c_21 = c_23;
c_31 = c_33;
D0 += 2*bs;
}
else //if(n0==3)
{
c_00 = c_03;
c_10 = c_13;
c_20 = c_23;
c_30 = c_33;
D0 += 3*bs;
}
}
int kn = n1 - n0;
if(offsetD==0)
{
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
}
else if(offsetD==1)
{
D1 = D0 + sdd*bs;
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
}
else if(offsetD==2)
{
D1 = D0 + sdd*bs;
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
}
else //if(offsetD==3)
{
D1 = D0 + sdd*bs;
if(kn<=0)
return;
if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
if(kn<=1)
return;
if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
if(kn<=2)
return;
if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
if(kn<=3)
return;
if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
}
return;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrmm_nn_rl_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *D)
{
kernel_dtrmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dpotrf_nt_l_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn)
{
const int bs = 4;
double
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
tmp,
c_00=0, //c_01=0, c_02=0, c_03=0,
c_10=0, c_11=0, //c_12=0, c_13=0,
c_20=0, c_21=0, c_22=0, //c_23=0,
c_30=0, c_31=0, c_32=0, c_33=0;
int k;
for(k=0; k<kmax-3; k+=4)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
// c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
// c_02 -= a_0 * b_2;
// c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
// c_03 -= a_0 * b_3;
// c_13 -= a_1 * b_3;
// c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
// k = 1
a_0 = A[4];
a_1 = A[5];
a_2 = A[6];
a_3 = A[7];
b_0 = B[4];
b_1 = B[5];
b_2 = B[6];
b_3 = B[7];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
// c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
// c_02 -= a_0 * b_2;
// c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
// c_03 -= a_0 * b_3;
// c_13 -= a_1 * b_3;
// c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
// k = 2
a_0 = A[8];
a_1 = A[9];
a_2 = A[10];
a_3 = A[11];
b_0 = B[8];
b_1 = B[9];
b_2 = B[10];
b_3 = B[11];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
// c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
// c_02 -= a_0 * b_2;
// c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
// c_03 -= a_0 * b_3;
// c_13 -= a_1 * b_3;
// c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
// k = 3
a_0 = A[12];
a_1 = A[13];
a_2 = A[14];
a_3 = A[15];
b_0 = B[12];
b_1 = B[13];
b_2 = B[14];
b_3 = B[15];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
// c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
// c_02 -= a_0 * b_2;
// c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
// c_03 -= a_0 * b_3;
// c_13 -= a_1 * b_3;
// c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 16;
B += 16;
}
for(; k<kmax; k++)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
// c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
// c_02 -= a_0 * b_2;
// c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
// c_03 -= a_0 * b_3;
// c_13 -= a_1 * b_3;
// c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 4;
B += 4;
}
c_00 = C[0+bs*0] + c_00;
c_10 = C[1+bs*0] + c_10;
c_20 = C[2+bs*0] + c_20;
c_30 = C[3+bs*0] + c_30;
// c_01 = C[0+bs*1] + c_01;
c_11 = C[1+bs*1] + c_11;
c_21 = C[2+bs*1] + c_21;
c_31 = C[3+bs*1] + c_31;
// c_02 = C[0+bs*2] + c_02;
// c_12 = C[1+bs*2] + c_12;
c_22 = C[2+bs*2] + c_22;
c_32 = C[3+bs*2] + c_32;
// c_03 = C[0+bs*3] + c_03;
// c_13 = C[1+bs*3] + c_13;
// c_23 = C[2+bs*3] + c_23;
c_33 = C[3+bs*3] + c_33;
if(c_00>0)
{
c_00 = sqrt(c_00);
tmp = 1.0/c_00;
}
else
{
c_00 = 0.0;
tmp = 0.0;
}
c_10 *= tmp;
c_20 *= tmp;
c_30 *= tmp;
inv_diag_D[0] = tmp;
if(kn==1)
goto store;
c_11 -= c_10 * c_10;
c_21 -= c_20 * c_10;
c_31 -= c_30 * c_10;
if(c_11>0)
{
c_11 = sqrt(c_11);
tmp = 1.0/c_11;
}
else
{
c_11 = 0.0;
tmp = 0.0;
}
c_21 *= tmp;
c_31 *= tmp;
inv_diag_D[1] = tmp;
if(kn==2)
goto store;
c_22 -= c_20 * c_20;
c_32 -= c_30 * c_20;
c_22 -= c_21 * c_21;
c_32 -= c_31 * c_21;
if(c_22>0)
{
c_22 = sqrt(c_22);
tmp = 1.0/c_22;
}
else
{
c_22 = 0.0;
tmp = 0.0;
}
c_32 *= tmp;
inv_diag_D[2] = tmp;
if(kn==3)
goto store;
c_33 -= c_30 * c_30;
c_33 -= c_31 * c_31;
c_33 -= c_32 * c_32;
if(c_33>0)
{
c_33 = sqrt(c_33);
tmp = 1.0/c_33;
}
else
{
c_33 = 0.0;
tmp = 0.0;
}
inv_diag_D[3] = tmp;
store:
if(km>=4)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
D[3+bs*0] = c_30;
if(kn==1)
return;
// D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
D[3+bs*1] = c_31;
if(kn==2)
return;
// D[0+bs*2] = c_02;
// D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
D[3+bs*2] = c_32;
if(kn==3)
return;
// D[0+bs*3] = c_03;
// D[1+bs*3] = c_13;
// D[2+bs*3] = c_23;
D[3+bs*3] = c_33;
}
else if(km>=3)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
if(kn==1)
return;
// D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
if(kn==2)
return;
// D[0+bs*2] = c_02;
// D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
// if(kn==3)
// return;
// D[0+bs*3] = c_03;
// D[1+bs*3] = c_13;
// D[2+bs*3] = c_23;
}
else if(km>=2)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
if(kn==1)
return;
// D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
// if(kn==2)
// return;
// D[0+bs*2] = c_02;
// D[1+bs*2] = c_12;
// if(kn==3)
// return;
// D[0+bs*3] = c_03;
// D[1+bs*3] = c_13;
}
else //if(km>=1)
{
D[0+bs*0] = c_00;
// if(kn==1)
// return;
// D[0+bs*1] = c_01;
// if(kn==2)
// return;
// D[0+bs*2] = c_02;
// if(kn==3)
// return;
// D[0+bs*3] = c_03;
}
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dpotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D)
{
kernel_dpotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn)
{
double alpha = 1.0;
double beta = 1.0;
kernel_dsyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
kernel_dpotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D)
{
double alpha = 1.0;
double beta = 1.0;
kernel_dsyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
kernel_dpotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
{
const int bs = 4;
double
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
tmp,
c_00=0, c_01=0, c_02=0, c_03=0,
c_10=0, c_11=0, c_12=0, c_13=0,
c_20=0, c_21=0, c_22=0, c_23=0,
c_30=0, c_31=0, c_32=0, c_33=0;
int k;
for(k=0; k<kmax-3; k+=4)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
// k = 1
a_0 = A[4];
a_1 = A[5];
a_2 = A[6];
a_3 = A[7];
b_0 = B[4];
b_1 = B[5];
b_2 = B[6];
b_3 = B[7];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
// k = 2
a_0 = A[8];
a_1 = A[9];
a_2 = A[10];
a_3 = A[11];
b_0 = B[8];
b_1 = B[9];
b_2 = B[10];
b_3 = B[11];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
// k = 3
a_0 = A[12];
a_1 = A[13];
a_2 = A[14];
a_3 = A[15];
b_0 = B[12];
b_1 = B[13];
b_2 = B[14];
b_3 = B[15];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 16;
B += 16;
}
for(; k<kmax; k++)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 4;
B += 4;
}
c_00 = C[0+bs*0] + c_00;
c_10 = C[1+bs*0] + c_10;
c_20 = C[2+bs*0] + c_20;
c_30 = C[3+bs*0] + c_30;
c_01 = C[0+bs*1] + c_01;
c_11 = C[1+bs*1] + c_11;
c_21 = C[2+bs*1] + c_21;
c_31 = C[3+bs*1] + c_31;
c_02 = C[0+bs*2] + c_02;
c_12 = C[1+bs*2] + c_12;
c_22 = C[2+bs*2] + c_22;
c_32 = C[3+bs*2] + c_32;
c_03 = C[0+bs*3] + c_03;
c_13 = C[1+bs*3] + c_13;
c_23 = C[2+bs*3] + c_23;
c_33 = C[3+bs*3] + c_33;
tmp = inv_diag_E[0];
c_00 *= tmp;
c_10 *= tmp;
c_20 *= tmp;
c_30 *= tmp;
if(kn==1)
goto store;
tmp = E[1+bs*0];
c_01 -= c_00 * tmp;
c_11 -= c_10 * tmp;
c_21 -= c_20 * tmp;
c_31 -= c_30 * tmp;
tmp = inv_diag_E[1];
c_01 *= tmp;
c_11 *= tmp;
c_21 *= tmp;
c_31 *= tmp;
if(kn==2)
goto store;
tmp = E[2+bs*0];
c_02 -= c_00 * tmp;
c_12 -= c_10 * tmp;
c_22 -= c_20 * tmp;
c_32 -= c_30 * tmp;
tmp = E[2+bs*1];
c_02 -= c_01 * tmp;
c_12 -= c_11 * tmp;
c_22 -= c_21 * tmp;
c_32 -= c_31 * tmp;
tmp = inv_diag_E[2];
c_02 *= tmp;
c_12 *= tmp;
c_22 *= tmp;
c_32 *= tmp;
if(kn==3)
goto store;
tmp = E[3+bs*0];
c_03 -= c_00 * tmp;
c_13 -= c_10 * tmp;
c_23 -= c_20 * tmp;
c_33 -= c_30 * tmp;
tmp = E[3+bs*1];
c_03 -= c_01 * tmp;
c_13 -= c_11 * tmp;
c_23 -= c_21 * tmp;
c_33 -= c_31 * tmp;
tmp = E[3+bs*2];
c_03 -= c_02 * tmp;
c_13 -= c_12 * tmp;
c_23 -= c_22 * tmp;
c_33 -= c_32 * tmp;
tmp = inv_diag_E[3];
c_03 *= tmp;
c_13 *= tmp;
c_23 *= tmp;
c_33 *= tmp;
store:
if(km>=4)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
D[3+bs*0] = c_30;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
D[3+bs*1] = c_31;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
D[3+bs*2] = c_32;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
D[3+bs*3] = c_33;
}
else if(km>=3)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
}
else if(km>=2)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
}
else //if(km>=1)
{
D[0+bs*0] = c_00;
if(kn==1)
return;
D[0+bs*1] = c_01;
if(kn==2)
return;
D[0+bs*2] = c_02;
if(kn==3)
return;
D[0+bs*3] = c_03;
}
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
{
kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
{
double alpha = 1.0;
double beta = 1.0;
kernel_dgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E)
{
double alpha = 1.0;
double beta = 1.0;
kernel_dgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
kernel_dtrsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, int km, int kn)
{
const int bs = 4;
double
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
tmp,
c_00=0, c_01=0, c_02=0, c_03=0,
c_10=0, c_11=0, c_12=0, c_13=0,
c_20=0, c_21=0, c_22=0, c_23=0,
c_30=0, c_31=0, c_32=0, c_33=0;
int k;
for(k=0; k<kmax-3; k+=4)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
// k = 1
a_0 = A[4];
a_1 = A[5];
a_2 = A[6];
a_3 = A[7];
b_0 = B[4];
b_1 = B[5];
b_2 = B[6];
b_3 = B[7];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
// k = 2
a_0 = A[8];
a_1 = A[9];
a_2 = A[10];
a_3 = A[11];
b_0 = B[8];
b_1 = B[9];
b_2 = B[10];
b_3 = B[11];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
// k = 3
a_0 = A[12];
a_1 = A[13];
a_2 = A[14];
a_3 = A[15];
b_0 = B[12];
b_1 = B[13];
b_2 = B[14];
b_3 = B[15];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 16;
B += 16;
}
for(; k<kmax; k++)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 4;
B += 4;
}
c_00 = C[0+bs*0] + c_00;
c_10 = C[1+bs*0] + c_10;
c_20 = C[2+bs*0] + c_20;
c_30 = C[3+bs*0] + c_30;
c_01 = C[0+bs*1] + c_01;
c_11 = C[1+bs*1] + c_11;
c_21 = C[2+bs*1] + c_21;
c_31 = C[3+bs*1] + c_31;
c_02 = C[0+bs*2] + c_02;
c_12 = C[1+bs*2] + c_12;
c_22 = C[2+bs*2] + c_22;
c_32 = C[3+bs*2] + c_32;
c_03 = C[0+bs*3] + c_03;
c_13 = C[1+bs*3] + c_13;
c_23 = C[2+bs*3] + c_23;
c_33 = C[3+bs*3] + c_33;
if(kn==1)
goto store;
tmp = E[1+bs*0];
c_01 -= c_00 * tmp;
c_11 -= c_10 * tmp;
c_21 -= c_20 * tmp;
c_31 -= c_30 * tmp;
if(kn==2)
goto store;
tmp = E[2+bs*0];
c_02 -= c_00 * tmp;
c_12 -= c_10 * tmp;
c_22 -= c_20 * tmp;
c_32 -= c_30 * tmp;
tmp = E[2+bs*1];
c_02 -= c_01 * tmp;
c_12 -= c_11 * tmp;
c_22 -= c_21 * tmp;
c_32 -= c_31 * tmp;
if(kn==3)
goto store;
tmp = E[3+bs*0];
c_03 -= c_00 * tmp;
c_13 -= c_10 * tmp;
c_23 -= c_20 * tmp;
c_33 -= c_30 * tmp;
tmp = E[3+bs*1];
c_03 -= c_01 * tmp;
c_13 -= c_11 * tmp;
c_23 -= c_21 * tmp;
c_33 -= c_31 * tmp;
tmp = E[3+bs*2];
c_03 -= c_02 * tmp;
c_13 -= c_12 * tmp;
c_23 -= c_22 * tmp;
c_33 -= c_32 * tmp;
store:
if(km>=4)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
D[3+bs*0] = c_30;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
D[3+bs*1] = c_31;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
D[3+bs*2] = c_32;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
D[3+bs*3] = c_33;
}
else if(km>=3)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
}
else if(km>=2)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
}
else //if(km>=1)
{
D[0+bs*0] = c_00;
if(kn==1)
return;
D[0+bs*1] = c_01;
if(kn==2)
return;
D[0+bs*2] = c_02;
if(kn==3)
return;
D[0+bs*3] = c_03;
}
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E)
{
kernel_dtrsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
{
const int bs = 4;
double
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
tmp,
c_00=0, c_01=0, c_02=0, c_03=0,
c_10=0, c_11=0, c_12=0, c_13=0,
c_20=0, c_21=0, c_22=0, c_23=0,
c_30=0, c_31=0, c_32=0, c_33=0;
int k;
for(k=0; k<kmax-3; k+=4)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
// k = 1
a_0 = A[4];
a_1 = A[5];
a_2 = A[6];
a_3 = A[7];
b_0 = B[4];
b_1 = B[5];
b_2 = B[6];
b_3 = B[7];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
// k = 2
a_0 = A[8];
a_1 = A[9];
a_2 = A[10];
a_3 = A[11];
b_0 = B[8];
b_1 = B[9];
b_2 = B[10];
b_3 = B[11];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
// k = 3
a_0 = A[12];
a_1 = A[13];
a_2 = A[14];
a_3 = A[15];
b_0 = B[12];
b_1 = B[13];
b_2 = B[14];
b_3 = B[15];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 16;
B += 16;
}
for(; k<kmax; k++)
{
// k = 0
a_0 = A[0];
a_1 = A[1];
a_2 = A[2];
a_3 = A[3];
b_0 = B[0];
b_1 = B[1];
b_2 = B[2];
b_3 = B[3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 4;
B += 4;
}
c_00 = C[0+bs*0] + c_00;
c_10 = C[1+bs*0] + c_10;
c_20 = C[2+bs*0] + c_20;
c_30 = C[3+bs*0] + c_30;
c_01 = C[0+bs*1] + c_01;
c_11 = C[1+bs*1] + c_11;
c_21 = C[2+bs*1] + c_21;
c_31 = C[3+bs*1] + c_31;
c_02 = C[0+bs*2] + c_02;
c_12 = C[1+bs*2] + c_12;
c_22 = C[2+bs*2] + c_22;
c_32 = C[3+bs*2] + c_32;
c_03 = C[0+bs*3] + c_03;
c_13 = C[1+bs*3] + c_13;
c_23 = C[2+bs*3] + c_23;
c_33 = C[3+bs*3] + c_33;
if(kn>3)
{
tmp = inv_diag_E[3];
c_03 *= tmp;
c_13 *= tmp;
c_23 *= tmp;
c_33 *= tmp;
tmp = E[2+bs*3];
c_02 -= c_03 * tmp;
c_12 -= c_13 * tmp;
c_22 -= c_23 * tmp;
c_32 -= c_33 * tmp;
tmp = E[1+bs*3];
c_01 -= c_03 * tmp;
c_11 -= c_13 * tmp;
c_21 -= c_23 * tmp;
c_31 -= c_33 * tmp;
tmp = E[0+bs*3];
c_00 -= c_03 * tmp;
c_10 -= c_13 * tmp;
c_20 -= c_23 * tmp;
c_30 -= c_33 * tmp;
}
if(kn>2)
{
tmp = inv_diag_E[2];
c_02 *= tmp;
c_12 *= tmp;
c_22 *= tmp;
c_32 *= tmp;
tmp = E[1+bs*2];
c_01 -= c_02 * tmp;
c_11 -= c_12 * tmp;
c_21 -= c_22 * tmp;
c_31 -= c_32 * tmp;
tmp = E[0+bs*2];
c_00 -= c_02 * tmp;
c_10 -= c_12 * tmp;
c_20 -= c_22 * tmp;
c_30 -= c_32 * tmp;
}
if(kn>1)
{
tmp = inv_diag_E[1];
c_01 *= tmp;
c_11 *= tmp;
c_21 *= tmp;
c_31 *= tmp;
tmp = E[0+bs*1];
c_00 -= c_01 * tmp;
c_10 -= c_11 * tmp;
c_20 -= c_21 * tmp;
c_30 -= c_31 * tmp;
}
tmp = inv_diag_E[0];
c_00 *= tmp;
c_10 *= tmp;
c_20 *= tmp;
c_30 *= tmp;
store:
if(km>=4)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
D[3+bs*0] = c_30;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
D[3+bs*1] = c_31;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
D[3+bs*2] = c_32;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
D[3+bs*3] = c_33;
}
else if(km>=3)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
}
else if(km>=2)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
}
else //if(km>=1)
{
D[0+bs*0] = c_00;
if(kn==1)
return;
D[0+bs*1] = c_01;
if(kn==2)
return;
D[0+bs*2] = c_02;
if(kn==3)
return;
D[0+bs*3] = c_03;
}
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
{
kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dgetrf_nn_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn)
{
const int bs = 4;
int k;
double
tmp,
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
c_00=0, c_01=0, c_02=0, c_03=0,
c_10=0, c_11=0, c_12=0, c_13=0,
c_20=0, c_21=0, c_22=0, c_23=0,
c_30=0, c_31=0, c_32=0, c_33=0;
if(kmax<=0)
goto add;
for(k=0; k<kmax-3; k+=4)
{
a_0 = A[0+bs*0];
a_1 = A[1+bs*0];
a_2 = A[2+bs*0];
a_3 = A[3+bs*0];
b_0 = B[0+bs*0];
b_1 = B[0+bs*1];
b_2 = B[0+bs*2];
b_3 = B[0+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
a_0 = A[0+bs*1];
a_1 = A[1+bs*1];
a_2 = A[2+bs*1];
a_3 = A[3+bs*1];
b_0 = B[1+bs*0];
b_1 = B[1+bs*1];
b_2 = B[1+bs*2];
b_3 = B[1+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
a_0 = A[0+bs*2];
a_1 = A[1+bs*2];
a_2 = A[2+bs*2];
a_3 = A[3+bs*2];
b_0 = B[2+bs*0];
b_1 = B[2+bs*1];
b_2 = B[2+bs*2];
b_3 = B[2+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
a_0 = A[0+bs*3];
a_1 = A[1+bs*3];
a_2 = A[2+bs*3];
a_3 = A[3+bs*3];
b_0 = B[3+bs*0];
b_1 = B[3+bs*1];
b_2 = B[3+bs*2];
b_3 = B[3+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 16;
B += 4*sdb;
}
for(; k<kmax; k++)
{
a_0 = A[0+bs*0];
a_1 = A[1+bs*0];
a_2 = A[2+bs*0];
a_3 = A[3+bs*0];
b_0 = B[0+bs*0];
b_1 = B[0+bs*1];
b_2 = B[0+bs*2];
b_3 = B[0+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 4;
B += 1;
}
add:
c_00 += C[0+bs*0];
c_10 += C[1+bs*0];
c_20 += C[2+bs*0];
c_30 += C[3+bs*0];
c_01 += C[0+bs*1];
c_11 += C[1+bs*1];
c_21 += C[2+bs*1];
c_31 += C[3+bs*1];
c_02 += C[0+bs*2];
c_12 += C[1+bs*2];
c_22 += C[2+bs*2];
c_32 += C[3+bs*2];
c_03 += C[0+bs*3];
c_13 += C[1+bs*3];
c_23 += C[2+bs*3];
c_33 += C[3+bs*3];
// factorization
// first column
tmp = 1.0 / c_00;
c_10 *= tmp;
c_20 *= tmp;
c_30 *= tmp;
inv_diag_D[0] = tmp;
if(kn==1)
goto store;
// second column
c_11 -= c_10 * c_01;
c_21 -= c_20 * c_01;
c_31 -= c_30 * c_01;
tmp = 1.0 / c_11;
c_21 *= tmp;
c_31 *= tmp;
inv_diag_D[1] = tmp;
if(kn==2)
goto store;
// third column
c_12 -= c_10 * c_02;
c_22 -= c_20 * c_02;
c_32 -= c_30 * c_02;
c_22 -= c_21 * c_12;
c_32 -= c_31 * c_12;
tmp = 1.0 / c_22;
c_32 *= tmp;
inv_diag_D[2] = tmp;
if(kn==3)
goto store;
// fourth column
c_13 -= c_10 * c_03;
c_23 -= c_20 * c_03;
c_33 -= c_30 * c_03;
c_23 -= c_21 * c_13;
c_33 -= c_31 * c_13;
c_33 -= c_32 * c_23;
tmp = 1.0 / c_33;
inv_diag_D[3] = tmp;
store:
if(km>=4)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
D[3+bs*0] = c_30;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
D[3+bs*1] = c_31;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
D[3+bs*2] = c_32;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
D[3+bs*3] = c_33;
}
else if(km>=3)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
}
else if(km>=2)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
}
else //if(km>=1)
{
D[0+bs*0] = c_00;
if(kn==1)
return;
D[0+bs*1] = c_01;
if(kn==2)
return;
D[0+bs*2] = c_02;
if(kn==3)
return;
D[0+bs*3] = c_03;
}
return;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dgetrf_nn_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D)
{
kernel_dgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn)
{
const int bs = 4;
int k;
double
tmp,
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
e_1, e_2, e_3,
c_00=0, c_01=0, c_02=0, c_03=0,
c_10=0, c_11=0, c_12=0, c_13=0,
c_20=0, c_21=0, c_22=0, c_23=0,
c_30=0, c_31=0, c_32=0, c_33=0;
if(kmax<=0)
goto add;
for(k=0; k<kmax-3; k+=4)
{
a_0 = A[0+bs*0];
a_1 = A[1+bs*0];
a_2 = A[2+bs*0];
a_3 = A[3+bs*0];
b_0 = B[0+bs*0];
b_1 = B[0+bs*1];
b_2 = B[0+bs*2];
b_3 = B[0+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
a_0 = A[0+bs*1];
a_1 = A[1+bs*1];
a_2 = A[2+bs*1];
a_3 = A[3+bs*1];
b_0 = B[1+bs*0];
b_1 = B[1+bs*1];
b_2 = B[1+bs*2];
b_3 = B[1+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
a_0 = A[0+bs*2];
a_1 = A[1+bs*2];
a_2 = A[2+bs*2];
a_3 = A[3+bs*2];
b_0 = B[2+bs*0];
b_1 = B[2+bs*1];
b_2 = B[2+bs*2];
b_3 = B[2+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
a_0 = A[0+bs*3];
a_1 = A[1+bs*3];
a_2 = A[2+bs*3];
a_3 = A[3+bs*3];
b_0 = B[3+bs*0];
b_1 = B[3+bs*1];
b_2 = B[3+bs*2];
b_3 = B[3+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 16;
B += 4*sdb;
}
for(; k<kmax; k++)
{
a_0 = A[0+bs*0];
a_1 = A[1+bs*0];
a_2 = A[2+bs*0];
a_3 = A[3+bs*0];
b_0 = B[0+bs*0];
b_1 = B[0+bs*1];
b_2 = B[0+bs*2];
b_3 = B[0+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 4;
B += 1;
}
add:
c_00 += C[0+bs*0];
c_10 += C[1+bs*0];
c_20 += C[2+bs*0];
c_30 += C[3+bs*0];
c_01 += C[0+bs*1];
c_11 += C[1+bs*1];
c_21 += C[2+bs*1];
c_31 += C[3+bs*1];
c_02 += C[0+bs*2];
c_12 += C[1+bs*2];
c_22 += C[2+bs*2];
c_32 += C[3+bs*2];
c_03 += C[0+bs*3];
c_13 += C[1+bs*3];
c_23 += C[2+bs*3];
c_33 += C[3+bs*3];
// solution
if(km==1)
goto store;
e_1 = E[1+bs*0];
e_2 = E[2+bs*0];
e_3 = E[3+bs*0];
c_10 -= e_1 * c_00;
c_20 -= e_2 * c_00;
c_30 -= e_3 * c_00;
c_11 -= e_1 * c_01;
c_21 -= e_2 * c_01;
c_31 -= e_3 * c_01;
c_12 -= e_1 * c_02;
c_22 -= e_2 * c_02;
c_32 -= e_3 * c_02;
c_13 -= e_1 * c_03;
c_23 -= e_2 * c_03;
c_33 -= e_3 * c_03;
if(km==2)
goto store;
e_2 = E[2+bs*1];
e_3 = E[3+bs*1];
c_20 -= e_2 * c_10;
c_30 -= e_3 * c_10;
c_21 -= e_2 * c_11;
c_31 -= e_3 * c_11;
c_22 -= e_2 * c_12;
c_32 -= e_3 * c_12;
c_23 -= e_2 * c_13;
c_33 -= e_3 * c_13;
if(km==3)
goto store;
e_3 = E[3+bs*2];
c_30 -= e_3 * c_20;
c_31 -= e_3 * c_21;
c_32 -= e_3 * c_22;
c_33 -= e_3 * c_23;
store:
if(km>=4)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
D[3+bs*0] = c_30;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
D[3+bs*1] = c_31;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
D[3+bs*2] = c_32;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
D[3+bs*3] = c_33;
}
else if(km>=3)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
}
else if(km>=2)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
}
else //if(km>=1)
{
D[0+bs*0] = c_00;
if(kn==1)
return;
D[0+bs*1] = c_01;
if(kn==2)
return;
D[0+bs*2] = c_02;
if(kn==3)
return;
D[0+bs*3] = c_03;
}
return;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrsm_nn_ll_one_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E)
{
kernel_dtrsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
{
const int bs = 4;
int k;
double
tmp,
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
e_00, e_01, e_02, e_03,
e_11, e_12, e_13,
e_22, e_23,
e_33,
c_00=0, c_01=0, c_02=0, c_03=0,
c_10=0, c_11=0, c_12=0, c_13=0,
c_20=0, c_21=0, c_22=0, c_23=0,
c_30=0, c_31=0, c_32=0, c_33=0;
if(kmax<=0)
goto add;
for(k=0; k<kmax-3; k+=4)
{
a_0 = A[0+bs*0];
a_1 = A[1+bs*0];
a_2 = A[2+bs*0];
a_3 = A[3+bs*0];
b_0 = B[0+bs*0];
b_1 = B[0+bs*1];
b_2 = B[0+bs*2];
b_3 = B[0+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
a_0 = A[0+bs*1];
a_1 = A[1+bs*1];
a_2 = A[2+bs*1];
a_3 = A[3+bs*1];
b_0 = B[1+bs*0];
b_1 = B[1+bs*1];
b_2 = B[1+bs*2];
b_3 = B[1+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
a_0 = A[0+bs*2];
a_1 = A[1+bs*2];
a_2 = A[2+bs*2];
a_3 = A[3+bs*2];
b_0 = B[2+bs*0];
b_1 = B[2+bs*1];
b_2 = B[2+bs*2];
b_3 = B[2+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
a_0 = A[0+bs*3];
a_1 = A[1+bs*3];
a_2 = A[2+bs*3];
a_3 = A[3+bs*3];
b_0 = B[3+bs*0];
b_1 = B[3+bs*1];
b_2 = B[3+bs*2];
b_3 = B[3+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 16;
B += 4*sdb;
}
for(; k<kmax; k++)
{
a_0 = A[0+bs*0];
a_1 = A[1+bs*0];
a_2 = A[2+bs*0];
a_3 = A[3+bs*0];
b_0 = B[0+bs*0];
b_1 = B[0+bs*1];
b_2 = B[0+bs*2];
b_3 = B[0+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 4;
B += 1;
}
add:
c_00 += C[0+bs*0];
c_10 += C[1+bs*0];
c_20 += C[2+bs*0];
c_30 += C[3+bs*0];
c_01 += C[0+bs*1];
c_11 += C[1+bs*1];
c_21 += C[2+bs*1];
c_31 += C[3+bs*1];
c_02 += C[0+bs*2];
c_12 += C[1+bs*2];
c_22 += C[2+bs*2];
c_32 += C[3+bs*2];
c_03 += C[0+bs*3];
c_13 += C[1+bs*3];
c_23 += C[2+bs*3];
c_33 += C[3+bs*3];
// solve
e_00 = inv_diag_E[0];
c_00 *= e_00;
c_10 *= e_00;
c_20 *= e_00;
c_30 *= e_00;
if(kn==1)
goto store;
e_01 = E[0+bs*1];
e_11 = inv_diag_E[1];
c_01 -= c_00 * e_01;
c_11 -= c_10 * e_01;
c_21 -= c_20 * e_01;
c_31 -= c_30 * e_01;
c_01 *= e_11;
c_11 *= e_11;
c_21 *= e_11;
c_31 *= e_11;
if(kn==2)
goto store;
e_02 = E[0+bs*2];
e_12 = E[1+bs*2];
e_22 = inv_diag_E[2];
c_02 -= c_00 * e_02;
c_12 -= c_10 * e_02;
c_22 -= c_20 * e_02;
c_32 -= c_30 * e_02;
c_02 -= c_01 * e_12;
c_12 -= c_11 * e_12;
c_22 -= c_21 * e_12;
c_32 -= c_31 * e_12;
c_02 *= e_22;
c_12 *= e_22;
c_22 *= e_22;
c_32 *= e_22;
if(kn==3)
goto store;
e_03 = E[0+bs*3];
e_13 = E[1+bs*3];
e_23 = E[2+bs*3];
e_33 = inv_diag_E[3];
c_03 -= c_00 * e_03;
c_13 -= c_10 * e_03;
c_23 -= c_20 * e_03;
c_33 -= c_30 * e_03;
c_03 -= c_01 * e_13;
c_13 -= c_11 * e_13;
c_23 -= c_21 * e_13;
c_33 -= c_31 * e_13;
c_03 -= c_02 * e_23;
c_13 -= c_12 * e_23;
c_23 -= c_22 * e_23;
c_33 -= c_32 * e_23;
c_03 *= e_33;
c_13 *= e_33;
c_23 *= e_33;
c_33 *= e_33;
store:
if(km>=4)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
D[3+bs*0] = c_30;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
D[3+bs*1] = c_31;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
D[3+bs*2] = c_32;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
D[3+bs*3] = c_33;
}
else if(km>=3)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
}
else if(km>=2)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
}
else //if(km>=1)
{
D[0+bs*0] = c_00;
if(kn==1)
return;
D[0+bs*1] = c_01;
if(kn==2)
return;
D[0+bs*2] = c_02;
if(kn==3)
return;
D[0+bs*3] = c_03;
}
return;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrsm_nn_ru_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
{
kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
{
const int bs = 4;
int k;
double
tmp,
a_0, a_1, a_2, a_3,
b_0, b_1, b_2, b_3,
e_00, e_01, e_02, e_03,
e_11, e_12, e_13,
e_22, e_23,
e_33,
c_00=0, c_01=0, c_02=0, c_03=0,
c_10=0, c_11=0, c_12=0, c_13=0,
c_20=0, c_21=0, c_22=0, c_23=0,
c_30=0, c_31=0, c_32=0, c_33=0;
if(kmax<=0)
goto add;
for(k=0; k<kmax-3; k+=4)
{
a_0 = A[0+bs*0];
a_1 = A[1+bs*0];
a_2 = A[2+bs*0];
a_3 = A[3+bs*0];
b_0 = B[0+bs*0];
b_1 = B[0+bs*1];
b_2 = B[0+bs*2];
b_3 = B[0+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
a_0 = A[0+bs*1];
a_1 = A[1+bs*1];
a_2 = A[2+bs*1];
a_3 = A[3+bs*1];
b_0 = B[1+bs*0];
b_1 = B[1+bs*1];
b_2 = B[1+bs*2];
b_3 = B[1+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
a_0 = A[0+bs*2];
a_1 = A[1+bs*2];
a_2 = A[2+bs*2];
a_3 = A[3+bs*2];
b_0 = B[2+bs*0];
b_1 = B[2+bs*1];
b_2 = B[2+bs*2];
b_3 = B[2+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
a_0 = A[0+bs*3];
a_1 = A[1+bs*3];
a_2 = A[2+bs*3];
a_3 = A[3+bs*3];
b_0 = B[3+bs*0];
b_1 = B[3+bs*1];
b_2 = B[3+bs*2];
b_3 = B[3+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 16;
B += 4*sdb;
}
for(; k<kmax; k++)
{
a_0 = A[0+bs*0];
a_1 = A[1+bs*0];
a_2 = A[2+bs*0];
a_3 = A[3+bs*0];
b_0 = B[0+bs*0];
b_1 = B[0+bs*1];
b_2 = B[0+bs*2];
b_3 = B[0+bs*3];
c_00 -= a_0 * b_0;
c_10 -= a_1 * b_0;
c_20 -= a_2 * b_0;
c_30 -= a_3 * b_0;
c_01 -= a_0 * b_1;
c_11 -= a_1 * b_1;
c_21 -= a_2 * b_1;
c_31 -= a_3 * b_1;
c_02 -= a_0 * b_2;
c_12 -= a_1 * b_2;
c_22 -= a_2 * b_2;
c_32 -= a_3 * b_2;
c_03 -= a_0 * b_3;
c_13 -= a_1 * b_3;
c_23 -= a_2 * b_3;
c_33 -= a_3 * b_3;
A += 4;
B += 1;
}
add:
c_00 += C[0+bs*0];
c_10 += C[1+bs*0];
c_20 += C[2+bs*0];
c_30 += C[3+bs*0];
c_01 += C[0+bs*1];
c_11 += C[1+bs*1];
c_21 += C[2+bs*1];
c_31 += C[3+bs*1];
c_02 += C[0+bs*2];
c_12 += C[1+bs*2];
c_22 += C[2+bs*2];
c_32 += C[3+bs*2];
c_03 += C[0+bs*3];
c_13 += C[1+bs*3];
c_23 += C[2+bs*3];
c_33 += C[3+bs*3];
// printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
// printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
// printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
// printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
// solve
if(km>3)
{
e_03 = E[0+bs*3];
e_13 = E[1+bs*3];
e_23 = E[2+bs*3];
e_33 = inv_diag_E[3];
c_30 *= e_33;
c_31 *= e_33;
c_32 *= e_33;
c_33 *= e_33;
c_00 -= e_03 * c_30;
c_01 -= e_03 * c_31;
c_02 -= e_03 * c_32;
c_03 -= e_03 * c_33;
c_10 -= e_13 * c_30;
c_11 -= e_13 * c_31;
c_12 -= e_13 * c_32;
c_13 -= e_13 * c_33;
c_20 -= e_23 * c_30;
c_21 -= e_23 * c_31;
c_22 -= e_23 * c_32;
c_23 -= e_23 * c_33;
}
if(km>2)
{
e_02 = E[0+bs*2];
e_12 = E[1+bs*2];
e_22 = inv_diag_E[2];
c_20 *= e_22;
c_21 *= e_22;
c_22 *= e_22;
c_23 *= e_22;
c_00 -= e_02 * c_20;
c_01 -= e_02 * c_21;
c_02 -= e_02 * c_22;
c_03 -= e_02 * c_23;
c_10 -= e_12 * c_20;
c_11 -= e_12 * c_21;
c_12 -= e_12 * c_22;
c_13 -= e_12 * c_23;
}
if(km>1)
{
e_01 = E[0+bs*1];
e_11 = inv_diag_E[1];
c_10 *= e_11;
c_11 *= e_11;
c_12 *= e_11;
c_13 *= e_11;
c_00 -= e_01 * c_10;
c_01 -= e_01 * c_11;
c_02 -= e_01 * c_12;
c_03 -= e_01 * c_13;
}
e_00 = inv_diag_E[0];
c_00 *= e_00;
c_01 *= e_00;
c_02 *= e_00;
c_03 *= e_00;
store:
if(km>=4)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
D[3+bs*0] = c_30;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
D[3+bs*1] = c_31;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
D[3+bs*2] = c_32;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
D[3+bs*3] = c_33;
}
else if(km>=3)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
D[2+bs*0] = c_20;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
D[2+bs*1] = c_21;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
D[2+bs*2] = c_22;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
D[2+bs*3] = c_23;
}
else if(km>=2)
{
D[0+bs*0] = c_00;
D[1+bs*0] = c_10;
if(kn==1)
return;
D[0+bs*1] = c_01;
D[1+bs*1] = c_11;
if(kn==2)
return;
D[0+bs*2] = c_02;
D[1+bs*2] = c_12;
if(kn==3)
return;
D[0+bs*3] = c_03;
D[1+bs*3] = c_13;
}
else //if(km>=1)
{
D[0+bs*0] = c_00;
if(kn==1)
return;
D[0+bs*1] = c_01;
if(kn==2)
return;
D[0+bs*2] = c_02;
if(kn==3)
return;
D[0+bs*3] = c_03;
}
return;
}
#endif
#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
void kernel_dtrsm_nn_lu_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
{
kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
}
#endif