blob: cab8e3c29d79af9bec9046439ccccaf5c703a515 [file] [log] [blame]
/**************************************************************************************************
* *
* This file is part of BLASFEO. *
* *
* BLASFEO -- BLAS For Embedded Optimization. *
* Copyright (C) 2016-2017 by Gianluca Frison. *
* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
* All rights reserved. *
* *
* HPMPC is free software; you can redistribute it and/or *
* modify it under the terms of the GNU Lesser General Public *
* License as published by the Free Software Foundation; either *
* version 2.1 of the License, or (at your option) any later version. *
* *
* HPMPC is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
* See the GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public *
* License along with HPMPC; if not, write to the Free Software *
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
* *
* Author: Gianluca Frison, giaf (at) dtu.dk *
* gianluca.frison (at) imtek.uni-freiburg.de *
* *
**************************************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "../include/blasfeo_common.h"
#include "../include/blasfeo_d_kernel.h"
#include "../include/blasfeo_d_aux.h"
void dtrsv_ln_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
{
if(m<=0 || n<=0)
return;
// suppose m>=n
if(m<n)
m = n;
const int bs = 4;
double alpha = -1.0;
double beta = 1.0;
int i;
if(x!=y)
{
for(i=0; i<m; i++)
y[i] = x[i];
}
i = 0;
for( ; i<n-3; i+=4)
{
kernel_dtrsv_ln_inv_4_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i]);
}
if(i<n)
{
kernel_dtrsv_ln_inv_4_vs_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i], m-i, n-i);
i+=4;
}
#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
for( ; i<m-7; i+=8)
{
kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, y, &beta, &y[i], &y[i]);
}
if(i<m-3)
{
kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
i+=4;
}
#else
for( ; i<m-3; i+=4)
{
kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
}
#endif
if(i<m)
{
kernel_dgemv_n_4_gen_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i], 0, m-i);
i+=4;
}
}
void dtrsv_lt_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
{
if(m<=0 || n<=0)
return;
if(n>m)
n = m;
const int bs = 4;
int i;
if(x!=y)
for(i=0; i<m; i++)
y[i] = x[i];
i=0;
if(n%4==1)
{
kernel_dtrsv_lt_inv_1_lib4(m-n+i+1, &pA[n/bs*bs*sda+(n-i-1)*bs], sda, &inv_diag_A[n-i-1], &y[n-i-1], &y[n-i-1], &y[n-i-1]);
i++;
}
else if(n%4==2)
{
kernel_dtrsv_lt_inv_2_lib4(m-n+i+2, &pA[n/bs*bs*sda+(n-i-2)*bs], sda, &inv_diag_A[n-i-2], &y[n-i-2], &y[n-i-2], &y[n-i-2]);
i+=2;
}
else if(n%4==3)
{
kernel_dtrsv_lt_inv_3_lib4(m-n+i+3, &pA[n/bs*bs*sda+(n-i-3)*bs], sda, &inv_diag_A[n-i-3], &y[n-i-3], &y[n-i-3], &y[n-i-3]);
i+=3;
}
for(; i<n-3; i+=4)
{
kernel_dtrsv_lt_inv_4_lib4(m-n+i+4, &pA[(n-i-4)/bs*bs*sda+(n-i-4)*bs], sda, &inv_diag_A[n-i-4], &y[n-i-4], &y[n-i-4], &y[n-i-4]);
}
}
void dgemv_nt_lib(int m, int n, double alpha_n, double alpha_t, double *pA, int sda, double *x_n, double *x_t, double beta_n, double beta_t, double *y_n, double *y_t, double *z_n, double *z_t)
{
if(m<=0 | n<=0)
return;
const int bs = 4;
int ii;
// copy and scale y_n int z_n
ii = 0;
for(; ii<m-3; ii+=4)
{
z_n[ii+0] = beta_n*y_n[ii+0];
z_n[ii+1] = beta_n*y_n[ii+1];
z_n[ii+2] = beta_n*y_n[ii+2];
z_n[ii+3] = beta_n*y_n[ii+3];
}
for(; ii<m; ii++)
{
z_n[ii+0] = beta_n*y_n[ii+0];
}
ii = 0;
#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
for(; ii<n-5; ii+=6)
{
kernel_dgemv_nt_6_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
}
#endif
for(; ii<n-3; ii+=4)
{
kernel_dgemv_nt_4_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
}
if(ii<n)
{
kernel_dgemv_nt_4_vs_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
}
return;
}
#if defined(LA_HIGH_PERFORMANCE)
void dgemv_n_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
{
if(m<0)
return;
const int bs = 4;
int i;
int sda = sA->cn;
double *pA = sA->pA + aj*bs + ai/bs*bs*sda;
double *x = sx->pa + xi;
double *y = sy->pa + yi;
double *z = sz->pa + zi;
i = 0;
// clean up at the beginning
if(ai%bs!=0)
{
kernel_dgemv_n_4_gen_lib4(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
pA += bs*sda;
y += 4 - ai%bs;
z += 4 - ai%bs;
m -= 4 - ai%bs;
}
// main loop
#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
for( ; i<m-11; i+=12)
{
kernel_dgemv_n_12_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
}
#endif
for( ; i<m-7; i+=8)
{
kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
}
if(i<m-3)
{
kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
i+=4;
}
#else
for( ; i<m-3; i+=4)
{
kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
}
#endif
if(i<m)
{
kernel_dgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
}
return;
}
void dgemv_t_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
{
if(n<=0)
return;
const int bs = 4;
int i;
int sda = sA->cn;
double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
double *x = sx->pa + xi;
double *y = sy->pa + yi;
double *z = sz->pa + zi;
if(ai%bs==0)
{
i = 0;
#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
for( ; i<n-11; i+=12)
{
kernel_dgemv_t_12_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
}
#endif
for( ; i<n-7; i+=8)
{
kernel_dgemv_t_8_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
}
if(i<n-3)
{
kernel_dgemv_t_4_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
i+=4;
}
#else
for( ; i<n-3; i+=4)
{
kernel_dgemv_t_4_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
}
#endif
if(i<n)
{
kernel_dgemv_t_4_vs_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
}
}
else // TODO kernel 8
{
i = 0;
for( ; i<n; i+=4)
{
kernel_dgemv_t_4_gen_lib4(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
}
}
return;
}
void dgemv_nt_libstr(int m, int n, double alpha_n, double alpha_t, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx_n, int xi_n, struct d_strvec *sx_t, int xi_t, double beta_n, double beta_t, struct d_strvec *sy_n, int yi_n, struct d_strvec *sy_t, int yi_t, struct d_strvec *sz_n, int zi_n, struct d_strvec *sz_t, int zi_t)
{
if(ai!=0)
{
printf("\ndgemv_nt_libstr: feature not implemented yet: ai=%d\n", ai);
exit(1);
}
const int bs = 4;
int sda = sA->cn;
double *pA = sA->pA + aj*bs; // TODO ai
double *x_n = sx_n->pa + xi_n;
double *x_t = sx_t->pa + xi_t;
double *y_n = sy_n->pa + yi_n;
double *y_t = sy_t->pa + yi_t;
double *z_n = sz_n->pa + zi_n;
double *z_t = sz_t->pa + zi_t;
dgemv_nt_lib(m, n, alpha_n, alpha_t, pA, sda, x_n, x_t, beta_n, beta_t, y_n, y_t, z_n, z_t);
return;
}
void dsymv_l_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
{
if(m<=0 | n<=0)
return;
const int bs = 4;
int ii, n1;
int sda = sA->cn;
double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
double *x = sx->pa + xi;
double *y = sy->pa + yi;
double *z = sz->pa + zi;
// copy and scale y int z
ii = 0;
for(; ii<m-3; ii+=4)
{
z[ii+0] = beta*y[ii+0];
z[ii+1] = beta*y[ii+1];
z[ii+2] = beta*y[ii+2];
z[ii+3] = beta*y[ii+3];
}
for(; ii<m; ii++)
{
z[ii+0] = beta*y[ii+0];
}
// clean up at the beginning
if(ai%bs!=0) // 1, 2, 3
{
n1 = 4-ai%bs;
kernel_dsymv_l_4_gen_lib4(m, &alpha, ai%bs, &pA[0], sda, &x[0], &z[0], n<n1 ? n : n1);
pA += n1 + n1*bs + (sda-1)*bs;
x += n1;
z += n1;
m -= n1;
n -= n1;
}
// main loop
ii = 0;
for(; ii<n-3; ii+=4)
{
kernel_dsymv_l_4_lib4(m-ii, &alpha, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii]);
}
// clean up at the end
if(ii<n)
{
kernel_dsymv_l_4_gen_lib4(m-ii, &alpha, 0, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii], n-ii);
}
return;
}
// m >= n
void dtrmv_lnn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
{
if(m<=0)
return;
const int bs = 4;
int sda = sA->cn;
double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
double *x = sx->pa + xi;
double *z = sz->pa + zi;
if(m-n>0)
dgemv_n_libstr(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
double *pA2 = pA;
double *z2 = z;
int m2 = n;
int n2 = 0;
double *pA3, *x3;
double alpha = 1.0;
double beta = 1.0;
double zt[4];
int ii, jj, jj_end;
ii = 0;
if(ai%4!=0)
{
pA2 += sda*bs - ai%bs;
z2 += bs-ai%bs;
m2 -= bs-ai%bs;
n2 += bs-ai%bs;
}
pA2 += m2/bs*bs*sda;
z2 += m2/bs*bs;
n2 += m2/bs*bs;
if(m2%bs!=0)
{
//
pA3 = pA2 + bs*n2;
x3 = x + n2;
zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
zt[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
zt[0] = pA3[0+bs*0]*x3[0];
kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, zt, zt);
for(jj=0; jj<m2%bs; jj++)
z2[jj] = zt[jj];
}
for(; ii<m2-3; ii+=4)
{
pA2 -= bs*sda;
z2 -= 4;
n2 -= 4;
pA3 = pA2 + bs*n2;
x3 = x + n2;
z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
z2[0] = pA3[0+bs*0]*x3[0];
kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, z2, z2);
}
if(ai%4!=0)
{
if(ai%bs==1)
{
zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
zt[0] = pA[0+bs*0]*x[0];
jj_end = 4-ai%bs<n ? 4-ai%bs : n;
for(jj=0; jj<jj_end; jj++)
z[jj] = zt[jj];
}
else if(ai%bs==2)
{
zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
zt[0] = pA[0+bs*0]*x[0];
jj_end = 4-ai%bs<n ? 4-ai%bs : n;
for(jj=0; jj<jj_end; jj++)
z[jj] = zt[jj];
}
else // if (ai%bs==3)
{
z[0] = pA[0+bs*0]*x[0];
}
}
return;
}
// m >= n
void dtrmv_ltn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
{
if(m<=0)
return;
const int bs = 4;
int sda = sA->cn;
double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
double *x = sx->pa + xi;
double *z = sz->pa + zi;
double xt[4];
double zt[4];
double alpha = 1.0;
double beta = 1.0;
int ii, jj, ll, ll_max;
jj = 0;
if(ai%bs!=0)
{
if(ai%bs==1)
{
ll_max = m-jj<3 ? m-jj : 3;
for(ll=0; ll<ll_max; ll++)
xt[ll] = x[ll];
for(; ll<3; ll++)
xt[ll] = 0.0;
zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
zt[2] = pA[2+bs*2]*xt[2];
pA += bs*sda - 1;
x += 3;
kernel_dgemv_t_4_lib4(m-3-jj, &alpha, pA, sda, x, &beta, zt, zt);
ll_max = n-jj<3 ? n-jj : 3;
for(ll=0; ll<ll_max; ll++)
z[ll] = zt[ll];
pA += bs*3;
z += 3;
jj += 3;
}
else if(ai%bs==2)
{
ll_max = m-jj<2 ? m-jj : 2;
for(ll=0; ll<ll_max; ll++)
xt[ll] = x[ll];
for(; ll<2; ll++)
xt[ll] = 0.0;
zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
zt[1] = pA[1+bs*1]*xt[1];
pA += bs*sda - 2;
x += 2;
kernel_dgemv_t_4_lib4(m-2-jj, &alpha, pA, sda, x, &beta, zt, zt);
ll_max = n-jj<2 ? n-jj : 2;
for(ll=0; ll<ll_max; ll++)
z[ll] = zt[ll];
pA += bs*2;
z += 2;
jj += 2;
}
else // if(ai%bs==3)
{
ll_max = m-jj<1 ? m-jj : 1;
for(ll=0; ll<ll_max; ll++)
xt[ll] = x[ll];
for(; ll<1; ll++)
xt[ll] = 0.0;
zt[0] = pA[0+bs*0]*xt[0];
pA += bs*sda - 3;
x += 1;
kernel_dgemv_t_4_lib4(m-1-jj, &alpha, pA, sda, x, &beta, zt, zt);
ll_max = n-jj<1 ? n-jj : 1;
for(ll=0; ll<ll_max; ll++)
z[ll] = zt[ll];
pA += bs*1;
z += 1;
jj += 1;
}
}
for(; jj<n-3; jj+=4)
{
zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3];
zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3];
zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3];
zt[3] = pA[3+bs*3]*x[3];
pA += bs*sda;
x += 4;
kernel_dgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, z);
pA += bs*4;
z += 4;
}
if(jj<n)
{
ll_max = m-jj<4 ? m-jj : 4;
for(ll=0; ll<ll_max; ll++)
xt[ll] = x[ll];
for(; ll<4; ll++)
xt[ll] = 0.0;
zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
zt[3] = pA[3+bs*3]*xt[3];
pA += bs*sda;
x += 4;
kernel_dgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
for(ll=0; ll<n-jj; ll++)
z[ll] = zt[ll];
// pA += bs*4;
// z += 4;
}
return;
}
void dtrmv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
{
if(m<=0)
return;
if(ai!=0)
{
printf("\ndtrmv_unn_libstr: feature not implemented yet: ai=%d\n", ai);
exit(1);
}
const int bs = 4;
int sda = sA->cn;
double *pA = sA->pA + aj*bs; // TODO ai
double *x = sx->pa + xi;
double *z = sz->pa + zi;
int i;
i=0;
#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
for(; i<m-7; i+=8)
{
kernel_dtrmv_un_8_lib4(m-i, pA, sda, x, z);
pA += 8*sda+8*bs;
x += 8;
z += 8;
}
#endif
for(; i<m-3; i+=4)
{
kernel_dtrmv_un_4_lib4(m-i, pA, x, z);
pA += 4*sda+4*bs;
x += 4;
z += 4;
}
if(m>i)
{
if(m-i==1)
{
z[0] = pA[0+bs*0]*x[0];
}
else if(m-i==2)
{
z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1];
z[1] = pA[1+bs*1]*x[1];
}
else // if(m-i==3)
{
z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1] + pA[0+bs*2]*x[2];
z[1] = pA[1+bs*1]*x[1] + pA[1+bs*2]*x[2];
z[2] = pA[2+bs*2]*x[2];
}
}
return;
}
void dtrmv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
{
if(m<=0)
return;
if(ai!=0)
{
printf("\ndtrmv_utn_libstr: feature not implemented yet: ai=%d\n", ai);
exit(1);
}
const int bs = 4;
int sda = sA->cn;
double *pA = sA->pA + aj*bs; // TODO ai
double *x = sx->pa + xi;
double *z = sz->pa + zi;
int ii, idx;
double *ptrA;
ii=0;
idx = m/bs*bs;
if(m%bs!=0)
{
kernel_dtrmv_ut_4_vs_lib4(m, pA+idx*bs, sda, x, z+idx, m%bs);
ii += m%bs;
}
idx -= 4;
for(; ii<m; ii+=4)
{
kernel_dtrmv_ut_4_lib4(idx+4, pA+idx*bs, sda, x, z+idx);
idx -= 4;
}
return;
}
void dtrsv_lnn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
{
if(m==0 | n==0)
return;
#if defined(DIM_CHECK)
// non-negative size
if(m<0) printf("\n****** dtrsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
if(n<0) printf("\n****** dtrsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
// non-negative offset
if(ai<0) printf("\n****** dtrsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
if(aj<0) printf("\n****** dtrsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
if(xi<0) printf("\n****** dtrsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
if(zi<0) printf("\n****** dtrsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
// inside matrix
// A: m x k
if(ai+m > sA->m) printf("\n***** dtrsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
if(aj+n > sA->n) printf("\n***** dtrsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
// x: m
if(xi+m > sx->m) printf("\n***** dtrsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
// z: m
if(zi+m > sz->m) printf("\n***** dtrsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
#endif
if(ai!=0 | xi%4!=0)
{
printf("\ndtrsv_lnn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
exit(1);
}
const int bs = 4;
int sda = sA->cn;
double *pA = sA->pA + aj*bs; // TODO ai
double *dA = sA->dA;
double *x = sx->pa + xi;
double *z = sz->pa + zi;
int ii;
if(ai==0 & aj==0)
{
if(sA->use_dA!=1)
{
ddiaex_lib(n, 1.0, ai, pA, sda, dA);
for(ii=0; ii<n; ii++)
dA[ii] = 1.0 / dA[ii];
sA->use_dA = 1;
}
}
else
{
ddiaex_lib(n, 1.0, ai, pA, sda, dA);
for(ii=0; ii<n; ii++)
dA[ii] = 1.0 / dA[ii];
sA->use_dA = 0;
}
dtrsv_ln_inv_lib(m, n, pA, sda, dA, x, z);
return;
}
void dtrsv_ltn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
{
if(m==0)
return;
#if defined(DIM_CHECK)
// non-negative size
if(m<0) printf("\n****** dtrsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
if(n<0) printf("\n****** dtrsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
// non-negative offset
if(ai<0) printf("\n****** dtrsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
if(aj<0) printf("\n****** dtrsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
if(xi<0) printf("\n****** dtrsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
if(zi<0) printf("\n****** dtrsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
// inside matrix
// A: m x k
if(ai+m > sA->m) printf("\n***** dtrsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
if(aj+n > sA->n) printf("\n***** dtrsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
// x: m
if(xi+m > sx->m) printf("\n***** dtrsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
// z: m
if(zi+m > sz->m) printf("\n***** dtrsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
#endif
if(ai!=0 | xi%4!=0)
{
printf("\ndtrsv_ltn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
exit(1);
}
const int bs = 4;
int sda = sA->cn;
double *pA = sA->pA + aj*bs; // TODO ai
double *dA = sA->dA;
double *x = sx->pa + xi;
double *z = sz->pa + zi;
int ii;
if(ai==0 & aj==0)
{
if(sA->use_dA!=1)
{
ddiaex_lib(n, 1.0, ai, pA, sda, dA);
for(ii=0; ii<n; ii++)
dA[ii] = 1.0 / dA[ii];
sA->use_dA = 1;
}
}
else
{
ddiaex_lib(n, 1.0, ai, pA, sda, dA);
for(ii=0; ii<n; ii++)
dA[ii] = 1.0 / dA[ii];
sA->use_dA = 0;
}
dtrsv_lt_inv_lib(m, n, pA, sda, dA, x, z);
return;
}
void dtrsv_lnn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
{
if(m==0)
return;
#if defined(DIM_CHECK)
// non-negative size
if(m<0) printf("\n****** dtrsv_lnn_libstr : m<0 : %d<0 *****\n", m);
// non-negative offset
if(ai<0) printf("\n****** dtrsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
if(aj<0) printf("\n****** dtrsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
if(xi<0) printf("\n****** dtrsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
if(zi<0) printf("\n****** dtrsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
// inside matrix
// A: m x k
if(ai+m > sA->m) printf("\n***** dtrsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
if(aj+m > sA->n) printf("\n***** dtrsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
// x: m
if(xi+m > sx->m) printf("\n***** dtrsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
// z: m
if(zi+m > sz->m) printf("\n***** dtrsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
#endif
if(ai!=0 | xi%4!=0)
{
printf("\ndtrsv_lnn_libstr: feature not implemented yet: ai=%d\n", ai);
exit(1);
}
const int bs = 4;
int sda = sA->cn;
double *pA = sA->pA + aj*bs; // TODO ai
double *dA = sA->dA;
double *x = sx->pa + xi;
double *z = sz->pa + zi;
int ii;
if(ai==0 & aj==0)
{
if(sA->use_dA!=1)
{
ddiaex_lib(m, 1.0, ai, pA, sda, dA);
for(ii=0; ii<m; ii++)
dA[ii] = 1.0 / dA[ii];
sA->use_dA = 1;
}
}
else
{
ddiaex_lib(m, 1.0, ai, pA, sda, dA);
for(ii=0; ii<m; ii++)
dA[ii] = 1.0 / dA[ii];
sA->use_dA = 0;
}
dtrsv_ln_inv_lib(m, m, pA, sda, dA, x, z);
return;
}
void dtrsv_lnu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
{
if(m==0)
return;
#if defined(DIM_CHECK)
// non-negative size
if(m<0) printf("\n****** dtrsv_lnu_libstr : m<0 : %d<0 *****\n", m);
// non-negative offset
if(ai<0) printf("\n****** dtrsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
if(aj<0) printf("\n****** dtrsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
if(xi<0) printf("\n****** dtrsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
if(zi<0) printf("\n****** dtrsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
// inside matrix
// A: m x k
if(ai+m > sA->m) printf("\n***** dtrsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
if(aj+m > sA->n) printf("\n***** dtrsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
// x: m
if(xi+m > sx->m) printf("\n***** dtrsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
// z: m
if(zi+m > sz->m) printf("\n***** dtrsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
#endif
printf("\n***** dtrsv_lnu_libstr : feature not implemented yet *****\n");
exit(1);
}
void dtrsv_ltn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
{
if(m==0)
return;
#if defined(DIM_CHECK)
// non-negative size
if(m<0) printf("\n****** dtrsv_ltn_libstr : m<0 : %d<0 *****\n", m);
// non-negative offset
if(ai<0) printf("\n****** dtrsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
if(aj<0) printf("\n****** dtrsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
if(xi<0) printf("\n****** dtrsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
if(zi<0) printf("\n****** dtrsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
// inside matrix
// A: m x k
if(ai+m > sA->m) printf("\n***** dtrsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
if(aj+m > sA->n) printf("\n***** dtrsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
// x: m
if(xi+m > sx->m) printf("\n***** dtrsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
// z: m
if(zi+m > sz->m) printf("\n***** dtrsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
#endif
if(ai!=0 | xi%4!=0)
{
printf("\ndtrsv_ltn_libstr: feature not implemented yet: ai=%d\n", ai);
exit(1);
}
const int bs = 4;
int sda = sA->cn;
double *pA = sA->pA + aj*bs; // TODO ai
double *dA = sA->dA;
double *x = sx->pa + xi;
double *z = sz->pa + zi;
int ii;
if(ai==0 & aj==0)
{
if(sA->use_dA!=1)
{
ddiaex_lib(m, 1.0, ai, pA, sda, dA);
for(ii=0; ii<m; ii++)
dA[ii] = 1.0 / dA[ii];
sA->use_dA = 1;
}
}
else
{
ddiaex_lib(m, 1.0, ai, pA, sda, dA);
for(ii=0; ii<m; ii++)
dA[ii] = 1.0 / dA[ii];
sA->use_dA = 0;
}
dtrsv_lt_inv_lib(m, m, pA, sda, dA, x, z);
return;
}
void dtrsv_ltu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
{
if(m==0)
return;
#if defined(DIM_CHECK)
// non-negative size
if(m<0) printf("\n****** dtrsv_ltu_libstr : m<0 : %d<0 *****\n", m);
// non-negative offset
if(ai<0) printf("\n****** dtrsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
if(aj<0) printf("\n****** dtrsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
if(xi<0) printf("\n****** dtrsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
if(zi<0) printf("\n****** dtrsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
// inside matrix
// A: m x k
if(ai+m > sA->m) printf("\n***** dtrsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
if(aj+m > sA->n) printf("\n***** dtrsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
// x: m
if(xi+m > sx->m) printf("\n***** dtrsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
// z: m
if(zi+m > sz->m) printf("\n***** dtrsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
#endif
printf("\n***** dtrsv_ltu_libstr : feature not implemented yet *****\n");
exit(1);
}
void dtrsv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
{
if(m==0)
return;
#if defined(DIM_CHECK)
// non-negative size
if(m<0) printf("\n****** dtrsv_unn_libstr : m<0 : %d<0 *****\n", m);
// non-negative offset
if(ai<0) printf("\n****** dtrsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
if(aj<0) printf("\n****** dtrsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
if(xi<0) printf("\n****** dtrsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
if(zi<0) printf("\n****** dtrsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
// inside matrix
// A: m x k
if(ai+m > sA->m) printf("\n***** dtrsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
if(aj+m > sA->n) printf("\n***** dtrsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
// x: m
if(xi+m > sx->m) printf("\n***** dtrsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
// z: m
if(zi+m > sz->m) printf("\n***** dtrsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
#endif
printf("\n***** dtrsv_unn_libstr : feature not implemented yet *****\n");
exit(1);
}
void dtrsv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
{
if(m==0)
return;
#if defined(DIM_CHECK)
// non-negative size
if(m<0) printf("\n****** dtrsv_utn_libstr : m<0 : %d<0 *****\n", m);
// non-negative offset
if(ai<0) printf("\n****** dtrsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
if(aj<0) printf("\n****** dtrsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
if(xi<0) printf("\n****** dtrsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
if(zi<0) printf("\n****** dtrsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
// inside matrix
// A: m x k
if(ai+m > sA->m) printf("\n***** dtrsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
if(aj+m > sA->n) printf("\n***** dtrsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
// x: m
if(xi+m > sx->m) printf("\n***** dtrsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
// z: m
if(zi+m > sz->m) printf("\n***** dtrsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
#endif
printf("\n***** dtrsv_utn_libstr : feature not implemented yet *****\n");
exit(1);
}
#else
#error : wrong LA choice
#endif