Squashed 'third_party/blasfeo/' content from commit 2a828ca
Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/blas/Makefile b/blas/Makefile
new file mode 100644
index 0000000..304b448
--- /dev/null
+++ b/blas/Makefile
@@ -0,0 +1,88 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib8.o s_blas2_lib8.o s_blas2_diag_lib.o s_blas3_lib8.o s_blas3_diag_lib8.o s_lapack_lib8.o
+endif
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib8.o s_blas2_lib8.o s_blas2_diag_lib.o s_blas3_lib8.o s_blas3_diag_lib8.o s_lapack_lib8.o
+endif
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+ifeq ($(TARGET), GENERIC)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+OBJS += d_blas1_lib.o d_blas2_lib.o d_blas2_diag_lib.o d_blas3_lib.o d_blas3_diag_lib.o d_lapack_lib.o
+OBJS += s_blas1_lib.o s_blas2_lib.o s_blas2_diag_lib.o s_blas3_lib.o s_blas3_diag_lib.o s_lapack_lib.o
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
+d_blas1_lib.o: d_blas1_lib.c x_blas1_lib.c
+s_blas1_lib.o: s_blas1_lib.c x_blas1_lib.c
+d_blas2_lib.o: d_blas2_lib.c x_blas2_lib.c
+s_blas2_lib.o: s_blas2_lib.c x_blas2_lib.c
+d_blas2_diag_lib.o: d_blas2_diag_lib.c x_blas2_diag_lib.c
+s_blas2_diag_lib.o: s_blas2_diag_lib.c x_blas2_diag_lib.c
+d_blas3_lib.o: d_blas3_lib.c x_blas3_lib.c
+s_blas3_lib.o: s_blas3_lib.c x_blas3_lib.c
+d_blas3_diag_lib.o: d_blas3_diag_lib.c x_blas3_diag_lib.c
+s_blas3_diag_lib.o: s_blas3_diag_lib.c x_blas3_diag_lib.c
+d_lapack_lib.o: d_lapack_lib.c x_lapack_lib.c
+s_lapack_lib.o: s_lapack_lib.c x_lapack_lib.c
diff --git a/blas/d_blas.h b/blas/d_blas.h
new file mode 100644
index 0000000..fc5058b
--- /dev/null
+++ b/blas/d_blas.h
@@ -0,0 +1,66 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// headers to reference BLAS and LAPACK routines employed in BLASFEO WR
+
+// level 1
+void dcopy_(int *m, double *x, int *incx, double *y, int *incy);
+void daxpy_(int *m, double *alpha, double *x, int *incx, double *y, int *incy);
+void dscal_(int *m, double *alpha, double *x, int *incx);
+
+// level 2
+void dgemv_(char *ta, int *m, int *n, double *alpha, double *A, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
+void dsymv_(char *uplo, int *m, double *alpha, double *A, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
+void dtrmv_(char *uplo, char *trans, char *diag, int *n, double *A, int *lda, double *x, int *incx);
+void dtrsv_(char *uplo, char *trans, char *diag, int *n, double *A, int *lda, double *x, int *incx);
+void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *A, int *lda);
+
+// level 3
+void dgemm_(char *ta, char *tb, int *m, int *n, int *k, double *alpha, double *A, int *lda, double *B, int *ldb, double *beta, double *C, int *ldc);
+void dsyrk_(char *uplo, char *trans, int *n, int *k, double *alpha, double *A, int *lda, double *beta, double *C, int *ldc);
+void dtrmm_(char *side, char *uplo, char *trans, char *diag, int *m, int *n, double *alpha, double *A, int *lda, double *B, int *ldb);
+void dtrsm_(char *side, char *uplo, char *trans, char *diag, int *m, int *n, double *alpha, double *A, int *lda, double *B, int *ldb);
+
+// lapack
+int dpotrf_(char *uplo, int *m, double *A, int *lda, int *info);
+int dgetrf_(int *m, int *n, double *A, int *lda, int *ipiv, int *info);
+void dgeqrf_(int *m, int *n, double *A, int *lda, double *tau, double *work, int *lwork, int *info);
+void dgeqr2_(int *m, int *n, double *A, int *lda, double *tau, double *work, int *info);
+void dgelqf_(int *m, int *n, double *A, int *lda, double *tau, double *work, int *lwork, int *info);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/blas/d_blas1_lib.c b/blas/d_blas1_lib.c
new file mode 100644
index 0000000..1fd19d3
--- /dev/null
+++ b/blas/d_blas1_lib.c
@@ -0,0 +1,54 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#include "d_blas.h"
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+#define STRVEC d_strvec
+
+#define AXPY_LIBSTR daxpy_libstr
+#define VECMULDOT_LIBSTR dvecmuldot_libstr
+#define DOT_LIBSTR ddot_libstr
+
+#define AXPY daxpy_
+#define COPY dcopy_
+
+
+#include "x_blas1_lib.c"
diff --git a/blas/d_blas1_lib4.c b/blas/d_blas1_lib4.c
new file mode 100644
index 0000000..a4155a9
--- /dev/null
+++ b/blas/d_blas1_lib4.c
@@ -0,0 +1,263 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+void daxpy_libstr(int m, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ double *z = sz->pa + zi;
+
+ int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ __m256d
+ v_alpha, v_tmp,
+ v_x0, v_y0,
+ v_x1, v_y1;
+#endif
+
+ ii = 0;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ v_alpha = _mm256_broadcast_sd( &alpha );
+ for( ; ii<m-7; ii+=8)
+ {
+ v_x0 = _mm256_loadu_pd( &x[ii+0] );
+ v_x1 = _mm256_loadu_pd( &x[ii+4] );
+ v_y0 = _mm256_loadu_pd( &y[ii+0] );
+ v_y1 = _mm256_loadu_pd( &y[ii+4] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+ v_y0 = _mm256_fmadd_pd( v_alpha, v_x0, v_y0 );
+ v_y1 = _mm256_fmadd_pd( v_alpha, v_x1, v_y1 );
+#else // sandy bridge
+ v_tmp = _mm256_mul_pd( v_alpha, v_x0 );
+ v_y0 = _mm256_add_pd( v_tmp, v_y0 );
+ v_tmp = _mm256_mul_pd( v_alpha, v_x1 );
+ v_y1 = _mm256_add_pd( v_tmp, v_y1 );
+#endif
+ _mm256_storeu_pd( &z[ii+0], v_y0 );
+ _mm256_storeu_pd( &z[ii+4], v_y1 );
+ }
+ for( ; ii<m-3; ii+=4)
+ {
+ v_x0 = _mm256_loadu_pd( &x[ii] );
+ v_y0 = _mm256_loadu_pd( &y[ii] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+ v_y0 = _mm256_fmadd_pd( v_alpha, v_x0, v_y0 );
+#else // sandy bridge
+ v_tmp = _mm256_mul_pd( v_alpha, v_x0 );
+ v_y0 = _mm256_add_pd( v_tmp, v_y0 );
+#endif
+ _mm256_storeu_pd( &z[ii], v_y0 );
+ }
+#else
+ for( ; ii<m-3; ii+=4)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ z[ii+1] = y[ii+1] + alpha*x[ii+1];
+ z[ii+2] = y[ii+2] + alpha*x[ii+2];
+ z[ii+3] = y[ii+3] + alpha*x[ii+3];
+ }
+#endif
+ for( ; ii<m; ii++)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ }
+
+ return;
+ }
+
+
+
+// multiply two vectors and compute dot product
+double dvecmuldot_libstr(int m, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return 0.0;
+
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ double *z = sz->pa + zi;
+ int ii;
+ double dot = 0.0;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ __m128d
+ u_tmp, u_dot;
+ __m256d
+ v_tmp,
+ v_x0, v_y0, v_z0;
+
+ v_tmp = _mm256_setzero_pd();
+#endif
+
+ ii = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; ii<m-3; ii+=4)
+ {
+ v_x0 = _mm256_loadu_pd( &x[ii+0] );
+ v_y0 = _mm256_loadu_pd( &y[ii+0] );
+ v_z0 = _mm256_mul_pd( v_x0, v_y0 );
+ _mm256_storeu_pd( &z[ii+0], v_z0 );
+ v_tmp = _mm256_add_pd( v_tmp, v_z0 );
+ }
+#endif
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = x[ii+0] * y[ii+0];
+ dot += z[ii+0];
+ }
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ // dot product
+ u_tmp = _mm_add_pd( _mm256_castpd256_pd128( v_tmp ), _mm256_extractf128_pd( v_tmp, 0x1 ) );
+ u_tmp = _mm_hadd_pd( u_tmp, u_tmp);
+ u_dot = _mm_load_sd( &dot );
+ u_dot = _mm_add_sd( u_dot, u_tmp );
+ _mm_store_sd( &dot, u_dot );
+#endif
+ return dot;
+ }
+
+
+
+// compute dot product of two vectors
+double ddot_libstr(int m, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi)
+ {
+
+ if(m<=0)
+ return 0.0;
+
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ int ii;
+ double dot = 0.0;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ __m128d
+ u_dot0, u_x0, u_y0, u_tmp;
+ __m256d
+ v_dot0, v_dot1, v_x0, v_x1, v_y0, v_y1, v_tmp;
+
+ v_dot0 = _mm256_setzero_pd();
+ v_dot1 = _mm256_setzero_pd();
+ u_dot0 = _mm_setzero_pd();
+
+ ii = 0;
+ for(; ii<m-7; ii+=8)
+ {
+ v_x0 = _mm256_loadu_pd( &x[ii+0] );
+ v_x1 = _mm256_loadu_pd( &x[ii+4] );
+ v_y0 = _mm256_loadu_pd( &y[ii+0] );
+ v_y1 = _mm256_loadu_pd( &y[ii+4] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+ v_dot0 = _mm256_fmadd_pd( v_x0, v_y0, v_dot0 );
+ v_dot1 = _mm256_fmadd_pd( v_x1, v_y1, v_dot1 );
+#else // sandy bridge
+ v_tmp = _mm256_mul_pd( v_x0, v_y0 );
+ v_dot0 = _mm256_add_pd( v_dot0, v_tmp );
+ v_tmp = _mm256_mul_pd( v_x1, v_y1 );
+ v_dot1 = _mm256_add_pd( v_dot1, v_tmp );
+#endif
+ }
+ for(; ii<m-3; ii+=4)
+ {
+ v_x0 = _mm256_loadu_pd( &x[ii+0] );
+ v_y0 = _mm256_loadu_pd( &y[ii+0] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+ v_dot0 = _mm256_fmadd_pd( v_x0, v_y0, v_dot0 );
+#else // sandy bridge
+ v_tmp = _mm256_mul_pd( v_x0, v_y0 );
+ v_dot0 = _mm256_add_pd( v_dot0, v_tmp );
+#endif
+ }
+ for(; ii<m; ii++)
+ {
+ u_x0 = _mm_load_sd( &x[ii+0] );
+ u_y0 = _mm_load_sd( &y[ii+0] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+ u_dot0 = _mm_fmadd_sd( u_x0, u_y0, u_dot0 );
+#else // sandy bridge
+ u_tmp = _mm_mul_sd( u_x0, u_y0 );
+ u_dot0 = _mm_add_sd( u_dot0, u_tmp );
+#endif
+ }
+ // reduce
+ v_dot0 = _mm256_add_pd( v_dot0, v_dot1 );
+ u_tmp = _mm_add_pd( _mm256_castpd256_pd128( v_dot0 ), _mm256_extractf128_pd( v_dot0, 0x1 ) );
+ u_tmp = _mm_hadd_pd( u_tmp, u_tmp);
+ u_dot0 = _mm_add_sd( u_dot0, u_tmp );
+ _mm_store_sd( &dot, u_dot0 );
+#else // no haswell, no sandy bridge
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ dot += x[ii+0] * y[ii+0];
+ dot += x[ii+1] * y[ii+1];
+ dot += x[ii+2] * y[ii+2];
+ dot += x[ii+3] * y[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ dot += x[ii+0] * y[ii+0];
+ }
+#endif // haswell, sandy bridge
+ return dot;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/d_blas2_diag_lib.c b/blas/d_blas2_diag_lib.c
new file mode 100644
index 0000000..8bc3f68
--- /dev/null
+++ b/blas/d_blas2_diag_lib.c
@@ -0,0 +1,45 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL double
+
+#define STRVEC d_strvec
+
+#define GEMV_DIAG_LIBSTR dgemv_diag_libstr
+
+
+
+#include "x_blas2_diag_lib.c"
diff --git a/blas/d_blas2_lib.c b/blas/d_blas2_lib.c
new file mode 100644
index 0000000..9c39fe2
--- /dev/null
+++ b/blas/d_blas2_lib.c
@@ -0,0 +1,71 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#include "d_blas.h"
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+#define STRVEC d_strvec
+
+#define GEMV_N_LIBSTR dgemv_n_libstr
+#define GEMV_NT_LIBSTR dgemv_nt_libstr
+#define GEMV_T_LIBSTR dgemv_t_libstr
+#define SYMV_L_LIBSTR dsymv_l_libstr
+#define TRMV_LNN_LIBSTR dtrmv_lnn_libstr
+#define TRMV_LTN_LIBSTR dtrmv_ltn_libstr
+#define TRMV_UNN_LIBSTR dtrmv_unn_libstr
+#define TRMV_UTN_LIBSTR dtrmv_utn_libstr
+#define TRSV_LNN_LIBSTR dtrsv_lnn_libstr
+#define TRSV_LNN_MN_LIBSTR dtrsv_lnn_mn_libstr
+#define TRSV_LNU_LIBSTR dtrsv_lnu_libstr
+#define TRSV_LTN_LIBSTR dtrsv_ltn_libstr
+#define TRSV_LTN_MN_LIBSTR dtrsv_ltn_mn_libstr
+#define TRSV_LTU_LIBSTR dtrsv_ltu_libstr
+#define TRSV_UNN_LIBSTR dtrsv_unn_libstr
+#define TRSV_UTN_LIBSTR dtrsv_utn_libstr
+
+#define COPY dcopy_
+#define GEMV dgemv_
+#define SYMV dsymv_
+#define TRMV dtrmv_
+#define TRSV dtrsv_
+
+
+
+#include "x_blas2_lib.c"
diff --git a/blas/d_blas2_lib4.c b/blas/d_blas2_lib4.c
new file mode 100644
index 0000000..cab8e3c
--- /dev/null
+++ b/blas/d_blas2_lib4.c
@@ -0,0 +1,1060 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+void dtrsv_ln_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ // suppose m>=n
+ if(m<n)
+ m = n;
+
+ const int bs = 4;
+
+ double alpha = -1.0;
+ double beta = 1.0;
+
+ int i;
+
+ if(x!=y)
+ {
+ for(i=0; i<m; i++)
+ y[i] = x[i];
+ }
+
+ i = 0;
+ for( ; i<n-3; i+=4)
+ {
+ kernel_dtrsv_ln_inv_4_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i]);
+ }
+ if(i<n)
+ {
+ kernel_dtrsv_ln_inv_4_vs_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i], m-i, n-i);
+ i+=4;
+ }
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for( ; i<m-7; i+=8)
+ {
+ kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, y, &beta, &y[i], &y[i]);
+ }
+ if(i<m-3)
+ {
+ kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
+ i+=4;
+ }
+#else
+ for( ; i<m-3; i+=4)
+ {
+ kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
+ }
+#endif
+ if(i<m)
+ {
+ kernel_dgemv_n_4_gen_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i], 0, m-i);
+ i+=4;
+ }
+
+ }
+
+
+
+void dtrsv_lt_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ if(n>m)
+ n = m;
+
+ const int bs = 4;
+
+ int i;
+
+ if(x!=y)
+ for(i=0; i<m; i++)
+ y[i] = x[i];
+
+ i=0;
+ if(n%4==1)
+ {
+ kernel_dtrsv_lt_inv_1_lib4(m-n+i+1, &pA[n/bs*bs*sda+(n-i-1)*bs], sda, &inv_diag_A[n-i-1], &y[n-i-1], &y[n-i-1], &y[n-i-1]);
+ i++;
+ }
+ else if(n%4==2)
+ {
+ kernel_dtrsv_lt_inv_2_lib4(m-n+i+2, &pA[n/bs*bs*sda+(n-i-2)*bs], sda, &inv_diag_A[n-i-2], &y[n-i-2], &y[n-i-2], &y[n-i-2]);
+ i+=2;
+ }
+ else if(n%4==3)
+ {
+ kernel_dtrsv_lt_inv_3_lib4(m-n+i+3, &pA[n/bs*bs*sda+(n-i-3)*bs], sda, &inv_diag_A[n-i-3], &y[n-i-3], &y[n-i-3], &y[n-i-3]);
+ i+=3;
+ }
+ for(; i<n-3; i+=4)
+ {
+ kernel_dtrsv_lt_inv_4_lib4(m-n+i+4, &pA[(n-i-4)/bs*bs*sda+(n-i-4)*bs], sda, &inv_diag_A[n-i-4], &y[n-i-4], &y[n-i-4], &y[n-i-4]);
+ }
+
+ }
+
+
+
+void dgemv_nt_lib(int m, int n, double alpha_n, double alpha_t, double *pA, int sda, double *x_n, double *x_t, double beta_n, double beta_t, double *y_n, double *y_t, double *z_n, double *z_t)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii;
+
+ // copy and scale y_n int z_n
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z_n[ii+0] = beta_n*y_n[ii+0];
+ z_n[ii+1] = beta_n*y_n[ii+1];
+ z_n[ii+2] = beta_n*y_n[ii+2];
+ z_n[ii+3] = beta_n*y_n[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z_n[ii+0] = beta_n*y_n[ii+0];
+ }
+
+ ii = 0;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; ii<n-5; ii+=6)
+ {
+ kernel_dgemv_nt_6_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
+ }
+#endif
+ for(; ii<n-3; ii+=4)
+ {
+ kernel_dgemv_nt_4_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
+ }
+ if(ii<n)
+ {
+ kernel_dgemv_nt_4_vs_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
+ }
+
+ return;
+
+ }
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+void dgemv_n_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<0)
+ return;
+
+ const int bs = 4;
+
+ int i;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda;
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ double *z = sz->pa + zi;
+
+ i = 0;
+ // clean up at the beginning
+ if(ai%bs!=0)
+ {
+ kernel_dgemv_n_4_gen_lib4(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
+ pA += bs*sda;
+ y += 4 - ai%bs;
+ z += 4 - ai%bs;
+ m -= 4 - ai%bs;
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for( ; i<m-11; i+=12)
+ {
+ kernel_dgemv_n_12_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
+ }
+#endif
+ for( ; i<m-7; i+=8)
+ {
+ kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
+ }
+ if(i<m-3)
+ {
+ kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
+ i+=4;
+ }
+#else
+ for( ; i<m-3; i+=4)
+ {
+ kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
+ }
+#endif
+ if(i<m)
+ {
+ kernel_dgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
+ }
+
+ return;
+
+ }
+
+
+
+void dgemv_t_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+ {
+
+ if(n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ double *z = sz->pa + zi;
+
+ if(ai%bs==0)
+ {
+ i = 0;
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for( ; i<n-11; i+=12)
+ {
+ kernel_dgemv_t_12_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+ }
+#endif
+ for( ; i<n-7; i+=8)
+ {
+ kernel_dgemv_t_8_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+ }
+ if(i<n-3)
+ {
+ kernel_dgemv_t_4_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+ i+=4;
+ }
+#else
+ for( ; i<n-3; i+=4)
+ {
+ kernel_dgemv_t_4_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+ }
+#endif
+ if(i<n)
+ {
+ kernel_dgemv_t_4_vs_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ }
+ else // TODO kernel 8
+ {
+ i = 0;
+ for( ; i<n; i+=4)
+ {
+ kernel_dgemv_t_4_gen_lib4(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void dgemv_nt_libstr(int m, int n, double alpha_n, double alpha_t, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx_n, int xi_n, struct d_strvec *sx_t, int xi_t, double beta_n, double beta_t, struct d_strvec *sy_n, int yi_n, struct d_strvec *sy_t, int yi_t, struct d_strvec *sz_n, int zi_n, struct d_strvec *sz_t, int zi_t)
+ {
+ if(ai!=0)
+ {
+ printf("\ndgemv_nt_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *x_n = sx_n->pa + xi_n;
+ double *x_t = sx_t->pa + xi_t;
+ double *y_n = sy_n->pa + yi_n;
+ double *y_t = sy_t->pa + yi_t;
+ double *z_n = sz_n->pa + zi_n;
+ double *z_t = sz_t->pa + zi_t;
+ dgemv_nt_lib(m, n, alpha_n, alpha_t, pA, sda, x_n, x_t, beta_n, beta_t, y_n, y_t, z_n, z_t);
+ return;
+ }
+
+
+
+void dsymv_l_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii, n1;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ double *z = sz->pa + zi;
+
+ // copy and scale y int z
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z[ii+0] = beta*y[ii+0];
+ z[ii+1] = beta*y[ii+1];
+ z[ii+2] = beta*y[ii+2];
+ z[ii+3] = beta*y[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = beta*y[ii+0];
+ }
+
+ // clean up at the beginning
+ if(ai%bs!=0) // 1, 2, 3
+ {
+ n1 = 4-ai%bs;
+ kernel_dsymv_l_4_gen_lib4(m, &alpha, ai%bs, &pA[0], sda, &x[0], &z[0], n<n1 ? n : n1);
+ pA += n1 + n1*bs + (sda-1)*bs;
+ x += n1;
+ z += n1;
+ m -= n1;
+ n -= n1;
+ }
+ // main loop
+ ii = 0;
+ for(; ii<n-3; ii+=4)
+ {
+ kernel_dsymv_l_4_lib4(m-ii, &alpha, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii]);
+ }
+ // clean up at the end
+ if(ii<n)
+ {
+ kernel_dsymv_l_4_gen_lib4(m-ii, &alpha, 0, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii], n-ii);
+ }
+
+ return;
+ }
+
+
+
+// m >= n
+void dtrmv_lnn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+
+ if(m-n>0)
+ dgemv_n_libstr(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
+
+ double *pA2 = pA;
+ double *z2 = z;
+ int m2 = n;
+ int n2 = 0;
+ double *pA3, *x3;
+
+ double alpha = 1.0;
+ double beta = 1.0;
+
+ double zt[4];
+
+ int ii, jj, jj_end;
+
+ ii = 0;
+
+ if(ai%4!=0)
+ {
+ pA2 += sda*bs - ai%bs;
+ z2 += bs-ai%bs;
+ m2 -= bs-ai%bs;
+ n2 += bs-ai%bs;
+ }
+
+ pA2 += m2/bs*bs*sda;
+ z2 += m2/bs*bs;
+ n2 += m2/bs*bs;
+
+ if(m2%bs!=0)
+ {
+ //
+ pA3 = pA2 + bs*n2;
+ x3 = x + n2;
+ zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+ zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+ zt[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+ zt[0] = pA3[0+bs*0]*x3[0];
+ kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, zt, zt);
+ for(jj=0; jj<m2%bs; jj++)
+ z2[jj] = zt[jj];
+ }
+ for(; ii<m2-3; ii+=4)
+ {
+ pA2 -= bs*sda;
+ z2 -= 4;
+ n2 -= 4;
+ pA3 = pA2 + bs*n2;
+ x3 = x + n2;
+ z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+ z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+ z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+ z2[0] = pA3[0+bs*0]*x3[0];
+ kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, z2, z2);
+ }
+ if(ai%4!=0)
+ {
+ if(ai%bs==1)
+ {
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 4-ai%bs<n ? 4-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==2)
+ {
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 4-ai%bs<n ? 4-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else // if (ai%bs==3)
+ {
+ z[0] = pA[0+bs*0]*x[0];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// m >= n
+void dtrmv_ltn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+
+ double xt[4];
+ double zt[4];
+
+ double alpha = 1.0;
+ double beta = 1.0;
+
+ int ii, jj, ll, ll_max;
+
+ jj = 0;
+
+ if(ai%bs!=0)
+ {
+
+ if(ai%bs==1)
+ {
+ ll_max = m-jj<3 ? m-jj : 3;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<3; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
+ zt[2] = pA[2+bs*2]*xt[2];
+ pA += bs*sda - 1;
+ x += 3;
+ kernel_dgemv_t_4_lib4(m-3-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<3 ? n-jj : 3;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*3;
+ z += 3;
+ jj += 3;
+ }
+ else if(ai%bs==2)
+ {
+ ll_max = m-jj<2 ? m-jj : 2;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<2; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
+ zt[1] = pA[1+bs*1]*xt[1];
+ pA += bs*sda - 2;
+ x += 2;
+ kernel_dgemv_t_4_lib4(m-2-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<2 ? n-jj : 2;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*2;
+ z += 2;
+ jj += 2;
+ }
+ else // if(ai%bs==3)
+ {
+ ll_max = m-jj<1 ? m-jj : 1;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<1; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0];
+ pA += bs*sda - 3;
+ x += 1;
+ kernel_dgemv_t_4_lib4(m-1-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<1 ? n-jj : 1;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*1;
+ z += 1;
+ jj += 1;
+ }
+
+ }
+
+ for(; jj<n-3; jj+=4)
+ {
+ zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3];
+ zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3];
+ zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3];
+ zt[3] = pA[3+bs*3]*x[3];
+ pA += bs*sda;
+ x += 4;
+ kernel_dgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, z);
+ pA += bs*4;
+ z += 4;
+ }
+ if(jj<n)
+ {
+ ll_max = m-jj<4 ? m-jj : 4;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<4; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
+ zt[3] = pA[3+bs*3]*xt[3];
+ pA += bs*sda;
+ x += 4;
+ kernel_dgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ for(ll=0; ll<n-jj; ll++)
+ z[ll] = zt[ll];
+// pA += bs*4;
+// z += 4;
+ }
+
+ return;
+
+ }
+
+
+
+void dtrmv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ai!=0)
+ {
+ printf("\ndtrmv_unn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+
+ int i;
+
+ i=0;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ kernel_dtrmv_un_8_lib4(m-i, pA, sda, x, z);
+ pA += 8*sda+8*bs;
+ x += 8;
+ z += 8;
+ }
+#endif
+ for(; i<m-3; i+=4)
+ {
+ kernel_dtrmv_un_4_lib4(m-i, pA, x, z);
+ pA += 4*sda+4*bs;
+ x += 4;
+ z += 4;
+ }
+ if(m>i)
+ {
+ if(m-i==1)
+ {
+ z[0] = pA[0+bs*0]*x[0];
+ }
+ else if(m-i==2)
+ {
+ z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1];
+ z[1] = pA[1+bs*1]*x[1];
+ }
+ else // if(m-i==3)
+ {
+ z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1] + pA[0+bs*2]*x[2];
+ z[1] = pA[1+bs*1]*x[1] + pA[1+bs*2]*x[2];
+ z[2] = pA[2+bs*2]*x[2];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void dtrmv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ai!=0)
+ {
+ printf("\ndtrmv_utn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+
+ int ii, idx;
+
+ double *ptrA;
+
+ ii=0;
+ idx = m/bs*bs;
+ if(m%bs!=0)
+ {
+ kernel_dtrmv_ut_4_vs_lib4(m, pA+idx*bs, sda, x, z+idx, m%bs);
+ ii += m%bs;
+ }
+ idx -= 4;
+ for(; ii<m; ii+=4)
+ {
+ kernel_dtrmv_ut_4_lib4(idx+4, pA+idx*bs, sda, x, z+idx);
+ idx -= 4;
+ }
+
+ return;
+
+ }
+
+
+
+void dtrsv_lnn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0 | n==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** dtrsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** dtrsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ if(ai!=0 | xi%4!=0)
+ {
+ printf("\ndtrsv_lnn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *dA = sA->dA;
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ dtrsv_ln_inv_lib(m, n, pA, sda, dA, x, z);
+ return;
+ }
+
+
+
+void dtrsv_ltn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** dtrsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** dtrsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ if(ai!=0 | xi%4!=0)
+ {
+ printf("\ndtrsv_ltn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *dA = sA->dA;
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ dtrsv_lt_inv_lib(m, n, pA, sda, dA, x, z);
+ return;
+ }
+
+
+
+void dtrsv_lnn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** dtrsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ if(ai!=0 | xi%4!=0)
+ {
+ printf("\ndtrsv_lnn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *dA = sA->dA;
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ dtrsv_ln_inv_lib(m, m, pA, sda, dA, x, z);
+ return;
+ }
+
+
+
+void dtrsv_lnu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_lnu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** dtrsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** dtrsv_lnu_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void dtrsv_ltn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** dtrsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ if(ai!=0 | xi%4!=0)
+ {
+ printf("\ndtrsv_ltn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs; // TODO ai
+ double *dA = sA->dA;
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ dtrsv_lt_inv_lib(m, m, pA, sda, dA, x, z);
+ return;
+ }
+
+
+
+void dtrsv_ltu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_ltu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** dtrsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** dtrsv_ltu_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void dtrsv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_unn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** dtrsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** dtrsv_unn_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void dtrsv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** dtrsv_utn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** dtrsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** dtrsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** dtrsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** dtrsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** dtrsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** dtrsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** dtrsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** dtrsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** dtrsv_utn_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/d_blas3_diag_lib.c b/blas/d_blas3_diag_lib.c
new file mode 100644
index 0000000..ff69317
--- /dev/null
+++ b/blas/d_blas3_diag_lib.c
@@ -0,0 +1,47 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+#define STRVEC d_strvec
+
+#define GEMM_R_DIAG_LIBSTR dgemm_r_diag_libstr
+#define GEMM_L_DIAG_LIBSTR dgemm_l_diag_libstr
+
+
+
+#include "x_blas3_diag_lib.c"
diff --git a/blas/d_blas3_diag_lib4.c b/blas/d_blas3_diag_lib4.c
new file mode 100644
index 0000000..2731d1f
--- /dev/null
+++ b/blas/d_blas3_diag_lib4.c
@@ -0,0 +1,184 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+void dgemm_diag_left_lib(int m, int n, double alpha, double *dA, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii;
+
+ ii = 0;
+ if(beta==0.0)
+ {
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgemm_diag_left_4_a0_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &pD[ii*sdd]);
+ }
+ }
+ else
+ {
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgemm_diag_left_4_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ }
+ }
+ if(m-ii>0)
+ {
+ if(m-ii==1)
+ kernel_dgemm_diag_left_1_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ else if(m-ii==2)
+ kernel_dgemm_diag_left_2_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ else // if(m-ii==3)
+ kernel_dgemm_diag_left_3_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ }
+
+ }
+
+
+
+void dgemm_diag_right_lib(int m, int n, double alpha, double *pA, int sda, double *dB, double beta, double *pC, int sdc, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii;
+
+ ii = 0;
+ if(beta==0.0)
+ {
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_dgemm_diag_right_4_a0_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &pD[ii*bs], sdd);
+ }
+ }
+ else
+ {
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_dgemm_diag_right_4_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ }
+ }
+ if(n-ii>0)
+ {
+ if(n-ii==1)
+ kernel_dgemm_diag_right_1_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ else if(n-ii==2)
+ kernel_dgemm_diag_right_2_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ else // if(n-ii==3)
+ kernel_dgemm_diag_right_3_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ }
+
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm with A diagonal matrix (stored as strvec)
+void dgemm_l_diag_libstr(int m, int n, double alpha, struct d_strvec *sA, int ai, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ if(bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\ndgemm_l_diag_libstr: feature not implemented yet: bi=%d, ci=%d, di=%d\n", bi, ci, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *dA = sA->pa + ai;
+ double *pB = sB->pA + bj*bs;
+ double *pC = sC->pA + cj*bs;
+ double *pD = sD->pA + dj*bs;
+ dgemm_diag_left_lib(m, n, alpha, dA, pB, sdb, beta, pC, sdc, pD, sdd);
+ return;
+ }
+
+
+
+// dgemm with B diagonal matrix (stored as strvec)
+void dgemm_r_diag_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sB, int bi, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ if(ai!=0 | ci!=0 | di!=0)
+ {
+ printf("\ndgemm_r_diag_libstr: feature not implemented yet: ai=%d, ci=%d, di=%d\n", ai, ci, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*bs;
+ double *dB = sB->pa + bi;
+ double *pC = sC->pA + cj*bs;
+ double *pD = sD->pA + dj*bs;
+ dgemm_diag_right_lib(m, n, alpha, pA, sda, dB, beta, pC, sdc, pD, sdd);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/blas/d_blas3_lib.c b/blas/d_blas3_lib.c
new file mode 100644
index 0000000..27c20ab
--- /dev/null
+++ b/blas/d_blas3_lib.c
@@ -0,0 +1,69 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#if defined(REF_BLAS_BLIS)
+#include "d_blas_64.h"
+#else
+#include "d_blas.h"
+#endif
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+
+#define GEMM_NN_LIBSTR dgemm_nn_libstr
+#define GEMM_NT_LIBSTR dgemm_nt_libstr
+#define SYRK_LN_LIBSTR dsyrk_ln_libstr
+#define SYRK_LN_MN_LIBSTR dsyrk_ln_mn_libstr
+#define TRMM_RLNN_LIBSTR dtrmm_rlnn_libstr
+#define TRMM_RUTN_LIBSTR dtrmm_rutn_libstr
+#define TRSM_LLNU_LIBSTR dtrsm_llnu_libstr
+#define TRSM_LUNN_LIBSTR dtrsm_lunn_libstr
+#define TRSM_RLTN_LIBSTR dtrsm_rltn_libstr
+#define TRSM_RLTU_LIBSTR dtrsm_rltu_libstr
+#define TRSM_RUTN_LIBSTR dtrsm_rutn_libstr
+
+#define COPY dcopy_
+#define GEMM dgemm_
+#define SYRK dsyrk_
+#define TRMM dtrmm_
+#define TRSM dtrsm_
+
+
+
+#include "x_blas3_lib.c"
diff --git a/blas/d_blas3_lib4.c b/blas/d_blas3_lib4.c
new file mode 100644
index 0000000..dfa3cb8
--- /dev/null
+++ b/blas/d_blas3_lib4.c
@@ -0,0 +1,2728 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+void dgemm_nt_lib(int m, int n, int k, double alpha, double *pA, int sda, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j, l;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*ps+(i+4)*sdc], &pD[j*ps+(i+4)*sdd], m-(i+4), n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<n-8; j+=12)
+ {
+ kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, n-(j+4));
+ }
+
+ if(j<n-4)
+ {
+ kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, n-(j+4));
+ }
+ else if(j<n)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+#endif
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+#endif
+#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*ps+(i+4)*sdc], &pD[j*ps+(i+4)*sdd], m-(i+4), n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<n-8; j+=12)
+ {
+ kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ if(j<n-4)
+ {
+ kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ else if(j<n)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_4:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ return;
+#else
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ return;
+#endif
+
+ }
+
+
+
+#if 0
+void dgemm_nn_lib(int m, int n, int k, double alpha, double *pA, int sda, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j, l;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nn_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nn_4x4_lib4(k, &alpha, &pA[i*sda], 0, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+#endif
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ return;
+
+ }
+#endif
+
+
+
+void dtrmm_nt_ru_lib(int m, int n, double alpha, double *pA, int sda, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j;
+
+ i = 0;
+// XXX there is a bug here !!!!!!
+#if 0//defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrmm_nt_ru_12x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n) // TODO specialized edge routine
+ {
+ kernel_dtrmm_nt_ru_12x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ if(m-i<5)
+ {
+ goto left_4;
+ }
+ if(m-i<9)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrmm_nt_ru_8x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n) // TODO specialized edge routine
+ {
+ kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ if(m-i<5)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrmm_nt_ru_4x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(j<n) // TODO specialized edge routine
+ {
+ kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ // clean up
+ left_12:
+ j = 0;
+// for(; j<n-3; j+=4)
+ for(; j<n; j+=4)
+ {
+ kernel_dtrmm_nt_ru_12x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+// if(j<n) // TODO specialized edge routine
+// {
+// kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+// }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ // clean up
+ left_8:
+ j = 0;
+// for(; j<n-3; j+=4)
+ for(; j<n; j+=4)
+ {
+ kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+// if(j<n) // TODO specialized edge routine
+// {
+// kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+// }
+ return;
+#endif
+
+ left_4:
+ j = 0;
+// for(; j<n-3; j+=4)
+ for(; j<n; j+=4)
+ {
+ kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+// if(j<n) // TODO specialized edge routine
+// {
+// kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+// }
+ return;
+
+ }
+
+
+
+// D <= B * A^{-T} , with A lower triangular with unit diagonal
+void dtrsm_nt_rl_one_lib(int m, int n, double *pA, int sda, double *pB, int sdb, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_one_12x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda]);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nt_rl_one_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_one_8x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda]);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nt_rl_one_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_one_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda]);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_one_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_one_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
+ }
+ return;
+#endif
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+// D <= B * A^{-T} , with A upper triangular employing explicit inverse of diagonal
+void dtrsm_nt_ru_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *pB, int sdb, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j, idx;
+
+ int rn = n%4;
+
+ double *dummy;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_dtrsm_nt_ru_inv_12x4_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx]);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_dtrsm_nt_ru_inv_8x4_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx]);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_dtrsm_nt_ru_inv_4x4_lib4(j, &pD[i*sdd+(idx+4)*ps], &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx]);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ // TODO
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, 4);
+ }
+ return;
+
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ // TODO
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, 4);
+ }
+ return;
+
+#endif
+
+ left_4:
+ j = 0;
+ // TODO
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, 4);
+ }
+ return;
+
+ }
+
+
+
+// D <= A^{-1} * B , with A lower triangular with unit diagonal
+void dtrsm_nn_ll_one_lib(int m, int n, double *pA, int sda, double *pB, int sdb, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for( ; i<m-11; i+=12)
+ {
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ if(m-i<=4)
+ goto left_4;
+ if(m-i<=8)
+ goto left_8;
+ else
+ goto left_12;
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for( ; i<m-7; i+=8)
+ {
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ if(m-i<=4)
+ goto left_4;
+ else
+ goto left_8;
+ }
+#else
+ for( ; i<m-3; i+=4)
+ {
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_lib4(i, pA+i*sda, pD+j*ps, sdd, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for( ; j<n; j+=4)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for( ; j<n; j+=4)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
+ }
+ return;
+#endif
+
+ left_4:
+ j = 0;
+ for( ; j<n; j+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+// D <= A^{-1} * B , with A upper triangular employing explicit inverse of diagonal
+void dtrsm_nn_lu_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *pB, int sdb, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int i, j, idx;
+ double *dummy;
+
+ i = 0;
+ int rm = m%4;
+ if(rm>0)
+ {
+ // TODO code expliticly the final case
+ idx = m-rm; // position of the part to do
+ j = 0;
+ for( ; j<n; j+=4)
+ {
+ kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(0, dummy, dummy, 0, pB+idx*sdb+j*ps, pD+idx*sdd+j*ps, pA+idx*sda+idx*ps, inv_diag_A+idx, rm, n-j);
+ }
+ // TODO
+ i += rm;
+ }
+// int em = m-rm;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for( ; i<m-8; i+=12)
+ {
+ idx = m-i; // position of already done part
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nn_lu_inv_12x4_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, inv_diag_A+(idx-12));
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nn_lu_inv_12x4_vs_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, inv_diag_A+(idx-12), 12, n-j);
+// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4), 4, n-j);
+// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, inv_diag_A+(idx-8), 4, n-j);
+// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+8, pA+(idx-12)*sda+(idx-8)*ps, pD+(idx-8)*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, pD+(idx-12)*sdd+j*ps, pA+(idx-12)*sda+(idx-12)*ps, inv_diag_A+(idx-12), 4, n-j);
+ }
+ }
+#endif
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for( ; i<m-4; i+=8)
+ {
+ idx = m-i; // position of already done part
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nn_lu_inv_8x4_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, inv_diag_A+(idx-8));
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, inv_diag_A+(idx-8), 8, n-j);
+// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4), 4, n-j);
+// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, inv_diag_A+(idx-8), 4, n-j);
+ }
+ }
+#endif
+ for( ; i<m; i+=4)
+ {
+ idx = m-i; // position of already done part
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nn_lu_inv_4x4_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4));
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4), 4, n-j);
+ }
+ }
+
+ // common return
+ return;
+
+ }
+
+
+
+#if 0
+void dlauum_blk_nt_l_lib(int m, int n, int nv, int *rv, int *cv, double *pA, int sda, double *pB, int sdb, int alg, double *pC, int sdc, double *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ // TODO remove
+ double alpha, beta;
+ if(alg==0)
+ {
+ alpha = 1.0;
+ beta = 0.0;
+ }
+ else if(alg==1)
+ {
+ alpha = 1.0;
+ beta = 1.0;
+ }
+ else
+ {
+ alpha = -1.0;
+ beta = 1.0;
+ }
+
+ // TODO remove
+ int k = cv[nv-1];
+
+ const int ps = 4;
+
+ int i, j, l;
+ int ii, iii, jj, kii, kiii, kjj, k0, k1;
+
+ i = 0;
+ ii = 0;
+ iii = 0;
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-7; i+=8)
+ {
+
+ while(ii<nv && rv[ii]<i+8)
+ ii++;
+ if(ii<nv)
+ kii = cv[ii];
+ else
+ kii = cv[ii-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_nt_8x4_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, 8, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, 8, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], 4, n-j-4); // TODO
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+
+ while(ii<nv && rv[ii]<i+4)
+ ii++;
+ if(ii<nv)
+ kii = cv[ii];
+ else
+ kii = cv[ii-1];
+// k0 = kii;
+// printf("\nii %d %d %d %d %d\n", i, ii, rv[ii], cv[ii], kii);
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+// printf("\njj %d %d %d %d %d\n", j, jj, rv[jj], cv[jj], kjj);
+
+ kernel_dgemm_nt_4x4_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+// printf("\njj %d %d %d %d %d\n", j, jj, rv[jj], cv[jj], kjj);
+
+ if(i<j) // dgemm
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], 4, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], 4, n-j);
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+
+ kii = cv[nv-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4); // TODO
+ }
+ }
+ }
+ return;
+#endif
+
+ left_4:
+
+ kii = cv[nv-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ }
+ return;
+
+ }
+#endif
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm nt
+void dgemm_nt_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ const int ps = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ int air = ai & (ps-1);
+ int bir = bi & (ps-1);
+ double *pA = sA->pA + aj*ps + (ai-air)*sda;
+ double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+
+ if(ai==0 & bi==0 & ci==0 & di==0)
+ {
+ dgemm_nt_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd);
+ return;
+ }
+
+ int ci0 = ci-air;
+ int di0 = di-air;
+ int offsetC;
+ int offsetD;
+ if(ci0>=0)
+ {
+ pC += ci0/ps*ps*sdd;
+ offsetC = ci0%ps;
+ }
+ else
+ {
+ pC += -4*sdc;
+ offsetC = ps+ci0;
+ }
+ if(di0>=0)
+ {
+ pD += di0/ps*ps*sdd;
+ offsetD = di0%ps;
+ }
+ else
+ {
+ pD += -4*sdd;
+ offsetD = ps+di0;
+ }
+
+ int i, j, l;
+
+ int idxB;
+
+ // clean up at the beginning
+ if(air!=0)
+ {
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ if(m>5)
+ {
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bir!=0)
+ {
+ kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, n-j);
+ j += ps-bir;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
+ idxB += 4;
+ }
+ m -= 2*ps-air;
+ pA += 2*ps*sda;
+ pC += 2*ps*sdc;
+ pD += 2*ps*sdd;
+ }
+ else // m<=4
+ {
+#endif
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bir!=0)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[0], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, n-j);
+ j += ps-bir;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[0], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
+ idxB += 4;
+ }
+ m -= ps-air;
+ pA += ps*sda;
+ pC += ps*sdc;
+ pD += ps*sdd;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ // nothing more to do
+ return;
+ }
+#endif
+ }
+ i = 0;
+ // main loop
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-4; i+=8)
+ {
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bir!=0)
+ {
+ kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, n-j);
+ j += ps-bir;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ idxB += 4;
+ }
+ }
+ if(i<m)
+ {
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bir!=0)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, n-j);
+ j += ps-bir;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ idxB += 4;
+ }
+ }
+#else
+ for(; i<m; i+=4)
+ {
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bir!=0)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, n-j);
+ j += ps-bir;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ idxB += 4;
+ }
+ }
+#endif
+
+ return;
+
+ }
+
+
+
+// dgemm nn
+void dgemm_nn_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ int air = ai & (ps-1);
+ int bir = bi & (ps-1);
+ double *pA = sA->pA + aj*ps + (ai-air)*sda;
+ double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+
+ int offsetB = bir;
+
+ int ci0 = ci-air;
+ int di0 = di-air;
+ int offsetC;
+ int offsetD;
+ if(ci0>=0)
+ {
+ pC += ci0/ps*ps*sdd;
+ offsetC = ci0%ps;
+ }
+ else
+ {
+ pC += -4*sdc;
+ offsetC = ps+ci0;
+ }
+ if(di0>=0)
+ {
+ pD += di0/ps*ps*sdd;
+ offsetD = di0%ps;
+ }
+ else
+ {
+ pD += -4*sdd;
+ offsetD = ps+di0;
+ }
+
+ int i, j, l;
+
+ // clean up at the beginning
+ if(air!=0)
+ {
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ if(m>5)
+ {
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[0], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
+ }
+ m -= 2*ps-air;
+ pA += 2*ps*sda;
+ pC += 2*ps*sda;
+ pD += 2*ps*sda;
+ }
+ else // m-i<=4
+ {
+#endif
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[0], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
+ }
+ m -= 2*ps-air;
+ pA += 2*ps*sda;
+ pC += 2*ps*sda;
+ pD += 2*ps*sda;
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ // nothing more to do
+ return;
+ }
+#endif
+ }
+ // main loop
+ i = 0;
+ if(offsetC==0 & offsetD==0)
+ {
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nn_12x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+// kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[(i+8)*sda], offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+(i+8)*sdc], sdc, 0, &pD[j*ps+(i+8)*sdd], sdd, 0, m-(i+8), 0, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dgemm_nn_4x4_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(j<n)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+ }
+ else
+ {
+// TODO 12x4
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-4; i+=8)
+ {
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#else
+ for(; i<m; i+=4)
+ {
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ }
+#endif
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+// kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[(i+8)*sda], offsetB, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+(i+8)*sdc], sdc, offsetD, &pD[j*ps+(i+8)*sdd], sdd, 0, m-(i+8), 0, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ return;
+#endif
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ return;
+
+ return;
+ }
+
+
+
+// dtrsm_nn_llu
+void dtrsm_llnu_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\ndtrsm_llnu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int ps = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pD = sD->pA + dj*ps;
+ dtrsm_nn_ll_one_lib(m, n, pA, sda, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_nn_lun
+void dtrsm_lunn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\ndtrsm_lunn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int ps = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dA = sA->dA;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ dtrsm_nn_lu_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_right_lower_transposed_notunit
+void dtrsm_rltn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\ndtrsm_rltn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+
+ const int ps = 4;
+
+ // TODO alpha !!!!!
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dA = sA->dA;
+
+ int i, j;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(i=0; i<n; i++)
+ dA[i] = 1.0 / dA[i];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(i=0; i<n; i++)
+ dA[i] = 1.0 / dA[i];
+ sA->use_dA = 0;
+ }
+
+// dtrsm_nt_rl_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j]);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j]);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j]);
+ }
+ if(j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<n-8; j+=12)
+ {
+ kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], sda, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
+ kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pA[(j+4)*sda], sda, &pB[(j+4)*ps+i*sdb], sdb, &pD[(j+4)*ps+i*sdd], sdd, &pA[(j+4)*ps+(j+4)*sda], sda, &dA[(j+4)], m-i, n-(j+4));
+ }
+ if(j<n-4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], sda, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pA[(j+4)*sda], &pB[(j+4)*ps+i*sdb], &pD[(j+4)*ps+i*sdd], &pA[(j+4)*ps+(j+4)*sda], &dA[(j+4)], m-i, n-(j+4));
+ j += 8;
+ }
+ else if(j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ j += 4;
+ }
+ return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<n-8; j+=12)
+ {
+ kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pA[j*sda], sda, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
+ }
+ if(j<n-4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pA[j*sda], sda, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
+ j += 8;
+ }
+ else if(j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ j += 4;
+ }
+ return;
+#else
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+ }
+ return;
+#endif
+
+ }
+
+
+
+// dtrsm_right_lower_transposed_unit
+void dtrsm_rltu_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\ndtrsm_rltu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int ps = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pD = sD->pA + dj*ps;
+ dtrsm_nt_rl_one_lib(m, n, pA, sda, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void dtrsm_rutn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\ndtrsm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int ps = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dA = sA->dA;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ dtrsm_nt_ru_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrmm_right_upper_transposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void dtrmm_rutn_libstr(int m, int n, double alpha, struct d_strmat *sB, int bi, int bj, struct d_strmat *sA, int ai, int aj, struct d_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0)
+ {
+ printf("\ndtrmm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
+ exit(1);
+ }
+ const int ps = 4;
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pD = sD->pA + dj*ps;
+ dtrmm_nt_ru_lib(m, n, alpha, pA, sda, pB, sdb, 0.0, pD, sdd, pD, sdd);
+ return;
+ }
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void dtrmm_rlnn_libstr(int m, int n, double alpha, struct d_strmat *sB, int bi, int bj, struct d_strmat *sA, int ai, int aj, struct d_strmat *sD, int di, int dj)
+ {
+
+ const int ps = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ int air = ai & (ps-1);
+ int bir = bi & (ps-1);
+ double *pA = sA->pA + aj*ps + (ai-air)*sda;
+ double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
+ double *pD = sD->pA + dj*ps;
+
+ int offsetB = bir;
+
+ int di0 = di-air;
+ int offsetD;
+ if(di0>=0)
+ {
+ pD += di0/ps*ps*sdd;
+ offsetD = di0%ps;
+ }
+ else
+ {
+ pD += -4*sdd;
+ offsetD = ps+di0;
+ }
+
+ int ii, jj;
+
+ if(air!=0)
+ {
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[jj*ps], sdd, air, air+m, 0, n-jj);
+ }
+ m -= ps-air;
+ pA += ps*sda;
+ pD += ps*sdd;
+ }
+ ii = 0;
+ if(offsetD==0)
+ {
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-11; ii+=12)
+ {
+ jj = 0;
+ for(; jj<n-5; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_12x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd); // n-j>=6 !!!!!
+ }
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_12x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, 12, n-jj);
+// kernel_dtrmm_nn_rl_8x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, 8, n-jj);
+// kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[(ii+8)*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, 0, &pD[(ii+8)*sdd+jj*ps], sdd, 0, 4, 0, n-jj);
+ }
+ }
+ if(ii<m)
+ {
+ if(ii<m-8)
+ goto left_12;
+ else if(ii<m-4)
+ goto left_8;
+ else
+ goto left_4_gen;
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; ii<m-7; ii+=8)
+ {
+ jj = 0;
+ for(; jj<n-5; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_8x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd);
+ }
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, 0, &pD[ii*sdd+jj*ps], sdd, 0, 8, 0, n-jj);
+ }
+ }
+ if(ii<m)
+ {
+ if(ii<m-4)
+ goto left_8_gen;
+ else
+ goto left_4_gen;
+ }
+#else
+ for(; ii<m-3; ii+=4)
+ {
+ jj = 0;
+ for(; jj<n-5; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_4x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps]);
+ }
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, 0, &pD[ii*sdd+jj*ps], sdd, 0, 4, 0, n-jj);
+ }
+ }
+ if(ii<m)
+ {
+ goto left_4_gen;
+ }
+#endif
+ }
+ else
+ {
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; ii<m-4; ii+=8)
+ {
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ if(ii<m)
+ {
+ goto left_4_gen;
+ }
+#else
+ for(; ii<m; ii+=4)
+ {
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+#endif
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_12x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, m-ii, n-jj);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_8x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, m-ii, n-jj);
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8_gen:
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
+ }
+ return;
+#endif
+
+ left_4_gen:
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
+ }
+ return;
+
+ }
+
+
+
+void dsyrk_ln_libstr(int m, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ai!=0 | bi!=0)
+ {
+ printf("\ndsyrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d\n", ai, bi);
+ exit(1);
+ }
+
+ const int ps = 4;
+
+ int i, j;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pC = sC->pA + cj*ps + (ci-(ci&(ps-1)))*sdc;
+ double *pD = sD->pA + dj*ps + (di-(di&(ps-1)))*sdd;
+
+ // TODO ai and bi
+ int offsetC;
+ int offsetD;
+ offsetC = ci&(ps-1);
+ offsetD = di&(ps-1);
+
+ // main loop
+ i = 0;
+ if(offsetC==0 & offsetD==0)
+ {
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+ }
+ else
+ {
+#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-4; i+=8)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+ }
+ kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+ kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, m-j-4);
+ }
+ if(m>i)
+ {
+ goto left_4_gen;
+ }
+#else
+ for(; i<m; i+=4)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+ }
+ kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+ }
+#endif
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ }
+ kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, m-j-4);
+// kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<i-8; j+=12)
+ {
+ kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, m-(j+4));
+ }
+ if(j<i-4)
+ {
+ kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, m-(j+4));
+ j += 8;
+ }
+ else if(j<i)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ j += 4;
+ }
+ kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+// kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
+ return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ }
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, m-j-4);
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<i-8; j+=12)
+ {
+ kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+ }
+ if(j<i-4)
+ {
+ kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+ j += 8;
+ }
+ else if(j<i)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+ j += 4;
+ }
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+ return;
+#else
+ left_4:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+ }
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+ return;
+#endif
+
+ left_4_gen:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+ }
+ kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+ return;
+
+ }
+
+
+
+void dsyrk_ln_mn_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ if(ai!=0 | bi!=0)
+ {
+ printf("\ndsyrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d\n", ai, bi);
+ exit(1);
+ }
+
+ const int ps = 4;
+
+ int i, j;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pC = sC->pA + cj*ps + (ci-(ci&(ps-1)))*sdc;
+ double *pD = sD->pA + dj*ps + (di-(di&(ps-1)))*sdd;
+
+ // TODO ai and bi
+ int offsetC;
+ int offsetD;
+ offsetC = ci&(ps-1);
+ offsetD = di&(ps-1);
+
+ // main loop
+ i = 0;
+ if(offsetC==0 & offsetD==0)
+ {
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-11)
+ {
+ kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
+ }
+ else
+ {
+ kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, n-j-4);
+ if(j<n-8)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
+ }
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-7)
+ {
+ kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+ kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
+ }
+ else
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dgemm
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-3)
+ {
+ kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+ }
+ else
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+ }
+ else
+ {
+#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-4; i+=8)
+ {
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, n-j-4);
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4_gen;
+ }
+#else
+ for(; i<m; i+=4)
+ {
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ }
+#endif
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, n-j-4);
+ if(j<n-8)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
+ }
+ }
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<i-8 & j<n-8; j+=12)
+ {
+ kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<i-4 & j<n-4)
+ {
+ kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, n-(j+4));
+ j += 8;
+ }
+ if(j<i & j<n)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ j += 4;
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
+ }
+ }
+ return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
+ }
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<i-8 & j<n-8; j+=12)
+ {
+ kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ if(j<i-4 & j<n-4)
+ {
+ kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ j += 8;
+ }
+ else if(j<i & j<n)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ j += 4;
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ return;
+#else
+ left_4:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+ }
+ return;
+#endif
+
+ left_4_gen:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+ }
+ return;
+
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/blas/d_blas_64.h b/blas/d_blas_64.h
new file mode 100644
index 0000000..8e6aba2
--- /dev/null
+++ b/blas/d_blas_64.h
@@ -0,0 +1,65 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// headers to reference BLAS and LAPACK routines employed in BLASFEO WR
+
+// level 1
+void dcopy_(long long *m, double *x, long long *incx, double *y, long long *incy);
+void daxpy_(long long *m, double *alpha, double *x, long long *incx, double *y, long long *incy);
+void dscal_(long long *m, double *alpha, double *x, long long *incx);
+
+// level 2
+void dgemv_(char *ta, long long *m, long long *n, double *alpha, double *A, long long *lda, double *x, long long *incx, double *beta, double *y, long long *incy);
+void dsymv_(char *uplo, long long *m, double *alpha, double *A, long long *lda, double *x, long long *incx, double *beta, double *y, long long *incy);
+void dtrmv_(char *uplo, char *trans, char *diag, long long *n, double *A, long long *lda, double *x, long long *incx);
+void dtrsv_(char *uplo, char *trans, char *diag, long long *n, double *A, long long *lda, double *x, long long *incx);
+void dger_(long long *m, long long *n, double *alpha, double *x, long long *incx, double *y, long long *incy, double *A, long long *lda);
+
+// level 3
+void dgemm_(char *ta, char *tb, long long *m, long long *n, long long *k, double *alpha, double *A, long long *lda, double *B, long long *ldb, double *beta, double *C, long long *ldc);
+void dsyrk_(char *uplo, char *trans, long long *n, long long *k, double *alpha, double *A, long long *lda, double *beta, double *C, long long *ldc);
+void dtrmm_(char *side, char *uplo, char *trans, char *diag, long long *m, long long *n, double *alpha, double *A, long long *lda, double *B, long long *ldb);
+void dtrsm_(char *side, char *uplo, char *trans, char *diag, long long *m, long long *n, double *alpha, double *A, long long *lda, double *B, long long *ldb);
+
+// lapack
+long long dpotrf_(char *uplo, long long *m, double *A, long long *lda, long long *info);
+long long dgetrf_(long long *m, long long *n, double *A, long long *lda, long long *ipiv, long long *info);
+void dgeqrf_(long long *m, long long *n, double *A, long long *lda, double *tau, double *work, long long *lwork, long long *info);
+void dgeqr2_(long long *m, long long *n, double *A, long long *lda, double *tau, double *work, long long *info);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/blas/d_lapack_lib.c b/blas/d_lapack_lib.c
new file mode 100644
index 0000000..ce68c3d
--- /dev/null
+++ b/blas/d_lapack_lib.c
@@ -0,0 +1,75 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#if defined(LA_BLAS)
+#if defined(REF_BLAS_BLIS)
+#include "d_blas_64.h"
+#else
+#include "d_blas.h"
+#endif
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+#define STRVEC d_strvec
+
+#define GELQF_LIBSTR dgelqf_libstr
+#define GELQF_WORK_SIZE_LIBSTR dgelqf_work_size_libstr
+#define GEQRF_LIBSTR dgeqrf_libstr
+#define GEQRF_WORK_SIZE_LIBSTR dgeqrf_work_size_libstr
+#define GETF2_NOPIVOT dgetf2_nopivot
+#define GETRF_NOPIVOT_LIBSTR dgetrf_nopivot_libstr
+#define GETRF_LIBSTR dgetrf_libstr
+#define POTRF_L_LIBSTR dpotrf_l_libstr
+#define POTRF_L_MN_LIBSTR dpotrf_l_mn_libstr
+#define SYRK_POTRF_LN_LIBSTR dsyrk_dpotrf_ln_libstr
+
+#define COPY dcopy_
+#define GELQF dgelqf_
+#define GEMM dgemm_
+#define GER dger_
+#define GEQRF dgeqrf_
+#define GEQR2 dgeqr2_
+#define GETRF dgetrf_
+#define POTRF dpotrf_
+#define SCAL dscal_
+#define SYRK dsyrk_
+#define TRSM dtrsm_
+
+
+#include "x_lapack_lib.c"
diff --git a/blas/d_lapack_lib4.c b/blas/d_lapack_lib4.c
new file mode 100644
index 0000000..75a4a4f
--- /dev/null
+++ b/blas/d_lapack_lib4.c
@@ -0,0 +1,2671 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+
+
+void dgetrf_nn_nopivot_lib(int m, int n, double *pC, int sdc, double *pD, int sdd, double *inv_diag_D)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int ps = 4;
+
+ int ii, jj, ie;
+
+ // main loop
+ ii = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<m-11; ii+=12)
+ {
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie-3; jj+=4)
+ {
+ kernel_dtrsm_nn_ru_inv_12x4_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
+ }
+ if(jj<ie)
+ {
+ kernel_dtrsm_nn_ru_inv_12x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ jj+=4;
+ }
+ // factorize
+ if(jj<n-3)
+ {
+ kernel_dgetrf_nn_l_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_dgetrf_nn_l_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ if(jj<n-3)
+ {
+ kernel_dgetrf_nn_m_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_dgetrf_nn_m_12x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ if(jj<n-3)
+ {
+ kernel_dgetrf_nn_r_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_dgetrf_nn_r_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n-3; jj+=4)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd);
+ }
+ if(jj<n)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
+ }
+ }
+ if(m>ii)
+ {
+ if(m-ii<=4)
+ {
+ goto left_4;
+ }
+ else if(m-ii<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for( ; ii<m-7; ii+=8)
+ {
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie-3; jj+=4)
+ {
+ kernel_dtrsm_nn_ru_inv_8x4_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
+ }
+ if(jj<ie)
+ {
+ kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ jj+=4;
+ }
+ // factorize
+ if(jj<n-3)
+ {
+ kernel_dgetrf_nn_l_8x4_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
+// kernel_dgetrf_nn_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj]);
+// kernel_dtrsm_nn_ru_inv_4x4_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_dgetrf_nn_l_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+// kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+// kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
+ jj+=4;
+ }
+ if(jj<n-3)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
+ kernel_dgetrf_nn_4x4_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
+ kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n-3; jj+=4)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd],sdd, &pD[ii*ps+ii*sdd], sdd);
+ }
+ if(jj<n)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
+ }
+ }
+ if(m>ii)
+ {
+ if(m-ii<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for( ; ii<m-3; ii+=4)
+ {
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie-3; jj+=4)
+ {
+ kernel_dtrsm_nn_ru_inv_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
+ }
+ if(jj<ie)
+ {
+ kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ jj+=4;
+ }
+ // factorize
+ if(jj<n-3)
+ {
+ kernel_dgetrf_nn_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n-3; jj+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
+ }
+ if(jj<n)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
+ }
+ }
+ if(m>ii)
+ {
+ goto left_4;
+ }
+
+#endif
+
+ // common return if i==m
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie; jj+=4)
+ {
+ kernel_dtrsm_nn_ru_inv_12x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ }
+ // factorize
+ if(jj<n)
+ {
+ kernel_dgetrf_nn_l_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ if(jj<n)
+ {
+ kernel_dgetrf_nn_l_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ if(jj<n)
+ {
+ kernel_dgetrf_nn_r_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n; jj+=4)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
+ }
+ return;
+
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie; jj+=4)
+ {
+ kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ }
+ // factorize
+ if(jj<n)
+ {
+ kernel_dgetrf_nn_l_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+// kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+// kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
+ jj+=4;
+ }
+ if(jj<n)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
+ kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n; jj+=4)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
+ }
+ return;
+
+#endif
+
+ left_4:
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie; jj+=4)
+ {
+ kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ }
+ // factorize
+ if(jj<n)
+ {
+ kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n; jj+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
+ }
+ return;
+
+ }
+
+
+
+void dgetrf_nn_lib(int m, int n, double *pC, int sdc, double *pD, int sdd, double *inv_diag_D, int *ipiv)
+ {
+
+ if(m<=0)
+ return;
+
+ const int ps = 4;
+
+ int ii, jj, i0, i1, j0, ll, p;
+
+ double d1 = 1.0;
+ double dm1 = -1.0;
+
+ // needs to perform row-excanges on the yet-to-be-factorized matrix too
+ if(pC!=pD)
+ dgecp_lib(m, n, 1.0, 0, pC, sdc, 0, pD, sdd);
+
+ // minimum matrix size
+ p = n<m ? n : m; // XXX
+
+ // main loop
+#if defined(TARGET_X64_INTEL_HASWELL)
+ // 12 columns at a time
+ jj = 0;
+ for(; jj<p-11; jj+=12)
+ {
+ // pivot & factorize & solve lower
+ // left block-column
+ ii = jj;
+ i0 = ii;
+ for( ; ii<m-11; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>8)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ else if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ // middle block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
+ ii += 4;
+ i1 = ii;
+ for( ; ii<m-11; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>8)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ else if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+ kernel_dgetrf_pivot_4_lib4(m-i1, &pD[(jj+4)*ps+i1*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i1]);
+ ipiv[i1+0] += i1;
+ if(ipiv[i1+0]!=i1+0)
+ {
+ drowsw_lib(jj+4, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+8)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+8)*ps);
+ }
+ ipiv[i1+1] += i1;
+ if(ipiv[i1+1]!=i1+1)
+ {
+ drowsw_lib(jj+4, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+8)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+8)*ps);
+ }
+ ipiv[i1+2] += i1;
+ if(ipiv[i1+2]!=i1+2)
+ {
+ drowsw_lib(jj+4, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+8)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+8)*ps);
+ }
+ ipiv[i1+3] += i1;
+ if(ipiv[i1+3]!=i1+3)
+ {
+ drowsw_lib(jj+4, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+8)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+8)*ps);
+ }
+ // right block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_8x4_lib4(ii, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd);
+ ii += 8;
+ i1 = ii;
+ for( ; ii<m-11; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+8)*ps], sdd, &d1, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>8)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &d1, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ else if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+ kernel_dgetrf_pivot_4_lib4(m-i1, &pD[(jj+8)*ps+i1*sdd], sdd, &inv_diag_D[(jj+8)], &ipiv[i1]);
+ ipiv[i1+0] += i1;
+ if(ipiv[i1+0]!=i1+0)
+ {
+ drowsw_lib(jj+8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+12)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+12)*ps);
+ }
+ ipiv[i1+1] += i1;
+ if(ipiv[i1+1]!=i1+1)
+ {
+ drowsw_lib(jj+8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+12)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+12)*ps);
+ }
+ ipiv[i1+2] += i1;
+ if(ipiv[i1+2]!=i1+2)
+ {
+ drowsw_lib(jj+8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+12)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+12)*ps);
+ }
+ ipiv[i1+3] += i1;
+ if(ipiv[i1+3]!=i1+3)
+ {
+ drowsw_lib(jj+8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+12)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+12)*ps);
+ }
+
+ // solve upper
+// i0 -= 8; // 4 ???
+ ll = jj+12;
+ for( ; ll<n-3; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd);
+ }
+ if(ll<n)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, 12, n-ll);
+ }
+ }
+ if(m>=n)
+ {
+ if(n-jj>0)
+ {
+ if(n-jj<=4)
+ goto left_n_4;
+ else if(n-jj<=8)
+ goto left_n_8;
+ else
+ goto left_n_12;
+ }
+ }
+ else // n>m
+ {
+ if(m-jj>0)
+ {
+ if(m-jj<=4)
+ goto left_m_4;
+ else if(m-jj<=8)
+ goto left_m_8;
+ else
+ goto left_m_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ // 8 columns at a time
+ jj = 0;
+ for(; jj<p-7; jj+=8)
+ {
+ // pivot & factorize & solve lower
+ // left block-column
+ ii = jj;
+ i0 = ii;
+#if defined(TARGET_X64_INTEL_HASWELL) // XXX
+ for( ; ii<m-11; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>8)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ else if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+#else // SANDY_BRIDGE
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgemm_nn_8x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+#endif
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ // right block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
+ ii += 4;
+ i0 = ii;
+#if defined(TARGET_X64_INTEL_HASWELL) // XXX
+ for( ; ii<m-11; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>8)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ else if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+#else // SANDY_BRIDGE
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgemm_nn_8x4_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+#endif
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[(jj+4)*ps+i0*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj+4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+8)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+8)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj+4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+8)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+8)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj+4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+8)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+8)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj+4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+8)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+8)*ps);
+ }
+
+ // solve upper
+ i0 -= 4;
+ ll = jj+8;
+ for( ; ll<n-3; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd);
+ }
+ if(ll<n)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, 8, n-ll);
+ }
+ }
+ if(m>=n)
+ {
+ if(n-jj>0)
+ {
+ if(n-jj<=4) // (m>=1 && n==1) || (m>=2 && n==2) || m>=3 && n==3
+ {
+ goto left_n_4;
+ }
+ else // (m>=5 && n==5) || (m>=6 && n==6) || (m>=7 && n==7)
+ goto left_n_8;
+ }
+ }
+ else // n>m
+ {
+ if(m-jj>0)
+ {
+ if(m-jj<=4) // (m==1 && n>=2) || (m==2 && n>=3) || (m==3 && n>=4) || (m==4 && n>=5)
+ goto left_m_4;
+ else // (m==5 && n>=6) || (m==6 && n>=7) || (m==7 && n>=8)
+ {
+ goto left_m_8;
+ }
+ }
+ }
+#else
+ // 4 columns at a time
+ jj = 0;
+ for(; jj<p-3; jj+=4) // XXX
+ {
+ // pivot & factorize & solve lower
+ ii = jj;
+ i0 = ii;
+#if defined(TARGET_X64_INTEL_HASWELL) // XXX
+ for( ; ii<m-11; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>8)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ else if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) // XXX
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgemm_nn_8x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+ }
+ if(m-ii>0)
+ {
+ if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ else
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ }
+#else
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgemm_nn_4x4_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], &pD[jj*ps+ii*sdd]);
+ }
+ if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+#endif
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+
+ // solve upper
+ ll = jj+4;
+ for( ; ll<n-3; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd]);
+ }
+ if(n-ll>0)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd], 4, n-ll);
+ }
+ }
+ if(m>=n)
+ {
+ if(n-jj>0)
+ {
+ goto left_n_4;
+ }
+ }
+ else
+ {
+ if(m-jj>0)
+ {
+ goto left_m_4;
+ }
+ }
+#endif
+
+ // common return if jj==n
+ return;
+
+
+ // clean up
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_n_12:
+ // 9-12 columns at a time
+ // pivot & factorize & solve lower
+ // left block-column
+ ii = jj;
+ i0 = ii;
+ for( ; ii<m-8; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+ }
+ if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+// ii+=8;
+ }
+ else if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+// ii+=4;
+ }
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ // middle block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
+ ii += 4;
+ i1 = ii;
+ for( ; ii<m-8; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd, m-ii, n-jj-4);
+ }
+ if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ else if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-4, &pD[(jj+4)*ps+i1*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i1]);
+ ipiv[i1+0] += i1;
+ if(ipiv[i1+0]!=i1+0)
+ {
+ drowsw_lib(jj+4, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+8)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+8)*ps);
+ }
+ if(n-jj-4>1)
+ {
+ ipiv[i1+1] += i1;
+ if(ipiv[i1+1]!=i1+1)
+ {
+ drowsw_lib(jj+4, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+8)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+8)*ps);
+ }
+ if(n-jj-4>2)
+ {
+ ipiv[i1+2] += i1;
+ if(ipiv[i1+2]!=i1+2)
+ {
+ drowsw_lib(jj+4, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+8)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+8)*ps);
+ }
+ if(n-jj-4>3)
+ {
+ ipiv[i1+3] += i1;
+ if(ipiv[i1+3]!=i1+3)
+ {
+ drowsw_lib(jj+4, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+8)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+8)*ps);
+ }
+ }
+ }
+ }
+ // right block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, 8, n-jj-8);
+ ii += 8;
+ i1 = ii;
+ for( ; ii<m-8; ii+=12)
+ {
+ kernel_dgemm_nn_12x4_vs_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &d1, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, m-ii, n-jj-8);
+ }
+ if(m-ii>4)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-8);
+ }
+ else if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-8);
+ }
+ kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-8, &pD[(jj+8)*ps+i1*sdd], sdd, &inv_diag_D[(jj+8)], &ipiv[i1]);
+ ipiv[i1+0] += i1;
+ if(ipiv[i1+0]!=i1+0)
+ {
+ drowsw_lib(jj+8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+12)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+12)*ps);
+ }
+ if(n-jj-8>1)
+ {
+ ipiv[i1+1] += i1;
+ if(ipiv[i1+1]!=i1+1)
+ {
+ drowsw_lib(jj+8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+12)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+12)*ps);
+ }
+ if(n-jj-8>2)
+ {
+ ipiv[i1+2] += i1;
+ if(ipiv[i1+2]!=i1+2)
+ {
+ drowsw_lib(jj+8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+12)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+12)*ps);
+ }
+ if(n-jj-8>3)
+ {
+ ipiv[i1+3] += i1;
+ if(ipiv[i1+3]!=i1+3)
+ {
+ drowsw_lib(jj+8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+12)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+12)*ps);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ // there is no upper
+ return;
+#endif
+
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_m_12:
+ // 9-12 rows at a time
+ // pivot & factorize & solve lower
+ // left block-column
+ ii = jj;
+ i0 = ii;
+ kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ // middle block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
+ ii += 4;
+ i1 = ii;
+ kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+ kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-4, &pD[(jj+4)*ps+i1*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i1]);
+ ipiv[i1+0] += i1;
+ if(ipiv[i1+0]!=i1+0)
+ {
+ drowsw_lib(jj+4, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+8)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+8)*ps);
+ }
+ if(m-jj-4>1)
+ {
+ ipiv[i1+1] += i1;
+ if(ipiv[i1+1]!=i1+1)
+ {
+ drowsw_lib(jj+4, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+8)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+8)*ps);
+ }
+ if(m-jj-4>2)
+ {
+ ipiv[i1+2] += i1;
+ if(ipiv[i1+2]!=i1+2)
+ {
+ drowsw_lib(jj+4, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+8)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+8)*ps);
+ }
+ if(m-jj-4>3)
+ {
+ ipiv[i1+3] += i1;
+ if(ipiv[i1+3]!=i1+3)
+ {
+ drowsw_lib(jj+4, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+ drowsw_lib(n-jj-8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+8)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+8)*ps);
+ }
+ }
+ }
+ }
+ // right block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, 8, n-jj-8);
+ ii += 8;
+ i1 = ii;
+ kernel_dgemm_nn_4x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-8);
+ kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-8, &pD[(jj+8)*ps+i1*sdd], sdd, &inv_diag_D[(jj+8)], &ipiv[i1]);
+ ipiv[i1+0] += i1;
+ if(ipiv[i1+0]!=i1+0)
+ {
+ drowsw_lib(jj+8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+12)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+12)*ps);
+ }
+ if(m-jj-8>1)
+ {
+ ipiv[i1+1] += i1;
+ if(ipiv[i1+1]!=i1+1)
+ {
+ drowsw_lib(jj+8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+12)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+12)*ps);
+ }
+ if(m-jj-8>2)
+ {
+ ipiv[i1+2] += i1;
+ if(ipiv[i1+2]!=i1+2)
+ {
+ drowsw_lib(jj+8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+12)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+12)*ps);
+ }
+ if(m-jj-8>3)
+ {
+ ipiv[i1+3] += i1;
+ if(ipiv[i1+3]!=i1+3)
+ {
+ drowsw_lib(jj+8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+ drowsw_lib(n-jj-12, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+12)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+12)*ps);
+ }
+ }
+ }
+ }
+
+ // solve upper
+// i0 -= 8;
+ ll = jj+12;
+ for( ; ll<n; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, m-i0, n-ll);
+ }
+ return;
+#endif
+
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_n_8:
+ // 5-8 columns at a time
+ // pivot & factorize & solve lower
+ // left block-column
+ ii = jj;
+ i0 = ii;
+ for( ; ii<m-4; ii+=8)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ }
+ if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+// ii+=4;
+ }
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ // right block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
+ ii += 4;
+ i0 = ii;
+ for( ; ii<m-4; ii+=8)
+ {
+ kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], 0, sdd, m-ii, 0, n-jj-4);
+ }
+ if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj-4, &pD[(jj+4)*ps+i0*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj+4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+8)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+8)*ps);
+ }
+ if(n-jj-4>1)
+ {
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj+4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+8)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+8)*ps);
+ }
+ if(n-jj-4>2)
+ {
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj+4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+8)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+8)*ps);
+ }
+ if(n-jj-4>3)
+ {
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj+4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+8)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+8)*ps);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ // there is no upper
+ return;
+#endif
+
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_m_8:
+ // 5-8 rows at a time
+ // pivot & factorize & solve lower
+ // left block-column
+ ii = jj;
+ i0 = ii;
+ kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+ kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ // right block-column
+ ii = i0;
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
+ ii += 4;
+ i0 = ii;
+ kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+ kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj-4, &pD[(jj+4)*ps+i0*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj+4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+8)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+8)*ps);
+ }
+ if(m-jj-4>1)
+ {
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj+4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+8)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+8)*ps);
+ }
+ if(m-jj-4>2)
+ {
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj+4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+8)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+8)*ps);
+ }
+ if(m-jj-4>3)
+ {
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj+4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-8, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+8)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+8)*ps);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ i0 -= 4;
+ ll = jj+8;
+ for( ; ll<n; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, m-i0, n-ll);
+ }
+ return;
+#endif
+
+
+ left_n_4:
+ // 1-4 columns at a time
+ // pivot & factorize & solve lower
+ ii = jj;
+ i0 = ii;
+#if 0//defined(TARGET_X64_AVX2) || defined(TARGET_X64_AVX)
+ for( ; ii<m-4; ii+=8)
+ {
+ kernel_dgemm_nn_8x4_vs_lib4(m-ii, n-jj, jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, -1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, 0, 0);
+ }
+ if(m-ii>0)
+ {
+ kernel_dgemm_nn_4x4_vs_lib4(m-ii, n-jj, jj, &pD[ii*sdd], &pD[jj*ps], sdd, -1, &pD[jj*ps+ii*sdd], &pD[jj*ps+ii*sdd], 0, 0);
+// ii+=4;
+ }
+#else
+ for( ; ii<m; ii+=4)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj);
+ }
+#endif
+ kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ if(n-jj>1)
+ {
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ if(n-jj>2)
+ {
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ if(n-jj>3)
+ {
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ if(0) // there is no upper
+ {
+ ll = jj+4;
+ for( ; ll<n; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd], m-i0, n-ll);
+ }
+ }
+ return;
+
+
+ left_m_4:
+ // 1-4 rows at a time
+ // pivot & factorize & solve lower
+ ii = jj;
+ i0 = ii;
+ kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj);
+ kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+ }
+ if(m-i0>1)
+ {
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+ }
+ if(m-i0>2)
+ {
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+ }
+ if(m-i0>3)
+ {
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+ drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ ll = jj+4;
+ for( ; ll<n; ll+=4)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd], m-i0, n-ll);
+ }
+ return;
+
+ }
+
+
+# if 0
+void dlauum_dpotrf_blk_nt_l_lib(int m, int n, int nv, int *rv, int *cv, double *pA, int sda, double *pB, int sdb, int alg, double *pC, int sdc, double *pD, int sdd, double *inv_diag_D)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ // TODO remove
+ int k = cv[nv-1];
+
+ const int ps = 4;
+
+ int i, j, l;
+ int ii, iii, jj, kii, kiii, kjj, k0, k1;
+
+ i = 0;
+ ii = 0;
+ iii = 0;
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-7; i+=8)
+ {
+
+ while(ii<nv && rv[ii]<i+8)
+ ii++;
+ if(ii<nv)
+ kii = cv[ii];
+ else
+ kii = cv[ii-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j]);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j], 8, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &inv_diag_D[j], 8, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], alg, &pC[(j+4)*ps+(j+4)*sdc], &pD[(j+4)*ps+(j+4)*sdd], &inv_diag_D[j+4], 4, n-j-4); // TODO
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+
+ while(ii<nv && rv[ii]<i+4)
+ ii++;
+ if(ii<nv)
+ kii = cv[ii];
+ else
+ kii = cv[ii-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j]);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(i<j) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j], 4, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &inv_diag_D[j], 4, n-j);
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+
+ kii = cv[nv-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &inv_diag_D[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], alg, &pC[(j+4)*ps+(j+4)*sdc], &pD[(j+4)*ps+(j+4)*sdd], &inv_diag_D[j+4], m-i-4, n-j-4); // TODO
+ }
+ }
+ }
+ return;
+#endif
+
+ left_4:
+
+ kii = cv[nv-1];
+
+ j = 0;
+ jj = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+
+ while(jj<nv && rv[jj]<j+4)
+ jj++;
+ if(jj<nv)
+ kjj = cv[jj];
+ else
+ kjj = cv[jj-1];
+ k0 = kii<kjj ? kii : kjj;
+
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ }
+ return;
+
+ }
+#endif
+
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dpotrf
+void dpotrf_l_libstr(int m, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ci!=0 | di!=0)
+ {
+ printf("\ndpotrf_l_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+
+ const int ps = 4;
+
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dD = sD->dA;
+
+ if(di==0 & dj==0) // XXX what to do if di and dj are not zero
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ int i, j, l;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+ }
+ kernel_dpotrf_nt_l_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+ kernel_dpotrf_nt_l_8x8_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4]);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+ }
+ kernel_dpotrf_nt_l_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+ kernel_dpotrf_nt_l_4x4_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4]);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j]);
+ }
+ kernel_dpotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12: // 9 - 12
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ }
+ kernel_dpotrf_nt_l_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ kernel_dpotrf_nt_l_8x8_vs_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, m-j-4);
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<i-8; j+=12)
+ {
+ kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, &pD[(j+4)*ps+(j+4)*sdd], sdd, &dD[(j+4)], m-i, m-(j+4));
+ }
+ if(j<i-4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], &pD[(j+4)*ps+(j+4)*sdd], &dD[(j+4)], m-i, m-(j+4));
+ j += 8;
+ }
+ else if(j<i)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ j += 4;
+ }
+ kernel_dpotrf_nt_l_8x8_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ return;
+#endif
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ }
+ kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, m-j-4);
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<i-8; j+=12)
+ {
+ kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ }
+ if(j<i-4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+ j += 8;
+ }
+ else if(j<i)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ j += 4;
+ }
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ return;
+#else
+ left_4:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ }
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+ return;
+#endif
+
+ }
+
+
+
+// dpotrf
+void dpotrf_l_mn_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ if(ci!=0 | di!=0)
+ {
+ printf("\ndpotrf_l_mn_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+
+ const int ps = 4;
+
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dD = sD->dA;
+
+ if(di==0 & dj==0) // XXX what to do if di and dj are not zero
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ int i, j, l;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dpptrf
+ {
+ if(n<j-11)
+ {
+ kernel_dpotrf_nt_l_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+ kernel_dpotrf_nt_l_8x8_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4]);
+ }
+ else
+ {
+ kernel_dpotrf_nt_l_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dpotrf_nt_l_8x4_vs_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+ if(j<n-8)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], &dD[j+8], m-i-8, n-j-8);
+ }
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dpotrf
+ {
+ if(j<n-7)
+// if(0)
+ {
+ kernel_dpotrf_nt_l_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+ kernel_dpotrf_nt_l_4x4_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4]);
+ }
+ else
+ {
+ kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dpotrf
+ {
+ if(j<n-3)
+ {
+ kernel_dpotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j]);
+ }
+ else
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dpotrf_nt_l_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dpotrf_nt_l_8x4_vs_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+ if(j<n-8)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], &dD[j+8], m-i-8, n-j-8);
+ }
+ }
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<i-8 & j<n-8; j+=12)
+ {
+ kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, &pD[(j+4)*ps+(j+4)*sdd], sdd, &dD[(j+4)], m-i, n-(j+4));
+ }
+ if(j<i-4 & j<n-4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], &pD[(j+4)*ps+(j+4)*sdd], &dD[(j+4)], m-i, n-(j+4));
+ j += 8;
+ }
+ else if(j<i & j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ j += 4;
+ }
+ if(j<n)
+ {
+ kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<i-8 & j<n-8; j+=12)
+ {
+ kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ }
+ if(j<i-4 & j<n-4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ j += 8;
+ }
+ else if(j<i & j<n)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ j += 4;
+ }
+ if(j<n)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ return;
+#else
+ left_4:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ return;
+#endif
+
+ }
+
+
+
+// dsyrk dpotrf
+void dsyrk_dpotrf_ln_libstr(int m, int n, int k, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\ndsyrk_dpotrf_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+ exit(1);
+ }
+
+ const int ps = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pA = sA->pA + aj*ps;
+ double *pB = sB->pA + bj*ps;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dD = sD->dA; // XXX what to do if di and dj are not zero
+
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ int i, j, l;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-11)
+ {
+ kernel_dsyrk_dpotrf_nt_l_12x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+ kernel_dsyrk_dpotrf_nt_l_8x8_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4]);
+ }
+ else
+ {
+ kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ if(j<n-8)
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+ }
+ else
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_12;
+ }
+ }
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(j<i) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-7)
+// if(0)
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+ kernel_dsyrk_dpotrf_nt_l_4x4_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4]);
+ }
+ else
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else
+ {
+ goto left_8;
+ }
+ }
+#else
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i & j<n-3; j+=4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dgemm
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-3)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j]);
+ }
+ else
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+ if(j<n-8)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], &dD[j+8], m-i-8, n-j-8);
+ }
+ }
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_8:
+ j = 0;
+ for(; j<i-8 & j<n-8; j+=12)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], sdb, j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4(k, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, (j+4), &pD[i*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, &pD[(j+4)*ps+(j+4)*sdd], sdd, &dD[(j+4)], m-i, n-(j+4));
+ }
+ if(j<i-3 & j<n-3)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], sdb, j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[(j+4)*sdb], (j+4), &pD[i*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], &pD[(j+4)*ps+(j+4)*sdd], &dD[(j+4)], m-i, n-(j+4));
+ j += 8;
+ }
+ else if(j<i & j<n)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ j += 4;
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_8:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ if(j<n-4)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+ }
+ }
+ return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4:
+ j = 0;
+ for(; j<i-8 & j<n-8; j+=12)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4(k, &pA[i*sda], &pB[j*sdb], sdb, j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ }
+ if(j<i-4 & j<n-4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4(k, &pA[i*sda], &pB[j*sdb], sdb, j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+ j += 8;
+ }
+ else if(j<i & j<n)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ j += 4;
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+#else
+ left_4:
+ j = 0;
+ for(; j<i & j<n; j+=4)
+ {
+ kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+ }
+#endif
+
+ return;
+
+ }
+
+
+
+// dgetrf without pivoting
+void dgetrf_nopivot_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+ {
+ if(ci!=0 | di!=0)
+ {
+ printf("\ndgetf_nopivot_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+ const int ps = 4;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dD = sD->dA; // XXX what to do if di and dj are not zero
+ dgetrf_nn_nopivot_lib(m, n, pC, sdc, pD, sdd, dD);
+ if(di==0 && dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ return;
+ }
+
+
+
+
+// dgetrf pivoting
+void dgetrf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, int *ipiv)
+ {
+ if(ci!=0 | di!=0)
+ {
+ printf("\ndgetrf_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+ const int ps = 4;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pC = sC->pA + cj*ps;
+ double *pD = sD->pA + dj*ps;
+ double *dD = sD->dA; // XXX what to do if di and dj are not zero
+ dgetrf_nn_lib(m, n, pC, sdc, pD, sdd, dD, ipiv);
+ if(di==0 && dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ return;
+ }
+
+
+
+int dgeqrf_work_size_libstr(int m, int n)
+ {
+ const int ps = 4;
+ int cm = (m+ps-1)/ps*ps;
+ int cn = (n+ps-1)/ps*ps;
+ return ps*(cm+cn)*sizeof(double);
+// return 0;
+ }
+
+
+
+void dgeqrf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, void *v_work)
+ {
+ char *work = (char *) v_work;
+ if(m<=0 | n<=0)
+ return;
+ const int ps = 4;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pC = &(DMATEL_LIBSTR(sC,ci,cj));
+ double *pD = &(DMATEL_LIBSTR(sD,di,dj));
+ double *dD = sD->dA + di;
+ int cm = (m+ps-1)/ps*ps;
+ int cn = (n+ps-1)/ps*ps;
+ double *pVt = (double *) work;
+ work += ps*cm*sizeof(double);
+ double *pW = (double *) work;
+ work += ps*cn*sizeof(double);
+ if(pC!=pD)
+ dgecp_lib(m, n, 1.0, ci&(ps-1), pC, sdc, di&(ps-1), pD, sdd);
+ int ii;
+ int imax0 = (ps-(di&(ps-1)))&(ps-1);
+ int imax = m<n ? m : n;
+ imax0 = imax<imax0 ? imax : imax0;
+ if(imax0>0)
+ {
+ kernel_dgeqrf_vs_lib4(m, n, imax0, di&(ps-1), pD, sdd, dD);
+ pD += imax0-ps+ps*sdd+imax0*ps;
+ dD += imax0;
+ m -= imax0;
+ n -= imax0;
+ imax -= imax0;
+ }
+ for(ii=0; ii<imax-3; ii+=4)
+ {
+ kernel_dgeqrf_4_lib4(m-ii, pD+ii*sdd+ii*ps, sdd, dD+ii);
+#if 0
+ kernel_dlarf_4_lib4(m-ii, n-ii-4, pD+ii*sdd+ii*ps, sdd, dD+ii, pD+ii*sdd+(ii+4)*ps, sdd);
+#else
+ kernel_dgetr_4_0_lib4(m-ii, pD+ii*sdd+ii*ps, sdd, pVt);
+ pVt[0+ps*0] = 1.0;
+ pVt[1+ps*0] = 0.0;
+ pVt[2+ps*0] = 0.0;
+ pVt[3+ps*0] = 0.0;
+ pVt[1+ps*1] = 1.0;
+ pVt[2+ps*1] = 0.0;
+ pVt[3+ps*1] = 0.0;
+ pVt[2+ps*2] = 1.0;
+ pVt[3+ps*2] = 0.0;
+ pVt[3+ps*3] = 1.0;
+ kernel_dlarf_t_4_lib4(m-ii, n-ii-4, pD+ii*sdd+ii*ps, sdd, pVt, dD+ii, pD+ii*sdd+(ii+4)*ps, sdd, pW);
+#endif
+ }
+ if(ii<imax)
+ {
+ kernel_dgeqrf_vs_lib4(m-ii, n-ii, imax-ii, ii&(ps-1), pD+ii*sdd+ii*ps, sdd, dD+ii);
+ }
+ return;
+ }
+
+
+
+int dgelqf_work_size_libstr(int m, int n)
+ {
+ return 0;
+ }
+
+
+
+void dgelqf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ const int ps = 4;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ double *pC = &(DMATEL_LIBSTR(sC,ci,cj));
+ double *pD = &(DMATEL_LIBSTR(sD,di,dj));
+ double *dD = sD->dA + di;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ double pT[144] __attribute__ ((aligned (64))) = {0};
+ double pK[96] __attribute__ ((aligned (64))) = {0};
+#else
+ double pT[144] = {0};
+ double pK[96] = {0};
+#endif
+ if(pC!=pD)
+ dgecp_lib(m, n, 1.0, ci&(ps-1), pC, sdc, di&(ps-1), pD, sdd);
+ int ii, jj, ll;
+ int imax0 = (ps-(di&(ps-1)))&(ps-1);
+ int imax = m<n ? m : n;
+#if 0
+ kernel_dgelqf_vs_lib4(m, n, imax, di&(ps-1), pD, sdd, dD);
+#else
+ imax0 = imax<imax0 ? imax : imax0;
+ if(imax0>0)
+ {
+ kernel_dgelqf_vs_lib4(m, n, imax0, di&(ps-1), pD, sdd, dD);
+ pD += imax0-ps+ps*sdd+imax0*ps;
+ dD += imax0;
+ m -= imax0;
+ n -= imax0;
+ imax -= imax0;
+ }
+ ii = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+// for(; ii<imax-11; ii+=12)
+ for(; ii<imax-127; ii+=12) // crossover point ~ ii=128
+ {
+ kernel_dgelqf_dlarft12_12_lib4(n-(ii+0), pD+(ii+0)*sdd+(ii+0)*ps, sdd, dD+(ii+0), &pT[0+0*12+0*ps]);
+ jj = ii+12;
+ for(; jj<m; jj+=4)
+ {
+ kernel_dlarfb12_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, sdd, pT, pD+jj*sdd+ii*ps, pK, m-jj);
+ }
+ }
+ for(; ii<imax-11; ii+=4)
+ {
+ kernel_dgelqf_dlarft4_12_lib4(n-ii, pD+ii*sdd+ii*ps, sdd, dD+ii, pT);
+ jj = ii+12;
+ for(; jj<m-11; jj+=12)
+ {
+ kernel_dlarfb4_r_12_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+ for(; jj<m-7; jj+=8)
+ {
+ kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+ for(; jj<m-3; jj+=4)
+ {
+ kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
+ }
+ }
+ // 8 9 10 11
+ if(ii<imax-7)
+ {
+ kernel_dgelqf_dlarft4_8_lib4(n-ii, pD+ii*sdd+ii*ps, sdd, dD+ii, pT);
+ jj = ii+8;
+ if(jj<m)
+ {
+ for(; jj<m-11; jj+=12)
+ {
+ kernel_dlarfb4_r_12_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+ for(; jj<m-7; jj+=8)
+ {
+ kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+ for(; jj<m-3; jj+=4)
+ {
+ kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
+ }
+ }
+ ii += 4;
+ }
+ // 4 5 6 7
+ if(ii<imax-3)
+ {
+ kernel_dgelqf_dlarft4_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii, pT);
+ jj = ii+4;
+ if(jj<m)
+ {
+ for(; jj<m-11; jj+=12)
+ {
+ kernel_dlarfb4_r_12_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+ for(; jj<m-7; jj+=8)
+ {
+ kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+ for(; jj<m-3; jj+=4)
+ {
+ kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
+ }
+ }
+ ii += 4;
+ }
+ // 1 2 3
+ if(ii<imax)
+ {
+ kernel_dgelqf_vs_lib4(m-ii, n-ii, imax-ii, ii&(ps-1), pD+ii*sdd+ii*ps, sdd, dD+ii);
+ }
+#else // no haswell
+ for(ii=0; ii<imax-4; ii+=4)
+ {
+// kernel_dgelqf_vs_lib4(4, n-ii, 4, 0, pD+ii*sdd+ii*ps, sdd, dD+ii);
+// kernel_dgelqf_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii);
+// kernel_dlarft_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii, pT);
+ kernel_dgelqf_dlarft4_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii, pT);
+ jj = ii+4;
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; jj<m-7; jj+=8)
+ {
+ kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+ }
+#endif
+ for(; jj<m-3; jj+=4)
+ {
+ kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
+ }
+ }
+ if(ii<imax)
+ {
+ if(ii==imax-4)
+ {
+ kernel_dgelqf_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii);
+ }
+ else
+ {
+ kernel_dgelqf_vs_lib4(m-ii, n-ii, imax-ii, ii&(ps-1), pD+ii*sdd+ii*ps, sdd, dD+ii);
+ }
+ }
+#endif // no haswell
+#endif
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/s_blas.h b/blas/s_blas.h
new file mode 100644
index 0000000..b6a92a7
--- /dev/null
+++ b/blas/s_blas.h
@@ -0,0 +1,66 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// headers to reference BLAS and LAPACK routines employed in BLASFEO WR
+
+// level 1
+void scopy_(int *m, float *x, int *incx, float *y, int *incy);
+void saxpy_(int *m, float *alpha, float *x, int *incx, float *y, int *incy);
+void sscal_(int *m, float *alpha, float *x, int *incx);
+
+// level 2
+void sgemv_(char *ta, int *m, int *n, float *alpha, float *A, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
+void ssymv_(char *uplo, int *m, float *alpha, float *A, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
+void strmv_(char *uplo, char *trans, char *diag, int *n, float *A, int *lda, float *x, int *incx);
+void strsv_(char *uplo, char *trans, char *diag, int *n, float *A, int *lda, float *x, int *incx);
+void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *A, int *lda);
+
+// level 3
+void sgemm_(char *ta, char *tb, int *m, int *n, int *k, float *alpha, float *A, int *lda, float *B, int *ldb, float *beta, float *C, int *ldc);
+void ssyrk_(char *uplo, char *trans, int *n, int *k, float *alpha, float *A, int *lda, float *beta, float *C, int *ldc);
+void strmm_(char *side, char *uplo, char *transa, char *diag, int *m, int *n, float *alpha, float *A, int *lda, float *B, int *ldb);
+void strsm_(char *side, char *uplo, char *transa, char *diag, int *m, int *n, float *alpha, float *A, int *lda, float *B, int *ldb);
+
+// lapack
+int spotrf_(char *uplo, int *m, float *A, int *lda, int *info);
+int sgetrf_(int *m, int *n, float *A, int *lda, int *ipiv, int *info);
+void sgeqrf_(int *m, int *n, float *A, int *lda, float *tau, float *work, int *lwork, int *info);
+void sgeqr2_(int *m, int *n, float *A, int *lda, float *tau, float *work, int *info);
+void sgelqf_(int *m, int *n, float *A, int *lda, float *tau, float *work, int *lwork, int *info);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/blas/s_blas1_lib.c b/blas/s_blas1_lib.c
new file mode 100644
index 0000000..67fec77
--- /dev/null
+++ b/blas/s_blas1_lib.c
@@ -0,0 +1,54 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#include "s_blas.h"
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+#define REAL float
+
+#define STRVEC s_strvec
+
+#define AXPY_LIBSTR saxpy_libstr
+#define VECMULDOT_LIBSTR svecmuldot_libstr
+#define DOT_LIBSTR sdot_libstr
+
+#define AXPY saxpy_
+#define COPY scopy_
+
+
+#include "x_blas1_lib.c"
+
diff --git a/blas/s_blas1_lib4.c b/blas/s_blas1_lib4.c
new file mode 100644
index 0000000..8588020
--- /dev/null
+++ b/blas/s_blas1_lib4.c
@@ -0,0 +1,123 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// z = y + alpha*x, with increments equal to 1
+void saxpy_libstr(int m, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+ int ii;
+ ii = 0;
+ for( ; ii<m-3; ii+=4)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ z[ii+1] = y[ii+1] + alpha*x[ii+1];
+ z[ii+2] = y[ii+2] + alpha*x[ii+2];
+ z[ii+3] = y[ii+3] + alpha*x[ii+3];
+ }
+ for( ; ii<m; ii++)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ }
+ return;
+ }
+
+
+
+void saxpy_bkp_libstr(int m, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+ int ii;
+ ii = 0;
+ for( ; ii<m-3; ii+=4)
+ {
+ z[ii+0] = y[ii+0];
+ y[ii+0] = y[ii+0] + alpha*x[ii+0];
+ z[ii+1] = y[ii+1];
+ y[ii+1] = y[ii+1] + alpha*x[ii+1];
+ z[ii+2] = y[ii+2];
+ y[ii+2] = y[ii+2] + alpha*x[ii+2];
+ z[ii+3] = y[ii+3];
+ y[ii+3] = y[ii+3] + alpha*x[ii+3];
+ }
+ for( ; ii<m; ii++)
+ {
+ z[ii+0] = y[ii+0];
+ y[ii+0] = y[ii+0] + alpha*x[ii+0];
+ }
+ return;
+ }
+
+
+
+// multiply two vectors and compute dot product
+float svecmuldot_libstr(int m, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return 0.0;
+
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+ int ii;
+ float dot = 0.0;
+
+ ii = 0;
+
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = x[ii+0] * y[ii+0];
+ dot += z[ii+0];
+ }
+ return dot;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
diff --git a/blas/s_blas1_lib8.c b/blas/s_blas1_lib8.c
new file mode 100644
index 0000000..538c012
--- /dev/null
+++ b/blas/s_blas1_lib8.c
@@ -0,0 +1,124 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// z = y + alpha*x, with increments equal to 1
+void saxpy_libstr(int m, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+ int ii;
+ ii = 0;
+ for( ; ii<m-3; ii+=4)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ z[ii+1] = y[ii+1] + alpha*x[ii+1];
+ z[ii+2] = y[ii+2] + alpha*x[ii+2];
+ z[ii+3] = y[ii+3] + alpha*x[ii+3];
+ }
+ for( ; ii<m; ii++)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ }
+ return;
+ }
+
+
+
+void saxpy_bkp_libstr(int m, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+ int ii;
+ ii = 0;
+ for( ; ii<m-3; ii+=4)
+ {
+ z[ii+0] = y[ii+0];
+ y[ii+0] = y[ii+0] + alpha*x[ii+0];
+ z[ii+1] = y[ii+1];
+ y[ii+1] = y[ii+1] + alpha*x[ii+1];
+ z[ii+2] = y[ii+2];
+ y[ii+2] = y[ii+2] + alpha*x[ii+2];
+ z[ii+3] = y[ii+3];
+ y[ii+3] = y[ii+3] + alpha*x[ii+3];
+ }
+ for( ; ii<m; ii++)
+ {
+ z[ii+0] = y[ii+0];
+ y[ii+0] = y[ii+0] + alpha*x[ii+0];
+ }
+ return;
+ }
+
+
+
+// multiply two vectors and compute dot product
+float svecmuldot_libstr(int m, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return 0.0;
+
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+ int ii;
+ float dot = 0.0;
+
+ ii = 0;
+
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = x[ii+0] * y[ii+0];
+ dot += z[ii+0];
+ }
+ return dot;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/blas/s_blas2_diag_lib.c b/blas/s_blas2_diag_lib.c
new file mode 100644
index 0000000..1dde42f
--- /dev/null
+++ b/blas/s_blas2_diag_lib.c
@@ -0,0 +1,46 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL float
+
+#define STRVEC s_strvec
+
+#define GEMV_DIAG_LIBSTR sgemv_diag_libstr
+
+
+
+#include "x_blas2_diag_lib.c"
+
diff --git a/blas/s_blas2_lib.c b/blas/s_blas2_lib.c
new file mode 100644
index 0000000..7ab8dc2
--- /dev/null
+++ b/blas/s_blas2_lib.c
@@ -0,0 +1,72 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#include "s_blas.h"
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+#define REAL float
+
+#define STRMAT s_strmat
+#define STRVEC s_strvec
+
+#define GEMV_N_LIBSTR sgemv_n_libstr
+#define GEMV_NT_LIBSTR sgemv_nt_libstr
+#define GEMV_T_LIBSTR sgemv_t_libstr
+#define SYMV_L_LIBSTR ssymv_l_libstr
+#define TRMV_LNN_LIBSTR strmv_lnn_libstr
+#define TRMV_LTN_LIBSTR strmv_ltn_libstr
+#define TRMV_UNN_LIBSTR strmv_unn_libstr
+#define TRMV_UTN_LIBSTR strmv_utn_libstr
+#define TRSV_LNN_LIBSTR strsv_lnn_libstr
+#define TRSV_LNN_MN_LIBSTR strsv_lnn_mn_libstr
+#define TRSV_LNU_LIBSTR strsv_lnu_libstr
+#define TRSV_LTN_LIBSTR strsv_ltn_libstr
+#define TRSV_LTN_MN_LIBSTR strsv_ltn_mn_libstr
+#define TRSV_LTU_LIBSTR strsv_ltu_libstr
+#define TRSV_UNN_LIBSTR strsv_unn_libstr
+#define TRSV_UTN_LIBSTR strsv_utn_libstr
+
+#define COPY scopy_
+#define GEMV sgemv_
+#define SYMV ssymv_
+#define TRMV strmv_
+#define TRSV strsv_
+
+
+
+#include "x_blas2_lib.c"
+
diff --git a/blas/s_blas2_lib4.c b/blas/s_blas2_lib4.c
new file mode 100644
index 0000000..b7a947d
--- /dev/null
+++ b/blas/s_blas2_lib4.c
@@ -0,0 +1,1045 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+void sgemv_n_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<0)
+ return;
+
+ const int bs = 4;
+
+ int i;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+
+ i = 0;
+ // clean up at the beginning
+ if(ai%bs!=0)
+ {
+ kernel_sgemv_n_4_gen_lib4(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
+ pA += bs*sda;
+ y += 4 - ai%bs;
+ z += 4 - ai%bs;
+ m -= 4 - ai%bs;
+ }
+ // main loop
+ for( ; i<m-3; i+=4)
+ {
+ kernel_sgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
+ }
+ if(i<m)
+ {
+ kernel_sgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
+ }
+
+ return;
+
+ }
+
+
+
+void sgemv_t_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+
+ if(ai%bs==0)
+ {
+ i = 0;
+ for( ; i<n-3; i+=4)
+ {
+ kernel_sgemv_t_4_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+ }
+ if(i<n)
+ {
+ kernel_sgemv_t_4_vs_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ }
+ else // TODO kernel 8
+ {
+ i = 0;
+ for( ; i<n; i+=4)
+ {
+ kernel_sgemv_t_4_gen_lib4(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void sgemv_nt_libstr(int m, int n, float alpha_n, float alpha_t, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx_n, int xi_n, struct s_strvec *sx_t, int xi_t, float beta_n, float beta_t, struct s_strvec *sy_n, int yi_n, struct s_strvec *sy_t, int yi_t, struct s_strvec *sz_n, int zi_n, struct s_strvec *sz_t, int zi_t)
+ {
+
+ if(ai!=0)
+ {
+ printf("\nsgemv_nt_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *x_n = sx_n->pa + xi_n;
+ float *x_t = sx_t->pa + xi_t;
+ float *y_n = sy_n->pa + yi_n;
+ float *y_t = sy_t->pa + yi_t;
+ float *z_n = sz_n->pa + zi_n;
+ float *z_t = sz_t->pa + zi_t;
+
+// sgemv_nt_lib(m, n, alpha_n, alpha_t, pA, sda, x_n, x_t, beta_n, beta_t, y_n, y_t, z_n, z_t);
+
+// if(m<=0 | n<=0)
+// return;
+
+ int ii;
+
+ // copy and scale y_n int z_n
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z_n[ii+0] = beta_n*y_n[ii+0];
+ z_n[ii+1] = beta_n*y_n[ii+1];
+ z_n[ii+2] = beta_n*y_n[ii+2];
+ z_n[ii+3] = beta_n*y_n[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z_n[ii+0] = beta_n*y_n[ii+0];
+ }
+
+ ii = 0;
+ for(; ii<n-3; ii+=4)
+ {
+ kernel_sgemv_nt_4_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
+ }
+ if(ii<n)
+ {
+ kernel_sgemv_nt_4_vs_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
+ }
+
+ return;
+ }
+
+
+
+void ssymv_l_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii, n1;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+
+ // copy and scale y int z
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z[ii+0] = beta*y[ii+0];
+ z[ii+1] = beta*y[ii+1];
+ z[ii+2] = beta*y[ii+2];
+ z[ii+3] = beta*y[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = beta*y[ii+0];
+ }
+
+ // clean up at the beginning
+ if(ai%bs!=0) // 1, 2, 3
+ {
+ n1 = 4-ai%bs;
+ kernel_ssymv_l_4_gen_lib4(m, &alpha, ai%bs, &pA[0], sda, &x[0], &z[0], n<n1 ? n : n1);
+ pA += n1 + n1*bs + (sda-1)*bs;
+ x += n1;
+ z += n1;
+ m -= n1;
+ n -= n1;
+ }
+ // main loop
+ ii = 0;
+ for(; ii<n-3; ii+=4)
+ {
+ kernel_ssymv_l_4_lib4(m-ii, &alpha, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii]);
+ }
+ // clean up at the end
+ if(ii<n)
+ {
+ kernel_ssymv_l_4_gen_lib4(m-ii, &alpha, 0, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii], n-ii);
+ }
+
+ return;
+ }
+
+
+
+// m >= n
+void strmv_lnn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ if(m-n>0)
+ sgemv_n_libstr(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
+
+ float *pA2 = pA;
+ float *z2 = z;
+ int m2 = n;
+ int n2 = 0;
+ float *pA3, *x3;
+
+ float alpha = 1.0;
+ float beta = 1.0;
+
+ float zt[4];
+
+ int ii, jj, jj_end;
+
+ ii = 0;
+
+ if(ai%4!=0)
+ {
+ pA2 += sda*bs - ai%bs;
+ z2 += bs-ai%bs;
+ m2 -= bs-ai%bs;
+ n2 += bs-ai%bs;
+ }
+
+ pA2 += m2/bs*bs*sda;
+ z2 += m2/bs*bs;
+ n2 += m2/bs*bs;
+
+ if(m2%bs!=0)
+ {
+ //
+ pA3 = pA2 + bs*n2;
+ x3 = x + n2;
+ zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+ zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+ zt[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+ zt[0] = pA3[0+bs*0]*x3[0];
+ kernel_sgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, zt, zt);
+ for(jj=0; jj<m2%bs; jj++)
+ z2[jj] = zt[jj];
+ }
+ for(; ii<m2-3; ii+=4)
+ {
+ pA2 -= bs*sda;
+ z2 -= 4;
+ n2 -= 4;
+ pA3 = pA2 + bs*n2;
+ x3 = x + n2;
+ z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+ z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+ z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+ z2[0] = pA3[0+bs*0]*x3[0];
+ kernel_sgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, z2, z2);
+ }
+ if(ai%4!=0)
+ {
+ if(ai%bs==1)
+ {
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 4-ai%bs<n ? 4-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==2)
+ {
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 4-ai%bs<n ? 4-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else // if (ai%bs==3)
+ {
+ z[0] = pA[0+bs*0]*x[0];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// m >= n
+void strmv_ltn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ float xt[4];
+ float zt[4];
+
+ float alpha = 1.0;
+ float beta = 1.0;
+
+ int ii, jj, ll, ll_max;
+
+ jj = 0;
+
+ if(ai%bs!=0)
+ {
+
+ if(ai%bs==1)
+ {
+ ll_max = m-jj<3 ? m-jj : 3;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<3; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
+ zt[2] = pA[2+bs*2]*xt[2];
+ pA += bs*sda - 1;
+ x += 3;
+ kernel_sgemv_t_4_lib4(m-3-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<3 ? n-jj : 3;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*3;
+ z += 3;
+ jj += 3;
+ }
+ else if(ai%bs==2)
+ {
+ ll_max = m-jj<2 ? m-jj : 2;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<2; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
+ zt[1] = pA[1+bs*1]*xt[1];
+ pA += bs*sda - 2;
+ x += 2;
+ kernel_sgemv_t_4_lib4(m-2-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<2 ? n-jj : 2;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*2;
+ z += 2;
+ jj += 2;
+ }
+ else // if(ai%bs==3)
+ {
+ ll_max = m-jj<1 ? m-jj : 1;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<1; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0];
+ pA += bs*sda - 3;
+ x += 1;
+ kernel_sgemv_t_4_lib4(m-1-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<1 ? n-jj : 1;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*1;
+ z += 1;
+ jj += 1;
+ }
+
+ }
+
+ for(; jj<n-3; jj+=4)
+ {
+ zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3];
+ zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3];
+ zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3];
+ zt[3] = pA[3+bs*3]*x[3];
+ pA += bs*sda;
+ x += 4;
+ kernel_sgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, z);
+ pA += bs*4;
+ z += 4;
+ }
+ if(jj<n)
+ {
+ ll_max = m-jj<4 ? m-jj : 4;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<4; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
+ zt[3] = pA[3+bs*3]*xt[3];
+ pA += bs*sda;
+ x += 4;
+ kernel_sgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ for(ll=0; ll<n-jj; ll++)
+ z[ll] = zt[ll];
+// pA += bs*4;
+// z += 4;
+ }
+
+ return;
+
+ }
+
+
+
+void strmv_unn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ai!=0)
+ {
+ printf("\ndtrmv_unn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int i;
+
+ i=0;
+ for(; i<m-3; i+=4)
+ {
+ kernel_strmv_un_4_lib4(m-i, pA, x, z);
+ pA += 4*sda+4*bs;
+ x += 4;
+ z += 4;
+ }
+ if(m>i)
+ {
+ if(m-i==1)
+ {
+ z[0] = pA[0+bs*0]*x[0];
+ }
+ else if(m-i==2)
+ {
+ z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1];
+ z[1] = pA[1+bs*1]*x[1];
+ }
+ else // if(m-i==3)
+ {
+ z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1] + pA[0+bs*2]*x[2];
+ z[1] = pA[1+bs*1]*x[1] + pA[1+bs*2]*x[2];
+ z[2] = pA[2+bs*2]*x[2];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void strmv_utn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ai!=0)
+ {
+ printf("\nstrmv_utn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii, idx;
+
+ float *ptrA;
+
+ ii=0;
+ idx = m/bs*bs;
+ if(m%bs!=0)
+ {
+ kernel_strmv_ut_4_vs_lib4(m, pA+idx*bs, sda, x, z+idx, m%bs);
+ ii += m%bs;
+ }
+ idx -= 4;
+ for(; ii<m; ii+=4)
+ {
+ kernel_strmv_ut_4_lib4(idx+4, pA+idx*bs, sda, x, z+idx);
+ idx -= 4;
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_lnn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_lnn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ int i;
+
+ if(x!=z)
+ {
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+ }
+
+ i = 0;
+ for( ; i<m-3; i+=4)
+ {
+ kernel_strsv_ln_inv_4_lib4(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
+ }
+ if(i<m)
+ {
+ kernel_strsv_ln_inv_4_vs_lib4(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, m-i);
+ i+=4;
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_lnn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** strsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** strsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_lnn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ if(m<n)
+ m = n;
+
+ float alpha = -1.0;
+ float beta = 1.0;
+
+ int i;
+
+ if(x!=z)
+ {
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+ }
+
+ i = 0;
+ for( ; i<n-3; i+=4)
+ {
+ kernel_strsv_ln_inv_4_lib4(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
+ }
+ if(i<n)
+ {
+ kernel_strsv_ln_inv_4_vs_lib4(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, n-i);
+ i+=4;
+ }
+ for( ; i<m-3; i+=4)
+ {
+ kernel_sgemv_n_4_lib4(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i]);
+ }
+ if(i<m)
+ {
+ kernel_sgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i], m-i);
+ i+=4;
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_ltn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_ltn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ int i;
+
+ if(x!=z)
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+
+ i=0;
+ if(m%4==1)
+ {
+ kernel_strsv_lt_inv_1_lib4(i+1, &pA[m/bs*bs*sda+(m-i-1)*bs], sda, &dA[m-i-1], &z[m-i-1], &z[m-i-1], &z[m-i-1]);
+ i++;
+ }
+ else if(m%4==2)
+ {
+ kernel_strsv_lt_inv_2_lib4(i+2, &pA[m/bs*bs*sda+(m-i-2)*bs], sda, &dA[m-i-2], &z[m-i-2], &z[m-i-2], &z[m-i-2]);
+ i+=2;
+ }
+ else if(m%4==3)
+ {
+ kernel_strsv_lt_inv_3_lib4(i+3, &pA[m/bs*bs*sda+(m-i-3)*bs], sda, &dA[m-i-3], &z[m-i-3], &z[m-i-3], &z[m-i-3]);
+ i+=3;
+ }
+ for(; i<m-3; i+=4)
+ {
+ kernel_strsv_lt_inv_4_lib4(i+4, &pA[(m-i-4)/bs*bs*sda+(m-i-4)*bs], sda, &dA[m-i-4], &z[m-i-4], &z[m-i-4], &z[m-i-4]);
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_ltn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** strsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** strsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_ltn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ if(n>m)
+ n = m;
+
+ int i;
+
+ if(x!=z)
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+
+ i=0;
+ if(n%4==1)
+ {
+ kernel_strsv_lt_inv_1_lib4(m-n+i+1, &pA[n/bs*bs*sda+(n-i-1)*bs], sda, &dA[n-i-1], &z[n-i-1], &z[n-i-1], &z[n-i-1]);
+ i++;
+ }
+ else if(n%4==2)
+ {
+ kernel_strsv_lt_inv_2_lib4(m-n+i+2, &pA[n/bs*bs*sda+(n-i-2)*bs], sda, &dA[n-i-2], &z[n-i-2], &z[n-i-2], &z[n-i-2]);
+ i+=2;
+ }
+ else if(n%4==3)
+ {
+ kernel_strsv_lt_inv_3_lib4(m-n+i+3, &pA[n/bs*bs*sda+(n-i-3)*bs], sda, &dA[n-i-3], &z[n-i-3], &z[n-i-3], &z[n-i-3]);
+ i+=3;
+ }
+ for(; i<n-3; i+=4)
+ {
+ kernel_strsv_lt_inv_4_lib4(m-n+i+4, &pA[(n-i-4)/bs*bs*sda+(n-i-4)*bs], sda, &dA[n-i-4], &z[n-i-4], &z[n-i-4], &z[n-i-4]);
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_lnu_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_lnu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** strsv_lnu_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void strsv_ltu_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_ltu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** strsv_ltu_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void strsv_unn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_unn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** strsv_unn_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void strsv_utn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_utn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** strsv_utn_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/s_blas2_lib8.c b/blas/s_blas2_lib8.c
new file mode 100644
index 0000000..41a78c4
--- /dev/null
+++ b/blas/s_blas2_lib8.c
@@ -0,0 +1,1008 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+void sgemv_n_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<0)
+ return;
+
+ const int bs = 8;
+
+ int i;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+
+ i = 0;
+ // clean up at the beginning
+ if(ai%bs!=0)
+ {
+ kernel_sgemv_n_8_gen_lib8(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
+ pA += bs*sda;
+ y += 8 - ai%bs;
+ z += 8 - ai%bs;
+ m -= 8 - ai%bs;
+ }
+ // main loop
+ for( ; i<m-7; i+=8)
+ {
+ kernel_sgemv_n_8_lib8(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
+ }
+ if(i<m)
+ {
+ kernel_sgemv_n_8_vs_lib8(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
+ }
+
+ return;
+
+ }
+
+
+
+void sgemv_t_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+ if(n<=0)
+ return;
+
+ const int bs = 8;
+
+ int i;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+
+ if(ai%bs==0)
+ {
+ i = 0;
+ for( ; i<n-7; i+=8)
+ {
+ kernel_sgemv_t_8_lib8(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+ }
+ if(i<n)
+ {
+ if(n-i<=4)
+ {
+ kernel_sgemv_t_4_vs_lib8(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ else
+ {
+ kernel_sgemv_t_8_vs_lib8(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ }
+ }
+ else
+ {
+ i = 0;
+ for( ; i<n-4; i+=8)
+ {
+ kernel_sgemv_t_8_gen_lib8(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ if(i<n)
+ {
+ kernel_sgemv_t_4_gen_lib8(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// m >= n
+void strmv_lnn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ if(m-n>0)
+ sgemv_n_libstr(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
+
+ float *pA2 = pA;
+ float *z2 = z;
+ int m2 = n;
+ int n2 = 0;
+ float *pA3, *x3;
+
+ float alpha = 1.0;
+ float beta = 1.0;
+
+ float zt[8];
+
+ int ii, jj, jj_end;
+
+ ii = 0;
+
+ if(ai%bs!=0)
+ {
+ pA2 += sda*bs - ai%bs;
+ z2 += bs-ai%bs;
+ m2 -= bs-ai%bs;
+ n2 += bs-ai%bs;
+ }
+
+ pA2 += m2/bs*bs*sda;
+ z2 += m2/bs*bs;
+ n2 += m2/bs*bs;
+
+ if(m2%bs!=0)
+ {
+ //
+ pA3 = pA2 + bs*n2;
+ x3 = x + n2;
+ zt[7] = pA3[7+bs*0]*x3[0] + pA3[7+bs*1]*x3[1] + pA3[7+bs*2]*x3[2] + pA3[7+bs*3]*x3[3] + pA3[7+bs*4]*x3[4] + pA3[7+bs*5]*x3[5] + pA3[7+bs*6]*x3[6] + pA3[7+bs*7]*x3[7];
+ zt[6] = pA3[6+bs*0]*x3[0] + pA3[6+bs*1]*x3[1] + pA3[6+bs*2]*x3[2] + pA3[6+bs*3]*x3[3] + pA3[6+bs*4]*x3[4] + pA3[6+bs*5]*x3[5] + pA3[6+bs*6]*x3[6];
+ zt[5] = pA3[5+bs*0]*x3[0] + pA3[5+bs*1]*x3[1] + pA3[5+bs*2]*x3[2] + pA3[5+bs*3]*x3[3] + pA3[5+bs*4]*x3[4] + pA3[5+bs*5]*x3[5];
+ zt[4] = pA3[4+bs*0]*x3[0] + pA3[4+bs*1]*x3[1] + pA3[4+bs*2]*x3[2] + pA3[4+bs*3]*x3[3] + pA3[4+bs*4]*x3[4];
+ zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+ zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+ zt[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+ zt[0] = pA3[0+bs*0]*x3[0];
+ kernel_sgemv_n_8_lib8(n2, &alpha, pA2, x, &beta, zt, zt);
+ for(jj=0; jj<m2%bs; jj++)
+ z2[jj] = zt[jj];
+ }
+ for(; ii<m2-7; ii+=8)
+ {
+ pA2 -= bs*sda;
+ z2 -= 8;
+ n2 -= 8;
+ pA3 = pA2 + bs*n2;
+ x3 = x + n2;
+ z2[7] = pA3[7+bs*0]*x3[0] + pA3[7+bs*1]*x3[1] + pA3[7+bs*2]*x3[2] + pA3[7+bs*3]*x3[3] + pA3[7+bs*4]*x3[4] + pA3[7+bs*5]*x3[5] + pA3[7+bs*6]*x3[6] + pA3[7+bs*7]*x3[7];
+ z2[6] = pA3[6+bs*0]*x3[0] + pA3[6+bs*1]*x3[1] + pA3[6+bs*2]*x3[2] + pA3[6+bs*3]*x3[3] + pA3[6+bs*4]*x3[4] + pA3[6+bs*5]*x3[5] + pA3[6+bs*6]*x3[6];
+ z2[5] = pA3[5+bs*0]*x3[0] + pA3[5+bs*1]*x3[1] + pA3[5+bs*2]*x3[2] + pA3[5+bs*3]*x3[3] + pA3[5+bs*4]*x3[4] + pA3[5+bs*5]*x3[5];
+ z2[4] = pA3[4+bs*0]*x3[0] + pA3[4+bs*1]*x3[1] + pA3[4+bs*2]*x3[2] + pA3[4+bs*3]*x3[3] + pA3[4+bs*4]*x3[4];
+ z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+ z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+ z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+ z2[0] = pA3[0+bs*0]*x3[0];
+ kernel_sgemv_n_8_lib8(n2, &alpha, pA2, x, &beta, z2, z2);
+ }
+ if(ai%bs!=0)
+ {
+ if(ai%bs==1)
+ {
+ zt[6] = pA[6+bs*0]*x[0] + pA[6+bs*1]*x[1] + pA[6+bs*2]*x[2] + pA[6+bs*3]*x[3] + pA[6+bs*4]*x[4] + pA[6+bs*5]*x[5] + pA[6+bs*6]*x[6];
+ zt[5] = pA[5+bs*0]*x[0] + pA[5+bs*1]*x[1] + pA[5+bs*2]*x[2] + pA[5+bs*3]*x[3] + pA[5+bs*4]*x[4] + pA[5+bs*5]*x[5];
+ zt[4] = pA[4+bs*0]*x[0] + pA[4+bs*1]*x[1] + pA[4+bs*2]*x[2] + pA[4+bs*3]*x[3] + pA[4+bs*4]*x[4];
+ zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==2)
+ {
+ zt[5] = pA[5+bs*0]*x[0] + pA[5+bs*1]*x[1] + pA[5+bs*2]*x[2] + pA[5+bs*3]*x[3] + pA[5+bs*4]*x[4] + pA[5+bs*5]*x[5];
+ zt[4] = pA[4+bs*0]*x[0] + pA[4+bs*1]*x[1] + pA[4+bs*2]*x[2] + pA[4+bs*3]*x[3] + pA[4+bs*4]*x[4];
+ zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==3)
+ {
+ zt[4] = pA[4+bs*0]*x[0] + pA[4+bs*1]*x[1] + pA[4+bs*2]*x[2] + pA[4+bs*3]*x[3] + pA[4+bs*4]*x[4];
+ zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==4)
+ {
+ zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==5)
+ {
+ zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else if(ai%bs==6)
+ {
+ zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+ zt[0] = pA[0+bs*0]*x[0];
+ jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+ for(jj=0; jj<jj_end; jj++)
+ z[jj] = zt[jj];
+ }
+ else // if (ai%bs==7)
+ {
+ z[0] = pA[0+bs*0]*x[0];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// m >= n
+void strmv_ltn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ float xt[8];
+ float zt[8];
+
+ float alpha = 1.0;
+ float beta = 1.0;
+
+ int ii, jj, ll, ll_max;
+
+ jj = 0;
+
+ if(ai%bs!=0)
+ {
+
+ if(ai%bs==1)
+ {
+ ll_max = m-jj<7 ? m-jj : 7;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<7; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4] + pA[5+bs*0]*xt[5] + pA[6+bs*0]*xt[6];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4] + pA[5+bs*1]*xt[5] + pA[6+bs*1]*xt[6];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4] + pA[5+bs*2]*xt[5] + pA[6+bs*2]*xt[6];
+ zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4] + pA[5+bs*3]*xt[5] + pA[6+bs*3]*xt[6];
+ zt[4] = pA[4+bs*4]*xt[4] + pA[5+bs*4]*xt[5] + pA[6+bs*4]*xt[6];
+ zt[5] = pA[5+bs*5]*xt[5] + pA[6+bs*5]*xt[6];
+ zt[6] = pA[6+bs*6]*xt[6];
+ pA += bs*sda - 1;
+ x += 7;
+ kernel_sgemv_t_8_lib8(m-7-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<7 ? n-jj : 7;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*7;
+ z += 7;
+ jj += 7;
+ }
+ else if(ai%bs==2)
+ {
+ ll_max = m-jj<6 ? m-jj : 6;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<6; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4] + pA[5+bs*0]*xt[5];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4] + pA[5+bs*1]*xt[5];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4] + pA[5+bs*2]*xt[5];
+ zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4] + pA[5+bs*3]*xt[5];
+ zt[4] = pA[4+bs*4]*xt[4] + pA[5+bs*4]*xt[5];
+ zt[5] = pA[5+bs*5]*xt[5];
+ pA += bs*sda - 2;
+ x += 6;
+ kernel_sgemv_t_8_lib8(m-6-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<6 ? n-jj : 6;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*6;
+ z += 6;
+ jj += 6;
+ }
+ else if(ai%bs==3)
+ {
+ ll_max = m-jj<5 ? m-jj : 5;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<5; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4];
+ zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4];
+ zt[4] = pA[4+bs*4]*xt[4];
+ pA += bs*sda - 3;
+ x += 5;
+ kernel_sgemv_t_8_lib8(m-5-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<5 ? n-jj : 5;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*5;
+ z += 5;
+ jj += 5;
+ }
+ else if(ai%bs==4)
+ {
+ ll_max = m-jj<4 ? m-jj : 4;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<4; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
+ zt[3] = pA[3+bs*3]*xt[3];
+ pA += bs*sda - 4;
+ x += 4;
+ kernel_sgemv_t_8_lib8(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<4 ? n-jj : 4;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*4;
+ z += 4;
+ jj += 4;
+ }
+ else if(ai%bs==5)
+ {
+ ll_max = m-jj<3 ? m-jj : 3;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<3; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
+ zt[2] = pA[2+bs*2]*xt[2];
+ pA += bs*sda - 5;
+ x += 3;
+ kernel_sgemv_t_8_lib8(m-3-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<3 ? n-jj : 3;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*3;
+ z += 3;
+ jj += 3;
+ }
+ else if(ai%bs==6)
+ {
+ ll_max = m-jj<2 ? m-jj : 2;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<2; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
+ zt[1] = pA[1+bs*1]*xt[1];
+ pA += bs*sda - 6;
+ x += 2;
+ kernel_sgemv_t_8_lib8(m-2-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<2 ? n-jj : 2;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*2;
+ z += 2;
+ jj += 2;
+ }
+ else // if(ai%bs==7)
+ {
+ ll_max = m-jj<1 ? m-jj : 1;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<1; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0];
+ pA += bs*sda - 7;
+ x += 1;
+ kernel_sgemv_t_8_lib8(m-1-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ ll_max = n-jj<1 ? n-jj : 1;
+ for(ll=0; ll<ll_max; ll++)
+ z[ll] = zt[ll];
+ pA += bs*1;
+ z += 1;
+ jj += 1;
+ }
+
+ }
+
+ for(; jj<n-7; jj+=8)
+ {
+ zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3] + pA[4+bs*0]*x[4] + pA[5+bs*0]*x[5] + pA[6+bs*0]*x[6] + pA[7+bs*0]*x[7];
+ zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3] + pA[4+bs*1]*x[4] + pA[5+bs*1]*x[5] + pA[6+bs*1]*x[6] + pA[7+bs*1]*x[7];
+ zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3] + pA[4+bs*2]*x[4] + pA[5+bs*2]*x[5] + pA[6+bs*2]*x[6] + pA[7+bs*2]*x[7];
+ zt[3] = pA[3+bs*3]*x[3] + pA[4+bs*3]*x[4] + pA[5+bs*3]*x[5] + pA[6+bs*3]*x[6] + pA[7+bs*3]*x[7];
+ zt[4] = pA[4+bs*4]*x[4] + pA[5+bs*4]*x[5] + pA[6+bs*4]*x[6] + pA[7+bs*4]*x[7];
+ zt[5] = pA[5+bs*5]*x[5] + pA[6+bs*5]*x[6] + pA[7+bs*5]*x[7];
+ zt[6] = pA[6+bs*6]*x[6] + pA[7+bs*6]*x[7];
+ zt[7] = pA[7+bs*7]*x[7];
+ pA += bs*sda;
+ x += 8;
+ kernel_sgemv_t_8_lib8(m-8-jj, &alpha, pA, sda, x, &beta, zt, z);
+ pA += bs*8;
+ z += 8;
+ }
+ if(jj<n)
+ {
+ if(n-jj<=4)
+ {
+ ll_max = m-jj<4 ? m-jj : 4;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<4; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
+ zt[3] = pA[3+bs*3]*xt[3];
+ pA += bs*sda;
+ x += 4;
+ kernel_sgemv_t_4_lib8(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ for(ll=0; ll<n-jj; ll++)
+ z[ll] = zt[ll];
+// pA += bs*4;
+// z += 4;
+ }
+ else
+ {
+ ll_max = m-jj<8 ? m-jj : 8;
+ for(ll=0; ll<ll_max; ll++)
+ xt[ll] = x[ll];
+ for(; ll<8; ll++)
+ xt[ll] = 0.0;
+ zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4] + pA[5+bs*0]*xt[5] + pA[6+bs*0]*xt[6] + pA[7+bs*0]*xt[7];
+ zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4] + pA[5+bs*1]*xt[5] + pA[6+bs*1]*xt[6] + pA[7+bs*1]*xt[7];
+ zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4] + pA[5+bs*2]*xt[5] + pA[6+bs*2]*xt[6] + pA[7+bs*2]*xt[7];
+ zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4] + pA[5+bs*3]*xt[5] + pA[6+bs*3]*xt[6] + pA[7+bs*3]*xt[7];
+ zt[4] = pA[4+bs*4]*xt[4] + pA[5+bs*4]*xt[5] + pA[6+bs*4]*xt[6] + pA[7+bs*4]*xt[7];
+ zt[5] = pA[5+bs*5]*xt[5] + pA[6+bs*5]*xt[6] + pA[7+bs*5]*xt[7];
+ zt[6] = pA[6+bs*6]*xt[6] + pA[7+bs*6]*xt[7];
+ zt[7] = pA[7+bs*7]*xt[7];
+ pA += bs*sda;
+ x += 8;
+ kernel_sgemv_t_8_lib8(m-8-jj, &alpha, pA, sda, x, &beta, zt, zt);
+ for(ll=0; ll<n-jj; ll++)
+ z[ll] = zt[ll];
+// pA += bs*8;
+// z += 8;
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_lnn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_lnn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ int i;
+
+ if(x!=z)
+ {
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+ }
+
+ i = 0;
+ for( ; i<m-7; i+=8)
+ {
+ kernel_strsv_ln_inv_8_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
+ }
+ if(i<m)
+ {
+ kernel_strsv_ln_inv_8_vs_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, m-i);
+ i+=8;
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_lnn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** strsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** strsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_lnn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ if(m<n)
+ m = n;
+
+ float alpha = -1.0;
+ float beta = 1.0;
+
+ int i;
+
+ if(x!=z)
+ {
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+ }
+
+ i = 0;
+ for( ; i<n-7; i+=8)
+ {
+ kernel_strsv_ln_inv_8_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
+ }
+ if(i<n)
+ {
+ kernel_strsv_ln_inv_8_vs_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, n-i);
+ i+=8;
+ }
+ for( ; i<m-7; i+=8)
+ {
+ kernel_sgemv_n_8_lib8(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i]);
+ }
+ if(i<m)
+ {
+ kernel_sgemv_n_8_vs_lib8(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i], m-i);
+ i+=8;
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_ltn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** strsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_ltn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ int i, i1;
+
+ if(x!=z)
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+
+ i=0;
+ i1 = m%8;
+ if(i1!=0)
+ {
+ kernel_strsv_lt_inv_8_vs_lib8(i+i1, &pA[m/bs*bs*sda+(m-i-i1)*bs], sda, &dA[m-i-i1], &z[m-i-i1], &z[m-i-i1], &z[m-i-i1], i1, i1);
+ i += i1;
+ }
+ for(; i<m-7; i+=8)
+ {
+ kernel_strsv_lt_inv_8_lib8(i+8, &pA[(m-i-8)/bs*bs*sda+(m-i-8)*bs], sda, &dA[m-i-8], &z[m-i-8], &z[m-i-8], &z[m-i-8]);
+ }
+
+ return;
+
+ }
+
+
+
+void strsv_ltn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+
+ if(m==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** strsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** strsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** strsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** strsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** strsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** strsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** strsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** strsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** strsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** strsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+ if(ai!=0)
+ {
+ printf("\nstrsv_ltn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *dA = sA->dA;
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+
+ int ii;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+
+ if(n>m)
+ n = m;
+
+ int i, i1;
+
+ if(x!=z)
+ for(i=0; i<m; i++)
+ z[i] = x[i];
+
+ i=0;
+ i1 = n%8;
+ if(i1!=0)
+ {
+ kernel_strsv_lt_inv_8_vs_lib8(m-n+i1, &pA[n/bs*bs*sda+(n-i1)*bs], sda, &dA[n-i1], &z[n-i1], &z[n-i1], &z[n-i1], m-n+i1, i1);
+ i += i1;
+ }
+ for(; i<n-7; i+=8)
+ {
+ kernel_strsv_lt_inv_8_lib8(m-n+i+8, &pA[(n-i-8)/bs*bs*sda+(n-i-8)*bs], sda, &dA[n-i-8], &z[n-i-8], &z[n-i-8], &z[n-i-8]);
+ }
+
+ return;
+
+ }
+
+
+
+void sgemv_nt_libstr(int m, int n, float alpha_n, float alpha_t, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx_n, int xi_n, struct s_strvec *sx_t, int xi_t, float beta_n, float beta_t, struct s_strvec *sy_n, int yi_n, struct s_strvec *sy_t, int yi_t, struct s_strvec *sz_n, int zi_n, struct s_strvec *sz_t, int zi_t)
+ {
+
+ if(ai!=0)
+ {
+ printf("\nsgemv_nt_libstr: feature not implemented yet: ai=%d\n", ai);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs; // TODO ai
+ float *x_n = sx_n->pa + xi_n;
+ float *x_t = sx_t->pa + xi_t;
+ float *y_n = sy_n->pa + yi_n;
+ float *y_t = sy_t->pa + yi_t;
+ float *z_n = sz_n->pa + zi_n;
+ float *z_t = sz_t->pa + zi_t;
+
+// if(m<=0 | n<=0)
+// return;
+
+ int ii;
+
+ // copy and scale y_n int z_n
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z_n[ii+0] = beta_n*y_n[ii+0];
+ z_n[ii+1] = beta_n*y_n[ii+1];
+ z_n[ii+2] = beta_n*y_n[ii+2];
+ z_n[ii+3] = beta_n*y_n[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z_n[ii+0] = beta_n*y_n[ii+0];
+ }
+
+ ii = 0;
+ for(; ii<n-3; ii+=4)
+ {
+ kernel_sgemv_nt_4_lib8(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
+ }
+ if(ii<n)
+ {
+ kernel_sgemv_nt_4_vs_lib8(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
+ }
+
+ return;
+ }
+
+
+
+void ssymv_l_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+ {
+
+// if(m<=0 | n<=0)
+// return;
+
+ const int bs = 8;
+
+ int ii, n1, n2;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ float *z = sz->pa + zi;
+
+ // copy and scale y int z
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z[ii+0] = beta*y[ii+0];
+ z[ii+1] = beta*y[ii+1];
+ z[ii+2] = beta*y[ii+2];
+ z[ii+3] = beta*y[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = beta*y[ii+0];
+ }
+
+ // clean up at the beginning
+ if(ai%bs!=0) // 1, 2, 3
+ {
+ n1 = 8-ai%bs;
+ n2 = n<n1 ? n : n1;
+ kernel_ssymv_l_4l_gen_lib8(m-0, &alpha, ai%bs, &pA[0+(0)*bs], sda, &x[0], &z[0], n2-0);
+ kernel_ssymv_l_4r_gen_lib8(m-4, &alpha, ai%bs, &pA[4+(4)*bs], sda, &x[4], &z[4], n2-4);
+ pA += n1 + n1*bs + (sda-1)*bs;
+ x += n1;
+ z += n1;
+ m -= n1;
+ n -= n1;
+ }
+ // main loop
+ ii = 0;
+ for(; ii<n-7; ii+=8)
+ {
+ kernel_ssymv_l_4l_lib8(m-ii-0, &alpha, &pA[0+(ii+0)*bs+ii*sda], sda, &x[ii+0], &z[ii+0]);
+ kernel_ssymv_l_4r_lib8(m-ii-4, &alpha, &pA[4+(ii+4)*bs+ii*sda], sda, &x[ii+4], &z[ii+4]);
+ }
+ // clean up at the end
+ if(ii<n)
+ {
+ kernel_ssymv_l_4l_gen_lib8(m-ii-0, &alpha, 0, &pA[0+(ii+0)*bs+ii*sda], sda, &x[ii+0], &z[ii+0], n-ii-0);
+ kernel_ssymv_l_4r_gen_lib8(m-ii-4, &alpha, 0, &pA[4+(ii+4)*bs+ii*sda], sda, &x[ii+4], &z[ii+4], n-ii-4);
+ }
+
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/s_blas3_diag_lib.c b/blas/s_blas3_diag_lib.c
new file mode 100644
index 0000000..23f8e0f
--- /dev/null
+++ b/blas/s_blas3_diag_lib.c
@@ -0,0 +1,49 @@
+
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL float
+
+#define STRMAT s_strmat
+#define STRVEC s_strvec
+
+#define GEMM_L_DIAG_LIBSTR sgemm_l_diag_libstr
+#define GEMM_R_DIAG_LIBSTR sgemm_r_diag_libstr
+
+
+
+#include "x_blas3_diag_lib.c"
+
diff --git a/blas/s_blas3_diag_lib4.c b/blas/s_blas3_diag_lib4.c
new file mode 100644
index 0000000..0319212
--- /dev/null
+++ b/blas/s_blas3_diag_lib4.c
@@ -0,0 +1,161 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm with A diagonal matrix (stored as strvec)
+void sgemm_l_diag_libstr(int m, int n, float alpha, struct s_strvec *sA, int ai, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ if(bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\nsgemm_l_diag_libstr: feature not implemented yet: bi=%d, ci=%d, di=%d\n", bi, ci, di);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *dA = sA->pa + ai;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+// sgemm_diag_left_lib(m, n, alpha, dA, pB, sdb, beta, pC, sdc, pD, sdd);
+ int ii;
+
+ ii = 0;
+ if(beta==0.0)
+ {
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_sgemm_diag_left_4_a0_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &pD[ii*sdd]);
+ }
+ }
+ else
+ {
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_sgemm_diag_left_4_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ }
+ }
+ if(m-ii>0)
+ {
+ if(m-ii==1)
+ kernel_sgemm_diag_left_1_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ else if(m-ii==2)
+ kernel_sgemm_diag_left_2_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ else // if(m-ii==3)
+ kernel_sgemm_diag_left_3_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+ }
+
+ return;
+
+ }
+
+
+
+// dgemm with B diagonal matrix (stored as strvec)
+void sgemm_r_diag_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sB, int bi, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ if(ai!=0 | ci!=0 | di!=0)
+ {
+ printf("\nsgemm_r_diag_libstr: feature not implemented yet: ai=%d, ci=%d, di=%d\n", ai, ci, di);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *dB = sB->pa + bi;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ int ii;
+
+ ii = 0;
+ if(beta==0.0)
+ {
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_sgemm_diag_right_4_a0_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &pD[ii*bs], sdd);
+ }
+ }
+ else
+ {
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_sgemm_diag_right_4_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ }
+ }
+ if(n-ii>0)
+ {
+ if(n-ii==1)
+ kernel_sgemm_diag_right_1_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ else if(n-ii==2)
+ kernel_sgemm_diag_right_2_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ else // if(n-ii==3)
+ kernel_sgemm_diag_right_3_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ }
+ return;
+
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/blas/s_blas3_diag_lib8.c b/blas/s_blas3_diag_lib8.c
new file mode 100644
index 0000000..8469345
--- /dev/null
+++ b/blas/s_blas3_diag_lib8.c
@@ -0,0 +1,105 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm with B diagonal matrix (stored as strvec)
+void sgemm_r_diag_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sB, int bi, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ if(ai!=0 | ci!=0 | di!=0)
+ {
+ printf("\nsgemm_r_diag_libstr: feature not implemented yet: ai=%d, ci=%d, di=%d\n", ai, ci, di);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *dB = sB->pa + bi;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ int ii;
+
+ ii = 0;
+ if(beta==0.0)
+ {
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_sgemm_diag_right_4_a0_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &pD[ii*bs], sdd);
+ }
+ }
+ else
+ {
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_sgemm_diag_right_4_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ }
+ }
+ if(n-ii>0)
+ {
+ if(n-ii==1)
+ kernel_sgemm_diag_right_1_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ else if(n-ii==2)
+ kernel_sgemm_diag_right_2_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ else // if(n-ii==3)
+ kernel_sgemm_diag_right_3_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+ }
+ return;
+
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/blas/s_blas3_lib.c b/blas/s_blas3_lib.c
new file mode 100644
index 0000000..dca98ff
--- /dev/null
+++ b/blas/s_blas3_lib.c
@@ -0,0 +1,70 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#if defined(REF_BLAS_BLIS)
+#include "s_blas_64.h"
+#else
+#include "s_blas.h"
+#endif
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+#define REAL float
+
+#define STRMAT s_strmat
+
+#define GEMM_NN_LIBSTR sgemm_nn_libstr
+#define GEMM_NT_LIBSTR sgemm_nt_libstr
+#define SYRK_LN_LIBSTR ssyrk_ln_libstr
+#define SYRK_LN_MN_LIBSTR ssyrk_ln_mn_libstr
+#define TRMM_RLNN_LIBSTR strmm_rlnn_libstr
+#define TRMM_RUTN_LIBSTR strmm_rutn_libstr
+#define TRSM_LLNU_LIBSTR strsm_llnu_libstr
+#define TRSM_LUNN_LIBSTR strsm_lunn_libstr
+#define TRSM_RLTN_LIBSTR strsm_rltn_libstr
+#define TRSM_RLTU_LIBSTR strsm_rltu_libstr
+#define TRSM_RUTN_LIBSTR strsm_rutn_libstr
+
+#define COPY scopy_
+#define GEMM sgemm_
+#define SYRK ssyrk_
+#define TRMM strmm_
+#define TRSM strsm_
+
+
+
+#include "x_blas3_lib.c"
+
diff --git a/blas/s_blas3_lib4.c b/blas/s_blas3_lib4.c
new file mode 100644
index 0000000..c6be38f
--- /dev/null
+++ b/blas/s_blas3_lib4.c
@@ -0,0 +1,1062 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+void sgemm_nt_lib(int m, int n, int k, float alpha, float *pA, int sda, float *pB, int sdb, float beta, float *pC, int sdc, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j, l;
+
+ i = 0;
+
+#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_sgemm_nt_16x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+i*sdc], sdc, &pD[j*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], m-(i+8), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+12)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+12)*sdc], &pD[j*bs+(i+12)*sdd], m-(i+12), n-j);
+ }
+ }
+#endif
+#if defined(TARGET_ARMV7A_ARM_CORTEX_A15) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ for(; i<m-11; i+=12)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_sgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+i*sdc], sdc, &pD[j*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], m-(i+8), n-j);
+ }
+ }
+#endif
+#if defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ for(; j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_8x8_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*bs+i*sdc], sdc, &pD[j*bs+i*sdd], sdd);
+ }
+#endif
+ for(; j<n-3; j+=4)
+ {
+ kernel_sgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+i*sdc], sdc, &pD[j*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+ }
+ }
+#endif
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_sgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_12:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], m-(i+8), n-j);
+ }
+ return;
+
+ left_8:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+ }
+ return;
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+void sgemm_nn_lib(int m, int n, int k, float alpha, float *pA, int sda, float *pB, int sdb, float beta, float *pC, int sdc, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j, l;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_sgemm_nn_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*bs], sdb, &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*bs], sdb, &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_sgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*bs], sdb, &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+void strmm_nt_ru_lib(int m, int n, float alpha, float *pA, int sda, float *pB, int sdb, float beta, float *pC, int sdc, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j;
+
+ i = 0;
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_strmm_nt_ru_4x4_lib4(n-j, &alpha, &pA[j*bs+i*sda], &pB[j*bs+j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ if(j<n) // TODO specialized edge routine
+ {
+ kernel_strmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*bs+i*sda], &pB[j*bs+j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ goto left_4;
+ }
+
+ // common return
+ return;
+
+ left_4:
+ j = 0;
+// for(; j<n-3; j+=4)
+ for(; j<n; j+=4)
+ {
+ kernel_strmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*bs+i*sda], &pB[j*bs+j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+// if(j<n) // TODO specialized edge routine
+// {
+// kernel_strmm_nt_ru_4x4_vs_lib4(n-j, &pA[j*bs+i*sda], &pB[j*bs+j*sdb], alg, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+// }
+ return;
+
+ }
+
+
+
+// D <= B * A^{-T} , with A lower triangular with unit diagonal
+void strsm_nt_rl_one_lib(int m, int n, float *pA, int sda, float *pB, int sdb, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_strsm_nt_rl_one_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda]);
+ }
+ if(j<n)
+ {
+ kernel_strsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_strsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+// D <= B * A^{-T} , with A upper triangular employing explicit inverse of diagonal
+void strsm_nt_ru_inv_lib(int m, int n, float *pA, int sda, float *inv_diag_A, float *pB, int sdb, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j, idx;
+
+ int rn = n%4;
+
+ float *dummy;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_strsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*bs], &pD[i*sdd+idx*bs], &pA[idx*sda+idx*bs], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_strsm_nt_ru_inv_4x4_lib4(j, &pD[i*sdd+(idx+4)*bs], &pA[idx*sda+(idx+4)*bs], &pB[i*sdb+idx*bs], &pD[i*sdd+idx*bs], &pA[idx*sda+idx*bs], &inv_diag_A[idx]);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ left_4:
+ j = 0;
+ // TODO
+ // clean at the end
+ if(rn>0)
+ {
+ idx = n-rn;
+ kernel_strsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*bs], &pD[i*sdd+idx*bs], &pA[idx*sda+idx*bs], &inv_diag_A[idx], m-i, rn);
+ j += rn;
+ }
+ for(; j<n; j+=4)
+ {
+ idx = n-j-4;
+ kernel_strsm_nt_ru_inv_4x4_vs_lib4(j, &pD[i*sdd+(idx+4)*bs], &pA[idx*sda+(idx+4)*bs], &pB[i*sdb+idx*bs], &pD[i*sdd+idx*bs], &pA[idx*sda+idx*bs], &inv_diag_A[idx], m-i, 4);
+ }
+ return;
+
+ }
+
+
+
+// D <= A^{-1} * B , with A lower triangular with unit diagonal
+void strsm_nn_ll_one_lib(int m, int n, float *pA, int sda, float *pB, int sdb, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j;
+
+ i = 0;
+
+ for( ; i<m-3; i+=4)
+ {
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_lib4(i, pA+i*sda, pD+j*bs, sdd, pB+i*sdb+j*bs, pD+i*sdd+j*bs, pA+i*sda+i*bs);
+ }
+ if(j<n)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*bs, sdd, pB+i*sdb+j*bs, pD+i*sdd+j*bs, pA+i*sda+i*bs, m-i, n-j);
+ }
+ }
+ if(i<m)
+ {
+ goto left_4;
+ }
+
+ // common return
+ return;
+
+ left_4:
+ j = 0;
+ for( ; j<n; j+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*bs, sdd, pB+i*sdb+j*bs, pD+i*sdd+j*bs, pA+i*sda+i*bs, m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+// D <= A^{-1} * B , with A upper triangular employing explicit inverse of diagonal
+void strsm_nn_lu_inv_lib(int m, int n, float *pA, int sda, float *inv_diag_A, float *pB, int sdb, float *pD, int sdd)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int i, j, idx;
+ float *dummy;
+
+ i = 0;
+ int rm = m%4;
+ if(rm>0)
+ {
+ // TODO code expliticly the final case
+ idx = m-rm; // position of the part to do
+ j = 0;
+ for( ; j<n; j+=4)
+ {
+ kernel_strsm_nn_lu_inv_4x4_vs_lib4(0, dummy, dummy, 0, pB+idx*sdb+j*bs, pD+idx*sdd+j*bs, pA+idx*sda+idx*bs, inv_diag_A+idx, rm, n-j);
+ }
+ // TODO
+ i += rm;
+ }
+// int em = m-rm;
+ for( ; i<m; i+=4)
+ {
+ idx = m-i; // position of already done part
+ j = 0;
+ for( ; j<n-3; j+=4)
+ {
+ kernel_strsm_nn_lu_inv_4x4_lib4(i, pA+(idx-4)*sda+idx*bs, pD+idx*sdd+j*bs, sdd, pB+(idx-4)*sdb+j*bs, pD+(idx-4)*sdd+j*bs, pA+(idx-4)*sda+(idx-4)*bs, inv_diag_A+(idx-4));
+ }
+ if(j<n)
+ {
+ kernel_strsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*bs, pD+idx*sdd+j*bs, sdd, pB+(idx-4)*sdb+j*bs, pD+(idx-4)*sdd+j*bs, pA+(idx-4)*sda+(idx-4)*bs, inv_diag_A+(idx-4), 4, n-j);
+ }
+ }
+
+ // common return
+ return;
+
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm nt
+void sgemm_nt_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ if(ai==0 & bi==0 & ci==0 & di==0)
+ {
+ sgemm_nt_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd);
+ return;
+ }
+
+ pA += ai/bs*bs*sda;
+ pB += bi/bs*bs*sda;
+ int ci0 = ci-ai%bs;
+ int di0 = di-ai%bs;
+ int offsetC;
+ int offsetD;
+ if(ci0>=0)
+ {
+ pC += ci0/bs*bs*sdd;
+ offsetC = ci0%bs;
+ }
+ else
+ {
+ pC += -4*sdc;
+ offsetC = bs+ci0;
+ }
+ if(di0>=0)
+ {
+ pD += di0/bs*bs*sdd;
+ offsetD = di0%bs;
+ }
+ else
+ {
+ pD += -4*sdd;
+ offsetD = bs+di0;
+ }
+
+ int i, j, l;
+
+ int idxB;
+
+ i = 0;
+ // clean up at the beginning
+ if(ai%bs!=0)
+ {
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bi%bs!=0)
+ {
+ kernel_sgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*bs+i*sdc]-bi%bs*bs, sdc, offsetD, &pD[j*bs+i*sdd]-bi%bs*bs, sdd, ai%bs, m-i, bi%bs, n-j);
+ j += bs-bi%bs;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_sgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*bs+i*sdc], sdc, offsetD, &pD[j*bs+i*sdd], sdd, ai%bs, m-i, 0, n-j);
+ idxB += 4;
+ }
+ m -= bs-ai%bs;
+ pA += bs*sda;
+ pC += bs*sdc;
+ pD += bs*sdd;
+ }
+ // main loop
+ for(; i<m; i+=4)
+ {
+ j = 0;
+ idxB = 0;
+ // clean up at the beginning
+ if(bi%bs!=0)
+ {
+ kernel_sgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*bs+i*sdc]-bi%bs*bs, sdc, offsetD, &pD[j*bs+i*sdd]-bi%bs*bs, sdd, 0, m-i, bi%bs, n-j);
+ j += bs-bi%bs;
+ idxB += 4;
+ }
+ // main loop
+ for(; j<n; j+=4)
+ {
+ kernel_sgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*bs+i*sdc], sdc, offsetD, &pD[j*bs+i*sdd], sdd, 0, m-i, 0, n-j);
+ idxB += 4;
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// dgemm nn
+void sgemm_nn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+ if(m<=0 || n<=0)
+ return;
+ if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\nsgemm_nn_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ sgemm_nn_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_nn_llu
+void strsm_llnu_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\nstrsm_llnu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int bs = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ strsm_nn_ll_one_lib(m, n, pA, sda, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_nn_lun
+void strsm_lunn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\nstrsm_lunn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int bs = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dA = sA->dA;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ strsm_nn_lu_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_right_lower_transposed_notunit
+void strsm_rltn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\nstrsm_rltn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ // TODO alpha
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dA = sA->dA;
+
+ int i, j;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(i=0; i<n; i++)
+ dA[i] = 1.0 / dA[i];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(i=0; i<n; i++)
+ dA[i] = 1.0 / dA[i];
+ sA->use_dA = 0;
+ }
+
+ if(m<=0 || n<=0)
+ return;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<n-3; j+=4)
+ {
+ kernel_strsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], &dA[j]);
+ }
+ if(j<n)
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], &dA[j], m-i, n-j);
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ left_4:
+ j = 0;
+ for(; j<n; j+=4)
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], &dA[j], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+// dtrsm_right_lower_transposed_unit
+void strsm_rltu_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\nstrsm_rltu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int bs = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ strsm_nt_rl_one_lib(m, n, pA, sda, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void strsm_rutn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\nstrsm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+ const int bs = 4;
+ // TODO alpha
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dA = sA->dA;
+ int ii;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / dA[ii];
+ sA->use_dA = 0;
+ }
+ strsm_nt_ru_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
+ return;
+ }
+
+
+
+// dtrmm_right_upper_transposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void strmm_rutn_libstr(int m, int n, float alpha, struct s_strmat *sB, int bi, int bj, struct s_strmat *sA, int ai, int aj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | di!=0)
+ {
+ printf("\nstrmm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ strmm_nt_ru_lib(m, n, alpha, pA, sda, pB, sdb, 0.0, pD, sdd, pD, sdd);
+ return;
+ }
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void strmm_rlnn_libstr(int m, int n, float alpha, struct s_strmat *sB, int bi, int bj, struct s_strmat *sA, int ai, int aj, struct s_strmat *sD, int di, int dj)
+ {
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ pA += ai/bs*bs*sda;
+ pB += bi/bs*bs*sdb;
+ int offsetB = bi%bs;
+ int di0 = di-ai%bs;
+ int offsetD;
+ if(di0>=0)
+ {
+ pD += di0/bs*bs*sdd;
+ offsetD = di0%bs;
+ }
+ else
+ {
+ pD += -4*sdd;
+ offsetD = bs+di0;
+ }
+
+ int ii, jj;
+
+ ii = 0;
+ if(ai%bs!=0)
+ {
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_strmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, ai%bs, m-ii, 0, n-jj);
+ }
+ m -= bs-ai%bs;
+ pA += bs*sda;
+ pD += bs*sdd;
+ }
+ if(offsetD==0)
+ {
+ for(; ii<m-3; ii+=4)
+ {
+ jj = 0;
+ for(; jj<n-5; jj+=4)
+ {
+ kernel_strmm_nn_rl_4x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs]);
+ }
+ for(; jj<n; jj+=4)
+ {
+ kernel_strmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, 0, &pD[ii*sdd+jj*bs], sdd, 0, 4, 0, n-jj);
+ }
+ }
+ if(ii<m)
+ {
+ goto left_4;
+ }
+ }
+ else
+ {
+ for(; ii<m; ii+=4)
+ {
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_strmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4:
+ jj = 0;
+ for(; jj<n; jj+=4)
+ {
+ kernel_strmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ return;
+
+ }
+
+
+
+void ssyrk_ln_libstr(int m, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\nsryrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+// ssyrk_nt_l_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd);
+
+ int i, j, l;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_sgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ kernel_ssyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4:
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, m-j);
+ }
+ kernel_ssyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, m-j);
+ return;
+
+ }
+
+
+
+void ssyrk_ln_mn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\nsryrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+// ssyrk_nt_l_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd);
+
+ int i, j, l;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+ kernel_sgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dgemm
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-3)
+ {
+ kernel_ssyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+ }
+ else
+ {
+ kernel_ssyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4:
+ j = 0;
+ for(; j<i && j<n; j+=4)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_ssyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/blas/s_blas3_lib8.c b/blas/s_blas3_lib8.c
new file mode 100644
index 0000000..f0f5144
--- /dev/null
+++ b/blas/s_blas3_lib8.c
@@ -0,0 +1,1325 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#if defined(DIM_CHECK)
+#include <stdio.h>
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+void sgemm_nt_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // TODO check that sA=!sD or that if sA==sD then they do not overlap (same for sB)
+ // non-negative size
+ if(m<0) printf("\n****** sgemm_nt_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgemm_nt_libstr : n<0 : %d<0 *****\n", n);
+ if(k<0) printf("\n****** sgemm_nt_libstr : k<0 : %d<0 *****\n", k);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgemm_nt_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgemm_nt_libstr : aj<0 : %d<0 *****\n", aj);
+ if(bi<0) printf("\n****** sgemm_nt_libstr : bi<0 : %d<0 *****\n", bi);
+ if(bj<0) printf("\n****** sgemm_nt_libstr : bj<0 : %d<0 *****\n", bj);
+ if(ci<0) printf("\n****** sgemm_nt_libstr : ci<0 : %d<0 *****\n", ci);
+ if(cj<0) printf("\n****** sgemm_nt_libstr : cj<0 : %d<0 *****\n", cj);
+ if(di<0) printf("\n****** sgemm_nt_libstr : di<0 : %d<0 *****\n", di);
+ if(dj<0) printf("\n****** sgemm_nt_libstr : dj<0 : %d<0 *****\n", dj);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** sgemm_nt_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+k > sA->n) printf("\n***** sgemm_nt_libstr : aj+k > col(A) : %d+%d > %d *****\n", aj, k, sA->n);
+ // B: n x k
+ if(bi+n > sB->m) printf("\n***** sgemm_nt_libstr : bi+n > row(B) : %d+%d > %d *****\n", bi, n, sB->m);
+ if(bj+k > sB->n) printf("\n***** sgemm_nt_libstr : bj+k > col(B) : %d+%d > %d *****\n", bj, k, sB->n);
+ // C: m x n
+ if(ci+m > sC->m) printf("\n***** sgemm_nt_libstr : ci+m > row(C) : %d+%d > %d *****\n", ci, n, sC->m);
+ if(cj+n > sC->n) printf("\n***** sgemm_nt_libstr : cj+n > col(C) : %d+%d > %d *****\n", cj, k, sC->n);
+ // D: m x n
+ if(di+m > sD->m) printf("\n***** sgemm_nt_libstr : di+m > row(D) : %d+%d > %d *****\n", di, n, sD->m);
+ if(dj+n > sD->n) printf("\n***** sgemm_nt_libstr : dj+n > col(D) : %d+%d > %d *****\n", dj, k, sD->n);
+#endif
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ int i, j, l;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(j<n-3)
+ {
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ if(j<n-4)
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 8, n-(j+4));
+ }
+ }
+ else
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 8, n-j);
+ }
+ }
+ }
+ if(m-i>0)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=12)
+ {
+ goto left_12;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+// else if(m-i<=20)
+// {
+// goto left_20;
+// }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(j<n-3)
+ {
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ if(j<n-4)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 8, n-(j+4));
+ }
+ }
+ else
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 8, n-j);
+ }
+ }
+ }
+ if(m-i>0)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=12)
+ {
+ goto left_12;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_24:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, 4);
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_20:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, 4);
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+16)*sdc], &pD[(j+0)*bs+(i+16)*sdd], m-(i+16), n-j);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+16)*sdc], &pD[(j+0)*bs+(i+16)*sdd], m-(i+16), n-j);
+ }
+ return;
+#endif
+
+ left_16:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, 4);
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+left_12:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+8)*sdc], &pD[(j+0)*bs+(i+8)*sdd], m-(i+8), n-j);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+8)*sdc], &pD[(j+0)*bs+(i+8)*sdd], m-(i+8), n-j);
+ }
+ return;
+#endif
+
+ left_8:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nt_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ }
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ left_4:
+ j = 0;
+ for(; j<n; j+=8)
+ {
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ }
+ return;
+#endif
+
+ }
+
+
+
+void sgemm_nn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgemm_nt_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgemm_nt_libstr : n<0 : %d<0 *****\n", n);
+ if(k<0) printf("\n****** sgemm_nt_libstr : k<0 : %d<0 *****\n", k);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgemm_nt_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgemm_nt_libstr : aj<0 : %d<0 *****\n", aj);
+ if(bi<0) printf("\n****** sgemm_nt_libstr : bi<0 : %d<0 *****\n", bi);
+ if(bj<0) printf("\n****** sgemm_nt_libstr : bj<0 : %d<0 *****\n", bj);
+ if(ci<0) printf("\n****** sgemm_nt_libstr : ci<0 : %d<0 *****\n", ci);
+ if(cj<0) printf("\n****** sgemm_nt_libstr : cj<0 : %d<0 *****\n", cj);
+ if(di<0) printf("\n****** sgemm_nt_libstr : di<0 : %d<0 *****\n", di);
+ if(dj<0) printf("\n****** sgemm_nt_libstr : dj<0 : %d<0 *****\n", dj);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** sgemm_nn_libstr : ai+m > row(A) : %d+%d > %d *****\n\n", ai, m, sA->m);
+ if(aj+k > sA->n) printf("\n***** sgemm_nn_libstr : aj+k > col(A) : %d+%d > %d *****\n\n", aj, k, sA->n);
+ // B: k x n
+ if(bi+k > sB->m) printf("\n***** sgemm_nn_libstr : bi+k > row(B) : %d+%d > %d *****\n\n", bi, k, sB->m);
+ if(bj+n > sB->n) printf("\n***** sgemm_nn_libstr : bj+n > col(B) : %d+%d > %d *****\n\n", bj, n, sB->n);
+ // C: m x n
+ if(ci+m > sC->m) printf("\n***** sgemm_nn_libstr : ci+m > row(C) : %d+%d > %d *****\n\n", ci, n, sC->m);
+ if(cj+n > sC->n) printf("\n***** sgemm_nn_libstr : cj+n > col(C) : %d+%d > %d *****\n\n", cj, k, sC->n);
+ // D: m x n
+ if(di+m > sD->m) printf("\n***** sgemm_nn_libstr : di+m > row(D) : %d+%d > %d *****\n\n", di, n, sD->m);
+ if(dj+n > sD->n) printf("\n***** sgemm_nn_libstr : dj+n > col(D) : %d+%d > %d *****\n\n", dj, k, sD->n);
+#endif
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs + bi/bs*bs*sdb;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ int offsetB = bi%bs;
+
+ int i, j, l;
+
+ i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<n-7; j+=8)
+ {
+ kernel_sgemm_nn_24x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nn_24x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(j<n-3)
+ {
+ kernel_sgemm_nn_24x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ if(j<n-4)
+ {
+ kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 16, n-(j+4));
+ }
+ }
+ else
+ {
+ kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 16, n-j);
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+#if 1
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<n-7; j+=8)
+ {
+ kernel_sgemm_nn_16x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nn_16x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(j<n-3)
+ {
+ kernel_sgemm_nn_16x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ if(j<n-4)
+ {
+ kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 16, n-(j+4));
+ }
+ }
+ else
+ {
+ kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 16, n-j);
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#else
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-7; j+=8)
+ {
+#if 1
+ kernel_sgemm_nn_8x8_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd]);
+#else
+ kernel_sgemm_nn_8x4_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd]);
+ kernel_sgemm_nn_8x4_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], &pD[(j+4)*bs+i*sdd]);
+#endif
+ }
+ if(j<n)
+ {
+ if(j<n-3)
+ {
+ kernel_sgemm_nn_8x4_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd]);
+ if(j<n-4)
+ {
+ kernel_sgemm_nn_8x4_gen_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+4)*bs], sdb, &beta, 0, &pC[(j+4)*bs+i*sdc], sdc, 0, &pD[(j+4)*bs+i*sdd], sdd, 0, 8, 0, n-(j+4));
+ }
+ }
+ else
+ {
+ kernel_sgemm_nn_8x4_gen_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, 0, &pC[(j+0)*bs+i*sdc], sdc, 0, &pD[(j+0)*bs+i*sdd], sdd, 0, 8, 0, n-j);
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_8;
+ }
+#endif
+#endif
+
+ // common return if i==m
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_24:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+#endif
+
+ left_16:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+ }
+ return;
+
+ left_8:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_sgemm_nn_8x8_vs_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+ kernel_sgemm_nn_8x4_vs_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+ }
+ return;
+
+ }
+
+
+
+void ssyrk_ln_libstr(int m, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ci>0 | di>0)
+ {
+ printf("\nssyrk_ln_libstr: feature not implemented yet: ci>0, di>0\n");
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int i, j;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+
+ kernel_ssyrk_nt_l_24x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_20x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd);
+ kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd);
+ kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(j+16)*sda], &pB[0+(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd]);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=12)
+ {
+ goto left_12;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd]);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=12)
+ {
+ goto left_12;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_24: // 17 <= m <= 23
+ j = 0;
+ for(; j<i & j<m-7; j+=8)
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, m-(j+0));
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, m-(j+4));
+ }
+ kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+0));
+ kernel_ssyrk_nt_l_20x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+4));
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), m-(j+8));
+ kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, m-(i+8), m-(j+12));
+ if(j<m-20) // 21 - 23
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+16)*sda], &pB[0+(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), m-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+16)*sda], &pB[0+(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), m-(j+16));
+ }
+ return;
+#endif
+
+ left_16: // 13 <= m <= 16
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, m-(j+0));
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, m-(j+4));
+ }
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+0));
+ kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+4));
+ if(j<m-12) // 13 - 16
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), m-(j+8));
+ }
+ else // 9 - 12
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), m-(j+8));
+ }
+ return;
+
+ left_12: // 9 <= m <= 12
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[(i+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+0)*sdc], &pD[(j+0)*bs+(i+0)*sdd], m-(i+0), m-(j+0));
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+8)*sdc], &pD[(j+0)*bs+(i+8)*sdd], m-(i+0), m-(j+0));
+ }
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+8)*sdc], &pD[(j+0)*bs+(j+8)*sdd], m-(i+8), m-(j+0));
+ if(j<m-8) // 9 - 12
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), m-(j+8));
+ }
+ return;
+
+ left_8: // 5 <= m <= 8
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[(i+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+0)*sdc], &pD[(j+0)*bs+(i+0)*sdd], m-(i+0), m-(j+0));
+ }
+ if(j<m-4) // 5 - 8
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
+ }
+ else // 1 - 4
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
+ }
+ return;
+
+ left_4: // 1 <= m <= 4
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+0)*sdc], &pD[(j+0)*bs+(i+0)*sdd], m-(i+0), m-(j+0));
+ }
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
+ return;
+
+ }
+
+
+
+void ssyrk_ln_mn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ci>0 | di>0)
+ {
+ printf("\nssyrk_ln_mn_libstr: feature not implemented yet: ci>0, di>0\n");
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int i, j;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-23)
+ {
+ kernel_ssyrk_nt_l_24x4_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_20x4_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd);
+ kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd);
+ kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd]);
+ }
+ else
+ {
+ if(j<n-4) // 5 - 23
+ {
+ kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+0));
+ kernel_ssyrk_nt_l_20x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+4));
+ if(j==n-8)
+ return;
+ if(j<n-12) // 13 - 23
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
+ kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+12));
+ if(j==n-16)
+ return;
+ if(j<n-20) // 21 - 23
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
+ }
+ }
+ else // 9 10 11 12
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+ kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-15)
+ {
+ kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
+ kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd]);
+ }
+ else
+ {
+ if(j<n-4) // 5 - 15
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+0));
+ kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+4));
+ if(j==n-8) // 8
+ return;
+ if(j<n-12) // 13 - 15
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
+ }
+ else // 9 10 11 12
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_24:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 - 23
+ {
+ kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+0));
+ kernel_ssyrk_nt_l_20x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+4));
+ if(j>=n-8)
+ return;
+ if(j<n-12) // 13 - 23
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
+ kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+12));
+ if(j>=n-16)
+ return;
+ if(j<n-20) // 21 - 23
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
+ }
+ }
+ else // 9 10 11 12
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
+ }
+ }
+ }
+ return;
+#endif
+
+ left_16:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 - 15
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+j*sdc], sdc, &pD[(j+0)*bs+j*sdd], sdd, m-(i+0), n-(j+0));
+ kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+j*sdc], sdc, &pD[(j+4)*bs+j*sdd], sdd, m-(i+0), n-(j+4));
+ if(j>=n-8)
+ return;
+ if(j<n-12) // 13 - 15
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
+ }
+ else // 9 - 12
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
+ }
+ }
+ }
+ return;
+
+ left_8:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ else // 1 2 3 4
+ {
+ kernel_sgemm_nt_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 6 7
+ {
+ kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], m-i, n-j);
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], m-i, n-j);
+ }
+ }
+ }
+ return;
+
+ }
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void strmm_rlnn_libstr(int m, int n, float alpha, struct s_strmat *sB, int bi, int bj, struct s_strmat *sA, int ai, int aj, struct s_strmat *sD, int di, int dj)
+ {
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+
+ pA += ai/bs*bs*sda;
+ pB += bi/bs*bs*sdb;
+ int offsetB = bi%bs;
+ int di0 = di-ai%bs;
+ int offsetD;
+ if(di0>=0)
+ {
+ pD += di0/bs*bs*sdd;
+ offsetD = di0%bs;
+ }
+ else
+ {
+ pD += -8*sdd;
+ offsetD = bs+di0;
+ }
+
+ int ii, jj;
+
+ int offsetB4;
+
+ if(offsetB<4)
+ {
+ offsetB4 = offsetB+4;
+ ii = 0;
+ if(ai%bs!=0)
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, ai%bs, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, ai%bs, m-ii, 0, n-jj-4);
+ }
+ m -= bs-ai%bs;
+ pA += bs*sda;
+ pD += bs*sdd;
+ }
+ if(offsetD==0)
+ {
+#if defined(TARGET_X64_INTEL_HASWELL)
+ // XXX create left_24 once the _gen_ kernel exist !!!
+ for(; ii<m-23; ii+=24)
+ {
+ jj = 0;
+ for(; jj<n-7; jj+=8)
+ {
+ kernel_strmm_nn_rl_24x4_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd);
+ kernel_strmm_nn_rl_24x4_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_24x4_vs_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd, 24, n-jj);
+ if(n-jj>4)
+ {
+ kernel_strmm_nn_rl_24x4_vs_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd, 24, n-jj-4);
+ }
+ }
+ }
+#endif
+ for(; ii<m-15; ii+=16)
+ {
+ jj = 0;
+ for(; jj<n-7; jj+=8)
+ {
+ kernel_strmm_nn_rl_16x4_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd);
+ kernel_strmm_nn_rl_16x4_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_16x4_vs_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd, 16, n-jj);
+ if(n-jj>4)
+ {
+ kernel_strmm_nn_rl_16x4_vs_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd, 16, n-jj-4);
+ }
+ }
+ }
+ if(m-ii>0)
+ {
+ if(m-ii<=8)
+ goto left_8;
+ else
+ goto left_16;
+ }
+ }
+ else
+ {
+ for(; ii<m-8; ii+=16)
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ if(m-ii>0)
+ goto left_8;
+ }
+ }
+ else
+ {
+ offsetB4 = offsetB-4;
+ ii = 0;
+ if(ai%bs!=0)
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, ai%bs, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, ai%bs, m-ii, 0, n-jj-4);
+ }
+ m -= bs-ai%bs;
+ pA += bs*sda;
+ pD += bs*sdd;
+ }
+ if(offsetD==0)
+ {
+ for(; ii<m-15; ii+=16)
+ {
+ jj = 0;
+ for(; jj<n-7; jj+=8)
+ {
+ kernel_strmm_nn_rl_16x4_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd);
+ kernel_strmm_nn_rl_16x4_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_16x4_vs_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd, 8, n-jj);
+ if(n-jj>4)
+ {
+ kernel_strmm_nn_rl_16x4_vs_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd, 8, n-jj-4);
+ }
+ }
+ }
+ if(m-ii>0)
+ {
+ if(m-ii<=8)
+ goto left_8;
+ else
+ goto left_16;
+ }
+ }
+ else
+ {
+ for(; ii<m-8; ii+=16)
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ if(m-ii>0)
+ goto left_8;
+ }
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_16:
+ if(offsetB<4)
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ else
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ return;
+
+ left_8:
+ if(offsetB<4)
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ else
+ {
+ jj = 0;
+ for(; jj<n-4; jj+=8)
+ {
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+ }
+ if(n-jj>0)
+ {
+ kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+ }
+ }
+ return;
+
+ }
+
+
+
+// dtrsm_right_lower_transposed_notunit
+void strsm_rltn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+ {
+ printf("\nstrsm_rltn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ // TODO alpha
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dA = sA->dA;
+
+ int i, j;
+
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(i=0; i<n; i++)
+ dA[i] = 1.0 / dA[i];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+ for(i=0; i<n; i++)
+ dA[i] = 1.0 / dA[i];
+ sA->use_dA = 0;
+ }
+
+ if(m<=0 || n<=0)
+ return;
+
+ i = 0;
+
+ for(; i<m-7; i+=8)
+ {
+ j = 0;
+ for(; j<n-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_8x4_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0]);
+ kernel_strsm_nt_rl_inv_8x4_lib8(j+4, &pD[i*sdd], &pA[4+j*sda], &pB[(j+4)*bs+i*sdb], &pD[(j+4)*bs+i*sdd], &pA[4+(j+4)*bs+j*sda], &dA[j+0]);
+ }
+ if(n-j>0)
+ {
+ kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0], m-i, n-j-0);
+ if(n-j>4)
+ {
+ kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+4, &pD[i*sdd], &pA[4+j*sda], &pB[(j+4)*bs+i*sdb], &pD[(j+4)*bs+i*sdd], &pA[4+(j+4)*bs+j*sda], &dA[j+4], m-i, n-j-4);
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_8;
+ }
+
+ // common return if i==m
+ return;
+
+ left_8:
+ j = 0;
+ for(; j<n-4; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0], m-i, n-j-0);
+ kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+4, &pD[i*sdd], &pA[4+j*sda], &pB[(j+4)*bs+i*sdb], &pD[(j+4)*bs+i*sdd], &pA[4+(j+4)*bs+j*sda], &dA[j+4], m-i, n-j-4);
+ }
+ if(n-j>0)
+ {
+ kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0], m-i, n-j-0);
+ }
+ return;
+
+ }
+
+
+
+
diff --git a/blas/s_blas_64.h b/blas/s_blas_64.h
new file mode 100644
index 0000000..1589867
--- /dev/null
+++ b/blas/s_blas_64.h
@@ -0,0 +1,65 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// headers to reference BLAS and LAPACK routines employed in BLASFEO WR
+
+// level 1
+void scopy_(long long *m, float *x, long long *incx, float *y, long long *incy);
+void saxpy_(long long *m, float *alpha, float *x, long long *incx, float *y, long long *incy);
+void sscal_(long long *m, float *alpha, float *x, long long *incx);
+
+// level 2
+void sgemv_(char *ta, long long *m, long long *n, float *alpha, float *A, long long *lda, float *x, long long *incx, float *beta, float *y, long long *incy);
+void ssymv_(char *uplo, long long *m, float *alpha, float *A, long long *lda, float *x, long long *incx, float *beta, float *y, long long *incy);
+void strmv_(char *uplo, char *trans, char *diag, long long *n, float *A, long long *lda, float *x, long long *incx);
+void strsv_(char *uplo, char *trans, char *diag, long long *n, float *A, long long *lda, float *x, long long *incx);
+void sger_(long long *m, long long *n, float *alpha, float *x, long long *incx, float *y, long long *incy, float *A, long long *lda);
+
+// level 3
+void sgemm_(char *ta, char *tb, long long *m, long long *n, long long *k, float *alpha, float *A, long long *lda, float *B, long long *ldb, float *beta, float *C, long long *ldc);
+void ssyrk_(char *uplo, char *trans, long long *n, long long *k, float *alpha, float *A, long long *lda, float *beta, float *C, long long *ldc);
+void strmm_(char *side, char *uplo, char *transa, char *diag, long long *m, long long *n, float *alpha, float *A, long long *lda, float *B, long long *ldb);
+void strsm_(char *side, char *uplo, char *transa, char *diag, long long *m, long long *n, float *alpha, float *A, long long *lda, float *B, long long *ldb);
+
+// lapack
+long long spotrf_(char *uplo, long long *m, float *A, long long *lda, long long *info);
+long long sgetrf_(long long *m, long long *n, float *A, long long *lda, long long *ipiv, long long *info);
+void sgeqrf_(long long *m, long long *n, float *A, long long *lda, float *tau, float *work, long long *lwork, long long *info);
+void sgeqr2_(long long *m, long long *n, float *A, long long *lda, float *tau, float *work, long long *info);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/blas/s_lapack_lib.c b/blas/s_lapack_lib.c
new file mode 100644
index 0000000..c7cb56b
--- /dev/null
+++ b/blas/s_lapack_lib.c
@@ -0,0 +1,76 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#if defined(LA_BLAS)
+#if defined(REF_BLAS_BLIS)
+#include "s_blas_64.h"
+#else
+#include "s_blas.h"
+#endif
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+#define REAL float
+
+#define STRMAT s_strmat
+#define STRVEC s_strvec
+
+#define GELQF_LIBSTR sgelqf_libstr
+#define GELQF_WORK_SIZE_LIBSTR sgelqf_work_size_libstr
+#define GEQRF_LIBSTR sgeqrf_libstr
+#define GEQRF_WORK_SIZE_LIBSTR sgeqrf_work_size_libstr
+#define GETF2_NOPIVOT sgetf2_nopivot
+#define GETRF_NOPIVOT_LIBSTR sgetrf_nopivot_libstr
+#define GETRF_LIBSTR sgetrf_libstr
+#define POTRF_L_LIBSTR spotrf_l_libstr
+#define POTRF_L_MN_LIBSTR spotrf_l_mn_libstr
+#define SYRK_POTRF_LN_LIBSTR ssyrk_spotrf_ln_libstr
+
+#define COPY scopy_
+#define GELQF sgelqf_
+#define GEMM sgemm_
+#define GER sger_
+#define GEQRF sgeqrf_
+#define GEQR2 sgeqr2_
+#define GETRF sgetrf_
+#define POTRF spotrf_
+#define SCAL sscal_
+#define SYRK ssyrk_
+#define TRSM strsm_
+
+
+#include "x_lapack_lib.c"
+
diff --git a/blas/s_lapack_lib4.c b/blas/s_lapack_lib4.c
new file mode 100644
index 0000000..7d02d36
--- /dev/null
+++ b/blas/s_lapack_lib4.c
@@ -0,0 +1,664 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+void ssyrk_spotrf_nt_l_lib(int m, int n, int k, float *pA, int sda, float *pB, int sdb, float *pC, int sdc, float *pD, int sdd, float *inv_diag_D)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ int alg = 1; // XXX
+
+ const int bs = 4;
+
+ int i, j, l;
+
+ i = 0;
+
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &inv_diag_D[j]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dgemm
+ {
+ kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ if(j<n-3)
+ {
+ kernel_ssyrk_spotrf_nt_l_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &inv_diag_D[j]);
+ }
+ else
+ {
+ kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4:
+ j = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ if(j<i) // dgemm
+ {
+ kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ else // dsyrk
+ {
+ kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+ }
+ }
+ return;
+
+ }
+
+
+
+void sgetrf_nn_nopivot_lib(int m, int n, float *pC, int sdc, float *pD, int sdd, float *inv_diag_D)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii, jj, ie;
+
+ // main loop
+ ii = 0;
+ for( ; ii<m-3; ii+=4)
+ {
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie-3; jj+=4)
+ {
+ kernel_strsm_nn_ru_inv_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[jj*bs+jj*sdd], &inv_diag_D[jj]);
+ }
+ if(jj<ie)
+ {
+ kernel_strsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[jj*bs+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ jj+=4;
+ }
+ // factorize
+ if(jj<n-3)
+ {
+ kernel_sgetrf_nn_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &inv_diag_D[jj]);
+ jj+=4;
+ }
+ else if(jj<n)
+ {
+ kernel_sgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n-3; jj+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[ii*bs+ii*sdd]);
+ }
+ if(jj<n)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[ii*bs+ii*sdd], m-ii, n-jj);
+ }
+ }
+ if(m>ii)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ left_4:
+ jj = 0;
+ // solve lower
+ ie = n<ii ? n : ii; // ie is multiple of 4
+ for( ; jj<ie; jj+=4)
+ {
+ kernel_strsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[jj*bs+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+ }
+ // factorize
+ if(jj<n)
+ {
+ kernel_sgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+ jj+=4;
+ }
+ // solve upper
+ for( ; jj<n; jj+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[ii*bs+ii*sdd], m-ii, n-jj);
+ }
+ return;
+
+ }
+
+
+
+void sgetrf_nn_lib(int m, int n, float *pC, int sdc, float *pD, int sdd, float *inv_diag_D, int *ipiv)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 4;
+
+ int ii, jj, i0, i1, j0, ll, p;
+
+ float d1 = 1.0;
+ float dm1 = -1.0;
+
+// // needs to perform row-excanges on the yet-to-be-factorized matrix too
+// if(pC!=pD)
+// sgecp_lib(m, n, 1.0, 0, pC, sdc, 0, pD, sdd);
+
+ // minimum matrix size
+ p = n<m ? n : m; // XXX
+
+ // main loop
+ // 4 columns at a time
+ jj = 0;
+ for(; jj<p-3; jj+=4) // XXX
+ {
+ // pivot & factorize & solve lower
+ ii = jj;
+ i0 = ii;
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_sgemm_nn_4x4_lib4(jj, &dm1, &pD[ii*sdd], &pD[jj*bs], sdd, &d1, &pD[jj*bs+ii*sdd], &pD[jj*bs+ii*sdd]);
+ }
+ if(m-ii>0)
+ {
+ kernel_sgemm_nn_4x4_vs_lib4(jj, &dm1, &pD[ii*sdd], &pD[jj*bs], sdd, &d1, &pD[jj*bs+ii*sdd], &pD[jj*bs+ii*sdd], m-ii, 4);
+ }
+ kernel_sgetrf_pivot_4_lib4(m-i0, &pD[jj*bs+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ srowsw_lib(jj, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs+(jj+4)*bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs+(jj+4)*bs);
+ }
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ srowsw_lib(jj, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs+(jj+4)*bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs+(jj+4)*bs);
+ }
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ srowsw_lib(jj, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs+(jj+4)*bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs+(jj+4)*bs);
+ }
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ srowsw_lib(jj, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs+(jj+4)*bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs+(jj+4)*bs);
+ }
+
+ // solve upper
+ ll = jj+4;
+ for( ; ll<n-3; ll+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_lib4(i0, &pD[i0*sdd], &pD[ll*bs], sdd, &pD[ll*bs+i0*sdd], &pD[ll*bs+i0*sdd], &pD[i0*bs+i0*sdd]);
+ }
+ if(n-ll>0)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*bs], sdd, &pD[ll*bs+i0*sdd], &pD[ll*bs+i0*sdd], &pD[i0*bs+i0*sdd], 4, n-ll);
+ }
+ }
+ if(m>=n)
+ {
+ if(n-jj>0)
+ {
+ goto left_n_4;
+ }
+ }
+ else
+ {
+ if(m-jj>0)
+ {
+ goto left_m_4;
+ }
+ }
+
+ // common return if jj==n
+ return;
+
+ // clean up
+
+ left_n_4:
+ // 1-4 columns at a time
+ // pivot & factorize & solve lower
+ ii = jj;
+ i0 = ii;
+ for( ; ii<m; ii+=4)
+ {
+ kernel_sgemm_nn_4x4_vs_lib4(jj, &dm1, &pD[ii*sdd], &pD[jj*bs], sdd, &d1, &pD[jj*bs+ii*sdd], &pD[jj*bs+ii*sdd], m-ii, n-jj);
+ }
+ kernel_sgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*bs+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ srowsw_lib(jj, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs+(jj+4)*bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs+(jj+4)*bs);
+ }
+ if(n-jj>1)
+ {
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ srowsw_lib(jj, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs+(jj+4)*bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs+(jj+4)*bs);
+ }
+ if(n-jj>2)
+ {
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ srowsw_lib(jj, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs+(jj+4)*bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs+(jj+4)*bs);
+ }
+ if(n-jj>3)
+ {
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ srowsw_lib(jj, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs+(jj+4)*bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs+(jj+4)*bs);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ if(0) // there is no upper
+ {
+ ll = jj+4;
+ for( ; ll<n; ll+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*bs], sdd, &pD[ll*bs+i0*sdd], &pD[ll*bs+i0*sdd], &pD[i0*bs+i0*sdd], m-i0, n-ll);
+ }
+ }
+ return;
+
+
+ left_m_4:
+ // 1-4 rows at a time
+ // pivot & factorize & solve lower
+ ii = jj;
+ i0 = ii;
+ kernel_sgemm_nn_4x4_vs_lib4(jj, &dm1, &pD[ii*sdd], &pD[jj*bs], sdd, &d1, &pD[jj*bs+ii*sdd], &pD[jj*bs+ii*sdd], m-ii, n-jj);
+ kernel_sgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*bs+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+ ipiv[i0+0] += i0;
+ if(ipiv[i0+0]!=i0+0)
+ {
+ srowsw_lib(jj, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs+(jj+4)*bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs+(jj+4)*bs);
+ }
+ if(m-i0>1)
+ {
+ ipiv[i0+1] += i0;
+ if(ipiv[i0+1]!=i0+1)
+ {
+ srowsw_lib(jj, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs+(jj+4)*bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs+(jj+4)*bs);
+ }
+ if(m-i0>2)
+ {
+ ipiv[i0+2] += i0;
+ if(ipiv[i0+2]!=i0+2)
+ {
+ srowsw_lib(jj, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs+(jj+4)*bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs+(jj+4)*bs);
+ }
+ if(m-i0>3)
+ {
+ ipiv[i0+3] += i0;
+ if(ipiv[i0+3]!=i0+3)
+ {
+ srowsw_lib(jj, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs);
+ srowsw_lib(n-jj-4, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs+(jj+4)*bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs+(jj+4)*bs);
+ }
+ }
+ }
+ }
+
+ // solve upper
+ ll = jj+4;
+ for( ; ll<n; ll+=4)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*bs], sdd, &pD[ll*bs+i0*sdd], &pD[ll*bs+i0*sdd], &pD[i0*bs+i0*sdd], m-i0, n-ll);
+ }
+ return;
+
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dpotrf
+void spotrf_l_libstr(int m, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ci!=0 | di!=0)
+ {
+ printf("\nspotrf_l_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA;
+ if(di==0 && dj==0) // XXX what to do if di and dj are not zero
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ int i, j, l;
+
+ i = 0;
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_strsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j]);
+ }
+ kernel_spotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j]);
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4: // 1 - 3
+ j = 0;
+ for(; j<i; j+=4)
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ }
+ kernel_spotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ return;
+
+ return;
+ }
+
+
+
+// dpotrf
+void spotrf_l_mn_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ if(ci!=0 | di!=0)
+ {
+ printf("\nspotrf_l_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+
+ const int bs = 4;
+
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA;
+ if(di==0 && dj==0) // XXX what to do if di and dj are not zero
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ int i, j, l;
+
+ i = 0;
+ for(; i<m-3; i+=4)
+ {
+ j = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+ kernel_strsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dpotrf
+ {
+ if(j<n-3)
+ {
+ kernel_spotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j]);
+ }
+ else
+ {
+ kernel_spotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ goto left_4;
+ }
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+ left_4:
+ j = 0;
+ for(; j<i && j<n-3; j+=4)
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // dpotrf
+ {
+ kernel_spotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ return;
+
+ return;
+ }
+
+
+
+// dsyrk dpotrf
+void ssyrk_spotrf_ln_libstr(int m, int n, int k, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\nssyrk_spotrf_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA; // XXX what to do if di and dj are not zero
+ ssyrk_spotrf_nt_l_lib(m, n, k, pA, sda, pB, sdb, pC, sdc, pD, sdd, dD);
+ if(di==0 && dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ return;
+ }
+
+
+
+// dgetrf without pivoting
+void sgetrf_nopivot_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+ if(ci!=0 | di!=0)
+ {
+ printf("\nsgetf_nopivot_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA; // XXX what to do if di and dj are not zero
+ sgetrf_nn_nopivot_lib(m, n, pC, sdc, pD, sdd, dD);
+ if(di==0 && dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ return;
+ }
+
+
+
+
+// dgetrf pivoting
+void sgetrf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, int *ipiv)
+ {
+ if(ci!=0 | di!=0)
+ {
+ printf("\nsgetrf_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+ exit(1);
+ }
+ const int bs = 4;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA; // XXX what to do if di and dj are not zero
+ // needs to perform row-excanges on the yet-to-be-factorized matrix too
+ if(pC!=pD)
+ sgecp_libstr(m, n, sC, ci, cj, sD, di, dj);
+ sgetrf_nn_lib(m, n, pC, sdc, pD, sdd, dD, ipiv);
+ if(di==0 && dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ return;
+ }
+
+
+
+int sgeqrf_work_size_libstr(int m, int n)
+ {
+ printf("\nsgeqrf_work_size_libstr: feature not implemented yet\n");
+ exit(1);
+ return 0;
+ }
+
+
+
+void sgeqrf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ printf("\nsgeqrf_libstr: feature not implemented yet\n");
+ exit(1);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/blas/s_lapack_lib8.c b/blas/s_lapack_lib8.c
new file mode 100644
index 0000000..3b5239e
--- /dev/null
+++ b/blas/s_lapack_lib8.c
@@ -0,0 +1,872 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+void spotrf_l_libstr(int m, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0)
+ return;
+
+ if(ci>0 | di>0)
+ {
+ printf("\nspotrf_l_libstr: feature not implemented yet: ci>0, di>0\n");
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int i, j;
+
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA; // XXX what to do if di and dj are not zero
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_24x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+ kernel_strsm_nt_rl_inv_24x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+ }
+ kernel_spotrf_nt_l_24x4_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+ kernel_spotrf_nt_l_20x4_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+ kernel_spotrf_nt_l_16x4_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8]);
+ kernel_spotrf_nt_l_12x4_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12]);
+ kernel_spotrf_nt_l_8x8_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16]);
+ }
+ if(m>i)
+ {
+ if(m-i<=4)
+ {
+ goto left_4;
+ }
+ else if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=12)
+ {
+ goto left_12;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_16x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+ kernel_strsm_nt_rl_inv_16x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+ }
+ kernel_spotrf_nt_l_16x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+ kernel_spotrf_nt_l_12x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+ kernel_spotrf_nt_l_8x8_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8]);
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_24: // 17 <= m <= 23
+ j = 0;
+ for(; j<i & j<m-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, m-(j+0));
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, m-(j+4));
+ }
+ kernel_spotrf_nt_l_24x4_vs_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), m-(j+0));
+ kernel_spotrf_nt_l_20x4_vs_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), m-(j+4));
+ kernel_spotrf_nt_l_16x4_vs_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), m-(j+8));
+ kernel_spotrf_nt_l_12x4_vs_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), m-(j+12));
+ if(j<m-20) // 21 - 23
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), m-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), m-(j+16));
+ }
+ return;
+#endif
+
+ left_16: // 9 <= m <= 16
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, m-(j+0));
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, m-(j+4));
+ }
+ kernel_spotrf_nt_l_16x4_vs_lib8(j+0, &pD[(i+0)*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+j*sdc], sdc, &pD[(j+0)*bs+j*sdd], sdd, &dD[j+0], m-(i+0), m-(j+0));
+ kernel_spotrf_nt_l_12x4_vs_lib8(j+4, &pD[(i+0)*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+j*sdc], sdc, &pD[(j+4)*bs+j*sdd], sdd, &dD[j+4], m-(i+0), m-(j+4));
+ if(j<m-12) // 13 - 16
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), m-(j+8));
+ }
+ else // 9 - 12
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), m-(j+8));
+ }
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_12: // 9 <= m <= 12
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ kernel_strsm_nt_rl_inv_4x8_vs_lib8(j, &pD[(i+8)*sdd], &pD[j*sdd], &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], &pD[j*bs+j*sdd], &dD[j], m-(i+8), m-j);
+ }
+ kernel_spotrf_nt_l_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ kernel_strsm_nt_rl_inv_4x8_vs_lib8(j, &pD[(i+8)*sdd], &pD[j*sdd], &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], &pD[j*bs+j*sdd], &dD[j], m-(i+8), m-j);
+ if(j<m-8) // 9 - 12
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[(j+8)], m-(i+8), m-(j+8));
+ }
+ return;
+#endif
+
+ left_8: // 1 <= m <= 8
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ }
+ if(j<m-4) // 5 - 8
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ }
+ else // 1 - 4
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ }
+ return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_4: // 1 <= m <= 4
+ j = 0;
+ for(; j<i; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_4x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ }
+ kernel_spotrf_nt_l_8x4_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+ return;
+#endif
+
+ }
+
+
+
+void spotrf_l_mn_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+ if(ci>0 | di>0)
+ {
+ printf("\nspotrf_l_mn_libstr: feature not implemented yet: ci>0, di>0\n");
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int i, j;
+
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA; // XXX what to do if di and dj are not zero
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_24x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+ kernel_strsm_nt_rl_inv_24x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[(j+4)*bs+(j+4)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-23)
+ {
+ kernel_spotrf_nt_l_24x4_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+ kernel_spotrf_nt_l_20x4_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+ kernel_spotrf_nt_l_16x4_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8]);
+ kernel_spotrf_nt_l_12x4_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12]);
+ kernel_spotrf_nt_l_8x8_lib8((j+16), &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16]);
+ }
+ else
+ {
+ if(j<n-4) // 5 - 23
+ {
+ kernel_spotrf_nt_l_24x4_vs_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_spotrf_nt_l_20x4_vs_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j==n-8)
+ return;
+ if(j<n-12) // 13 - 23
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ kernel_spotrf_nt_l_12x4_vs_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), n-(j+12));
+ if(j==n-16)
+ return;
+ if(j<n-20) // 21 - 23
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ }
+ else // 9 10 11 12
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8(j+8, &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_spotrf_nt_l_24x4_vs_lib8(j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_16x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+ kernel_strsm_nt_rl_inv_16x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[(j+4)*bs+(j+4)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-15)
+ {
+ kernel_spotrf_nt_l_16x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+ kernel_spotrf_nt_l_12x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+ kernel_spotrf_nt_l_8x8_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8]);
+ }
+ else
+ {
+ if(j<n-4) // 5 - 15
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_spotrf_nt_l_12x4_vs_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j==n-8) // 8
+ return;
+ if(j<n-12) // 13 - 15
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ else // 9 10 11 12
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8(j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_24:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 - 23
+ {
+ kernel_spotrf_nt_l_24x4_vs_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_spotrf_nt_l_20x4_vs_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j>=n-8)
+ return;
+ if(j<n-12) // 13 - 23
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ kernel_spotrf_nt_l_12x4_vs_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), n-(j+12));
+ if(j>=n-16)
+ return;
+ if(j<n-20) // 21 - 23
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ }
+ else // 9 10 11 12
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8(j+8, &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_spotrf_nt_l_24x4_vs_lib8(j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ return;
+#endif
+
+ left_16:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 - 15
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8(j+0, &pD[(i+0)*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+j*sdc], sdc, &pD[(j+0)*bs+j*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_spotrf_nt_l_12x4_vs_lib8(j+4, &pD[(i+0)*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+j*sdc], sdc, &pD[(j+4)*bs+j*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j>=n-8)
+ return;
+ if(j<n-12) // 13 - 15
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ else // 9 - 12
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_spotrf_nt_l_16x4_vs_lib8(j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ return;
+
+ left_8:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_strsm_nt_rl_inv_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ if(j<n-4) // 5 6 7
+ {
+ kernel_strsm_nt_rl_inv_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // 1 2 3 4
+ {
+ kernel_strsm_nt_rl_inv_8x4_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 6 7
+ {
+ kernel_spotrf_nt_l_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // 1 2 3 4
+ {
+ kernel_spotrf_nt_l_8x4_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void ssyrk_spotrf_ln_libstr(int m, int n, int k, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+ {
+
+ if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+ {
+ printf("\nssyrk_spotrf_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+ exit(1);
+ }
+
+ const int bs = 8;
+
+ int i, j;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ int sdc = sC->cn;
+ int sdd = sD->cn;
+ float *pA = sA->pA + aj*bs;
+ float *pB = sB->pA + bj*bs;
+ float *pC = sC->pA + cj*bs;
+ float *pD = sD->pA + dj*bs;
+ float *dD = sD->dA; // XXX what to do if di and dj are not zero
+
+// ssyrk_spotrf_nt_l_lib(m, n, k, pA, sda, pB, sdb, pC, sdc, pD, sdd, dD);
+
+ if(di==0 && dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+
+ i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; i<m-23; i+=24)
+ {
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_24x4_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+ kernel_sgemm_strsm_nt_rl_inv_24x4_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[(j+4)*bs+(j+4)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-23)
+ {
+ kernel_ssyrk_spotrf_nt_l_24x4_lib8(k, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], (j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+ kernel_ssyrk_spotrf_nt_l_20x4_lib8(k, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], (j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+ kernel_ssyrk_spotrf_nt_l_16x4_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8]);
+ kernel_ssyrk_spotrf_nt_l_12x4_lib8(k, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], (j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12]);
+ kernel_ssyrk_spotrf_nt_l_8x8_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], (j+16), &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16]);
+ }
+ else
+ {
+ if(j<n-4) // 5 - 23
+ {
+ kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], (j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], (j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j==n-8)
+ return;
+ if(j<n-12) // 13 - 23
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], (j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), n-(j+12));
+ if(j==n-16)
+ return;
+ if(j<n-20) // 21 - 23
+ {
+ kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ }
+ else // 9 10 11 12
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[j*sdb], j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else if(m-i<=16)
+ {
+ goto left_16;
+ }
+ else
+ {
+ goto left_24;
+ }
+ }
+#else
+ for(; i<m-15; i+=16)
+ {
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+ kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+ }
+ if(j<n)
+ {
+ if(i<j) // dtrsm
+ {
+ kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[(j+4)*bs+(j+4)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-15)
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+ kernel_ssyrk_spotrf_nt_l_12x4_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+ kernel_ssyrk_spotrf_nt_l_8x8_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8]);
+ }
+ else
+ {
+ if(j<n-4) // 5 - 15
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], (j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], j+4, &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j==n-8) // 8
+ return;
+ if(j<n-12) // 13 - 15
+ {
+ kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ else // 9 10 11 12
+ {
+ kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[j*sdb], j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ }
+ }
+ if(m>i)
+ {
+ if(m-i<=8)
+ {
+ goto left_8;
+ }
+ else
+ {
+ goto left_16;
+ }
+ }
+#endif
+
+ // common return if i==m
+ return;
+
+ // clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+ left_24:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 - 23
+ {
+ kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], (j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], (j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j>=n-8)
+ return;
+ if(j<n-12) // 13 - 23
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], j+12, &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), n-(j+12));
+ if(j>=n-16)
+ return;
+ if(j<n-20) // 21 - 23
+ {
+ kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ else // 17 18 19 20
+ {
+ kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+ }
+ }
+ else // 9 10 11 12
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[j*sdb], j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ return;
+#endif
+
+ left_16:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 - 15
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[0+j*sdb], j+0, &pD[(i+0)*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+j*sdc], sdc, &pD[(j+0)*bs+j*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+ kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[4+j*sdb], j+4, &pD[(i+0)*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+j*sdc], sdc, &pD[(j+4)*bs+j*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+ if(j>=n-8)
+ return;
+ if(j<n-12) // 13 - 15
+ {
+ kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ else // 9 - 12
+ {
+ kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+ }
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[j*sdb], j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+ }
+ }
+ }
+ return;
+
+ left_8:
+ j = 0;
+ for(; j<i & j<n-7; j+=8)
+ {
+ kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ if(j<n)
+ {
+ if(j<i) // dtrsm
+ {
+ if(j<n-4) // 5 6 7
+ {
+ kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // 1 2 3 4
+ {
+ kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ else // dpotrf
+ {
+ if(j<n-4) // 5 6 7
+ {
+ kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ else // 1 2 3 4
+ {
+ kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+ }
+ }
+ }
+ return;
+
+ }
+
+
+
+int sgeqrf_work_size_libstr(int m, int n)
+ {
+ printf("\nsgeqrf_work_size_libstr: feature not implemented yet\n");
+ exit(1);
+ return 0;
+ }
+
+
+
+void sgeqrf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ printf("\nsgeqrf_libstr: feature not implemented yet\n");
+ exit(1);
+ return;
+ }
+
+
+
+
diff --git a/blas/x_blas1_lib.c b/blas/x_blas1_lib.c
new file mode 100644
index 0000000..5f8fc2e
--- /dev/null
+++ b/blas/x_blas1_lib.c
@@ -0,0 +1,186 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+void AXPY_LIBSTR(int m, REAL alpha, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ if(m<=0)
+ return;
+ int ii;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ z[ii+1] = y[ii+1] + alpha*x[ii+1];
+ z[ii+2] = y[ii+2] + alpha*x[ii+2];
+ z[ii+3] = y[ii+3] + alpha*x[ii+3];
+ }
+ for(; ii<m; ii++)
+ z[ii+0] = y[ii+0] + alpha*x[ii+0];
+ return;
+ }
+
+
+
+// multiply two vectors and compute dot product
+REAL VECMULDOT_LIBSTR(int m, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ if(m<=0)
+ return 0.0;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ int ii;
+ REAL dot = 0.0;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ z[ii+0] = x[ii+0] * y[ii+0];
+ z[ii+1] = x[ii+1] * y[ii+1];
+ z[ii+2] = x[ii+2] * y[ii+2];
+ z[ii+3] = x[ii+3] * y[ii+3];
+ dot += z[ii+0] + z[ii+1] + z[ii+2] + z[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = x[ii+0] * y[ii+0];
+ dot += z[ii+0];
+ }
+ return dot;
+ }
+
+
+
+// compute dot product of two vectors
+REAL DOT_LIBSTR(int m, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi)
+ {
+ if(m<=0)
+ return 0.0;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ int ii;
+ REAL dot = 0.0;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ dot += x[ii+0] * y[ii+0];
+ dot += x[ii+1] * y[ii+1];
+ dot += x[ii+2] * y[ii+2];
+ dot += x[ii+3] * y[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ dot += x[ii+0] * y[ii+0];
+ }
+ return dot;
+ }
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+void AXPY_LIBSTR(int m, REAL alpha, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ if(m<=0)
+ return;
+ int i1 = 1;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ if(y!=z)
+ COPY(&m, y, &i1, z, &i1);
+ AXPY(&m, &alpha, x, &i1, z, &i1);
+ return;
+ }
+
+
+
+// multiply two vectors and compute dot product
+REAL VECMULDOT_LIBSTR(int m, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ if(m<=0)
+ return 0.0;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ int ii;
+ REAL dot = 0.0;
+ ii = 0;
+ for(; ii<m; ii++)
+ {
+ z[ii+0] = x[ii+0] * y[ii+0];
+ dot += z[ii+0];
+ }
+ return dot;
+ }
+
+
+
+// compute dot product of two vectors
+REAL DOT_LIBSTR(int m, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi)
+ {
+ if(m<=0)
+ return 0.0;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ int ii;
+ REAL dot = 0.0;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ dot += x[ii+0] * y[ii+0];
+ dot += x[ii+1] * y[ii+1];
+ dot += x[ii+2] * y[ii+2];
+ dot += x[ii+3] * y[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ dot += x[ii+0] * y[ii+0];
+ }
+ return dot;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/blas/x_blas2_diag_lib.c b/blas/x_blas2_diag_lib.c
new file mode 100644
index 0000000..e90cbd6
--- /dev/null
+++ b/blas/x_blas2_diag_lib.c
@@ -0,0 +1,51 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+void GEMV_DIAG_LIBSTR(int m, REAL alpha, struct STRVEC *sA, int ai, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ if(m<=0)
+ return;
+ int ii;
+ REAL *a = sA->pa + ai;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ if(alpha==1.0 & beta==1.0)
+ {
+ for(ii=0; ii<m; ii++)
+ z[ii] = a[ii]*x[ii] + y[ii];
+ }
+ else
+ {
+ for(ii=0; ii<m; ii++)
+ z[ii] = alpha*a[ii]*x[ii] + beta*y[ii];
+ }
+
+ return;
+
+ }
diff --git a/blas/x_blas2_lib.c b/blas/x_blas2_lib.c
new file mode 100644
index 0000000..32e1e0a
--- /dev/null
+++ b/blas/x_blas2_lib.c
@@ -0,0 +1,1466 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+void GEMV_N_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0, y_1, y_2, y_3,
+ x_0, x_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+#if 1 // y reg version
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 = 0.0;
+ y_1 = 0.0;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ y_0 += pA[ii+0+lda*(jj+0)] * x[jj+0] + pA[ii+0+lda*(jj+1)] * x[jj+1];
+ y_1 += pA[ii+1+lda*(jj+0)] * x[jj+0] + pA[ii+1+lda*(jj+1)] * x[jj+1];
+ }
+ if(jj<n)
+ {
+ y_0 += pA[ii+0+lda*jj] * x[jj];
+ y_1 += pA[ii+1+lda*jj] * x[jj];
+ }
+ z[ii+0] = beta * y[ii+0] + alpha * y_0;
+ z[ii+1] = beta * y[ii+1] + alpha * y_1;
+ }
+ for(; ii<m; ii++)
+ {
+ y_0 = 0.0;
+ for(jj=0; jj<n; jj++)
+ {
+ y_0 += pA[ii+lda*jj] * x[jj];
+ }
+ z[ii] = beta * y[ii] + alpha * y_0;
+ }
+#else // x reg version
+ for(ii=0; ii<n; ii++)
+ {
+ z[ii] = beta * y[ii];
+ }
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ x_0 = alpha * x[jj+0];
+ x_1 = alpha * x[jj+1];
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ z[ii+0] += pA[ii+0+lda*(jj+0)] * x_0 + pA[ii+0+lda*(jj+1)] * x_1;
+ z[ii+1] += pA[ii+1+lda*(jj+0)] * x_0 + pA[ii+1+lda*(jj+1)] * x_1;
+ }
+ for(; ii<m; ii++)
+ {
+ z[ii] += pA[ii+lda*(jj+0)] * x_0;
+ z[ii] += pA[ii+lda*(jj+1)] * x_1;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ x_0 = alpha * x[jj+0];
+ for(ii=0; ii<m; ii++)
+ {
+ z[ii] += pA[ii+lda*(jj+0)] * x_0;
+ }
+ }
+#endif
+ return;
+ }
+
+
+
+void GEMV_T_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0, y_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ y_0 = 0.0;
+ y_1 = 0.0;
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 += pA[ii+0+lda*(jj+0)] * x[ii+0] + pA[ii+1+lda*(jj+0)] * x[ii+1];
+ y_1 += pA[ii+0+lda*(jj+1)] * x[ii+0] + pA[ii+1+lda*(jj+1)] * x[ii+1];
+ }
+ if(ii<m)
+ {
+ y_0 += pA[ii+lda*(jj+0)] * x[ii];
+ y_1 += pA[ii+lda*(jj+1)] * x[ii];
+ }
+ z[jj+0] = beta * y[jj+0] + alpha * y_0;
+ z[jj+1] = beta * y[jj+1] + alpha * y_1;
+ }
+ for(; jj<n; jj++)
+ {
+ y_0 = 0.0;
+ for(ii=0; ii<m; ii++)
+ {
+ y_0 += pA[ii+lda*(jj+0)] * x[ii];
+ }
+ z[jj+0] = beta * y[jj+0] + alpha * y_0;
+ }
+ return;
+ }
+
+
+
+// TODO optimize !!!!!
+void GEMV_NT_LIBSTR(int m, int n, REAL alpha_n, REAL alpha_t, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx_n, int xi_n, struct STRVEC *sx_t, int xi_t, REAL beta_n, REAL beta_t, struct STRVEC *sy_n, int yi_n, struct STRVEC *sy_t, int yi_t, struct STRVEC *sz_n, int zi_n, struct STRVEC *sz_t, int zi_t)
+ {
+ int ii, jj;
+ REAL
+ a_00,
+ x_n_0,
+ y_t_0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x_n = sx_n->pa + xi_n;
+ REAL *x_t = sx_t->pa + xi_t;
+ REAL *y_n = sy_n->pa + yi_n;
+ REAL *y_t = sy_t->pa + yi_t;
+ REAL *z_n = sz_n->pa + zi_n;
+ REAL *z_t = sz_t->pa + zi_t;
+ for(ii=0; ii<m; ii++)
+ {
+ z_n[ii] = beta_n * y_n[ii];
+ }
+ for(jj=0; jj<n; jj++)
+ {
+ y_t_0 = 0.0;
+ x_n_0 = alpha_n * x_n[jj];
+ for(ii=0; ii<m; ii++)
+ {
+ a_00 = pA[ii+lda*jj];
+ z_n[ii] += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t[ii];
+ }
+ z_t[jj] = beta_t * y_t[jj] + alpha_t * y_t_0;
+ }
+ return;
+ }
+
+
+
+// TODO optimize !!!!!
+void SYMV_L_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ for(ii=0; ii<n; ii++)
+ {
+ y_0 = 0.0;
+ jj = 0;
+ for(; jj<=ii; jj++)
+ {
+ y_0 += pA[ii+lda*jj] * x[jj];
+ }
+ for( ; jj<m; jj++)
+ {
+ y_0 += pA[jj+lda*ii] * x[jj];
+ }
+ z[ii] = beta * y[ii] + alpha * y_0;
+ }
+ return;
+ }
+
+
+
+void TRMV_LNN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0, y_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ if(m-n>0)
+ {
+ GEMV_N_LIBSTR(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
+ }
+ if(n%2!=0)
+ {
+ ii = n-1;
+ y_0 = x[ii];
+ y_0 *= pA[ii+lda*ii];
+ for(jj=0; jj<ii; jj++)
+ {
+ y_0 += pA[ii+lda*jj] * x[jj];
+ }
+ z[ii] = y_0;
+ n -= 1;
+ }
+ for(ii=n-2; ii>=0; ii-=2)
+ {
+ y_0 = x[ii+0];
+ y_1 = x[ii+1];
+ y_1 *= pA[ii+1+lda*(ii+1)];
+ y_1 += pA[ii+1+lda*(ii+0)] * y_0;
+ y_0 *= pA[ii+0+lda*(ii+0)];
+ jj = 0;
+ for(; jj<ii-1; jj+=2)
+ {
+ y_0 += pA[ii+0+lda*(jj+0)] * x[jj+0] + pA[ii+0+lda*(jj+1)] * x[jj+1];
+ y_1 += pA[ii+1+lda*(jj+0)] * x[jj+0] + pA[ii+1+lda*(jj+1)] * x[jj+1];
+ }
+// XXX there is no clean up loop !!!!!
+// for(; jj<ii; jj++)
+// {
+// y_0 += pA[ii+0+lda*jj] * x[jj];
+// y_1 += pA[ii+1+lda*jj] * x[jj];
+// }
+ z[ii+0] = y_0;
+ z[ii+1] = y_1;
+ }
+ return;
+ }
+
+
+
+void TRMV_LTN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0, y_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ y_0 = x[jj+0];
+ y_1 = x[jj+1];
+ y_0 *= pA[jj+0+lda*(jj+0)];
+ y_0 += pA[jj+1+lda*(jj+0)] * y_1;
+ y_1 *= pA[jj+1+lda*(jj+1)];
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 += pA[ii+0+lda*(jj+0)] * x[ii+0] + pA[ii+1+lda*(jj+0)] * x[ii+1];
+ y_1 += pA[ii+0+lda*(jj+1)] * x[ii+0] + pA[ii+1+lda*(jj+1)] * x[ii+1];
+ }
+ for(; ii<m; ii++)
+ {
+ y_0 += pA[ii+lda*(jj+0)] * x[ii];
+ y_1 += pA[ii+lda*(jj+1)] * x[ii];
+ }
+ z[jj+0] = y_0;
+ z[jj+1] = y_1;
+ }
+ for(; jj<n; jj++)
+ {
+ y_0 = x[jj];
+ y_0 *= pA[jj+lda*jj];
+ for(ii=jj+1; ii<m; ii++)
+ {
+ y_0 += pA[ii+lda*jj] * x[ii];
+ }
+ z[jj] = y_0;
+ }
+ return;
+ }
+
+
+
+void TRMV_UNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0, y_1,
+ x_0, x_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+#if 1 // y reg version
+ jj = 0;
+ for(; jj<m-1; jj+=2)
+ {
+ y_0 = x[jj+0];
+ y_1 = x[jj+1];
+ y_0 = pA[jj+0+lda*(jj+0)] * y_0;
+ y_0 += pA[jj+0+lda*(jj+1)] * y_1;
+ y_1 = pA[jj+1+lda*(jj+1)] * y_1;
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 += pA[jj+0+lda*(ii+0)] * x[ii+0] + pA[jj+0+lda*(ii+1)] * x[ii+1];
+ y_1 += pA[jj+1+lda*(ii+0)] * x[ii+0] + pA[jj+1+lda*(ii+1)] * x[ii+1];
+ }
+ if(ii<m)
+ {
+ y_0 += pA[jj+0+lda*(ii+0)] * x[ii+0];
+ y_1 += pA[jj+1+lda*(ii+0)] * x[ii+0];
+ }
+ z[jj+0] = y_0;
+ z[jj+1] = y_1;
+ }
+ for(; jj<m; jj++)
+ {
+ y_0 = pA[jj+lda*jj] * x[jj];
+ for(ii=jj+1; ii<m; ii++)
+ {
+ y_0 += pA[jj+lda*ii] * x[ii];
+ }
+ z[jj] = y_0;
+ }
+#else // x reg version
+ if(x != z)
+ {
+ for(ii=0; ii<m; ii++)
+ z[ii] = x[ii];
+ }
+ jj = 0;
+ for(; jj<m-1; jj+=2)
+ {
+ x_0 = z[jj+0];
+ x_1 = z[jj+1];
+ ii = 0;
+ for(; ii<jj-1; ii+=2)
+ {
+ z[ii+0] += pA[ii+0+lda*(jj+0)] * x_0 + pA[ii+0+lda*(jj+1)] * x_1;
+ z[ii+1] += pA[ii+1+lda*(jj+0)] * x_0 + pA[ii+1+lda*(jj+1)] * x_1;
+ }
+// XXX there is no clean-up loop, since jj+=2 !!!!!
+// for(; ii<jj; ii++)
+// {
+// z[ii+0] += pA[ii+0+lda*(jj+0)] * x_0 + pA[ii+0+lda*(jj+1)] * x_1;
+// }
+ x_0 *= pA[jj+0+lda*(jj+0)];
+ x_0 += pA[jj+0+lda*(jj+1)] * x_1;
+ x_1 *= pA[jj+1+lda*(jj+1)];
+ z[jj+0] = x_0;
+ z[jj+1] = x_1;
+ }
+ for(; jj<m; jj++)
+ {
+ x_0 = z[jj];
+ for(ii=0; ii<jj; ii++)
+ {
+ z[ii] += pA[ii+lda*jj] * x_0;
+ }
+ x_0 *= pA[jj+lda*jj];
+ z[jj] = x_0;
+ }
+#endif
+ return;
+ }
+
+
+
+void TRMV_UTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ int ii, jj;
+ REAL
+ y_0, y_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ if(m%2!=0)
+ {
+ jj = m-1;
+ y_0 = pA[jj+lda*jj] * x[jj];
+ for(ii=0; ii<jj; ii++)
+ {
+ y_0 += pA[ii+lda*jj] * x[ii];
+ }
+ z[jj] = y_0;
+ m -= 1; // XXX
+ }
+ for(jj=m-2; jj>=0; jj-=2)
+ {
+ y_1 = pA[jj+1+lda*(jj+1)] * x[jj+1];
+ y_1 += pA[jj+0+lda*(jj+1)] * x[jj+0];
+ y_0 = pA[jj+0+lda*(jj+0)] * x[jj+0];
+ for(ii=0; ii<jj-1; ii+=2)
+ {
+ y_0 += pA[ii+0+lda*(jj+0)] * x[ii+0] + pA[ii+1+lda*(jj+0)] * x[ii+1];
+ y_1 += pA[ii+0+lda*(jj+1)] * x[ii+0] + pA[ii+1+lda*(jj+1)] * x[ii+1];
+ }
+// XXX there is no clean-up loop !!!!!
+// if(ii<jj)
+// {
+// y_0 += pA[ii+lda*(jj+0)] * x[ii];
+// y_1 += pA[ii+lda*(jj+1)] * x[ii];
+// }
+ z[jj+0] = y_0;
+ z[jj+1] = y_1;
+ }
+ return;
+ }
+
+
+
+void TRSV_LNN_MN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0 | n==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** trsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** trsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ int ii, jj, j1;
+ REAL
+ y_0, y_1,
+ x_0, x_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *dA = sA->dA;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 0;
+ }
+#if 1 // y reg version
+ ii = 0;
+ for(; ii<n-1; ii+=2)
+ {
+ y_0 = x[ii+0];
+ y_1 = x[ii+1];
+ jj = 0;
+ for(; jj<ii-1; jj+=2)
+ {
+ y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0] + pA[ii+0+lda*(jj+1)] * z[jj+1];
+ y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0] + pA[ii+1+lda*(jj+1)] * z[jj+1];
+ }
+// XXX there is no clean-up loop !!!!!
+// if(jj<ii)
+// {
+// y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0];
+// y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0];
+// }
+ y_0 *= dA[ii+0];
+ y_1 -= pA[ii+1+lda*(jj+0)] * y_0;
+ y_1 *= dA[ii+1];
+ z[ii+0] = y_0;
+ z[ii+1] = y_1;
+ }
+ for(; ii<n; ii++)
+ {
+ y_0 = x[ii];
+ for(jj=0; jj<ii; jj++)
+ {
+ y_0 -= pA[ii+lda*jj] * z[jj];
+ }
+ y_0 *= dA[ii];
+ z[ii] = y_0;
+ }
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 = x[ii+0];
+ y_1 = x[ii+1];
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0] + pA[ii+0+lda*(jj+1)] * z[jj+1];
+ y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0] + pA[ii+1+lda*(jj+1)] * z[jj+1];
+ }
+ if(jj<n)
+ {
+ y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0];
+ y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0];
+ }
+ z[ii+0] = y_0;
+ z[ii+1] = y_1;
+ }
+ for(; ii<m; ii++)
+ {
+ y_0 = x[ii];
+ for(jj=0; jj<n; jj++)
+ {
+ y_0 -= pA[ii+lda*jj] * z[jj];
+ }
+ z[ii] = y_0;
+ }
+#else // x reg version
+ if(x != z)
+ {
+ for(ii=0; ii<m; ii++)
+ z[ii] = x[ii];
+ }
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ x_0 = dA[jj+0] * z[jj+0];
+ x_1 = z[jj+1] - pA[jj+1+lda*(jj+0)] * x_0;
+ x_1 = dA[jj+1] * x_1;
+ z[jj+0] = x_0;
+ z[jj+1] = x_1;
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ z[ii+0] -= pA[ii+0+lda*(jj+0)] * x_0 + pA[ii+0+lda*(jj+1)] * x_1;
+ z[ii+1] -= pA[ii+1+lda*(jj+0)] * x_0 + pA[ii+1+lda*(jj+1)] * x_1;
+ }
+ for(; ii<m; ii++)
+ {
+ z[ii] -= pA[ii+lda*(jj+0)] * x_0 + pA[ii+lda*(jj+1)] * x_1;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ x_0 = dA[jj] * z[jj];
+ z[jj] = x_0;
+ for(ii=jj+1; ii<m; ii++)
+ {
+ z[ii] -= pA[ii+lda*jj] * x_0;
+ }
+ }
+#endif
+ return;
+ }
+
+
+
+void TRSV_LTN_MN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** trsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** trsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ int ii, jj;
+ REAL
+ y_0, y_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *dA = sA->dA;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 0;
+ }
+ if(n%2!=0)
+ {
+ jj = n-1;
+ y_0 = x[jj];
+ for(ii=jj+1; ii<m; ii++)
+ {
+ y_0 -= pA[ii+lda*jj] * z[ii];
+ }
+ y_0 *= dA[jj];
+ z[jj] = y_0;
+ jj -= 2;
+ }
+ else
+ {
+ jj = n-2;
+ }
+ for(; jj>=0; jj-=2)
+ {
+ y_0 = x[jj+0];
+ y_1 = x[jj+1];
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 -= pA[ii+0+lda*(jj+0)] * z[ii+0] + pA[ii+1+lda*(jj+0)] * z[ii+1];
+ y_1 -= pA[ii+0+lda*(jj+1)] * z[ii+0] + pA[ii+1+lda*(jj+1)] * z[ii+1];
+ }
+ if(ii<m)
+ {
+ y_0 -= pA[ii+lda*(jj+0)] * z[ii];
+ y_1 -= pA[ii+lda*(jj+1)] * z[ii];
+ }
+ y_1 *= dA[jj+1];
+ y_0 -= pA[jj+1+lda*(jj+0)] * y_1;
+ y_0 *= dA[jj+0];
+ z[jj+0] = y_0;
+ z[jj+1] = y_1;
+ }
+ return;
+ }
+
+
+
+void TRSV_LNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ int ii, jj, j1;
+ REAL
+ y_0, y_1,
+ x_0, x_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *dA = sA->dA;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 0;
+ }
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 = x[ii+0];
+ y_1 = x[ii+1];
+ jj = 0;
+ for(; jj<ii-1; jj+=2)
+ {
+ y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0] + pA[ii+0+lda*(jj+1)] * z[jj+1];
+ y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0] + pA[ii+1+lda*(jj+1)] * z[jj+1];
+ }
+ y_0 *= dA[ii+0];
+ y_1 -= pA[ii+1+lda*(jj+0)] * y_0;
+ y_1 *= dA[ii+1];
+ z[ii+0] = y_0;
+ z[ii+1] = y_1;
+ }
+ for(; ii<m; ii++)
+ {
+ y_0 = x[ii];
+ for(jj=0; jj<ii; jj++)
+ {
+ y_0 -= pA[ii+lda*jj] * z[jj];
+ }
+ y_0 *= dA[ii];
+ z[ii] = y_0;
+ }
+ return;
+ }
+
+
+
+void TRSV_LNU_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_lnu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** trsv_lnu_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void TRSV_LTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ int ii, jj;
+ REAL
+ y_0, y_1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *dA = sA->dA;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 0;
+ }
+ if(m%2!=0)
+ {
+ jj = m-1;
+ y_0 = x[jj];
+ y_0 *= dA[jj];
+ z[jj] = y_0;
+ jj -= 2;
+ }
+ else
+ {
+ jj = m-2;
+ }
+ for(; jj>=0; jj-=2)
+ {
+ y_0 = x[jj+0];
+ y_1 = x[jj+1];
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ y_0 -= pA[ii+0+lda*(jj+0)] * z[ii+0] + pA[ii+1+lda*(jj+0)] * z[ii+1];
+ y_1 -= pA[ii+0+lda*(jj+1)] * z[ii+0] + pA[ii+1+lda*(jj+1)] * z[ii+1];
+ }
+ if(ii<m)
+ {
+ y_0 -= pA[ii+lda*(jj+0)] * z[ii];
+ y_1 -= pA[ii+lda*(jj+1)] * z[ii];
+ }
+ y_1 *= dA[jj+1];
+ y_0 -= pA[jj+1+lda*(jj+0)] * y_1;
+ y_0 *= dA[jj+0];
+ z[jj+0] = y_0;
+ z[jj+1] = y_1;
+ }
+ return;
+ }
+
+
+
+void TRSV_LTU_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_ltu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** trsv_ltu_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void TRSV_UNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_unn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** trsv_unn_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+void TRSV_UTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_utn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ printf("\n***** trsv_utn_libstr : feature not implemented yet *****\n");
+ exit(1);
+ }
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+void GEMV_N_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, y, &i1, z, &i1);
+ GEMV(&cn, &m, &n, &alpha, pA, &lda, x, &i1, &beta, z, &i1);
+ return;
+ }
+
+
+
+void GEMV_T_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ COPY(&n, y, &i1, z, &i1);
+ GEMV(&ct, &m, &n, &alpha, pA, &lda, x, &i1, &beta, z, &i1);
+ return;
+ }
+
+
+
+void GEMV_NT_LIBSTR(int m, int n, REAL alpha_n, REAL alpha_t, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx_n, int xi_n, struct STRVEC *sx_t, int xi_t, REAL beta_n, REAL beta_t, struct STRVEC *sy_n, int yi_n, struct STRVEC *sy_t, int yi_t, struct STRVEC *sz_n, int zi_n, struct STRVEC *sz_t, int zi_t)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x_n = sx_n->pa + xi_n;
+ REAL *x_t = sx_t->pa + xi_t;
+ REAL *y_n = sy_n->pa + yi_n;
+ REAL *y_t = sy_t->pa + yi_t;
+ REAL *z_n = sz_n->pa + zi_n;
+ REAL *z_t = sz_t->pa + zi_t;
+ COPY(&m, y_n, &i1, z_n, &i1);
+ GEMV(&cn, &m, &n, &alpha_n, pA, &lda, x_n, &i1, &beta_n, z_n, &i1);
+ COPY(&n, y_t, &i1, z_t, &i1);
+ GEMV(&ct, &m, &n, &alpha_t, pA, &lda, x_t, &i1, &beta_t, z_t, &i1);
+ return;
+ }
+
+
+
+void SYMV_L_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *y = sy->pa + yi;
+ REAL *z = sz->pa + zi;
+ int tmp = m-n;
+ COPY(&m, y, &i1, z, &i1);
+ SYMV(&cl, &n, &alpha, pA, &lda, x, &i1, &beta, z, &i1);
+ GEMV(&cn, &tmp, &n, &alpha, pA+n, &lda, x, &i1, &beta, z+n, &i1);
+ GEMV(&ct, &tmp, &n, &alpha, pA+n, &lda, x+n, &i1, &d1, z, &i1);
+ return;
+ }
+
+
+
+void TRMV_LNN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL d0 = 0.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ int tmp = m-n;
+ if(x!=z)
+ COPY(&n, x, &i1, z, &i1);
+ GEMV(&cn, &tmp, &n, &d1, pA+n, &lda, x, &i1, &d0, z+n, &i1);
+ TRMV(&cl, &cn, &cn, &n, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRMV_LTN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ int tmp = m-n;
+ if(x!=z)
+ COPY(&n, x, &i1, z, &i1);
+ TRMV(&cl, &ct, &cn, &n, pA, &lda, z, &i1);
+ GEMV(&ct, &tmp, &n, &d1, pA+n, &lda, x+n, &i1, &d1, z, &i1);
+ return;
+ }
+
+
+
+void TRMV_UNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRMV(&cu, &cn, &cn, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRMV_UTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRMV(&cu, &ct, &cn, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_LNN_MN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0 | n==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** trsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** trsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int mmn = m-n;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cl, &cn, &cn, &n, pA, &lda, z, &i1);
+ GEMV(&cn, &mmn, &n, &dm1, pA+n, &lda, z, &i1, &d1, z+n, &i1);
+ return;
+ }
+
+
+
+void TRSV_LTN_MN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** trsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** trsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int mmn = m-n;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ GEMV(&ct, &mmn, &n, &dm1, pA+n, &lda, z+n, &i1, &d1, z, &i1);
+ TRSV(&cl, &ct, &cn, &n, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_LNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cl, &cn, &cn, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_LNU_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_lnu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cl, &cn, &cu, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_LTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cl, &ct, &cn, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_LTU_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_ltu_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cl, &ct, &cu, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_UNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_unn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cu, &cn, &cn, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+void TRSV_UTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+ {
+ if(m==0)
+ return;
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** trsv_utn_libstr : m<0 : %d<0 *****\n", m);
+ // non-negative offset
+ if(ai<0) printf("\n****** trsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** trsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
+ if(xi<0) printf("\n****** trsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
+ if(zi<0) printf("\n****** trsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
+ // inside matrix
+ // A: m x k
+ if(ai+m > sA->m) printf("\n***** trsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+m > sA->n) printf("\n***** trsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+ // x: m
+ if(xi+m > sx->m) printf("\n***** trsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+ // z: m
+ if(zi+m > sz->m) printf("\n***** trsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL d1 = 1.0;
+ REAL dm1 = -1.0;
+ int lda = sA->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *x = sx->pa + xi;
+ REAL *z = sz->pa + zi;
+ COPY(&m, x, &i1, z, &i1);
+ TRSV(&cu, &ct, &cn, &m, pA, &lda, z, &i1);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/blas/x_blas3_diag_lib.c b/blas/x_blas3_diag_lib.c
new file mode 100644
index 0000000..d5cce93
--- /dev/null
+++ b/blas/x_blas3_diag_lib.c
@@ -0,0 +1,170 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+// dgemm with A diagonal matrix (stored as strvec)
+void GEMM_L_DIAG_LIBSTR(int m, int n, REAL alpha, struct STRVEC *sA, int ai, struct STRMAT *sB, int bi, int bj, double beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *dA = sA->pa + ai;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL a0, a1;
+ if(beta==0.0)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ a0 = alpha * dA[ii+0];
+ a1 = alpha * dA[ii+1];
+ for(jj=0; jj<n; jj++)
+ {
+ pD[ii+0+ldd*jj] = a0 * pB[ii+0+ldb*jj];
+ pD[ii+1+ldd*jj] = a1 * pB[ii+1+ldb*jj];
+ }
+ }
+ for(; ii<m; ii++)
+ {
+ a0 = alpha * dA[ii];
+ for(jj=0; jj<n; jj++)
+ {
+ pD[ii+0+ldd*jj] = a0 * pB[ii+0+ldb*jj];
+ }
+ }
+ }
+ else
+ {
+ int ldc = sC->m;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ a0 = alpha * dA[ii+0];
+ a1 = alpha * dA[ii+1];
+ for(jj=0; jj<n; jj++)
+ {
+ pD[ii+0+ldd*jj] = a0 * pB[ii+0+ldb*jj] + beta * pC[ii+0+ldc*jj];
+ pD[ii+1+ldd*jj] = a1 * pB[ii+1+ldb*jj] + beta * pC[ii+1+ldc*jj];
+ }
+ }
+ for(; ii<m; ii++)
+ {
+ a0 = alpha * dA[ii];
+ for(jj=0; jj<n; jj++)
+ {
+ pD[ii+0+ldd*jj] = a0 * pB[ii+0+ldb*jj] + beta * pC[ii+0+ldc*jj];
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// dgemm with B diagonal matrix (stored as strvec)
+void GEMM_R_DIAG_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sB, int bi, double beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj;
+ int lda = sA->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *dB = sB->pa + bi;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL a0, a1;
+ if(beta==0)
+ {
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ a0 = alpha * dB[jj+0];
+ a1 = alpha * dB[jj+1];
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*(jj+0)] = a0 * pA[ii+lda*(jj+0)];
+ pD[ii+ldd*(jj+1)] = a1 * pA[ii+lda*(jj+1)];
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ a0 = alpha * dB[jj+0];
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*(jj+0)] = a0 * pA[ii+lda*(jj+0)];
+ }
+ }
+ }
+ else
+ {
+ int ldc = sC->m;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ a0 = alpha * dB[jj+0];
+ a1 = alpha * dB[jj+1];
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*(jj+0)] = a0 * pA[ii+lda*(jj+0)] + beta * pC[ii+ldc*(jj+0)];
+ pD[ii+ldd*(jj+1)] = a1 * pA[ii+lda*(jj+1)] + beta * pC[ii+ldc*(jj+1)];
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ a0 = alpha * dB[jj+0];
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*(jj+0)] = a0 * pA[ii+lda*(jj+0)] + beta * pC[ii+ldc*(jj+0)];
+ }
+ }
+ }
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
+
diff --git a/blas/x_blas3_lib.c b/blas/x_blas3_lib.c
new file mode 100644
index 0000000..29a33c7
--- /dev/null
+++ b/blas/x_blas3_lib.c
@@ -0,0 +1,1531 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+// dgemm nt
+void GEMM_NT_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0;
+ c_10 = 0.0;
+ c_01 = 0.0;
+ c_11 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+ c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
+ c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
+ c_11 += pA[(ii+1)+lda*kk] * pB[(jj+1)+ldb*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+ pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ c_01 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+ c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0;
+ c_10 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+ c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ }
+ }
+ return;
+ }
+
+
+
+// dgemm nn
+void GEMM_NN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0; ;
+ c_10 = 0.0; ;
+ c_01 = 0.0; ;
+ c_11 = 0.0; ;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+ c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
+ c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
+ c_11 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+1)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+ pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0; ;
+ c_01 = 0.0; ;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+ c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0; ;
+ c_10 = 0.0; ;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+ c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0; ;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+ }
+ }
+ return;
+ }
+
+
+
+// dtrsm_left_lower_nottransposed_unit
+void TRSM_LLNU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ d_00, d_01,
+ d_10, d_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda; // triangular
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+#if 1
+ // solve
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ d_00 = alpha * pB[ii+0+ldb*(jj+0)];
+ d_10 = alpha * pB[ii+1+ldb*(jj+0)];
+ d_01 = alpha * pB[ii+0+ldb*(jj+1)];
+ d_11 = alpha * pB[ii+1+ldb*(jj+1)];
+ kk = 0;
+#if 0
+ for(; kk<ii-1; kk+=2)
+ {
+ d_00 -= pA[ii+0+lda*(kk+0)] * pD[kk+ldd*(jj+0)];
+ d_10 -= pA[ii+1+lda*(kk+0)] * pD[kk+ldd*(jj+0)];
+ d_01 -= pA[ii+0+lda*(kk+0)] * pD[kk+ldd*(jj+1)];
+ d_11 -= pA[ii+1+lda*(kk+0)] * pD[kk+ldd*(jj+1)];
+ d_00 -= pA[ii+0+lda*(kk+1)] * pD[kk+ldd*(jj+0)];
+ d_10 -= pA[ii+1+lda*(kk+1)] * pD[kk+ldd*(jj+0)];
+ d_01 -= pA[ii+0+lda*(kk+1)] * pD[kk+ldd*(jj+1)];
+ d_11 -= pA[ii+1+lda*(kk+1)] * pD[kk+ldd*(jj+1)];
+ }
+ if(kk<ii)
+#else
+ for(; kk<ii; kk++)
+#endif
+ {
+ d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+0)];
+ d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+1)];
+ d_11 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+1)];
+ }
+ d_10 -= pA[ii+1+lda*kk] * d_00;
+ d_11 -= pA[ii+1+lda*kk] * d_01;
+ pD[ii+0+ldd*(jj+0)] = d_00;
+ pD[ii+1+ldd*(jj+0)] = d_10;
+ pD[ii+0+ldd*(jj+1)] = d_01;
+ pD[ii+1+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<m; ii++)
+ {
+ d_00 = alpha * pB[ii+ldb*(jj+0)];
+ d_01 = alpha * pB[ii+ldb*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+1)];
+ }
+ pD[ii+ldd*(jj+0)] = d_00;
+ pD[ii+ldd*(jj+1)] = d_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ d_00 = alpha * pB[ii+0+ldb*jj];
+ d_10 = alpha * pB[ii+1+ldb*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*jj];
+ d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*jj];
+ }
+ d_10 -= pA[ii+1+lda*kk] * d_00;
+ pD[ii+0+ldd*jj] = d_00;
+ pD[ii+1+ldd*jj] = d_10;
+ }
+ for(; ii<m; ii++)
+ {
+ d_00 = alpha * pB[ii+ldb*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pA[ii+lda*kk] * pD[kk+ldd*jj];
+ }
+ pD[ii+ldd*jj] = d_00;
+ }
+ }
+#else
+ // copy
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ for(ii=0; ii<m; ii++)
+ pD[ii+ldd*jj] = alpha * pB[ii+ldb*jj];
+ }
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m; ii++)
+ {
+ d_00 = pD[ii+ldd*jj];
+ for(kk=ii+1; kk<m; kk++)
+ {
+ pD[kk+ldd*jj] -= pA[kk+lda*ii] * d_00;
+ }
+ }
+ }
+#endif
+ return;
+ }
+
+
+
+// dtrsm_left_upper_nottransposed_notunit
+void TRSM_LUNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, id;
+ REAL
+ d_00, d_01,
+ d_10, d_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda; // triangular
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL *dA = sA->dA;
+ if(!(sA->use_dA==1 & ai==0 & aj==0))
+ {
+ // inverte diagonal of pA
+ for(ii=0; ii<m; ii++)
+ dA[ii] = 1.0/pA[ii+lda*ii];
+ // use only now
+ sA->use_dA = 0;
+ }
+#if 1
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ id = m-ii-2;
+ d_00 = alpha * pB[id+0+ldb*(jj+0)];
+ d_10 = alpha * pB[id+1+ldb*(jj+0)];
+ d_01 = alpha * pB[id+0+ldb*(jj+1)];
+ d_11 = alpha * pB[id+1+ldb*(jj+1)];
+ kk = id+2;
+#if 0
+ for(; kk<m-1; kk+=2)
+ {
+ d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+ d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+ d_00 -= pA[id+0+lda*(kk+1)] * pD[kk+1+ldd*(jj+0)];
+ d_10 -= pA[id+1+lda*(kk+1)] * pD[kk+1+ldd*(jj+0)];
+ d_01 -= pA[id+0+lda*(kk+1)] * pD[kk+1+ldd*(jj+1)];
+ d_11 -= pA[id+1+lda*(kk+1)] * pD[kk+1+ldd*(jj+1)];
+ }
+ if(kk<m)
+#else
+ for(; kk<m; kk++)
+#endif
+ {
+ d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+ d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+ }
+ d_10 *= dA[id+1];
+ d_11 *= dA[id+1];
+ d_00 -= pA[id+0+lda*(id+1)] * d_10;
+ d_01 -= pA[id+0+lda*(id+1)] * d_11;
+ d_00 *= dA[id+0];
+ d_01 *= dA[id+0];
+ pD[id+0+ldd*(jj+0)] = d_00;
+ pD[id+1+ldd*(jj+0)] = d_10;
+ pD[id+0+ldd*(jj+1)] = d_01;
+ pD[id+1+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<m; ii++)
+ {
+ id = m-ii-1;
+ d_00 = alpha * pB[id+0+ldb*(jj+0)];
+ d_01 = alpha * pB[id+0+ldb*(jj+1)];
+ kk = id+1;
+ for(; kk<m; kk++)
+ {
+ d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+ }
+ d_00 *= dA[id+0];
+ d_01 *= dA[id+0];
+ pD[id+0+ldd*(jj+0)] = d_00;
+ pD[id+0+ldd*(jj+1)] = d_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ id = m-ii-2;
+ d_00 = alpha * pB[id+0+ldb*(jj+0)];
+ d_10 = alpha * pB[id+1+ldb*(jj+0)];
+ kk = id+2;
+ for(; kk<m; kk++)
+ {
+ d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ }
+ d_10 *= dA[id+1];
+ d_00 -= pA[id+0+lda*(id+1)] * d_10;
+ d_00 *= dA[id+0];
+ pD[id+0+ldd*(jj+0)] = d_00;
+ pD[id+1+ldd*(jj+0)] = d_10;
+ }
+ for(; ii<m; ii++)
+ {
+ id = m-ii-1;
+ d_00 = alpha * pB[id+0+ldb*(jj+0)];
+ kk = id+1;
+ for(; kk<m; kk++)
+ {
+ d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+ }
+ d_00 *= dA[id+0];
+ pD[id+0+ldd*(jj+0)] = d_00;
+ }
+ }
+#else
+ // copy
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ for(ii=0; ii<m; ii++)
+ pD[ii+ldd*jj] = alpha * pB[ii+ldb*jj];
+ }
+ // solve
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=m-1; ii>=0; ii--)
+ {
+ d_00 = pD[ii+ldd*jj] * dA[ii];
+ pD[ii+ldd*jj] = d_00;
+ for(kk=0; kk<ii; kk++)
+ {
+ pD[kk+ldd*jj] -= pA[kk+lda*ii] * d_00;
+ }
+ }
+ }
+#endif
+ return;
+ }
+
+
+
+// dtrsm_right_lower_transposed_unit
+void TRSM_RLTU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL
+ f_10,
+ c_00, c_01,
+ c_10, c_11;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ f_10 = pA[jj+1+lda*(jj+0)];
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+ c_10 = alpha * pB[ii+1+ldb*(jj+0)];
+ c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+ c_11 = alpha * pB[ii+1+ldb*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
+ c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
+ c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
+ }
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ pD[ii+1+ldd*(jj+0)] = c_10;
+ c_01 -= c_00 * f_10;
+ c_11 -= c_10 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01;
+ pD[ii+1+ldd*(jj+1)] = c_11;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+ c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ c_01 -= c_00 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // factorize diagonal
+ for(ii=0; ii<m; ii++)
+ {
+ c_00 = alpha * pB[ii+ldb*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
+ }
+ pD[ii+ldd*jj] = c_00;
+ }
+ }
+ return;
+ }
+
+
+
+// dtrsm_right_lower_transposed_unit
+void TRSM_RLTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL *dA = sA->dA;
+ if(ai==0 & aj==0)
+ {
+ if(sA->use_dA!=1)
+ {
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 1;
+ }
+ }
+ else
+ {
+ for(ii=0; ii<n; ii++)
+ dA[ii] = 1.0 / pA[ii+lda*ii];
+ sA->use_dA = 0;
+ }
+ REAL
+ f_00_inv,
+ f_10, f_11_inv,
+ c_00, c_01,
+ c_10, c_11;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ f_00_inv = dA[jj+0];
+ f_10 = pA[jj+1+lda*(jj+0)];
+ f_11_inv = dA[jj+1];
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+ c_10 = alpha * pB[ii+1+ldb*(jj+0)];
+ c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+ c_11 = alpha * pB[ii+1+ldb*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
+ c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
+ c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
+ }
+ c_00 *= f_00_inv;
+ c_10 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ pD[ii+1+ldd*(jj+0)] = c_10;
+ c_01 -= c_00 * f_10;
+ c_11 -= c_10 * f_10;
+ c_01 *= f_11_inv;
+ c_11 *= f_11_inv;
+ pD[ii+0+ldd*(jj+1)] = c_01;
+ pD[ii+1+ldd*(jj+1)] = c_11;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+ c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
+ }
+ c_00 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ c_01 -= c_00 * f_10;
+ c_01 *= f_11_inv;
+ pD[ii+0+ldd*(jj+1)] = c_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // factorize diagonal
+ f_00_inv = dA[jj];
+ for(ii=0; ii<m; ii++)
+ {
+ c_00 = alpha * pB[ii+ldb*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
+ }
+ c_00 *= f_00_inv;
+ pD[ii+ldd*jj] = c_00;
+ }
+ }
+ return;
+ }
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void TRSM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ int i1 = 1;
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+ printf("\ndtrsm_rutn_libstr: feature not implemented yet\n");
+ exit(1);
+// if(!(pB==pD))
+// {
+// for(jj=0; jj<n; jj++)
+// COPY(&m, pB+jj*sB->m, &i1, pD+jj*sD->m, &i1);
+// }
+// TRSM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &(sA->m), pD, &(sD->m));
+ return;
+ }
+
+
+
+// dtrmm_right_upper_transposed_notunit (A triangular !!!)
+void TRMM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0;
+ c_10 = 0.0;
+ c_01 = 0.0;
+ c_11 = 0.0;
+ kk = jj;
+ c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+ c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
+ kk++;
+ for(; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+ c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
+ c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
+ c_11 += pB[(ii+1)+ldb*kk] * pA[(jj+1)+lda*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+ pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ c_01 = 0.0;
+ kk = jj;
+ c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+ kk++;
+ for(; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+ c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0;
+ c_10 = 0.0;
+ for(kk=jj; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+ c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ for(kk=jj; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ }
+ }
+ return;
+ }
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
+void TRMM_RLNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pD = sD->pA + di + dj*ldd;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0; ;
+ c_10 = 0.0; ;
+ c_01 = 0.0; ;
+ c_11 = 0.0; ;
+ kk = jj;
+ c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+ c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
+ kk++;
+ for(; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+ c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
+ c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
+ c_11 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+1)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+ pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0; ;
+ c_01 = 0.0; ;
+ kk = jj;
+ c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+ kk++;
+ for(; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+ c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0; ;
+ c_10 = 0.0; ;
+ for(kk=jj; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+ c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0; ;
+ for(kk=jj; kk<n; kk++)
+ {
+ c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+ }
+ }
+ return;
+ }
+
+
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_LIBSTR(int m, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0)
+ return;
+ int ii, jj, kk;
+ int n = m; // TODO optimize for this case !!!!!!!!!
+ REAL
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ // diagonal
+ c_00 = 0.0;
+ c_10 = 0.0;
+ c_11 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
+ c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
+ pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
+ pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
+ // lower
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0;
+ c_10 = 0.0;
+ c_01 = 0.0;
+ c_11 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
+ c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+ c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+ pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
+ pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+ pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ c_01 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+ pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // diagonal
+ c_00 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
+ }
+ pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
+ // lower
+ for(ii=jj+1; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
+ }
+ pD[ii+ldd*jj] = beta * pC[ii+ldc*jj] + alpha * c_00;
+ }
+ }
+ return;
+ }
+
+
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_MN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ // diagonal
+ c_00 = 0.0;
+ c_10 = 0.0;
+ c_11 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
+ c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
+ pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
+ pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
+ // lower
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = 0.0;
+ c_10 = 0.0;
+ c_01 = 0.0;
+ c_11 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
+ c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+ c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+ pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
+ pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+ pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ c_01 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+ pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // diagonal
+ c_00 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
+ }
+ pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
+ // lower
+ for(ii=jj+1; ii<m; ii++)
+ {
+ c_00 = 0.0;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
+ }
+ pD[ii+ldd*jj] = beta * pC[ii+ldc*jj] + alpha * c_00;
+ }
+ }
+ return;
+ }
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+// dgemm nt
+void GEMM_NT_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cn = 'n';
+ char ct = 't';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long kk = k;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GEMM(&cn, &ct, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dgemm nn
+void GEMM_NN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cn = 'n';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long kk = k;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GEMM(&cn, &cn, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GEMM(&cn, &cn, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrsm_left_lower_nottransposed_unit
+void TRSM_LLNU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cl, &cl, &cn, &cu, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*sD->m, &i1);
+ }
+ TRSM(&cl, &cl, &cn, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrsm_left_upper_nottransposed_notunit
+void TRSM_LUNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cl, &cu, &cn, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cl, &cu, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrsm_right_lower_transposed_unit
+void TRSM_RLTU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cr, &cl, &ct, &cu, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cr, &cl, &ct, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrsm_right_lower_transposed_notunit
+void TRSM_RLTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cr, &cl, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cr, &cl, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void TRSM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cr, &cu, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRSM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrmm_right_upper_transposed_notunit (A triangular !!!)
+void TRMM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRMM(&cr, &cu, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRMM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
+void TRMM_RLNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA+ai+aj*sA->m;
+ REAL *pB = sB->pA+bi+bj*sB->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRMM(&cr, &cl, &cn, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldd = sD->m;
+ if(!(pB==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+ }
+ TRMM(&cr, &cl, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_LIBSTR(int m, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA + ai + aj*sA->m;
+ REAL *pB = sB->pA + bi + bj*sB->m;
+ REAL *pC = sC->pA + ci + cj*sC->m;
+ REAL *pD = sD->pA + di + dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long kk = k;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<m; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ if(pA==pB)
+ {
+ SYRK(&cl, &cn, &mm, &kk, &alpha, pA, &lda, &beta, pD, &ldd);
+ }
+ else
+ {
+ GEMM(&cn, &ct, &mm, &mm, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+ }
+#else
+ int i1 = 1;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<m; jj++)
+ COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
+ }
+ if(pA==pB)
+ {
+ SYRK(&cl, &cn, &m, &k, &alpha, pA, &lda, &beta, pD, &ldd);
+ }
+ else
+ {
+ GEMM(&cn, &ct, &m, &m, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+ }
+#endif
+ return;
+ }
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_MN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL *pA = sA->pA + ai + aj*sA->m;
+ REAL *pB = sB->pA + bi + bj*sB->m;
+ REAL *pC = sC->pA + ci + cj*sC->m;
+ REAL *pD = sD->pA + di + dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long kk = k;
+ long long mmn = mm-nn;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ if(pA==pB)
+ {
+ SYRK(&cl, &cn, &nn, &kk, &alpha, pA, &lda, &beta, pD, &ldd);
+ GEMM(&cn, &ct, &mmn, &nn, &kk, &alpha, pA+n, &lda, pB, &ldb, &beta, pD+n, &ldd);
+ }
+ else
+ {
+ GEMM(&cn, &ct, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+ }
+#else
+ int i1 = 1;
+ int mmn = m-n;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(beta==0.0 || pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
+ }
+ if(pA==pB)
+ {
+ SYRK(&cl, &cn, &n, &k, &alpha, pA, &lda, &beta, pD, &ldd);
+ GEMM(&cn, &ct, &mmn, &n, &k, &alpha, pA+n, &lda, pB, &ldb, &beta, pD+n, &ldd);
+ }
+ else
+ {
+ GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+ }
+#endif
+ return;
+ }
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
+
diff --git a/blas/x_lapack_lib.c b/blas/x_lapack_lib.c
new file mode 100644
index 0000000..762a8a0
--- /dev/null
+++ b/blas/x_lapack_lib.c
@@ -0,0 +1,2112 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+// dpotrf
+void POTRF_L_LIBSTR(int m, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ f_00_inv,
+ f_10, f_11_inv,
+ c_00, c_01,
+ c_10, c_11;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL *dD = sD->dA;
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ jj = 0;
+ for(; jj<m-1; jj+=2)
+ {
+ // factorize diagonal
+ c_00 = pC[jj+0+ldc*(jj+0)];;
+ c_10 = pC[jj+1+ldc*(jj+0)];;
+ c_11 = pC[jj+1+ldc*(jj+1)];;
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
+ c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ if(c_00>0)
+ {
+ f_00_inv = 1.0/sqrt(c_00);
+ }
+ else
+ {
+ f_00_inv = 0.0;
+ }
+ dD[jj+0] = f_00_inv;
+ pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
+ f_10 = c_10 * f_00_inv;
+ pD[jj+1+ldd*(jj+0)] = f_10;
+ c_11 -= f_10 * f_10;
+ if(c_11>0)
+ {
+ f_11_inv = 1.0/sqrt(c_11);
+ }
+ else
+ {
+ f_11_inv = 0.0;
+ }
+ dD[jj+1] = f_11_inv;
+ pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
+ // solve lower
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = pC[ii+0+ldc*(jj+0)];
+ c_10 = pC[ii+1+ldc*(jj+0)];
+ c_01 = pC[ii+0+ldc*(jj+1)];
+ c_11 = pC[ii+1+ldc*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ c_00 *= f_00_inv;
+ c_10 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ pD[ii+1+ldd*(jj+0)] = c_10;
+ c_01 -= c_00 * f_10;
+ c_11 -= c_10 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+ pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = pC[ii+0+ldc*(jj+0)];
+ c_01 = pC[ii+0+ldc*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ c_00 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ c_01 -= c_00 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+ }
+ }
+ for(; jj<m; jj++)
+ {
+ // factorize diagonal
+ c_00 = pC[jj+ldc*jj];;
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
+ }
+ if(c_00>0)
+ {
+ f_00_inv = 1.0/sqrt(c_00);
+ }
+ else
+ {
+ f_00_inv = 0.0;
+ }
+ dD[jj] = f_00_inv;
+ pD[jj+ldd*jj] = c_00 * f_00_inv;
+ // solve lower
+// for(ii=jj+1; ii<m; ii++)
+// {
+// c_00 = pC[ii+ldc*jj];
+// for(kk=0; kk<jj; kk++)
+// {
+// c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
+// }
+// pD[ii+ldd*jj] = c_00 * f_00_inv;
+// }
+ }
+ return;
+ }
+
+
+
+// dpotrf
+void POTRF_L_MN_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ REAL
+ f_00_inv,
+ f_10, f_11_inv,
+ c_00, c_01,
+ c_10, c_11;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL *dD = sD->dA;
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ // factorize diagonal
+ c_00 = pC[jj+0+ldc*(jj+0)];;
+ c_10 = pC[jj+1+ldc*(jj+0)];;
+ c_11 = pC[jj+1+ldc*(jj+1)];;
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
+ c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ if(c_00>0)
+ {
+ f_00_inv = 1.0/sqrt(c_00);
+ }
+ else
+ {
+ f_00_inv = 0.0;
+ }
+ dD[jj+0] = f_00_inv;
+ pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
+ f_10 = c_10 * f_00_inv;
+ pD[jj+1+ldd*(jj+0)] = f_10;
+ c_11 -= f_10 * f_10;
+ if(c_11>0)
+ {
+ f_11_inv = 1.0/sqrt(c_11);
+ }
+ else
+ {
+ f_11_inv = 0.0;
+ }
+ dD[jj+1] = f_11_inv;
+ pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
+ // solve lower
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = pC[ii+0+ldc*(jj+0)];
+ c_10 = pC[ii+1+ldc*(jj+0)];
+ c_01 = pC[ii+0+ldc*(jj+1)];
+ c_11 = pC[ii+1+ldc*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ c_00 *= f_00_inv;
+ c_10 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ pD[ii+1+ldd*(jj+0)] = c_10;
+ c_01 -= c_00 * f_10;
+ c_11 -= c_10 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+ pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = pC[ii+0+ldc*(jj+0)];
+ c_01 = pC[ii+0+ldc*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ c_00 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ c_01 -= c_00 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // factorize diagonal
+ c_00 = pC[jj+ldc*jj];;
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
+ }
+ if(c_00>0)
+ {
+ f_00_inv = 1.0/sqrt(c_00);
+ }
+ else
+ {
+ f_00_inv = 0.0;
+ }
+ dD[jj] = f_00_inv;
+ pD[jj+ldd*jj] = c_00 * f_00_inv;
+ // solve lower
+ for(ii=jj+1; ii<m; ii++)
+ {
+ c_00 = pC[ii+ldc*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
+ }
+ pD[ii+ldd*jj] = c_00 * f_00_inv;
+ }
+ }
+ return;
+ }
+
+
+
+// dsyrk dpotrf
+void SYRK_POTRF_LN_LIBSTR(int m, int n, int k, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ int ii, jj, kk;
+ REAL
+ f_00_inv,
+ f_10, f_11_inv,
+ c_00, c_01,
+ c_10, c_11;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA + ai + aj*lda;
+ REAL *pB = sB->pA + bi + bj*ldb;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL *dD = sD->dA;
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ // factorize diagonal
+ c_00 = pC[jj+0+ldc*(jj+0)];;
+ c_10 = pC[jj+1+ldc*(jj+0)];;
+ c_11 = pC[jj+1+ldc*(jj+1)];;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
+ c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
+ c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ if(c_00>0)
+ {
+ f_00_inv = 1.0/sqrt(c_00);
+ }
+ else
+ {
+ f_00_inv = 0.0;
+ }
+ dD[jj+0] = f_00_inv;
+ pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
+ f_10 = c_10 * f_00_inv;
+ pD[jj+1+ldd*(jj+0)] = f_10;
+ c_11 -= f_10 * f_10;
+ if(c_11>0)
+ {
+ f_11_inv = 1.0/sqrt(c_11);
+ }
+ else
+ {
+ f_11_inv = 0.0;
+ }
+ dD[jj+1] = f_11_inv;
+ pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
+ // solve lower
+ ii = jj+2;
+ for(; ii<m-1; ii+=2)
+ {
+ c_00 = pC[ii+0+ldc*(jj+0)];
+ c_10 = pC[ii+1+ldc*(jj+0)];
+ c_01 = pC[ii+0+ldc*(jj+1)];
+ c_11 = pC[ii+1+ldc*(jj+1)];
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
+ c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+ c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ c_00 *= f_00_inv;
+ c_10 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ pD[ii+1+ldd*(jj+0)] = c_10;
+ c_01 -= c_00 * f_10;
+ c_11 -= c_10 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+ pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
+ }
+ for(; ii<m; ii++)
+ {
+ c_00 = pC[ii+0+ldc*(jj+0)];
+ c_01 = pC[ii+0+ldc*(jj+1)];
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+ c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+ }
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+ c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+ }
+ c_00 *= f_00_inv;
+ pD[ii+0+ldd*(jj+0)] = c_00;
+ c_01 -= c_00 * f_10;
+ pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // factorize diagonal
+ c_00 = pC[jj+ldc*jj];;
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
+ }
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
+ }
+ if(c_00>0)
+ {
+ f_00_inv = 1.0/sqrt(c_00);
+ }
+ else
+ {
+ f_00_inv = 0.0;
+ }
+ dD[jj] = f_00_inv;
+ pD[jj+ldd*jj] = c_00 * f_00_inv;
+ // solve lower
+ for(ii=jj+1; ii<m; ii++)
+ {
+ c_00 = pC[ii+ldc*jj];
+ for(kk=0; kk<k; kk++)
+ {
+ c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
+ }
+ for(kk=0; kk<jj; kk++)
+ {
+ c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
+ }
+ pD[ii+ldd*jj] = c_00 * f_00_inv;
+ }
+ }
+ return;
+ }
+
+
+
+// dgetrf without pivoting
+void GETF2_NOPIVOT(int m, int n, REAL *A, int lda, REAL *dA)
+ {
+ int ii, jj, kk, itmp0, itmp1;
+ int iimax = m<n ? m : n;
+ int i1 = 1;
+ REAL dtmp;
+ REAL dm1 = -1.0;
+
+ for(ii=0; ii<iimax; ii++)
+ {
+ itmp0 = m-ii-1;
+ dtmp = 1.0/A[ii+lda*ii];
+ dA[ii] = dtmp;
+ for(jj=0; jj<itmp0; jj++)
+ {
+ A[ii+1+jj+lda*ii] *= dtmp;
+ }
+ itmp1 = n-ii-1;
+ for(jj=0; jj<itmp1; jj++)
+ {
+ for(kk=0; kk<itmp0; kk++)
+ {
+ A[(ii+1+kk)+lda*(ii+1+jj)] -= A[(ii+1+kk)+lda*ii] * A[ii+lda*(ii+1+jj)];
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// dgetrf without pivoting
+void GETRF_NOPIVOT_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ int ii, jj, kk;
+// int i1 = 1;
+// REAL d1 = 1.0;
+ REAL
+ d_00_inv, d_11_inv,
+ d_00, d_01,
+ d_10, d_11;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pC = sC->pA + ci + cj*ldc;
+ REAL *pD = sD->pA + di + dj*ldd;
+ REAL *dD = sD->dA;
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+#if 1
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ // upper
+ ii = 0;
+ for(; ii<jj-1; ii+=2)
+ {
+ // correct upper
+ d_00 = pC[(ii+0)+ldc*(jj+0)];
+ d_10 = pC[(ii+1)+ldc*(jj+0)];
+ d_01 = pC[(ii+0)+ldc*(jj+1)];
+ d_11 = pC[(ii+1)+ldc*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // solve upper
+ d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+ d_11 -= pD[(ii+1)+ldd*kk] * d_01;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+1)+ldd*(jj+0)] = d_10;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ pD[(ii+1)+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<jj; ii++)
+ {
+ // correct upper
+ d_00 = pC[(ii+0)+ldc*(jj+0)];
+ d_01 = pC[(ii+0)+ldc*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // solve upper
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ }
+ // diagonal
+ ii = jj;
+ if(ii<m-1)
+ {
+ // correct diagonal
+ d_00 = pC[(ii+0)+ldc*(jj+0)];
+ d_10 = pC[(ii+1)+ldc*(jj+0)];
+ d_01 = pC[(ii+0)+ldc*(jj+1)];
+ d_11 = pC[(ii+1)+ldc*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // factorize diagonal
+ d_00_inv = 1.0/d_00;
+ d_10 *= d_00_inv;
+ d_11 -= d_10 * d_01;
+ d_11_inv = 1.0/d_11;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+1)+ldd*(jj+0)] = d_10;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ pD[(ii+1)+ldd*(jj+1)] = d_11;
+ dD[ii+0] = d_00_inv;
+ dD[ii+1] = d_11_inv;
+ ii += 2;
+ }
+ else if(ii<m)
+ {
+ // correct diagonal
+ d_00 = pC[(ii+0)+ldc*(jj+0)];
+ d_01 = pC[(ii+0)+ldc*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // factorize diagonal
+ d_00_inv = 1.0/d_00;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ dD[ii+0] = d_00_inv;
+ ii += 1;
+ }
+ // lower
+ for(; ii<m-1; ii+=2)
+ {
+ // correct lower
+ d_00 = pC[(ii+0)+ldc*(jj+0)];
+ d_10 = pC[(ii+1)+ldc*(jj+0)];
+ d_01 = pC[(ii+0)+ldc*(jj+1)];
+ d_11 = pC[(ii+1)+ldc*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // solve lower
+ d_00 *= d_00_inv;
+ d_10 *= d_00_inv;
+ d_01 -= d_00 * pD[kk+ldd*(jj+1)];
+ d_11 -= d_10 * pD[kk+ldd*(jj+1)];
+ d_01 *= d_11_inv;
+ d_11 *= d_11_inv;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+1)+ldd*(jj+0)] = d_10;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ pD[(ii+1)+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<m; ii++)
+ {
+ // correct lower
+ d_00 = pC[(ii+0)+ldc*(jj+0)];
+ d_01 = pC[(ii+0)+ldc*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // solve lower
+ d_00 *= d_00_inv;
+ d_01 -= d_00 * pD[kk+ldd*(jj+1)];
+ d_01 *= d_11_inv;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ // upper
+ ii = 0;
+ for(; ii<jj-1; ii+=2)
+ {
+ // correct upper
+ d_00 = pC[(ii+0)+ldc*jj];
+ d_10 = pC[(ii+1)+ldc*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // solve upper
+ d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+ pD[(ii+0)+ldd*jj] = d_00;
+ pD[(ii+1)+ldd*jj] = d_10;
+ }
+ for(; ii<jj; ii++)
+ {
+ // correct upper
+ d_00 = pC[(ii+0)+ldc*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // solve upper
+ pD[(ii+0)+ldd*jj] = d_00;
+ }
+ // diagonal
+ ii = jj;
+ if(ii<m-1)
+ {
+ // correct diagonal
+ d_00 = pC[(ii+0)+ldc*jj];
+ d_10 = pC[(ii+1)+ldc*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // factorize diagonal
+ d_00_inv = 1.0/d_00;
+ d_10 *= d_00_inv;
+ pD[(ii+0)+ldd*jj] = d_00;
+ pD[(ii+1)+ldd*jj] = d_10;
+ dD[ii+0] = d_00_inv;
+ ii += 2;
+ }
+ else if(ii<m)
+ {
+ // correct diagonal
+ d_00 = pC[(ii+0)+ldc*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // factorize diagonal
+ d_00_inv = 1.0/d_00;
+ pD[(ii+0)+ldd*jj] = d_00;
+ dD[ii+0] = d_00_inv;
+ ii += 1;
+ }
+ // lower
+ for(; ii<m-1; ii+=2)
+ {
+ // correct lower
+ d_00 = pC[(ii+0)+ldc*jj];
+ d_10 = pC[(ii+1)+ldc*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // solve lower
+ d_00 *= d_00_inv;
+ d_10 *= d_00_inv;
+ pD[(ii+0)+ldd*jj] = d_00;
+ pD[(ii+1)+ldd*jj] = d_10;
+ }
+ for(; ii<m; ii++)
+ {
+ // correct lower
+ d_00 = pC[(ii+0)+ldc*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // solve lower
+ d_00 *= d_00_inv;
+ pD[(ii+0)+ldd*jj] = d_00;
+ }
+ }
+#else
+ if(pC!=pD)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*jj] = pC[ii+ldc*jj];
+ }
+ }
+ }
+ GETF2_NOPIVOT(m, n, pD, ldd, dD);
+#endif
+ return;
+ }
+
+
+
+// dgetrf pivoting
+void GETRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, int *ipiv)
+ {
+ int ii, i0, jj, kk, ip, itmp0, itmp1;
+ REAL dtmp, dmax;
+ REAL
+ d_00_inv, d_11_inv,
+ d_00, d_01,
+ d_10, d_11;
+ int i1 = 1;
+ REAL d1 = 1.0;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ REAL *pC = sC->pA+ci+cj*ldc;
+ REAL *pD = sD->pA+di+dj*ldd;
+ REAL *dD = sD->dA;
+ if(di==0 & dj==0)
+ sD->use_dA = 1;
+ else
+ sD->use_dA = 0;
+ // copy if needed
+ if(pC!=pD)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*jj] = pC[ii+ldc*jj];
+ }
+ }
+ }
+ // factorize
+#if 1
+ jj = 0;
+ for(; jj<n-1; jj+=2)
+ {
+ ii = 0;
+ for(; ii<jj-1; ii+=2)
+ {
+ // correct upper
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_10 = pD[(ii+1)+ldd*(jj+0)];
+ d_01 = pD[(ii+0)+ldd*(jj+1)];
+ d_11 = pD[(ii+1)+ldd*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // solve upper
+ d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+ d_11 -= pD[(ii+1)+ldd*kk] * d_01;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+1)+ldd*(jj+0)] = d_10;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ pD[(ii+1)+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<jj; ii++)
+ {
+ // correct upper
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_01 = pD[(ii+0)+ldd*(jj+1)];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ // solve upper
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ }
+ // correct diagonal and lower and look for pivot
+ // correct
+ ii = jj;
+ i0 = ii;
+ for(; ii<m-1; ii+=2)
+ {
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_10 = pD[(ii+1)+ldd*(jj+0)];
+ d_01 = pD[(ii+0)+ldd*(jj+1)];
+ d_11 = pD[(ii+1)+ldd*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+1)+ldd*(jj+0)] = d_10;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ pD[(ii+1)+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<m; ii++)
+ {
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_01 = pD[(ii+0)+ldd*(jj+1)];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+ d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+ }
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ }
+ // look for pivot & solve
+ // left column
+ ii = i0;
+ dmax = 0;
+ ip = ii;
+ for(; ii<m-1; ii+=2)
+ {
+ d_00 = pD[(ii+0)+ldd*jj];
+ d_10 = pD[(ii+1)+ldd*jj];
+ dtmp = d_00>0.0 ? d_00 : -d_00;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+0;
+ }
+ dtmp = d_10>0.0 ? d_10 : -d_10;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+1;
+ }
+ }
+ for(; ii<m; ii++)
+ {
+ d_00 = pD[(ii+0)+ldd*jj];
+ dtmp = d_00>0.0 ? d_00 : -d_00;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+0;
+ }
+ }
+ // row swap
+ ii = i0;
+ ipiv[ii] = ip;
+ if(ip!=ii)
+ {
+ for(kk=0; kk<n; kk++)
+ {
+ dtmp = pD[ii+ldd*kk];
+ pD[ii+ldd*kk] = pD[ip+ldd*kk];
+ pD[ip+ldd*kk] = dtmp;
+ }
+ }
+ // factorize diagonal
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_00_inv = 1.0/d_00;
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ dD[ii] = d_00_inv;
+ ii += 1;
+ // solve & compute next pivot
+ dmax = 0;
+ ip = ii;
+ for(; ii<m-1; ii+=2)
+ {
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_10 = pD[(ii+1)+ldd*(jj+0)];
+ d_00 *= d_00_inv;
+ d_10 *= d_00_inv;
+ d_01 = pD[(ii+0)+ldd*(jj+1)];
+ d_11 = pD[(ii+1)+ldd*(jj+1)];
+ d_01 -= d_00 * pD[jj+ldd*(jj+1)];
+ d_11 -= d_10 * pD[jj+ldd*(jj+1)];
+ dtmp = d_01>0.0 ? d_01 : -d_01;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+0;
+ }
+ dtmp = d_11>0.0 ? d_11 : -d_11;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+1;
+ }
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+1)+ldd*(jj+0)] = d_10;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ pD[(ii+1)+ldd*(jj+1)] = d_11;
+ }
+ for(; ii<m; ii++)
+ {
+ d_00 = pD[(ii+0)+ldd*(jj+0)];
+ d_00 *= d_00_inv;
+ d_01 = pD[(ii+0)+ldd*(jj+1)];
+ d_01 -= d_00 * pD[jj+ldd*(jj+1)];
+ dtmp = d_01>0.0 ? d_01 : -d_01;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+0;
+ }
+ pD[(ii+0)+ldd*(jj+0)] = d_00;
+ pD[(ii+0)+ldd*(jj+1)] = d_01;
+ }
+ // row swap
+ ii = i0+1;
+ ipiv[ii] = ip;
+ if(ip!=ii)
+ {
+ for(kk=0; kk<n; kk++)
+ {
+ dtmp = pD[ii+ldd*kk];
+ pD[ii+ldd*kk] = pD[ip+ldd*kk];
+ pD[ip+ldd*kk] = dtmp;
+ }
+ }
+ // factorize diagonal
+ d_00 = pD[ii+ldd*(jj+1)];
+ d_00_inv = 1.0/d_00;
+ pD[ii+ldd*(jj+1)] = d_00;
+ dD[ii] = d_00_inv;
+ ii += 1;
+ // solve lower
+ for(; ii<m; ii++)
+ {
+ d_00 = pD[ii+ldd*(jj+1)];
+ d_00 *= d_00_inv;
+ pD[ii+ldd*(jj+1)] = d_00;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<jj-1; ii+=2)
+ {
+ // correct upper
+ d_00 = pD[(ii+0)+ldd*jj];
+ d_10 = pD[(ii+1)+ldd*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // solve upper
+ d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+ pD[(ii+0)+ldd*jj] = d_00;
+ pD[(ii+1)+ldd*jj] = d_10;
+ }
+ for(; ii<jj; ii++)
+ {
+ // correct upper
+ d_00 = pD[ii+ldd*jj];
+ for(kk=0; kk<ii; kk++)
+ {
+ d_00 -= pD[ii+ldd*kk] * pD[kk+ldd*jj];
+ }
+ // solve upper
+ pD[ii+ldd*jj] = d_00;
+ }
+ i0 = ii;
+ ii = jj;
+ // correct diagonal and lower and look for pivot
+ dmax = 0;
+ ip = ii;
+ for(; ii<m-1; ii+=2)
+ {
+ d_00 = pD[(ii+0)+ldd*jj];
+ d_10 = pD[(ii+1)+ldd*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ dtmp = d_00>0.0 ? d_00 : -d_00;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+0;
+ }
+ dtmp = d_10>0.0 ? d_10 : -d_10;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+1;
+ }
+ pD[(ii+0)+ldd*jj] = d_00;
+ pD[(ii+1)+ldd*jj] = d_10;
+ }
+ for(; ii<m; ii++)
+ {
+ d_00 = pD[(ii+0)+ldd*jj];
+ for(kk=0; kk<jj; kk++)
+ {
+ d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+ }
+ dtmp = d_00>0.0 ? d_00 : -d_00;
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+0;
+ }
+ pD[(ii+0)+ldd*jj] = d_00;
+ }
+ // row swap
+ ii = i0;
+ ipiv[ii] = ip;
+ if(ip!=ii)
+ {
+ for(kk=0; kk<n; kk++)
+ {
+ dtmp = pD[ii+ldd*kk];
+ pD[ii+ldd*kk] = pD[ip+ldd*kk];
+ pD[ip+ldd*kk] = dtmp;
+ }
+ }
+ // factorize diagonal
+ d_00 = pD[ii+ldd*jj];
+ d_00_inv = 1.0/d_00;
+ pD[ii+ldd*jj] = d_00;
+ dD[ii] = d_00_inv;
+ ii += 1;
+ for(; ii<m; ii++)
+ {
+ // correct lower
+ d_00 = pD[ii+ldd*jj];
+ // solve lower
+ d_00 *= d_00_inv;
+ pD[ii+ldd*jj] = d_00;
+ }
+ }
+#else
+ int iimax = m<n ? m : n;
+ for(ii=0; ii<iimax; ii++)
+ {
+ dmax = (pD[ii+ldd*ii]>0 ? pD[ii+ldd*ii] : -pD[ii+ldd*ii]);
+ ip = ii;
+ for(jj=1; jj<m-ii; jj++)
+ {
+ dtmp = pD[ii+jj+ldd*ii]>0 ? pD[ii+jj+ldd*ii] : -pD[ii+jj+ldd*ii];
+ if(dtmp>dmax)
+ {
+ dmax = dtmp;
+ ip = ii+jj;
+ }
+ }
+ ipiv[ii] = ip;
+ if(ip!=ii)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ dtmp = pD[ii+jj*ldd];
+ pD[ii+jj*ldd] = pD[ip+jj*ldd];
+ pD[ip+jj*ldd] = dtmp;
+ }
+ }
+ itmp0 = m-ii-1;
+ dtmp = 1.0/pD[ii+ldd*ii];
+ dD[ii] = dtmp;
+ for(jj=0; jj<itmp0; jj++)
+ {
+ pD[ii+1+jj+ldd*ii] *= dtmp;
+ }
+ itmp1 = n-ii-1;
+ for(jj=0; jj<itmp1; jj++)
+ {
+ for(kk=0; kk<itmp0; kk++)
+ {
+ pD[(ii+1+kk)+ldd*(ii+1+jj)] -= pD[(ii+1+kk)+ldd*ii] * pD[ii+ldd*(ii+1+jj)];
+ }
+ }
+ }
+#endif
+ return;
+ }
+
+
+
+int GEQRF_WORK_SIZE_LIBSTR(int m, int n)
+ {
+ return 0;
+ }
+
+
+
+void GEQRF_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRMAT *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ int lda = sA->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA+ai+aj*lda;
+ REAL *pD = sD->pA+di+dj*ldd; // matrix of QR
+ REAL *dD = sD->dA+di; // vectors of tau
+ REAL alpha, beta, tmp, w0, w1;
+ REAL *pC00, *pC01, *pC11, *pv0, *pv1;
+ REAL pW[4] = {0.0, 0.0, 0.0, 0.0};
+ int ldw = 2;
+ REAL pT[4] = {0.0, 0.0, 0.0, 0.0};
+ int ldb = 2;
+ int imax, jmax, kmax;
+ // copy if needed
+ if(pA!=pD)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*jj] = pA[ii+lda*jj];
+ }
+ }
+ }
+ imax = m<n ? m : n;
+ ii = 0;
+#if 1
+ for(; ii<imax-1; ii+=2)
+ {
+ // first column
+ pC00 = &pD[ii+ldd*ii];
+ beta = 0.0;
+ for(jj=1; jj<m-ii; jj++)
+ {
+ tmp = pC00[jj+ldd*0];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau0
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0+ldd*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pC00[0+ldd*0] = beta;
+ for(jj=1; jj<m-ii; jj++)
+ {
+ pC00[jj+ldd*0] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ pC01 = &pC00[0+ldd*1];
+ pv0 = &pC00[0+ldd*0];
+ kmax = m-ii;
+ w0 = pC01[0+ldd*0]; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w0 += pC01[kk+ldd*0] * pv0[kk];
+ }
+ w0 = - dD[ii] * w0;
+ pC01[0+ldd*0] += w0; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC01[kk+ldd*0] += w0 * pv0[kk];
+ }
+ // second column
+ pC11 = &pD[(ii+1)+ldd*(ii+1)];
+ beta = 0.0;
+ for(jj=1; jj<m-(ii+1); jj++)
+ {
+ tmp = pC11[jj+ldd*0];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau1
+ dD[(ii+1)] = 0.0;
+ }
+ else
+ {
+ alpha = pC11[0+ldd*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau1
+ dD[(ii+1)] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v1
+ pC11[0+ldd*0] = beta;
+ for(jj=1; jj<m-(ii+1); jj++)
+ pC11[jj+ldd*0] *= tmp;
+ }
+ // compute lower triangular T containing tau for matrix update
+ pv0 = &pC00[0+ldd*0];
+ pv1 = &pC00[0+ldd*1];
+ kmax = m-ii;
+ tmp = pv0[1];
+ for(kk=2; kk<kmax; kk++)
+ tmp += pv0[kk]*pv1[kk];
+ pT[0+ldb*0] = dD[ii+0];
+ pT[1+ldb*0] = - dD[ii+1] * tmp * dD[ii+0];
+ pT[1+ldb*1] = dD[ii+1];
+ jmax = n-ii-2;
+ jj = 0;
+ for(; jj<jmax-1; jj+=2)
+ {
+ // compute W^T = C^T * V
+ pW[0+ldw*0] = pC00[0+ldd*(jj+0+2)] + pC00[1+ldd*(jj+0+2)] * pv0[1];
+ pW[1+ldw*0] = pC00[0+ldd*(jj+1+2)] + pC00[1+ldd*(jj+1+2)] * pv0[1];
+ pW[0+ldw*1] = pC00[1+ldd*(jj+0+2)];
+ pW[1+ldw*1] = pC00[1+ldd*(jj+1+2)];
+ kk = 2;
+ for(; kk<kmax; kk++)
+ {
+ tmp = pC00[kk+ldd*(jj+0+2)];
+ pW[0+ldw*0] += tmp * pv0[kk];
+ pW[0+ldw*1] += tmp * pv1[kk];
+ tmp = pC00[kk+ldd*(jj+1+2)];
+ pW[1+ldw*0] += tmp * pv0[kk];
+ pW[1+ldw*1] += tmp * pv1[kk];
+ }
+ // compute W^T *= T
+ pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+ pW[1+ldw*1] = pT[1+ldb*0]*pW[1+ldw*0] + pT[1+ldb*1]*pW[1+ldw*1];
+ pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+ pW[1+ldw*0] = pT[0+ldb*0]*pW[1+ldw*0];
+ // compute C -= V * W^T
+ pC00[0+ldd*(jj+0+2)] -= pW[0+ldw*0];
+ pC00[0+ldd*(jj+1+2)] -= pW[1+ldw*0];
+ pC00[1+ldd*(jj+0+2)] -= pv0[1]*pW[0+ldw*0] + pW[0+ldw*1];
+ pC00[1+ldd*(jj+1+2)] -= pv0[1]*pW[1+ldw*0] + pW[1+ldw*1];
+ kk = 2;
+ for(; kk<kmax-1; kk+=2)
+ {
+ pC00[kk+0+ldd*(jj+0+2)] -= pv0[kk+0]*pW[0+ldw*0] + pv1[kk+0]*pW[0+ldw*1];
+ pC00[kk+1+ldd*(jj+0+2)] -= pv0[kk+1]*pW[0+ldw*0] + pv1[kk+1]*pW[0+ldw*1];
+ pC00[kk+0+ldd*(jj+1+2)] -= pv0[kk+0]*pW[1+ldw*0] + pv1[kk+0]*pW[1+ldw*1];
+ pC00[kk+1+ldd*(jj+1+2)] -= pv0[kk+1]*pW[1+ldw*0] + pv1[kk+1]*pW[1+ldw*1];
+ }
+ for(; kk<kmax; kk++)
+ {
+ pC00[kk+ldd*(jj+0+2)] -= pv0[kk]*pW[0+ldw*0] + pv1[kk]*pW[0+ldw*1];
+ pC00[kk+ldd*(jj+1+2)] -= pv0[kk]*pW[1+ldw*0] + pv1[kk]*pW[1+ldw*1];
+ }
+ }
+ for(; jj<jmax; jj++)
+ {
+ // compute W = T * V^T * C
+ pW[0+ldw*0] = pC00[0+ldd*(jj+0+2)] + pC00[1+ldd*(jj+0+2)] * pv0[1];
+ pW[0+ldw*1] = pC00[1+ldd*(jj+0+2)];
+ for(kk=2; kk<kmax; kk++)
+ {
+ tmp = pC00[kk+ldd*(jj+0+2)];
+ pW[0+ldw*0] += tmp * pv0[kk];
+ pW[0+ldw*1] += tmp * pv1[kk];
+ }
+ pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+ pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+ // compute C -= V * W^T
+ pC00[0+ldd*(jj+0+2)] -= pW[0+ldw*0];
+ pC00[1+ldd*(jj+0+2)] -= pv0[1]*pW[0+ldw*0] + pW[0+ldw*1];
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC00[kk+ldd*(jj+0+2)] -= pv0[kk]*pW[0+ldw*0] + pv1[kk]*pW[0+ldw*1];
+ }
+ }
+ }
+#endif
+ for(; ii<imax; ii++)
+ {
+ pC00 = &pD[ii+ldd*ii];
+ beta = 0.0;
+ for(jj=1; jj<m-ii; jj++)
+ {
+ tmp = pC00[jj+ldd*0];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0+ldd*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ for(jj=1; jj<m-ii; jj++)
+ pC00[jj+ldd*0] *= tmp;
+ pC00[0+ldd*0] = beta;
+ }
+ if(ii<n)
+ {
+ // gemv_t & ger
+ pC01 = &pC00[0+ldd*1];
+ pv0 = &pC00[0+ldd*0];
+ jmax = n-ii-1;
+ kmax = m-ii;
+ jj = 0;
+ for(; jj<jmax-1; jj+=2)
+ {
+ w0 = pC01[0+ldd*(jj+0)]; // pv0[0] = 1.0
+ w1 = pC01[0+ldd*(jj+1)]; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w0 += pC01[kk+ldd*(jj+0)] * pv0[kk];
+ w1 += pC01[kk+ldd*(jj+1)] * pv0[kk];
+ }
+ w0 = - dD[ii] * w0;
+ w1 = - dD[ii] * w1;
+ pC01[0+ldd*(jj+0)] += w0; // pv0[0] = 1.0
+ pC01[0+ldd*(jj+1)] += w1; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC01[kk+ldd*(jj+0)] += w0 * pv0[kk];
+ pC01[kk+ldd*(jj+1)] += w1 * pv0[kk];
+ }
+ }
+ for(; jj<jmax; jj++)
+ {
+ w0 = pC01[0+ldd*jj]; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w0 += pC01[kk+ldd*jj] * pv0[kk];
+ }
+ w0 = - dD[ii] * w0;
+ pC01[0+ldd*jj] += w0; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC01[kk+ldd*jj] += w0 * pv0[kk];
+ }
+ }
+ }
+ }
+ return;
+ }
+
+
+
+int GELQF_WORK_SIZE_LIBSTR(int m, int n)
+ {
+ return 0;
+ }
+
+
+
+void GELQF_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRMAT *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk;
+ int lda = sA->m;
+ int ldd = sD->m;
+ REAL *pA = sA->pA+ai+aj*lda;
+ REAL *pD = sD->pA+di+dj*ldd; // matrix of QR
+ REAL *dD = sD->dA+di; // vectors of tau
+ REAL alpha, beta, tmp, w0, w1;
+ REAL *pC00, *pC10, *pC11, *pv0, *pv1;
+ REAL pW[4] = {0.0, 0.0, 0.0, 0.0};
+ int ldw = 2;
+ REAL pT[4] = {0.0, 0.0, 0.0, 0.0};
+ int ldb = 2;
+ int imax, jmax, kmax;
+ // copy if needed
+ if(pA!=pD)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pD[ii+ldd*jj] = pA[ii+lda*jj];
+ }
+ }
+ }
+ imax = m<n ? m : n;
+ ii = 0;
+#if 1
+ for(; ii<imax-1; ii+=2)
+ {
+ // first column
+ pC00 = &pD[ii+ldd*ii];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ldd*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau0
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0+ldd*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pC00[0+ldd*0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ pC00[0+ldd*jj] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ pC10 = &pC00[1+ldd*0];
+ pv0 = &pC00[0+ldd*0];
+ kmax = n-ii;
+ w0 = pC10[0+ldd*0]; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w0 += pC10[0+ldd*kk] * pv0[0+ldd*kk];
+ }
+ w0 = - dD[ii] * w0;
+ pC10[0+ldd*0] += w0; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ldd*kk] += w0 * pv0[0+ldd*kk];
+ }
+ // second row
+ pC11 = &pD[(ii+1)+ldd*(ii+1)];
+ beta = 0.0;
+ for(jj=1; jj<n-(ii+1); jj++)
+ {
+ tmp = pC11[0+ldd*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau1
+ dD[(ii+1)] = 0.0;
+ }
+ else
+ {
+ alpha = pC11[0+ldd*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau1
+ dD[(ii+1)] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v1
+ pC11[0+ldd*0] = beta;
+ for(jj=1; jj<n-(ii+1); jj++)
+ pC11[0+ldd*jj] *= tmp;
+ }
+ // compute lower triangular T containing tau for matrix update
+ pv0 = &pC00[0+ldd*0];
+ pv1 = &pC00[1+ldd*0];
+ kmax = n-ii;
+ tmp = pv0[0+ldd*1];
+ for(kk=2; kk<kmax; kk++)
+ tmp += pv0[0+ldd*kk]*pv1[0+ldd*kk];
+ pT[0+ldb*0] = dD[ii+0];
+ pT[1+ldb*0] = - dD[ii+1] * tmp * dD[ii+0];
+ pT[1+ldb*1] = dD[ii+1];
+ // downgrade
+ jmax = m-ii-2;
+ jj = 0;
+ for(; jj<jmax-1; jj+=2)
+ {
+ // compute W^T = C^T * V
+ pW[0+ldw*0] = pC00[jj+0+2+ldd*0] + pC00[jj+0+2+ldd*1] * pv0[0+ldd*1];
+ pW[1+ldw*0] = pC00[jj+1+2+ldd*0] + pC00[jj+1+2+ldd*1] * pv0[0+ldd*1];
+ pW[0+ldw*1] = pC00[jj+0+2+ldd*1];
+ pW[1+ldw*1] = pC00[jj+1+2+ldd*1];
+ kk = 2;
+ for(; kk<kmax; kk++)
+ {
+ tmp = pC00[jj+0+2+ldd*kk];
+ pW[0+ldw*0] += tmp * pv0[0+ldd*kk];
+ pW[0+ldw*1] += tmp * pv1[0+ldd*kk];
+ tmp = pC00[jj+1+2+ldd*kk];
+ pW[1+ldw*0] += tmp * pv0[0+ldd*kk];
+ pW[1+ldw*1] += tmp * pv1[0+ldd*kk];
+ }
+ // compute W^T *= T
+ pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+ pW[1+ldw*1] = pT[1+ldb*0]*pW[1+ldw*0] + pT[1+ldb*1]*pW[1+ldw*1];
+ pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+ pW[1+ldw*0] = pT[0+ldb*0]*pW[1+ldw*0];
+ // compute C -= V * W^T
+ pC00[jj+0+2+ldd*0] -= pW[0+ldw*0];
+ pC00[jj+1+2+ldd*0] -= pW[1+ldw*0];
+ pC00[jj+0+2+ldd*1] -= pv0[0+ldd*1]*pW[0+ldw*0] + pW[0+ldw*1];
+ pC00[jj+1+2+ldd*1] -= pv0[0+ldd*1]*pW[1+ldw*0] + pW[1+ldw*1];
+ kk = 2;
+ for(; kk<kmax-1; kk+=2)
+ {
+ pC00[jj+0+2+ldd*(kk+0)] -= pv0[0+ldd*(kk+0)]*pW[0+ldw*0] + pv1[0+ldd*(kk+0)]*pW[0+ldw*1];
+ pC00[jj+0+2+ldd*(kk+1)] -= pv0[0+ldd*(kk+1)]*pW[0+ldw*0] + pv1[0+ldd*(kk+1)]*pW[0+ldw*1];
+ pC00[jj+1+2+ldd*(kk+0)] -= pv0[0+ldd*(kk+0)]*pW[1+ldw*0] + pv1[0+ldd*(kk+0)]*pW[1+ldw*1];
+ pC00[jj+1+2+ldd*(kk+1)] -= pv0[0+ldd*(kk+1)]*pW[1+ldw*0] + pv1[0+ldd*(kk+1)]*pW[1+ldw*1];
+ }
+ for(; kk<kmax; kk++)
+ {
+ pC00[jj+0+2+ldd*kk] -= pv0[0+ldd*kk]*pW[0+ldw*0] + pv1[0+ldd*kk]*pW[0+ldw*1];
+ pC00[jj+1+2+ldd*kk] -= pv0[0+ldd*kk]*pW[1+ldw*0] + pv1[0+ldd*kk]*pW[1+ldw*1];
+ }
+ }
+ for(; jj<jmax; jj++)
+ {
+ // compute W = T * V^T * C
+ pW[0+ldw*0] = pC00[jj+0+2+ldd*0] + pC00[jj+0+2+ldd*1] * pv0[0+ldd*1];
+ pW[0+ldw*1] = pC00[jj+0+2+ldd*1];
+ for(kk=2; kk<kmax; kk++)
+ {
+ tmp = pC00[jj+0+2+ldd*kk];
+ pW[0+ldw*0] += tmp * pv0[0+ldd*kk];
+ pW[0+ldw*1] += tmp * pv1[0+ldd*kk];
+ }
+ pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+ pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+ // compute C -= V * W^T
+ pC00[jj+0+2+ldd*0] -= pW[0+ldw*0];
+ pC00[jj+0+2+ldd*1] -= pv0[0+ldd*1]*pW[0+ldw*0] + pW[0+ldw*1];
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC00[jj+0+2+ldd*kk] -= pv0[0+ldd*kk]*pW[0+ldw*0] + pv1[0+ldd*kk]*pW[0+ldw*1];
+ }
+ }
+ }
+#endif
+ for(; ii<imax; ii++)
+ {
+ pC00 = &pD[ii+ldd*ii];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ldd*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0+ldd*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ldd*jj] *= tmp;
+ pC00[0+ldd*0] = beta;
+ }
+ if(ii<n)
+ {
+ // gemv_t & ger
+ pC10 = &pC00[1+ldd*0];
+ pv0 = &pC00[0+ldd*0];
+ jmax = m-ii-1;
+ kmax = n-ii;
+ jj = 0;
+ for(; jj<jmax-1; jj+=2)
+ {
+ w0 = pC10[jj+0+ldd*0]; // pv0[0] = 1.0
+ w1 = pC10[jj+1+ldd*0]; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w0 += pC10[jj+0+ldd*kk] * pv0[0+ldd*kk];
+ w1 += pC10[jj+1+ldd*kk] * pv0[0+ldd*kk];
+ }
+ w0 = - dD[ii] * w0;
+ w1 = - dD[ii] * w1;
+ pC10[jj+0+ldd*0] += w0; // pv0[0] = 1.0
+ pC10[jj+1+ldd*0] += w1; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[jj+0+ldd*kk] += w0 * pv0[0+ldd*kk];
+ pC10[jj+1+ldd*kk] += w1 * pv0[0+ldd*kk];
+ }
+ }
+ for(; jj<jmax; jj++)
+ {
+ w0 = pC10[jj+ldd*0]; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w0 += pC10[jj+ldd*kk] * pv0[0+ldd*kk];
+ }
+ w0 = - dD[ii] * w0;
+ pC10[jj+ldd*0] += w0; // pv0[0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[jj+ldd*kk] += w0 * pv0[0+ldd*kk];
+ }
+ }
+ }
+ }
+ return;
+ }
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+// dpotrf
+void POTRF_L_LIBSTR(int m, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0)
+ return;
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ REAL d1 = 1.0;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long info;
+ long long tmp;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<m; jj++)
+ {
+ tmp = m-jj;
+ COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+ }
+ }
+ POTRF(&cl, &mm, pD, &ldd, &info);
+#else
+ int i1 = 1;
+ int info;
+ int tmp;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<m; jj++)
+ {
+ tmp = m-jj;
+ COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+ }
+ }
+ POTRF(&cl, &m, pD, &ldd, &info);
+#endif
+ return;
+ }
+
+
+
+// dpotrf
+void POTRF_L_MN_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ REAL d1 = 1.0;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long mmn = mm-nn;
+ long long info;
+ long long tmp;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ tmp = m-jj;
+ COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+ }
+ }
+ POTRF(&cl, &nn, pD, &ldd, &info);
+ TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
+#else
+ int i1 = 1;
+ int mmn = m-n;
+ int info;
+ int tmp;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ tmp = m-jj;
+ COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+ }
+ }
+ POTRF(&cl, &n, pD, &ldd, &info);
+ TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
+#endif
+ return;
+ }
+
+
+
+// dsyrk dpotrf
+void SYRK_POTRF_LN_LIBSTR(int m, int n, int k, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int jj;
+ char cl = 'l';
+ char cn = 'n';
+ char cr = 'r';
+ char ct = 't';
+ char cu = 'u';
+ REAL d1 = 1.0;
+ REAL *pA = sA->pA + ai + aj*sA->m;
+ REAL *pB = sB->pA + bi + bj*sB->m;
+ REAL *pC = sC->pA + ci + cj*sC->m;
+ REAL *pD = sD->pA + di + dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long kk = k;
+ long long mmn = mm-nn;
+ long long info;
+ long long lda = sA->m;
+ long long ldb = sB->m;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ if(pA==pB)
+ {
+ SYRK(&cl, &cn, &nn, &kk, &d1, pA, &lda, &d1, pD, &ldd);
+ GEMM(&cn, &ct, &mmn, &nn, &kk, &d1, pA+n, &lda, pB, &ldb, &d1, pD+n, &ldd);
+ POTRF(&cl, &nn, pD, &ldd, &info);
+ TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
+ }
+ else
+ {
+ GEMM(&cn, &ct, &mm, &nn, &kk, &d1, pA, &lda, pB, &ldb, &d1, pD, &ldd);
+ POTRF(&cl, &nn, pD, &ldd, &info);
+ TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
+ }
+#else
+ int i1 = 1;
+ int mmn = m-n;
+ int info;
+ int lda = sA->m;
+ int ldb = sB->m;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ if(pA==pB)
+ {
+ SYRK(&cl, &cn, &n, &k, &d1, pA, &lda, &d1, pD, &ldd);
+ GEMM(&cn, &ct, &mmn, &n, &k, &d1, pA+n, &lda, pB, &ldb, &d1, pD+n, &ldd);
+ POTRF(&cl, &n, pD, &ldd, &info);
+ TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
+ }
+ else
+ {
+ GEMM(&cn, &ct, &m, &n, &k, &d1, pA, &lda, pB, &ldb, &d1, pD, &ldd);
+ POTRF(&cl, &n, pD, &ldd, &info);
+ TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
+ }
+#endif
+ return;
+ }
+
+
+
+// dgetrf without pivoting
+#if defined(REF_BLAS_BLIS)
+static void GETF2_NOPIVOT(long long m, long long n, REAL *A, long long lda)
+ {
+ if(m<=0 | n<=0)
+ return;
+ long long i, j;
+ long long jmax = m<n ? m : n;
+ REAL dtmp;
+ REAL dm1 = -1.0;
+ long long itmp0, itmp1;
+ long long i1 = 1;
+ for(j=0; j<jmax; j++)
+ {
+ itmp0 = m-j-1;
+ dtmp = 1.0/A[j+lda*j];
+ SCAL(&itmp0, &dtmp, &A[(j+1)+lda*j], &i1);
+ itmp1 = n-j-1;
+ GER(&itmp0, &itmp1, &dm1, &A[(j+1)+lda*j], &i1, &A[j+lda*(j+1)], &lda, &A[(j+1)+lda*(j+1)], &lda);
+ }
+ return;
+ }
+#else
+static void GETF2_NOPIVOT(int m, int n, REAL *A, int lda)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int i, j;
+ int jmax = m<n ? m : n;
+ REAL dtmp;
+ REAL dm1 = -1.0;
+ int itmp0, itmp1;
+ int i1 = 1;
+ for(j=0; j<jmax; j++)
+ {
+ itmp0 = m-j-1;
+ dtmp = 1.0/A[j+lda*j];
+ SCAL(&itmp0, &dtmp, &A[(j+1)+lda*j], &i1);
+ itmp1 = n-j-1;
+ GER(&itmp0, &itmp1, &dm1, &A[(j+1)+lda*j], &i1, &A[j+lda*(j+1)], &lda, &A[(j+1)+lda*(j+1)], &lda);
+ }
+ return;
+ }
+#endif
+
+
+
+// dgetrf without pivoting
+void GETRF_NOPIVOT_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+ {
+ // TODO with custom level 2 LAPACK + level 3 BLAS
+ if(m<=0 | n<=0)
+ return;
+ int jj;
+ REAL d1 = 1.0;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long mm = m;
+ long long nn = n;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GETF2_NOPIVOT(mm, nn, pD, ldd);
+#else
+ int i1 = 1;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GETF2_NOPIVOT(m, n, pD, ldd);
+#endif
+ return;
+ }
+
+
+
+// dgetrf pivoting
+void GETRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, int *ipiv)
+ {
+ // TODO with custom level 2 LAPACK + level 3 BLAS
+ if(m<=0 | n<=0)
+ return;
+ int jj;
+ int tmp = m<n ? m : n;
+ REAL d1 = 1.0;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long info;
+ long long mm = m;
+ long long nn = n;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GETRF(&mm, &nn, pD, &ldd, (long long *) ipiv, &info);
+ for(jj=0; jj<tmp; jj++)
+ ipiv[jj] -= 1;
+#else
+ int i1 = 1;
+ int info;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+ GETRF(&m, &n, pD, &ldd, ipiv, &info);
+ for(jj=0; jj<tmp; jj++)
+ ipiv[jj] -= 1;
+#endif
+ return;
+ }
+
+
+
+int GEQRF_WORK_SIZE_LIBSTR(int m, int n)
+ {
+ REAL dwork;
+ REAL *pD, *dD;
+#if defined(REF_BLAS_BLIS)
+ long long mm = m;
+ long long nn = n;
+ long long lwork = -1;
+ long long info;
+ long long ldd = mm;
+ GEQRF(&mm, &nn, pD, &ldd, dD, &dwork, &lwork, &info);
+#else
+ int lwork = -1;
+ int info;
+ int ldd = m;
+ GEQRF(&m, &n, pD, &ldd, dD, &dwork, &lwork, &info);
+#endif
+ int size = dwork;
+ return size*sizeof(REAL);
+ }
+
+
+
+void GEQRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int jj;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+ REAL *dD = sD->dA+di;
+ REAL *dwork = (REAL *) work;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long info = -1;
+ long long mm = m;
+ long long nn = n;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+// GEQR2(&mm, &nn, pD, &ldd, dD, dwork, &info);
+ long long lwork = -1;
+ GEQRF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+ lwork = dwork[0];
+ GEQRF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+#else
+ int i1 = 1;
+ int info = -1;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+// GEQR2(&m, &n, pD, &ldd, dD, dwork, &info);
+ int lwork = -1;
+ GEQRF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+ lwork = dwork[0];
+ GEQRF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+#endif
+ return;
+ }
+
+
+
+int GELQF_WORK_SIZE_LIBSTR(int m, int n)
+ {
+ REAL dwork;
+ REAL *pD, *dD;
+#if defined(REF_BLAS_BLIS)
+ long long mm = m;
+ long long nn = n;
+ long long lwork = -1;
+ long long info;
+ long long ldd = mm;
+ GELQF(&mm, &nn, pD, &ldd, dD, &dwork, &lwork, &info);
+#else
+ int lwork = -1;
+ int info;
+ int ldd = m;
+ GELQF(&m, &n, pD, &ldd, dD, &dwork, &lwork, &info);
+#endif
+ int size = dwork;
+ return size*sizeof(REAL);
+ }
+
+
+
+void GELQF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, void *work)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int jj;
+ REAL *pC = sC->pA+ci+cj*sC->m;
+ REAL *pD = sD->pA+di+dj*sD->m;
+ REAL *dD = sD->dA+di;
+ REAL *dwork = (REAL *) work;
+#if defined(REF_BLAS_BLIS)
+ long long i1 = 1;
+ long long info = -1;
+ long long mm = m;
+ long long nn = n;
+ long long ldc = sC->m;
+ long long ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+// GEQR2(&mm, &nn, pD, &ldd, dD, dwork, &info);
+ long long lwork = -1;
+ GELQF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+ lwork = dwork[0];
+ GELQF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+#else
+ int i1 = 1;
+ int info = -1;
+ int ldc = sC->m;
+ int ldd = sD->m;
+ if(!(pC==pD))
+ {
+ for(jj=0; jj<n; jj++)
+ COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+ }
+// GEQR2(&m, &n, pD, &ldd, dD, dwork, &info);
+ int lwork = -1;
+ GELQF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+ lwork = dwork[0];
+ GELQF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+#endif
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+