Squashed 'third_party/blasfeo/' content from commit 2a828ca
Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/examples/Makefile b/examples/Makefile
new file mode 100644
index 0000000..7204cba
--- /dev/null
+++ b/examples/Makefile
@@ -0,0 +1,69 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../Makefile.rule
+
+ifeq ($(REF_BLAS), 0)
+LIBS = -lm
+endif
+ifeq ($(REF_BLAS), OPENBLAS)
+LIBS = /opt/openblas/lib/libopenblas.a -pthread -lm
+endif
+ifeq ($(REF_BLAS), BLIS)
+LIBS = -lblis -lm -fopenmp
+endif
+ifeq ($(REF_BLAS), NETLIB)
+LIBS = /opt/netlib/liblapack.a /opt/netlib/libblas.a -lgfortran -lm
+endif
+ifeq ($(REF_BLAS), MKL)
+LIBS = -Wl,--start-group /opt/intel/mkl/lib/intel64/libmkl_gf_lp64.a /opt/intel/mkl/lib/intel64/libmkl_core.a /opt/intel/mkl/lib/intel64/libmkl_sequential.a -Wl,--end-group -ldl -lpthread -lm
+endif
+
+ifneq ($(NUM_THREAD), 1)
+LIBS += -pthread
+endif
+
+#OBJS_TEST = example_d_lu_factorization.o
+#OBJS_TEST = example_s_lu_factorization.o
+OBJS_TEST = tools.o example_d_riccati_recursion.o
+#OBJS_TEST = tools.o example_s_riccati_recursion.o
+
+all: clean obj run
+
+obj: $(OBJS_TEST)
+ cp ../libblasfeo.a .
+ $(CC) -o test.out $(OBJS_TEST) -L. libblasfeo.a $(LIBS) #-pg
+
+run:
+ ./test.out
+
+clean:
+ rm -f *.o
+ rm -f test.out
+ rm -f libblasfeo.a
+
diff --git a/examples/example_d_lu_factorization.c b/examples/example_d_lu_factorization.c
new file mode 100644
index 0000000..62b3413
--- /dev/null
+++ b/examples/example_d_lu_factorization.c
@@ -0,0 +1,210 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+
+int main()
+ {
+
+ printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_REFERENCE)
+
+ printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ int ii;
+
+ int n = 16;
+
+ //
+ // matrices in column-major format
+ //
+
+ double *A; d_zeros(&A, n, n);
+ for(ii=0; ii<n*n; ii++) A[ii] = ii;
+// d_print_mat(n, n, A, n);
+
+ // spd matrix
+ double *B; d_zeros(&B, n, n);
+ for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+// d_print_mat(n, n, B, n);
+
+ // identity
+ double *I; d_zeros(&I, n, n);
+ for(ii=0; ii<n; ii++) I[ii*(n+1)] = 1.0;
+// d_print_mat(n, n, B, n);
+
+ // result matrix
+ double *D; d_zeros(&D, n, n);
+// d_print_mat(n, n, D, n);
+
+ // permutation indeces
+ int *ipiv; int_zeros(&ipiv, n, 1);
+
+ //
+ // matrices in matrix struct format
+ //
+
+ // work space enough for 5 matrix structs for size n times n
+ int size_strmat = 5*d_size_strmat(n, n);
+ void *memory_strmat; v_zeros_align(&memory_strmat, size_strmat);
+ char *ptr_memory_strmat = (char *) memory_strmat;
+
+ struct d_strmat sA;
+// d_allocate_strmat(n, n, &sA);
+ d_create_strmat(n, n, &sA, ptr_memory_strmat);
+ ptr_memory_strmat += sA.memory_size;
+ // convert from column major matrix to strmat
+ d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+ printf("\nA = \n");
+ d_print_strmat(n, n, &sA, 0, 0);
+
+ struct d_strmat sB;
+// d_allocate_strmat(n, n, &sB);
+ d_create_strmat(n, n, &sB, ptr_memory_strmat);
+ ptr_memory_strmat += sB.memory_size;
+ // convert from column major matrix to strmat
+ d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ printf("\nB = \n");
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ struct d_strmat sI;
+// d_allocate_strmat(n, n, &sI);
+ d_create_strmat(n, n, &sI, ptr_memory_strmat);
+ ptr_memory_strmat += sI.memory_size;
+ // convert from column major matrix to strmat
+
+ struct d_strmat sD;
+// d_allocate_strmat(n, n, &sD);
+ d_create_strmat(n, n, &sD, ptr_memory_strmat);
+ ptr_memory_strmat += sD.memory_size;
+
+ struct d_strmat sLU;
+// d_allocate_strmat(n, n, &sD);
+ d_create_strmat(n, n, &sLU, ptr_memory_strmat);
+ ptr_memory_strmat += sLU.memory_size;
+
+ dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+ printf("\nB+A*A' = \n");
+ d_print_strmat(n, n, &sD, 0, 0);
+
+// dgetrf_nopivot_libstr(n, n, &sD, 0, 0, &sD, 0, 0);
+ dgetrf_libstr(n, n, &sD, 0, 0, &sLU, 0, 0, ipiv);
+ printf("\nLU = \n");
+ d_print_strmat(n, n, &sLU, 0, 0);
+ printf("\nipiv = \n");
+ int_print_mat(1, n, ipiv, 1);
+
+#if 0 // solve P L U X = P B
+ d_cvt_mat2strmat(n, n, I, n, &sI, 0, 0);
+ printf("\nI = \n");
+ d_print_strmat(n, n, &sI, 0, 0);
+
+ drowpe_libstr(n, ipiv, &sI);
+ printf("\nperm(I) = \n");
+ d_print_strmat(n, n, &sI, 0, 0);
+
+ dtrsm_llnu_libstr(n, n, 1.0, &sLU, 0, 0, &sI, 0, 0, &sD, 0, 0);
+ printf("\nperm(inv(L)) = \n");
+ d_print_strmat(n, n, &sD, 0, 0);
+ dtrsm_lunn_libstr(n, n, 1.0, &sLU, 0, 0, &sD, 0, 0, &sD, 0, 0);
+ printf("\ninv(A) = \n");
+ d_print_strmat(n, n, &sD, 0, 0);
+
+ // convert from strmat to column major matrix
+ d_cvt_strmat2mat(n, n, &sD, 0, 0, D, n);
+#else // solve X^T (P L U)^T = B^T P^T
+ d_cvt_tran_mat2strmat(n, n, I, n, &sI, 0, 0);
+ printf("\nI' = \n");
+ d_print_strmat(n, n, &sI, 0, 0);
+
+ dcolpe_libstr(n, ipiv, &sB);
+ printf("\nperm(I') = \n");
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ dtrsm_rltu_libstr(n, n, 1.0, &sLU, 0, 0, &sB, 0, 0, &sD, 0, 0);
+ printf("\nperm(inv(L')) = \n");
+ d_print_strmat(n, n, &sD, 0, 0);
+ dtrsm_rutn_libstr(n, n, 1.0, &sLU, 0, 0, &sD, 0, 0, &sD, 0, 0);
+ printf("\ninv(A') = \n");
+ d_print_strmat(n, n, &sD, 0, 0);
+
+ // convert from strmat to column major matrix
+ d_cvt_tran_strmat2mat(n, n, &sD, 0, 0, D, n);
+#endif
+
+ // print matrix in column-major format
+ printf("\ninv(A) = \n");
+ d_print_mat(n, n, D, n);
+
+
+
+ //
+ // free memory
+ //
+
+ d_free(A);
+ d_free(B);
+ d_free(D);
+ d_free(I);
+ int_free(ipiv);
+// d_free_strmat(&sA);
+// d_free_strmat(&sB);
+// d_free_strmat(&sD);
+// d_free_strmat(&sI);
+ v_free_align(memory_strmat);
+
+ return 0;
+
+ }
diff --git a/examples/example_d_riccati_recursion.c b/examples/example_d_riccati_recursion.c
new file mode 100644
index 0000000..1618ce9
--- /dev/null
+++ b/examples/example_d_riccati_recursion.c
@@ -0,0 +1,595 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "tools.h"
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+
+
+static void d_back_ric_sv_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strvec *hsux, struct d_strvec *hspi, struct d_strmat *hswork_mat, struct d_strvec *hswork_vec)
+ {
+
+ int nn;
+
+ // factorization and backward substitution
+
+ // last stage
+ dpotrf_l_libstr(nx[N]+1, nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+
+ // middle stages
+ for(nn=0; nn<N; nn++)
+ {
+ dtrmm_rlnn_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn], nu[N-nn], &hsBAbt[N-nn-1], 0, 0, &hswork_mat[0], 0, 0);
+ dgead_libstr(1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn]+nx[N-nn], nu[N-nn], &hswork_mat[0], nu[N-nn-1]+nx[N-nn-1], 0);
+#if 1
+ dsyrk_dpotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+ dsyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+ dpotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+ }
+
+ // forward substitution
+
+ // first stage
+ nn = 0;
+ drowex_libstr(nu[nn]+nx[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ dtrmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+
+ // middle stages
+ for(nn=1; nn<N; nn++)
+ {
+ drowex_libstr(nu[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ dtrmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+ }
+
+ return;
+
+ }
+
+
+
+static void d_back_ric_trf_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hswork_mat)
+ {
+
+ int nn;
+
+ // factorization
+
+ // last stage
+ dpotrf_l_libstr(nx[N], nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+
+ // middle stages
+ for(nn=0; nn<N; nn++)
+ {
+ dtrmm_rlnn_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn], nu[N-nn], &hsBAbt[N-nn-1], 0, 0, &hswork_mat[0], 0, 0);
+#if 1
+ dsyrk_dpotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+ dsyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+ dpotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+ }
+
+ return;
+
+ }
+
+
+
+static void d_back_ric_trs_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strvec *hsb, struct d_strvec *hsrq, struct d_strmat *hsL, struct d_strvec *hsPb, struct d_strvec *hsux, struct d_strvec *hspi, struct d_strvec *hswork_vec)
+ {
+
+ int nn;
+
+ // backward substitution
+
+ // last stage
+ dveccp_libstr(nu[N]+nx[N], 1.0, &hsrq[N], 0, &hsux[N], 0);
+
+ // middle stages
+ for(nn=0; nn<N-1; nn++)
+ {
+ // compute Pb
+ dtrmv_ltn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dtrmv_lnn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+ dgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dtrsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ }
+
+ // first stage
+ nn = N-1;
+ dtrmv_ltn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dtrmv_lnn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+ dgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dtrsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+
+ // forward substitution
+
+ // first stage
+ nn = 0;
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ dveccp_libstr(nu[nn]+nx[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ dtrmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+
+ // middle stages
+ for(nn=1; nn<N; nn++)
+ {
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ dveccp_libstr(nu[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ dtrmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ }
+
+ return;
+
+ }
+
+
+
+/************************************************
+Mass-spring system: nx/2 masses connected each other with springs (in a row), and the first and the last one to walls. nu (<=nx) controls act on the first nu masses. The system is sampled with sampling time Ts.
+************************************************/
+static void d_mass_spring_system(double Ts, int nx, int nu, int N, double *A, double *B, double *b, double *x0)
+ {
+
+ int nx2 = nx*nx;
+
+ int info = 0;
+
+ int pp = nx/2; // number of masses
+
+/************************************************
+* build the continuous time system
+************************************************/
+
+ double *T; d_zeros(&T, pp, pp);
+ int ii;
+ for(ii=0; ii<pp; ii++) T[ii*(pp+1)] = -2;
+ for(ii=0; ii<pp-1; ii++) T[ii*(pp+1)+1] = 1;
+ for(ii=1; ii<pp; ii++) T[ii*(pp+1)-1] = 1;
+
+ double *Z; d_zeros(&Z, pp, pp);
+ double *I; d_zeros(&I, pp, pp); for(ii=0; ii<pp; ii++) I[ii*(pp+1)]=1.0; // = eye(pp);
+ double *Ac; d_zeros(&Ac, nx, nx);
+ dmcopy(pp, pp, Z, pp, Ac, nx);
+ dmcopy(pp, pp, T, pp, Ac+pp, nx);
+ dmcopy(pp, pp, I, pp, Ac+pp*nx, nx);
+ dmcopy(pp, pp, Z, pp, Ac+pp*(nx+1), nx);
+ free(T);
+ free(Z);
+ free(I);
+
+ d_zeros(&I, nu, nu); for(ii=0; ii<nu; ii++) I[ii*(nu+1)]=1.0; //I = eye(nu);
+ double *Bc; d_zeros(&Bc, nx, nu);
+ dmcopy(nu, nu, I, nu, Bc+pp, nx);
+ free(I);
+
+/************************************************
+* compute the discrete time system
+************************************************/
+
+ double *bb; d_zeros(&bb, nx, 1);
+ dmcopy(nx, 1, bb, nx, b, nx);
+
+ dmcopy(nx, nx, Ac, nx, A, nx);
+ dscal_3l(nx2, Ts, A);
+ expm(nx, A);
+
+ d_zeros(&T, nx, nx);
+ d_zeros(&I, nx, nx); for(ii=0; ii<nx; ii++) I[ii*(nx+1)]=1.0; //I = eye(nx);
+ dmcopy(nx, nx, A, nx, T, nx);
+ daxpy_3l(nx2, -1.0, I, T);
+ dgemm_nn_3l(nx, nu, nx, T, nx, Bc, nx, B, nx);
+ free(T);
+ free(I);
+
+ int *ipiv = (int *) malloc(nx*sizeof(int));
+ dgesv_3l(nx, nu, Ac, nx, ipiv, B, nx, &info);
+ free(ipiv);
+
+ free(Ac);
+ free(Bc);
+ free(bb);
+
+
+/************************************************
+* initial state
+************************************************/
+
+ if(nx==4)
+ {
+ x0[0] = 5;
+ x0[1] = 10;
+ x0[2] = 15;
+ x0[3] = 20;
+ }
+ else
+ {
+ int jj;
+ for(jj=0; jj<nx; jj++)
+ x0[jj] = 1;
+ }
+
+ }
+
+
+
+int main()
+ {
+
+ printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#elif defined(LA_REFERENCE)
+
+ printf("\nLA provided by REFERENCE\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ // loop index
+ int ii;
+
+/************************************************
+* problem size
+************************************************/
+
+ // problem size
+ int N = 4;
+ int nx_ = 4;
+ int nu_ = 1;
+
+ // stage-wise variant size
+ int nx[N+1];
+ nx[0] = 0;
+ for(ii=1; ii<=N; ii++)
+ nx[ii] = nx_;
+ nx[N] = nx_;
+
+ int nu[N+1];
+ for(ii=0; ii<N; ii++)
+ nu[ii] = nu_;
+ nu[N] = 0;
+
+/************************************************
+* dynamical system
+************************************************/
+
+ double *A; d_zeros(&A, nx_, nx_); // states update matrix
+
+ double *B; d_zeros(&B, nx_, nu_); // inputs matrix
+
+ double *b; d_zeros(&b, nx_, 1); // states offset
+ double *x0; d_zeros(&x0, nx_, 1); // initial state
+
+ double Ts = 0.5; // sampling time
+ d_mass_spring_system(Ts, nx_, nu_, N, A, B, b, x0);
+
+ for(ii=0; ii<nx_; ii++)
+ b[ii] = 0.1;
+
+ for(ii=0; ii<nx_; ii++)
+ x0[ii] = 0;
+ x0[0] = 2.5;
+ x0[1] = 2.5;
+
+ d_print_mat(nx_, nx_, A, nx_);
+ d_print_mat(nx_, nu_, B, nx_);
+ d_print_mat(1, nx_, b, 1);
+ d_print_mat(1, nx_, x0, 1);
+
+/************************************************
+* cost function
+************************************************/
+
+ double *R; d_zeros(&R, nu_, nu_);
+ for(ii=0; ii<nu_; ii++) R[ii*(nu_+1)] = 2.0;
+
+ double *S; d_zeros(&S, nu_, nx_);
+
+ double *Q; d_zeros(&Q, nx_, nx_);
+ for(ii=0; ii<nx_; ii++) Q[ii*(nx_+1)] = 1.0;
+
+ double *r; d_zeros(&r, nu_, 1);
+ for(ii=0; ii<nu_; ii++) r[ii] = 0.2;
+
+ double *q; d_zeros(&q, nx_, 1);
+ for(ii=0; ii<nx_; ii++) q[ii] = 0.1;
+
+ d_print_mat(nu_, nu_, R, nu_);
+ d_print_mat(nu_, nx_, S, nu_);
+ d_print_mat(nx_, nx_, Q, nx_);
+ d_print_mat(1, nu_, r, 1);
+ d_print_mat(1, nx_, q, 1);
+
+/************************************************
+* matrices as strmat
+************************************************/
+
+ struct d_strmat sA;
+ d_allocate_strmat(nx_, nx_, &sA);
+ d_cvt_mat2strmat(nx_, nx_, A, nx_, &sA, 0, 0);
+ struct d_strvec sb;
+ d_allocate_strvec(nx_, &sb);
+ d_cvt_vec2strvec(nx_, b, &sb, 0);
+ struct d_strvec sx0;
+ d_allocate_strvec(nx_, &sx0);
+ d_cvt_vec2strvec(nx_, x0, &sx0, 0);
+ struct d_strvec sb0;
+ d_allocate_strvec(nx_, &sb0);
+ double *b0; d_zeros(&b0, nx_, 1); // states offset
+ dgemv_n_libstr(nx_, nx_, 1.0, &sA, 0, 0, &sx0, 0, 1.0, &sb, 0, &sb0, 0);
+ d_print_tran_strvec(nx_, &sb0, 0);
+
+ struct d_strmat sBbt0;
+ d_allocate_strmat(nu_+nx_+1, nx_, &sBbt0);
+ d_cvt_tran_mat2strmat(nx_, nx_, B, nx_, &sBbt0, 0, 0);
+ drowin_libstr(nx_, 1.0, &sb0, 0, &sBbt0, nu_, 0);
+ d_print_strmat(nu_+1, nx_, &sBbt0, 0, 0);
+
+ struct d_strmat sBAbt1;
+ d_allocate_strmat(nu_+nx_+1, nx_, &sBAbt1);
+ d_cvt_tran_mat2strmat(nx_, nu_, B, nx_, &sBAbt1, 0, 0);
+ d_cvt_tran_mat2strmat(nx_, nx_, A, nx_, &sBAbt1, nu_, 0);
+ d_cvt_tran_mat2strmat(nx_, 1, b, nx_, &sBAbt1, nu_+nx_, 0);
+ d_print_strmat(nu_+nx_+1, nx_, &sBAbt1, 0, 0);
+
+ struct d_strvec sr0; // XXX no need to update r0 since S=0
+ d_allocate_strvec(nu_, &sr0);
+ d_cvt_vec2strvec(nu_, r, &sr0, 0);
+
+ struct d_strmat sRr0;
+ d_allocate_strmat(nu_+1, nu_, &sRr0);
+ d_cvt_mat2strmat(nu_, nu_, R, nu_, &sRr0, 0, 0);
+ drowin_libstr(nu_, 1.0, &sr0, 0, &sRr0, nu_, 0);
+ d_print_strmat(nu_+1, nu_, &sRr0, 0, 0);
+
+ struct d_strvec srq1;
+ d_allocate_strvec(nu_+nx_, &srq1);
+ d_cvt_vec2strvec(nu_, r, &srq1, 0);
+ d_cvt_vec2strvec(nx_, q, &srq1, nu_);
+
+ struct d_strmat sRSQrq1;
+ d_allocate_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1);
+ d_cvt_mat2strmat(nu_, nu_, R, nu_, &sRSQrq1, 0, 0);
+ d_cvt_tran_mat2strmat(nu_, nx_, S, nu_, &sRSQrq1, nu_, 0);
+ d_cvt_mat2strmat(nx_, nx_, Q, nx_, &sRSQrq1, nu_, nu_);
+ drowin_libstr(nu_+nx_, 1.0, &srq1, 0, &sRSQrq1, nu_+nx_, 0);
+ d_print_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1, 0, 0);
+
+ struct d_strvec sqN;
+ d_allocate_strvec(nx_, &sqN);
+ d_cvt_vec2strvec(nx_, q, &sqN, 0);
+
+ struct d_strmat sQqN;
+ d_allocate_strmat(nx_+1, nx_, &sQqN);
+ d_cvt_mat2strmat(nx_, nx_, Q, nx_, &sQqN, 0, 0);
+ drowin_libstr(nx_, 1.0, &sqN, 0, &sQqN, nx_, 0);
+ d_print_strmat(nx_+1, nx_, &sQqN, 0, 0);
+
+/************************************************
+* array of matrices
+************************************************/
+
+ struct d_strmat hsBAbt[N];
+ struct d_strvec hsb[N];
+ struct d_strmat hsRSQrq[N+1];
+ struct d_strvec hsrq[N+1];
+ struct d_strmat hsL[N+1];
+ struct d_strvec hsPb[N];
+ struct d_strvec hsux[N+1];
+ struct d_strvec hspi[N];
+ struct d_strmat hswork_mat[1];
+ struct d_strvec hswork_vec[1];
+
+ hsBAbt[0] = sBbt0;
+ hsb[0] = sb0;
+ hsRSQrq[0] = sRr0;
+ hsrq[0] = sr0;
+ d_allocate_strmat(nu_+1, nu_, &hsL[0]);
+ d_allocate_strvec(nx_, &hsPb[0]);
+ d_allocate_strvec(nx_+nu_+1, &hsux[0]);
+ d_allocate_strvec(nx_, &hspi[0]);
+ for(ii=1; ii<N; ii++)
+ {
+ hsBAbt[ii] = sBAbt1;
+ hsb[ii] = sb;
+ hsRSQrq[ii] = sRSQrq1;
+ hsrq[ii] = srq1;
+ d_allocate_strmat(nu_+nx_+1, nu_+nx_, &hsL[ii]);
+ d_allocate_strvec(nx_, &hsPb[ii]);
+ d_allocate_strvec(nx_+nu_+1, &hsux[ii]);
+ d_allocate_strvec(nx_, &hspi[ii]);
+ }
+ hsRSQrq[N] = sQqN;
+ hsrq[N] = sqN;
+ d_allocate_strmat(nx_+1, nx_, &hsL[N]);
+ d_allocate_strvec(nx_+nu_+1, &hsux[N]);
+ d_allocate_strmat(nu_+nx_+1, nx_, &hswork_mat[0]);
+ d_allocate_strvec(nx_, &hswork_vec[0]);
+
+// for(ii=0; ii<N; ii++)
+// d_print_strmat(nu[ii]+nx[ii]+1, nx[ii+1], &hsBAbt[ii], 0, 0);
+// return 0;
+
+/************************************************
+* call Riccati solver
+************************************************/
+
+ // timing
+ struct timeval tv0, tv1, tv2, tv3;
+ int nrep = 1000;
+ int rep;
+
+ gettimeofday(&tv0, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ d_back_ric_sv_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hsux, hspi, hswork_mat, hswork_vec);
+ }
+
+ gettimeofday(&tv1, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ d_back_ric_trf_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hswork_mat);
+ }
+
+ gettimeofday(&tv2, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ d_back_ric_trs_libstr(N, nx, nu, hsBAbt, hsb, hsrq, hsL, hsPb, hsux, hspi, hswork_vec);
+ }
+
+ gettimeofday(&tv3, NULL); // time
+
+ float time_sv = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+ float time_trf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+ float time_trs = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+ // print sol
+ printf("\nux = \n\n");
+ for(ii=0; ii<=N; ii++)
+ d_print_tran_strvec(nu[ii]+nx[ii], &hsux[ii], 0);
+
+ printf("\npi = \n\n");
+ for(ii=0; ii<N; ii++)
+ d_print_tran_strvec(nx[ii+1], &hspi[ii], 0);
+
+// printf("\nL = \n\n");
+// for(ii=0; ii<=N; ii++)
+// d_print_strmat(nu[ii]+nx[ii]+1, nu[ii]+nx[ii], &hsL[ii], 0, 0);
+
+ printf("\ntime sv\t\ttime trf\t\ttime trs\n");
+ printf("\n%e\t%e\t%e\n", time_sv, time_trf, time_trs);
+ printf("\n");
+
+/************************************************
+* free memory
+************************************************/
+
+ d_free(A);
+ d_free(B);
+ d_free(b);
+ d_free(x0);
+ d_free(R);
+ d_free(S);
+ d_free(Q);
+ d_free(r);
+ d_free(q);
+ d_free(b0);
+ d_free_strmat(&sA);
+ d_free_strvec(&sb);
+ d_free_strmat(&sBbt0);
+ d_free_strvec(&sb0);
+ d_free_strmat(&sBAbt1);
+ d_free_strmat(&sRr0);
+ d_free_strvec(&sr0);
+ d_free_strmat(&sRSQrq1);
+ d_free_strvec(&srq1);
+ d_free_strmat(&sQqN);
+ d_free_strvec(&sqN);
+ d_free_strmat(&hsL[0]);
+ d_free_strvec(&hsPb[0]);
+ d_free_strvec(&hsux[0]);
+ d_free_strvec(&hspi[0]);
+ for(ii=1; ii<N; ii++)
+ {
+ d_free_strmat(&hsL[ii]);
+ d_free_strvec(&hsPb[ii]);
+ d_free_strvec(&hsux[ii]);
+ d_free_strvec(&hspi[ii]);
+ }
+ d_free_strmat(&hsL[N]);
+ d_free_strvec(&hsux[N]);
+ d_free_strmat(&hswork_mat[0]);
+ d_free_strvec(&hswork_vec[0]);
+
+
+/************************************************
+* return
+************************************************/
+
+ return 0;
+
+ }
+
+
+
diff --git a/examples/example_s_lu_factorization.c b/examples/example_s_lu_factorization.c
new file mode 100644
index 0000000..e298604
--- /dev/null
+++ b/examples/example_s_lu_factorization.c
@@ -0,0 +1,211 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+
+int main()
+ {
+
+ printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_REFERENCE)
+
+ printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ int ii;
+
+ int n = 16;
+
+ //
+ // matrices in column-major format
+ //
+
+ float *A; s_zeros(&A, n, n);
+ for(ii=0; ii<n*n; ii++) A[ii] = ii;
+// s_print_mat(n, n, A, n);
+
+ // spd matrix
+ float *B; s_zeros(&B, n, n);
+ for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+// s_print_mat(n, n, B, n);
+
+ // identity
+ float *I; s_zeros(&I, n, n);
+ for(ii=0; ii<n; ii++) I[ii*(n+1)] = 1.0;
+// s_print_mat(n, n, B, n);
+
+ // result matrix
+ float *D; s_zeros(&D, n, n);
+// s_print_mat(n, n, D, n);
+
+ // permutation indeces
+ int *ipiv; int_zeros(&ipiv, n, 1);
+
+ //
+ // matrices in matrix struct format
+ //
+
+ // work space enough for 5 matrix structs for size n times n
+ int size_strmat = 5*s_size_strmat(n, n);
+ void *memory_strmat; v_zeros_align(&memory_strmat, size_strmat);
+ char *ptr_memory_strmat = (char *) memory_strmat;
+
+ struct s_strmat sA;
+// s_allocate_strmat(n, n, &sA);
+ s_create_strmat(n, n, &sA, ptr_memory_strmat);
+ ptr_memory_strmat += sA.memory_size;
+ // convert from column major matrix to strmat
+ s_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+ printf("\nA = \n");
+ s_print_strmat(n, n, &sA, 0, 0);
+
+ struct s_strmat sB;
+// s_allocate_strmat(n, n, &sB);
+ s_create_strmat(n, n, &sB, ptr_memory_strmat);
+ ptr_memory_strmat += sB.memory_size;
+ // convert from column major matrix to strmat
+ s_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ printf("\nB = \n");
+ s_print_strmat(n, n, &sB, 0, 0);
+
+ struct s_strmat sI;
+// s_allocate_strmat(n, n, &sI);
+ s_create_strmat(n, n, &sI, ptr_memory_strmat);
+ ptr_memory_strmat += sI.memory_size;
+ // convert from column major matrix to strmat
+
+ struct s_strmat sD;
+// s_allocate_strmat(n, n, &sD);
+ s_create_strmat(n, n, &sD, ptr_memory_strmat);
+ ptr_memory_strmat += sD.memory_size;
+
+ struct s_strmat sLU;
+// s_allocate_strmat(n, n, &sD);
+ s_create_strmat(n, n, &sLU, ptr_memory_strmat);
+ ptr_memory_strmat += sLU.memory_size;
+
+ sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+ printf("\nB+A*A' = \n");
+ s_print_strmat(n, n, &sD, 0, 0);
+
+// sgetrf_nopivot_libstr(n, n, &sD, 0, 0, &sD, 0, 0);
+ sgetrf_libstr(n, n, &sD, 0, 0, &sLU, 0, 0, ipiv);
+ printf("\nLU = \n");
+ s_print_strmat(n, n, &sLU, 0, 0);
+ printf("\nipiv = \n");
+ int_print_mat(1, n, ipiv, 1);
+
+#if 0 // solve P L U X = P B
+ s_cvt_mat2strmat(n, n, I, n, &sI, 0, 0);
+ printf("\nI = \n");
+ s_print_strmat(n, n, &sI, 0, 0);
+
+ srowpe_libstr(n, ipiv, &sI);
+ printf("\nperm(I) = \n");
+ s_print_strmat(n, n, &sI, 0, 0);
+
+ strsm_llnu_libstr(n, n, 1.0, &sLU, 0, 0, &sI, 0, 0, &sD, 0, 0);
+ printf("\nperm(inv(L)) = \n");
+ s_print_strmat(n, n, &sD, 0, 0);
+ strsm_lunn_libstr(n, n, 1.0, &sLU, 0, 0, &sD, 0, 0, &sD, 0, 0);
+ printf("\ninv(A) = \n");
+ s_print_strmat(n, n, &sD, 0, 0);
+
+ // convert from strmat to column major matrix
+ s_cvt_strmat2mat(n, n, &sD, 0, 0, D, n);
+#else // solve X^T (P L U)^T = B^T P^T
+ s_cvt_tran_mat2strmat(n, n, I, n, &sI, 0, 0);
+ printf("\nI' = \n");
+ s_print_strmat(n, n, &sI, 0, 0);
+
+ scolpe_libstr(n, ipiv, &sB);
+ printf("\nperm(I') = \n");
+ s_print_strmat(n, n, &sB, 0, 0);
+
+ strsm_rltu_libstr(n, n, 1.0, &sLU, 0, 0, &sB, 0, 0, &sD, 0, 0);
+ printf("\nperm(inv(L')) = \n");
+ s_print_strmat(n, n, &sD, 0, 0);
+ strsm_rutn_libstr(n, n, 1.0, &sLU, 0, 0, &sD, 0, 0, &sD, 0, 0);
+ printf("\ninv(A') = \n");
+ s_print_strmat(n, n, &sD, 0, 0);
+
+ // convert from strmat to column major matrix
+ s_cvt_tran_strmat2mat(n, n, &sD, 0, 0, D, n);
+#endif
+
+ // print matrix in column-major format
+ printf("\ninv(A) = \n");
+ s_print_mat(n, n, D, n);
+
+
+
+ //
+ // free memory
+ //
+
+ s_free(A);
+ s_free(B);
+ s_free(D);
+ s_free(I);
+ int_free(ipiv);
+// s_free_strmat(&sA);
+// s_free_strmat(&sB);
+// s_free_strmat(&sD);
+// s_free_strmat(&sI);
+ v_free_align(memory_strmat);
+
+ return 0;
+
+ }
+
diff --git a/examples/example_s_riccati_recursion.c b/examples/example_s_riccati_recursion.c
new file mode 100644
index 0000000..03b9fc6
--- /dev/null
+++ b/examples/example_s_riccati_recursion.c
@@ -0,0 +1,605 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "tools.h"
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+
+
+static void s_back_ric_sv_libstr(int N, int *nx, int *nu, struct s_strmat *hsBAbt, struct s_strmat *hsRSQrq, struct s_strmat *hsL, struct s_strvec *hsux, struct s_strvec *hspi, struct s_strmat *hswork_mat, struct s_strvec *hswork_vec)
+ {
+
+ int nn;
+
+ // factorization and backward substitution
+
+ // last stage
+ spotrf_l_libstr(nx[N]+1, nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+
+ // middle stages
+ for(nn=0; nn<N; nn++)
+ {
+ strmm_rlnn_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn], nu[N-nn], &hsBAbt[N-nn-1], 0, 0, &hswork_mat[0], 0, 0);
+ sgead_libstr(1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn]+nx[N-nn], nu[N-nn], &hswork_mat[0], nu[N-nn-1]+nx[N-nn-1], 0);
+#if 1
+ ssyrk_spotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+ ssyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+ spotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+ }
+
+ // forward substitution
+
+ // first stage
+ nn = 0;
+ srowex_libstr(nu[nn]+nx[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+ strsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ srowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+ sgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+ sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ srowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ strmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+ saxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ strmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+
+ // middle stages
+ for(nn=1; nn<N; nn++)
+ {
+ srowex_libstr(nu[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+ strsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ srowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+ sgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+ sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ srowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ strmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+ saxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ strmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+ }
+
+ return;
+
+ }
+
+
+
+static void s_back_ric_trf_libstr(int N, int *nx, int *nu, struct s_strmat *hsBAbt, struct s_strmat *hsRSQrq, struct s_strmat *hsL, struct s_strmat *hswork_mat)
+ {
+
+ int nn;
+
+ // factorization
+
+ // last stage
+ spotrf_l_libstr(nx[N], nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+
+ // middle stages
+ for(nn=0; nn<N; nn++)
+ {
+ strmm_rlnn_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn], nu[N-nn], &hsBAbt[N-nn-1], 0, 0, &hswork_mat[0], 0, 0);
+#if 1
+ ssyrk_spotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+ ssyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+ spotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+ }
+
+ return;
+
+ }
+
+
+
+static void s_back_ric_trs_libstr(int N, int *nx, int *nu, struct s_strmat *hsBAbt, struct s_strvec *hsb, struct s_strvec *hsrq, struct s_strmat *hsL, struct s_strvec *hsPb, struct s_strvec *hsux, struct s_strvec *hspi, struct s_strvec *hswork_vec)
+ {
+
+ int nn;
+
+ // backward substitution
+
+ // last stage
+ sveccp_libstr(nu[N]+nx[N], 1.0, &hsrq[N], 0, &hsux[N], 0);
+
+ // middle stages
+ for(nn=0; nn<N-1; nn++)
+ {
+ // compute Pb
+ strmv_ltn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ strmv_lnn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ sveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+ sveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+ saxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+ sgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ strsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ }
+
+ // first stage
+ nn = N-1;
+ strmv_ltn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ strmv_lnn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ sveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+ sveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+ saxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+ sgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ strsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+
+ // forward substitution
+
+ // first stage
+ nn = 0;
+ sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ sveccp_libstr(nu[nn]+nx[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+ strsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ sgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+ sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ strmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ strmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ saxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+
+ // middle stages
+ for(nn=1; nn<N; nn++)
+ {
+ sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ sveccp_libstr(nu[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+ strsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ sgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+ sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ strmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ strmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+ saxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ }
+
+ return;
+
+ }
+
+
+
+/************************************************
+Mass-spring system: nx/2 masses connected each other with springs (in a row), and the first and the last one to walls. nu (<=nx) controls act on the first nu masses. The system is sampled with sampling time Ts.
+************************************************/
+static void d_mass_spring_system(double Ts, int nx, int nu, int N, double *A, double *B, double *b, double *x0)
+ {
+
+ int nx2 = nx*nx;
+
+ int info = 0;
+
+ int pp = nx/2; // number of masses
+
+/************************************************
+* build the continuous time system
+************************************************/
+
+ double *T; d_zeros(&T, pp, pp);
+ int ii;
+ for(ii=0; ii<pp; ii++) T[ii*(pp+1)] = -2;
+ for(ii=0; ii<pp-1; ii++) T[ii*(pp+1)+1] = 1;
+ for(ii=1; ii<pp; ii++) T[ii*(pp+1)-1] = 1;
+
+ double *Z; d_zeros(&Z, pp, pp);
+ double *I; d_zeros(&I, pp, pp); for(ii=0; ii<pp; ii++) I[ii*(pp+1)]=1.0; // = eye(pp);
+ double *Ac; d_zeros(&Ac, nx, nx);
+ dmcopy(pp, pp, Z, pp, Ac, nx);
+ dmcopy(pp, pp, T, pp, Ac+pp, nx);
+ dmcopy(pp, pp, I, pp, Ac+pp*nx, nx);
+ dmcopy(pp, pp, Z, pp, Ac+pp*(nx+1), nx);
+ free(T);
+ free(Z);
+ free(I);
+
+ d_zeros(&I, nu, nu); for(ii=0; ii<nu; ii++) I[ii*(nu+1)]=1.0; //I = eye(nu);
+ double *Bc; d_zeros(&Bc, nx, nu);
+ dmcopy(nu, nu, I, nu, Bc+pp, nx);
+ free(I);
+
+/************************************************
+* compute the discrete time system
+************************************************/
+
+ double *bb; d_zeros(&bb, nx, 1);
+ dmcopy(nx, 1, bb, nx, b, nx);
+
+ dmcopy(nx, nx, Ac, nx, A, nx);
+ dscal_3l(nx2, Ts, A);
+ expm(nx, A);
+
+ d_zeros(&T, nx, nx);
+ d_zeros(&I, nx, nx); for(ii=0; ii<nx; ii++) I[ii*(nx+1)]=1.0; //I = eye(nx);
+ dmcopy(nx, nx, A, nx, T, nx);
+ daxpy_3l(nx2, -1.0, I, T);
+ dgemm_nn_3l(nx, nu, nx, T, nx, Bc, nx, B, nx);
+ free(T);
+ free(I);
+
+ int *ipiv = (int *) malloc(nx*sizeof(int));
+ dgesv_3l(nx, nu, Ac, nx, ipiv, B, nx, &info);
+ free(ipiv);
+
+ free(Ac);
+ free(Bc);
+ free(bb);
+
+
+/************************************************
+* initial state
+************************************************/
+
+ if(nx==4)
+ {
+ x0[0] = 5;
+ x0[1] = 10;
+ x0[2] = 15;
+ x0[3] = 20;
+ }
+ else
+ {
+ int jj;
+ for(jj=0; jj<nx; jj++)
+ x0[jj] = 1;
+ }
+
+ }
+
+
+
+int main()
+ {
+
+ printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#elif defined(LA_REFERENCE)
+
+ printf("\nLA provided by REFERENCE\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ // loop index
+ int ii;
+
+/************************************************
+* problem size
+************************************************/
+
+ // problem size
+ int N = 4;
+ int nx_ = 4;
+ int nu_ = 1;
+
+ // stage-wise variant size
+ int nx[N+1];
+ nx[0] = 0;
+ for(ii=1; ii<=N; ii++)
+ nx[ii] = nx_;
+ nx[N] = nx_;
+
+ int nu[N+1];
+ for(ii=0; ii<N; ii++)
+ nu[ii] = nu_;
+ nu[N] = 0;
+
+/************************************************
+* dynamical system
+************************************************/
+
+ double *Ad; d_zeros(&Ad, nx_, nx_); // states update matrix
+
+ double *Bd; d_zeros(&Bd, nx_, nu_); // inputs matrix
+
+ double *bd; d_zeros(&bd, nx_, 1); // states offset
+ double *x0d; d_zeros(&x0d, nx_, 1); // initial state
+
+ double Ts = 0.5; // sampling time
+ d_mass_spring_system(Ts, nx_, nu_, N, Ad, Bd, bd, x0d);
+
+ float *A; s_zeros(&A, nx_, nx_); for(ii=0; ii<nx_*nx_; ii++) A[ii] = (float) Ad[ii];
+ float *B; s_zeros(&B, nx_, nu_); for(ii=0; ii<nx_*nu_; ii++) B[ii] = (float) Bd[ii];
+ float *b; s_zeros(&b, nx_, 1); for(ii=0; ii<nx_; ii++) b[ii] = (float) bd[ii];
+ float *x0; s_zeros(&x0, nx_, 1); for(ii=0; ii<nx_; ii++) x0[ii] = (float) x0d[ii];
+
+ for(ii=0; ii<nx_; ii++)
+ b[ii] = 0.1;
+
+ for(ii=0; ii<nx_; ii++)
+ x0[ii] = 0;
+ x0[0] = 2.5;
+ x0[1] = 2.5;
+
+ s_print_mat(nx_, nx_, A, nx_);
+ s_print_mat(nx_, nu_, B, nx_);
+ s_print_mat(1, nx_, b, 1);
+ s_print_mat(1, nx_, x0, 1);
+
+/************************************************
+* cost function
+************************************************/
+
+ float *R; s_zeros(&R, nu_, nu_);
+ for(ii=0; ii<nu_; ii++) R[ii*(nu_+1)] = 2.0;
+
+ float *S; s_zeros(&S, nu_, nx_);
+
+ float *Q; s_zeros(&Q, nx_, nx_);
+ for(ii=0; ii<nx_; ii++) Q[ii*(nx_+1)] = 1.0;
+
+ float *r; s_zeros(&r, nu_, 1);
+ for(ii=0; ii<nu_; ii++) r[ii] = 0.2;
+
+ float *q; s_zeros(&q, nx_, 1);
+ for(ii=0; ii<nx_; ii++) q[ii] = 0.1;
+
+ s_print_mat(nu_, nu_, R, nu_);
+ s_print_mat(nu_, nx_, S, nu_);
+ s_print_mat(nx_, nx_, Q, nx_);
+ s_print_mat(1, nu_, r, 1);
+ s_print_mat(1, nx_, q, 1);
+
+/************************************************
+* matrices as strmat
+************************************************/
+
+ struct s_strmat sA;
+ s_allocate_strmat(nx_, nx_, &sA);
+ s_cvt_mat2strmat(nx_, nx_, A, nx_, &sA, 0, 0);
+ struct s_strvec sb;
+ s_allocate_strvec(nx_, &sb);
+ s_cvt_vec2strvec(nx_, b, &sb, 0);
+ struct s_strvec sx0;
+ s_allocate_strvec(nx_, &sx0);
+ s_cvt_vec2strvec(nx_, x0, &sx0, 0);
+ struct s_strvec sb0;
+ s_allocate_strvec(nx_, &sb0);
+ float *b0; d_zeros(&b0, nx_, 1); // states offset
+ sgemv_n_libstr(nx_, nx_, 1.0, &sA, 0, 0, &sx0, 0, 1.0, &sb, 0, &sb0, 0);
+ s_print_tran_strvec(nx_, &sb0, 0);
+
+ struct s_strmat sBbt0;
+ s_allocate_strmat(nu_+nx_+1, nx_, &sBbt0);
+ s_cvt_tran_mat2strmat(nx_, nx_, B, nx_, &sBbt0, 0, 0);
+ srowin_libstr(nx_, 1.0, &sb0, 0, &sBbt0, nu_, 0);
+ s_print_strmat(nu_+1, nx_, &sBbt0, 0, 0);
+
+ struct s_strmat sBAbt1;
+ s_allocate_strmat(nu_+nx_+1, nx_, &sBAbt1);
+ s_cvt_tran_mat2strmat(nx_, nu_, B, nx_, &sBAbt1, 0, 0);
+ s_cvt_tran_mat2strmat(nx_, nx_, A, nx_, &sBAbt1, nu_, 0);
+ s_cvt_tran_mat2strmat(nx_, 1, b, nx_, &sBAbt1, nu_+nx_, 0);
+ s_print_strmat(nu_+nx_+1, nx_, &sBAbt1, 0, 0);
+
+ struct s_strvec sr0; // XXX no need to update r0 since S=0
+ s_allocate_strvec(nu_, &sr0);
+ s_cvt_vec2strvec(nu_, r, &sr0, 0);
+
+ struct s_strmat sRr0;
+ s_allocate_strmat(nu_+1, nu_, &sRr0);
+ s_cvt_mat2strmat(nu_, nu_, R, nu_, &sRr0, 0, 0);
+ srowin_libstr(nu_, 1.0, &sr0, 0, &sRr0, nu_, 0);
+ s_print_strmat(nu_+1, nu_, &sRr0, 0, 0);
+
+ struct s_strvec srq1;
+ s_allocate_strvec(nu_+nx_, &srq1);
+ s_cvt_vec2strvec(nu_, r, &srq1, 0);
+ s_cvt_vec2strvec(nx_, q, &srq1, nu_);
+
+ struct s_strmat sRSQrq1;
+ s_allocate_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1);
+ s_cvt_mat2strmat(nu_, nu_, R, nu_, &sRSQrq1, 0, 0);
+ s_cvt_tran_mat2strmat(nu_, nx_, S, nu_, &sRSQrq1, nu_, 0);
+ s_cvt_mat2strmat(nx_, nx_, Q, nx_, &sRSQrq1, nu_, nu_);
+ srowin_libstr(nu_+nx_, 1.0, &srq1, 0, &sRSQrq1, nu_+nx_, 0);
+ s_print_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1, 0, 0);
+
+ struct s_strvec sqN;
+ s_allocate_strvec(nx_, &sqN);
+ s_cvt_vec2strvec(nx_, q, &sqN, 0);
+
+ struct s_strmat sQqN;
+ s_allocate_strmat(nx_+1, nx_, &sQqN);
+ s_cvt_mat2strmat(nx_, nx_, Q, nx_, &sQqN, 0, 0);
+ srowin_libstr(nx_, 1.0, &sqN, 0, &sQqN, nx_, 0);
+ s_print_strmat(nx_+1, nx_, &sQqN, 0, 0);
+
+/************************************************
+* array of matrices
+************************************************/
+
+ struct s_strmat hsBAbt[N];
+ struct s_strvec hsb[N];
+ struct s_strmat hsRSQrq[N+1];
+ struct s_strvec hsrq[N+1];
+ struct s_strmat hsL[N+1];
+ struct s_strvec hsPb[N];
+ struct s_strvec hsux[N+1];
+ struct s_strvec hspi[N];
+ struct s_strmat hswork_mat[1];
+ struct s_strvec hswork_vec[1];
+
+ hsBAbt[0] = sBbt0;
+ hsb[0] = sb0;
+ hsRSQrq[0] = sRr0;
+ hsrq[0] = sr0;
+ s_allocate_strmat(nu_+1, nu_, &hsL[0]);
+ s_allocate_strvec(nx_, &hsPb[0]);
+ s_allocate_strvec(nx_+nu_+1, &hsux[0]);
+ s_allocate_strvec(nx_, &hspi[0]);
+ for(ii=1; ii<N; ii++)
+ {
+ hsBAbt[ii] = sBAbt1;
+ hsb[ii] = sb;
+ hsRSQrq[ii] = sRSQrq1;
+ hsrq[ii] = srq1;
+ s_allocate_strmat(nu_+nx_+1, nu_+nx_, &hsL[ii]);
+ s_allocate_strvec(nx_, &hsPb[ii]);
+ s_allocate_strvec(nx_+nu_+1, &hsux[ii]);
+ s_allocate_strvec(nx_, &hspi[ii]);
+ }
+ hsRSQrq[N] = sQqN;
+ hsrq[N] = sqN;
+ s_allocate_strmat(nx_+1, nx_, &hsL[N]);
+ s_allocate_strvec(nx_+nu_+1, &hsux[N]);
+ s_allocate_strmat(nu_+nx_+1, nx_, &hswork_mat[0]);
+ s_allocate_strvec(nx_, &hswork_vec[0]);
+
+// for(ii=0; ii<N; ii++)
+// d_print_strmat(nu[ii]+nx[ii]+1, nx[ii+1], &hsBAbt[ii], 0, 0);
+// return 0;
+
+/************************************************
+* call Riccati solver
+************************************************/
+
+ // timing
+ struct timeval tv0, tv1, tv2, tv3;
+ int nrep = 1000;
+ int rep;
+
+ gettimeofday(&tv0, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ s_back_ric_sv_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hsux, hspi, hswork_mat, hswork_vec);
+ }
+
+ gettimeofday(&tv1, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ s_back_ric_trf_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hswork_mat);
+ }
+
+ gettimeofday(&tv2, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ s_back_ric_trs_libstr(N, nx, nu, hsBAbt, hsb, hsrq, hsL, hsPb, hsux, hspi, hswork_vec);
+ }
+
+ gettimeofday(&tv3, NULL); // time
+
+ float time_sv = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+ float time_trf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+ float time_trs = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+ // print sol
+ printf("\nux = \n\n");
+ for(ii=0; ii<=N; ii++)
+ s_print_tran_strvec(nu[ii]+nx[ii], &hsux[ii], 0);
+
+ printf("\npi = \n\n");
+ for(ii=0; ii<N; ii++)
+ s_print_tran_strvec(nx[ii+1], &hspi[ii], 0);
+
+// printf("\nL = \n\n");
+// for(ii=0; ii<=N; ii++)
+// s_print_strmat(nu[ii]+nx[ii]+1, nu[ii]+nx[ii], &hsL[ii], 0, 0);
+
+ printf("\ntime sv\t\ttime trf\t\ttime trs\n");
+ printf("\n%e\t%e\t%e\n", time_sv, time_trf, time_trs);
+ printf("\n");
+
+/************************************************
+* free memory
+************************************************/
+
+ d_free(Ad);
+ d_free(Bd);
+ d_free(bd);
+ d_free(x0d);
+ s_free(A);
+ s_free(B);
+ s_free(b);
+ s_free(x0);
+ s_free(R);
+ s_free(S);
+ s_free(Q);
+ s_free(r);
+ s_free(q);
+ s_free(b0);
+ s_free_strmat(&sA);
+ s_free_strvec(&sb);
+ s_free_strmat(&sBbt0);
+ s_free_strvec(&sb0);
+ s_free_strmat(&sBAbt1);
+ s_free_strmat(&sRr0);
+ s_free_strvec(&sr0);
+ s_free_strmat(&sRSQrq1);
+ s_free_strvec(&srq1);
+ s_free_strmat(&sQqN);
+ s_free_strvec(&sqN);
+ s_free_strmat(&hsL[0]);
+ s_free_strvec(&hsPb[0]);
+ s_free_strvec(&hsux[0]);
+ s_free_strvec(&hspi[0]);
+ for(ii=1; ii<N; ii++)
+ {
+ s_free_strmat(&hsL[ii]);
+ s_free_strvec(&hsPb[ii]);
+ s_free_strvec(&hsux[ii]);
+ s_free_strvec(&hspi[ii]);
+ }
+ s_free_strmat(&hsL[N]);
+ s_free_strvec(&hsux[N]);
+ s_free_strmat(&hswork_mat[0]);
+ s_free_strvec(&hswork_vec[0]);
+
+
+/************************************************
+* return
+************************************************/
+
+ return 0;
+
+ }
+
+
+
+
diff --git a/examples/example_tree_riccati_recursion.c b/examples/example_tree_riccati_recursion.c
new file mode 100644
index 0000000..b61d2d3
--- /dev/null
+++ b/examples/example_tree_riccati_recursion.c
@@ -0,0 +1,638 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "tools.h"
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+
+
+void d_back_ric_sv_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt, struct d_strvec *hsux, struct d_strvec *hspi, struct d_strmat *hswork_mat, struct d_strvec *hswork_vec)
+ {
+
+ int nn;
+
+ // factorization and backward substitution
+
+ // last stage
+ dpotrf_l_libstr(nx[N]+1, nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+ dtrtr_l_libstr(nx[N], &hsL[N], 0, 0, &hsLxt[N], 0, 0);
+
+ // middle stages
+ for(nn=0; nn<N; nn++)
+ {
+ dtrmm_rutn_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hsLxt[N-nn], 0, 0, 0.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0);
+ dgead_libstr(1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn]+nx[N-nn], nu[N-nn], &hswork_mat[0], nu[N-nn-1]+nx[N-nn-1], 0);
+#if 1
+ dsyrk_dpotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+ dsyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+ dpotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+ dtrtr_l_libstr(nx[N-nn-1], &hsL[N-nn-1], nu[N-nn-1], nu[N-nn-1], &hsLxt[N-nn-1], 0, 0);
+ }
+
+ // forward substitution
+
+ // first stage
+ nn = 0;
+ drowex_libstr(nu[nn]+nx[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_unn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hspi[nn], 0, &hspi[nn], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ dtrmv_utn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hspi[nn], 0, &hspi[nn], 0);
+
+ // middle stages
+ for(nn=1; nn<N; nn++)
+ {
+ drowex_libstr(nu[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ drowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_unn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hspi[nn], 0, &hspi[nn], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ dtrmv_utn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hspi[nn], 0, &hspi[nn], 0);
+ }
+
+ return;
+
+ }
+
+
+
+void d_back_ric_trf_funnel1_libstr(int md, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt_old, struct d_strmat *hsLxt_new, struct d_strmat *hswork_mat)
+ {
+
+ int ii;
+
+ ii = 0;
+ dtrmm_rutn_libstr(nu[0]+nx[0], nx[1], 1.0, &hsBAbt[ii], 0, 0, &hsLxt_old[ii], 0, 0, 0.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0);
+ dsyrk_ln_libstr(nu[0]+nx[0], nu[0]+nx[0], nx[1], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[0], 0, 0, &hsL[0], 0, 0);
+ for(ii=1; ii<md; ii++)
+ {
+ dtrmm_rutn_libstr(nu[0]+nx[0], nx[1], 1.0, &hsBAbt[ii], 0, 0, &hsLxt_old[ii], 0, 0, 0.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0);
+ dsyrk_ln_libstr(nu[0]+nx[0], nu[0]+nx[0], nx[1], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsL[0], 0, 0, &hsL[0], 0, 0);
+ }
+
+ dpotrf_l_libstr(nu[0]+nx[0], nu[0]+nx[0], &hsL[0], 0, 0, &hsL[0], 0, 0);
+ dtrtr_l_libstr(nx[0], &hsL[0], nu[0], nu[0], &hsLxt_new[0], 0, 0);
+
+ return;
+
+ }
+
+
+
+void d_back_ric_trf_step1_libstr(int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt, struct d_strmat *hswork_mat)
+ {
+
+ dtrmm_rutn_libstr(nu[0]+nx[0], nx[1], 1.0, &hsBAbt[0], 0, 0, &hsLxt[1], 0, 0, 0.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0);
+ dsyrk_ln_libstr(nu[0]+nx[0], nu[0]+nx[0], nx[1], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[0], 0, 0, &hsL[0], 0, 0);
+ dpotrf_l_libstr(nu[0]+nx[0], nu[0]+nx[0], &hsL[0], 0, 0, &hsL[0], 0, 0);
+ dtrtr_l_libstr(nx[0], &hsL[0], nu[0], nu[0], &hsLxt[0], 0, 0);
+
+ return;
+
+ }
+
+
+
+void d_back_ric_trf_stepN_libstr(int *nx, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt)
+ {
+
+ dpotrf_l_libstr(nx[0], nx[0], &hsRSQrq[0], 0, 0, &hsL[0], 0, 0);
+ dtrtr_l_libstr(nx[0], &hsL[0], 0, 0, &hsLxt[0], 0, 0);
+
+ return;
+
+ }
+
+
+
+void d_back_ric_trf_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt, struct d_strmat *hswork_mat)
+ {
+
+ int nn;
+
+ // factorization
+
+ // last stage
+ d_back_ric_trf_stepN_libstr(&nx[N], &hsRSQrq[N], &hsL[N], &hsLxt[N]);
+
+ // middle stages
+ for(nn=0; nn<N; nn++)
+ {
+ d_back_ric_trf_step1_libstr(&nx[N-nn-1], &nu[N-nn-1], &hsBAbt[N-nn-1], &hsRSQrq[N-nn-1], &hsL[N-nn-1], &hsLxt[N-nn-1], hswork_mat);
+ }
+
+ return;
+
+ }
+
+
+
+void d_back_ric_trs_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strvec *hsb, struct d_strvec *hsrq, struct d_strmat *hsL, struct d_strmat *hsLxt, struct d_strvec *hsPb, struct d_strvec *hsux, struct d_strvec *hspi, struct d_strvec *hswork_vec)
+ {
+
+ int nn;
+
+ // backward substitution
+
+ // last stage
+ dveccp_libstr(nu[N]+nx[N], 1.0, &hsrq[N], 0, &hsux[N], 0);
+
+ // middle stages
+ for(nn=0; nn<N-1; nn++)
+ {
+ // compute Pb
+ dtrmv_unn_libstr(nx[N-nn], &hsLxt[N-nn], 0, 0, &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dtrmv_utn_libstr(nx[N-nn], &hsLxt[N-nn], 0, 0, &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+ dgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dtrsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ }
+
+ // first stage
+ nn = N-1;
+ dtrmv_unn_libstr(nx[N-nn], &hsLxt[N-nn], 0, 0, &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dtrmv_utn_libstr(nx[N-nn], &hsLxt[N-nn], 0, 0, &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+ dveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+ dgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+ dtrsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+
+ // forward substitution
+
+ // first stage
+ nn = 0;
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ dveccp_libstr(nu[nn]+nx[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_unn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hswork_vec[0], 0, &hswork_vec[0], 0);
+ dtrmv_utn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hswork_vec[0], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+
+ // middle stages
+ for(nn=1; nn<N; nn++)
+ {
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+ dveccp_libstr(nu[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+ dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+ dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+ dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+ dtrmv_unn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hswork_vec[0], 0, &hswork_vec[0], 0);
+ dtrmv_utn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hswork_vec[0], 0, &hswork_vec[0], 0);
+ daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+ }
+
+ return;
+
+ }
+
+
+
+/************************************************
+Mass-spring system: nx/2 masses connected each other with springs (in a row), and the first and the last one to walls. nu (<=nx) controls act on the first nu masses. The system is sampled with sampling time Ts.
+************************************************/
+void mass_spring_system(double Ts, int nx, int nu, int N, double *A, double *B, double *b, double *x0)
+ {
+
+ int nx2 = nx*nx;
+
+ int info = 0;
+
+ int pp = nx/2; // number of masses
+
+/************************************************
+* build the continuous time system
+************************************************/
+
+ double *T; d_zeros(&T, pp, pp);
+ int ii;
+ for(ii=0; ii<pp; ii++) T[ii*(pp+1)] = -2;
+ for(ii=0; ii<pp-1; ii++) T[ii*(pp+1)+1] = 1;
+ for(ii=1; ii<pp; ii++) T[ii*(pp+1)-1] = 1;
+
+ double *Z; d_zeros(&Z, pp, pp);
+ double *I; d_zeros(&I, pp, pp); for(ii=0; ii<pp; ii++) I[ii*(pp+1)]=1.0; // = eye(pp);
+ double *Ac; d_zeros(&Ac, nx, nx);
+ dmcopy(pp, pp, Z, pp, Ac, nx);
+ dmcopy(pp, pp, T, pp, Ac+pp, nx);
+ dmcopy(pp, pp, I, pp, Ac+pp*nx, nx);
+ dmcopy(pp, pp, Z, pp, Ac+pp*(nx+1), nx);
+ free(T);
+ free(Z);
+ free(I);
+
+ d_zeros(&I, nu, nu); for(ii=0; ii<nu; ii++) I[ii*(nu+1)]=1.0; //I = eye(nu);
+ double *Bc; d_zeros(&Bc, nx, nu);
+ dmcopy(nu, nu, I, nu, Bc+pp, nx);
+ free(I);
+
+/************************************************
+* compute the discrete time system
+************************************************/
+
+ double *bb; d_zeros(&bb, nx, 1);
+ dmcopy(nx, 1, bb, nx, b, nx);
+
+ dmcopy(nx, nx, Ac, nx, A, nx);
+ dscal_3l(nx2, Ts, A);
+ expm(nx, A);
+
+ d_zeros(&T, nx, nx);
+ d_zeros(&I, nx, nx); for(ii=0; ii<nx; ii++) I[ii*(nx+1)]=1.0; //I = eye(nx);
+ dmcopy(nx, nx, A, nx, T, nx);
+ daxpy_3l(nx2, -1.0, I, T);
+ dgemm_nn_3l(nx, nu, nx, T, nx, Bc, nx, B, nx);
+ free(T);
+ free(I);
+
+ int *ipiv = (int *) malloc(nx*sizeof(int));
+ dgesv_3l(nx, nu, Ac, nx, ipiv, B, nx, &info);
+ free(ipiv);
+
+ free(Ac);
+ free(Bc);
+ free(bb);
+
+
+/************************************************
+* initial state
+************************************************/
+
+ if(nx==4)
+ {
+ x0[0] = 5;
+ x0[1] = 10;
+ x0[2] = 15;
+ x0[3] = 20;
+ }
+ else
+ {
+ int jj;
+ for(jj=0; jj<nx; jj++)
+ x0[jj] = 1;
+ }
+
+ }
+
+
+
+int main()
+ {
+
+ printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ // loop index
+ int ii;
+
+/************************************************
+* problem size
+************************************************/
+
+ // problem size
+ int N = 4;
+ int nx_ = 8;
+ int nu_ = 3;
+
+ // stage-wise variant size
+ int nx[N+1];
+ nx[0] = 0;
+ for(ii=1; ii<=N; ii++)
+ nx[ii] = nx_;
+ nx[N] = nx_;
+
+ int nu[N+1];
+ for(ii=0; ii<N; ii++)
+ nu[ii] = nu_;
+ nu[N] = 0;
+
+/************************************************
+* dynamical system
+************************************************/
+
+ double *A; d_zeros(&A, nx_, nx_); // states update matrix
+
+ double *B; d_zeros(&B, nx_, nu_); // inputs matrix
+
+ double *b; d_zeros(&b, nx_, 1); // states offset
+ double *x0; d_zeros_align(&x0, nx_, 1); // initial state
+
+ double Ts = 0.5; // sampling time
+ mass_spring_system(Ts, nx_, nu_, N, A, B, b, x0);
+
+ for(ii=0; ii<nx_; ii++)
+ b[ii] = 0.1;
+
+ for(ii=0; ii<nx_; ii++)
+ x0[ii] = 0;
+ x0[0] = 2.5;
+ x0[1] = 2.5;
+
+ d_print_mat(nx_, nx_, A, nx_);
+ d_print_mat(nx_, nu_, B, nx_);
+ d_print_mat(1, nx_, b, 1);
+ d_print_mat(1, nx_, x0, 1);
+
+/************************************************
+* cost function
+************************************************/
+
+ double *R; d_zeros(&R, nu_, nu_);
+ for(ii=0; ii<nu_; ii++) R[ii*(nu_+1)] = 2.0;
+
+ double *S; d_zeros(&S, nu_, nx_);
+
+ double *Q; d_zeros(&Q, nx_, nx_);
+ for(ii=0; ii<nx_; ii++) Q[ii*(nx_+1)] = 1.0;
+
+ double *r; d_zeros(&r, nu_, 1);
+ for(ii=0; ii<nu_; ii++) r[ii] = 0.2;
+
+ double *q; d_zeros(&q, nx_, 1);
+ for(ii=0; ii<nx_; ii++) q[ii] = 0.1;
+
+ d_print_mat(nu_, nu_, R, nu_);
+ d_print_mat(nu_, nx_, S, nu_);
+ d_print_mat(nx_, nx_, Q, nx_);
+ d_print_mat(1, nu_, r, 1);
+ d_print_mat(1, nx_, q, 1);
+
+/************************************************
+* matrices as strmat
+************************************************/
+
+ struct d_strmat sA;
+ d_allocate_strmat(nx_, nx_, &sA);
+ d_cvt_mat2strmat(nx_, nx_, A, nx_, &sA, 0, 0);
+ struct d_strvec sb;
+ d_allocate_strvec(nx_, &sb);
+ d_cvt_vec2strvec(nx_, b, &sb, 0);
+ struct d_strvec sx0;
+ d_allocate_strvec(nx_, &sx0);
+ d_cvt_vec2strvec(nx_, x0, &sx0, 0);
+ struct d_strvec sb0;
+ d_allocate_strvec(nx_, &sb0);
+ double *b0; d_zeros(&b0, nx_, 1); // states offset
+ dgemv_n_libstr(nx_, nx_, 1.0, &sA, 0, 0, &sx0, 0, 1.0, &sb, 0, &sb0, 0);
+ d_print_tran_strvec(nx_, &sb0, 0);
+
+ struct d_strmat sBbt0;
+ d_allocate_strmat(nu_+nx_+1, nx_, &sBbt0);
+ d_cvt_tran_mat2strmat(nx_, nx_, B, nx_, &sBbt0, 0, 0);
+ drowin_libstr(nx_, 1.0, &sb0, 0, &sBbt0, nu_, 0);
+ d_print_strmat(nu_+1, nx_, &sBbt0, 0, 0);
+
+ struct d_strmat sBAbt1;
+ d_allocate_strmat(nu_+nx_+1, nx_, &sBAbt1);
+ d_cvt_tran_mat2strmat(nx_, nu_, B, nx_, &sBAbt1, 0, 0);
+ d_cvt_tran_mat2strmat(nx_, nx_, A, nx_, &sBAbt1, nu_, 0);
+ d_cvt_tran_mat2strmat(nx_, 1, b, nx_, &sBAbt1, nu_+nx_, 0);
+ d_print_strmat(nu_+nx_+1, nx_, &sBAbt1, 0, 0);
+
+ struct d_strvec sr0; // XXX no need to update r0 since S=0
+ d_allocate_strvec(nu_, &sr0);
+ d_cvt_vec2strvec(nu_, r, &sr0, 0);
+
+ struct d_strmat sRr0;
+ d_allocate_strmat(nu_+1, nu_, &sRr0);
+ d_cvt_mat2strmat(nu_, nu_, R, nu_, &sRr0, 0, 0);
+ drowin_libstr(nu_, 1.0, &sr0, 0, &sRr0, nu_, 0);
+ d_print_strmat(nu_+1, nu_, &sRr0, 0, 0);
+
+ struct d_strvec srq1;
+ d_allocate_strvec(nu_+nx_, &srq1);
+ d_cvt_vec2strvec(nu_, r, &srq1, 0);
+ d_cvt_vec2strvec(nx_, q, &srq1, nu_);
+
+ struct d_strmat sRSQrq1;
+ d_allocate_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1);
+ d_cvt_mat2strmat(nu_, nu_, R, nu_, &sRSQrq1, 0, 0);
+ d_cvt_tran_mat2strmat(nu_, nx_, S, nu_, &sRSQrq1, nu_, 0);
+ d_cvt_mat2strmat(nx_, nx_, Q, nx_, &sRSQrq1, nu_, nu_);
+ drowin_libstr(nu_+nx_, 1.0, &srq1, 0, &sRSQrq1, nu_+nx_, 0);
+ d_print_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1, 0, 0);
+
+ struct d_strvec sqN;
+ d_allocate_strvec(nx_, &sqN);
+ d_cvt_vec2strvec(nx_, q, &sqN, 0);
+
+ struct d_strmat sQqN;
+ d_allocate_strmat(nx_+1, nx_, &sQqN);
+ d_cvt_mat2strmat(nx_, nx_, Q, nx_, &sQqN, 0, 0);
+ drowin_libstr(nx_, 1.0, &sqN, 0, &sQqN, nx_, 0);
+ d_print_strmat(nx_+1, nx_, &sQqN, 0, 0);
+
+/************************************************
+* array of matrices
+************************************************/
+
+ struct d_strmat hsBAbt[N];
+ struct d_strvec hsb[N];
+ struct d_strmat hsRSQrq[N+1];
+ struct d_strvec hsrq[N+1];
+ struct d_strmat hsL[N+1];
+ struct d_strmat hsLxt[N+1];
+ struct d_strvec hsPb[N];
+ struct d_strvec hsux[N+1];
+ struct d_strvec hspi[N];
+ struct d_strmat hswork_mat[1];
+ struct d_strvec hswork_vec[1];
+
+ hsBAbt[0] = sBbt0;
+ hsb[0] = sb0;
+ hsRSQrq[0] = sRr0;
+ hsrq[0] = sr0;
+ d_allocate_strmat(nu_+1, nu_, &hsL[0]);
+// d_allocate_strmat(nu_+1, nu_, &hsLxt[0]);
+ d_allocate_strvec(nx_, &hsPb[0]);
+ d_allocate_strvec(nx_+nu_+1, &hsux[0]);
+ d_allocate_strvec(nx_, &hspi[0]);
+ for(ii=1; ii<N; ii++)
+ {
+ hsBAbt[ii] = sBAbt1;
+ hsb[ii] = sb;
+ hsRSQrq[ii] = sRSQrq1;
+ hsrq[ii] = srq1;
+ d_allocate_strmat(nu_+nx_+1, nu_+nx_, &hsL[ii]);
+ d_allocate_strmat(nx_, nu_+nx_, &hsLxt[ii]);
+ d_allocate_strvec(nx_, &hsPb[ii]);
+ d_allocate_strvec(nx_+nu_+1, &hsux[ii]);
+ d_allocate_strvec(nx_, &hspi[ii]);
+ }
+ hsRSQrq[N] = sQqN;
+ hsrq[N] = sqN;
+ d_allocate_strmat(nx_+1, nx_, &hsL[N]);
+ d_allocate_strmat(nx_, nx_, &hsLxt[N]);
+ d_allocate_strvec(nx_+nu_+1, &hsux[N]);
+ d_allocate_strmat(nu_+nx_+1, nx_, &hswork_mat[0]);
+ d_allocate_strvec(nx_, &hswork_vec[0]);
+
+// for(ii=0; ii<N; ii++)
+// d_print_strmat(nu[ii]+nx[ii]+1, nx[ii+1], &hsBAbt[ii], 0, 0);
+// return 0;
+
+/************************************************
+* call Riccati solver
+************************************************/
+
+ // timing
+ struct timeval tv0, tv1, tv2, tv3;
+ int nrep = 1000;
+ int rep;
+
+ gettimeofday(&tv0, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+// d_back_ric_sv_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hsLxt, hsux, hspi, hswork_mat, hswork_vec);
+ }
+
+ gettimeofday(&tv1, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ d_back_ric_trf_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hsLxt, hswork_mat);
+ }
+
+ gettimeofday(&tv2, NULL); // time
+
+ for(rep=0; rep<nrep; rep++)
+ {
+ d_back_ric_trs_libstr(N, nx, nu, hsBAbt, hsb, hsrq, hsL, hsLxt, hsPb, hsux, hspi, hswork_vec);
+ }
+
+ gettimeofday(&tv3, NULL); // time
+
+ float time_sv = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+ float time_trf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+ float time_trs = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+ // print sol
+ printf("\nux = \n\n");
+ for(ii=0; ii<=N; ii++)
+ d_print_tran_strvec(nu[ii]+nx[ii], &hsux[ii], 0);
+
+ printf("\npi = \n\n");
+ for(ii=0; ii<N; ii++)
+ d_print_tran_strvec(nx[ii+1], &hspi[ii], 0);
+
+ printf("\ntime sv\t\ttime trf\t\ttime trs\n");
+ printf("\n%e\t%e\t%e\n", time_sv, time_trf, time_trs);
+ printf("\n");
+
+/************************************************
+* free memory
+************************************************/
+
+ d_free(A);
+ d_free(B);
+ d_free(b);
+ d_free_align(x0);
+ d_free(R);
+ d_free(S);
+ d_free(Q);
+ d_free(r);
+ d_free(q);
+ d_free(b0);
+ d_free_strmat(&sA);
+ d_free_strvec(&sb);
+ d_free_strmat(&sBbt0);
+ d_free_strvec(&sb0);
+ d_free_strmat(&sBAbt1);
+ d_free_strmat(&sRr0);
+ d_free_strvec(&sr0);
+ d_free_strmat(&sRSQrq1);
+ d_free_strvec(&srq1);
+ d_free_strmat(&sQqN);
+ d_free_strvec(&sqN);
+ d_free_strmat(&hsL[0]);
+// d_free_strmat(&hsLxt[0]);
+ d_free_strvec(&hsPb[0]);
+ d_free_strvec(&hsux[0]);
+ d_free_strvec(&hspi[0]);
+ for(ii=1; ii<N; ii++)
+ {
+ d_free_strmat(&hsL[ii]);
+ d_free_strmat(&hsLxt[ii]);
+ d_free_strvec(&hsPb[ii]);
+ d_free_strvec(&hsux[ii]);
+ d_free_strvec(&hspi[ii]);
+ }
+ d_free_strmat(&hsL[N]);
+ d_free_strmat(&hsLxt[N]);
+ d_free_strvec(&hsux[N]);
+ d_free_strmat(&hswork_mat[0]);
+ d_free_strvec(&hswork_vec[0]);
+
+
+/************************************************
+* return
+************************************************/
+
+ return 0;
+
+ }
+
+
+
diff --git a/examples/tools.c b/examples/tools.c
new file mode 100644
index 0000000..51d9e95
--- /dev/null
+++ b/examples/tools.c
@@ -0,0 +1,724 @@
+/**************************************************************************************************
+* *
+* This file is part of HPMPC. *
+* *
+* HPMPC -- Library for High-Performance implementation of solvers for MPC. *
+* Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* *
+**************************************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+//#include "../include/aux_d.h"
+
+//void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *A, int *lda, double *B, int *ldb, double *beta, double *C, int *ldc);
+//void dgesv_(int *n, int *nrhs, double *A, int *lda, int *ipiv, double *B, int *ldb, int *info);
+//void dcopy_(int *n, double *dx, int *incx, double *dy, int *incy);
+//void daxpy_(int *n, double *da, double *dx, int *incx, double *dy, int *incy);
+//void dscal_(int *n, double *da, double *dx, int *incx);
+
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+
+
+
+/************************************************
+ matrix-matrix multiplication
+************************************************/
+void dgemm_nn_3l(int m, int n, int k, double *A, int lda , double *B, int ldb, double *C, int ldc)
+ {
+
+ int ii, jj, kk;
+
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ C[ii+ldc*jj] = 0;
+ for(kk=0; kk<k; kk++)
+ {
+ C[ii+ldc*jj] += A[ii+lda*kk] * B[kk+ldb*jj];
+ }
+ }
+ }
+
+ return;
+
+ }
+
+
+void daxpy_3l(int n, double da, double *dx, double *dy)
+ {
+ int i;
+ for(i=0; i<n; i++)
+ {
+ dy[i] += da*dx[i];
+ }
+ }
+
+
+
+void dscal_3l(int n, double da, double *dx)
+ {
+ int i;
+ for(i=0; i<n; i++)
+ {
+ dx[i] *= da;
+ }
+ }
+
+
+
+/************************************************
+ Routine that copies a matrix
+************************************************/
+void dmcopy(int row, int col, double *A, int lda, double *B, int ldb)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ B[i+j*ldb] = A[i+j*lda];
+ }
+ }
+ }
+
+
+
+int idamax_3l(int n, double *x)
+ {
+
+ if(n<=0)
+ return 0;
+ if(n==1)
+ return 0;
+
+ double dabs;
+ double dmax = (x[0]>0 ? x[0] : -x[0]);
+ int idmax = 0;
+ int jj;
+ for(jj=1; jj<n; jj++)
+ {
+ dabs = (x[jj]>0 ? x[jj] : -x[jj]);
+ if(dabs>dmax)
+ {
+ dmax = dabs;
+ idmax = jj;
+ }
+ }
+
+ return idmax;
+
+ }
+
+
+
+void dswap_3l(int n, double *x, int incx, double *y, int incy)
+ {
+
+ if(n<=0)
+ return;
+
+ double temp;
+ int jj;
+ for(jj=0; jj<n; jj++)
+ {
+ temp = x[0];
+ x[0] = y[0];
+ y[0] = temp;
+ x += incx;
+ y += incy;
+ }
+
+ }
+
+
+
+void dger_3l(int m, int n, double alpha, double *x, int incx, double *y, int incy, double *A, int lda)
+ {
+
+ if(m==0 || n==0 || alpha==0.0)
+ return;
+
+ int i, j;
+ double *px, *py, temp;
+
+ py = y;
+ for(j=0; j<n; j++)
+ {
+ temp = alpha * py[0];
+ px = x;
+ for(i=0; i<m; i++)
+ {
+ A[i+lda*j] += px[0] * temp;
+ px += incx;
+ }
+ py += incy;
+ }
+
+ return;
+
+ }
+
+
+
+void dgetf2_3l(int m, int n, double *A, int lda, int *ipiv, int *info)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ int i, j, jp;
+
+ double Ajj;
+
+ int size_min = ( m<n ? m : n );
+
+ for(j=0; j<size_min; j++)
+ // find the pivot and test for singularity
+ {
+ jp = j + idamax_3l(m-j, &A[j+lda*j]);
+ ipiv[j] = jp;
+ if( A[jp+lda*j]!=0)
+ {
+ // apply the interchange to columns 0:n-1
+ if(jp!=j)
+ {
+ dswap_3l(n, &A[j], lda, &A[jp], lda);
+ }
+ // compute elements j+1:m-1 of j-th column
+ if(j<m-1)
+ {
+ Ajj = A[j+lda*j];
+ if( ( Ajj>0 ? Ajj : -Ajj ) >= 2.22e-16 )
+ {
+ dscal_3l(m-j-1, 1.0/Ajj, &A[j+1+lda*j]);
+ }
+ else
+ {
+ for(i=j+1; i<m; i++)
+ {
+ A[i+lda*j] /= Ajj;
+ }
+ }
+ }
+ }
+ else if(*info==0)
+ {
+ *info = j+1;
+ }
+
+ if( j < size_min )
+ {
+ // update trailing submatrix
+ dger_3l(m-j-1, n-j-1, -1.0, &A[j+1+lda*j], 1, &A[j+lda*(j+1)], lda, &A[j+1+lda*(j+1)], lda);
+ }
+
+ }
+
+ return;
+
+ }
+
+
+
+void dlaswp_3l(int n, double *A, int lda, int k1, int k2, int *ipiv)
+ {
+
+ int i, j, k, ix, ix0, i1, i2, n32, ip;
+ double temp;
+
+ ix0 = k1;
+ i1 = k1;
+ i2 = k2;
+
+ n32 = (n/32)*32;
+ if(n32!=0)
+ {
+ for(j=0; j<n32; j+=32)
+ {
+ ix = ix0;
+ for(i=i1; i<i2; i++)
+ {
+ ip = ipiv[ix];
+ if(ip!=i)
+ {
+ for(k=j; k<j+32; k++)
+ {
+ temp = A[i+lda*k];
+ A[i+lda*k] = A[ip+lda*k];
+ A[ip+lda*k] = temp;
+ }
+ }
+ ix++;
+ }
+ }
+ }
+ if(n32!=n)
+ {
+ ix = ix0;
+ for(i=i1; i<i2; i++)
+ {
+ ip = ipiv[ix];
+ if(ip!=i)
+ {
+ for(k=n32; k<n; k++)
+ {
+ temp = A[i+lda*k];
+ A[i+lda*k] = A[ip+lda*k];
+ A[ip+lda*k] = temp;
+ }
+ }
+ ix++;
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// left lower no-transp unit
+void dtrsm_l_l_n_u_3l(int m, int n, double *A, int lda, double *B, int ldb)
+ {
+
+ if(m==0 || n==0)
+ return;
+
+ int i, j, k;
+
+ for(j=0; j<n; j++)
+ {
+ for(k=0; k<m; k++)
+ {
+ for(i=k+1; i<m; i++)
+ {
+ B[i+ldb*j] -= B[k+ldb*j] * A[i+lda*k];
+ }
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// left upper no-transp non-unit
+void dtrsm_l_u_n_n_3l(int m, int n, double *A, int lda, double *B, int ldb)
+ {
+
+ if(m==0 || n==0)
+ return;
+
+ int i, j, k;
+
+ for(j=0; j<n; j++)
+ {
+ for(k=m-1; k>=0; k--)
+ {
+ B[k+ldb*j] /= A[k+lda*k];
+ for(i=0; i<k; i++)
+ {
+ B[i+ldb*j] -= B[k+ldb*j] * A[i+lda*k];
+ }
+ }
+ }
+
+ return;
+
+ }
+
+
+
+void dgetrs_3l(int n, int nrhs, double *A, int lda, int *ipiv, double *B, int ldb, int *info)
+ {
+
+ if(n==0 || nrhs==0)
+ return;
+
+ // solve A * X = B
+
+ // apply row interchanges to the rhs
+ dlaswp_3l(nrhs, B, ldb, 0, n, ipiv);
+
+ // solve L*X = B, overwriting B with X
+ dtrsm_l_l_n_u_3l(n, nrhs, A, lda, B, ldb);
+
+ // solve U*X = B, overwriting B with X
+ dtrsm_l_u_n_n_3l(n, nrhs, A, lda, B, ldb);
+
+ return;
+
+ }
+
+
+
+void dgesv_3l(int n, int nrhs, double *A, int lda, int *ipiv, double *B, int ldb, int *info)
+ {
+
+ // compute the LU factorization of A
+ dgetf2_3l(n, n, A, lda, ipiv, info);
+
+ if(*info==0)
+ {
+ // solve the system A*X = B, overwriting B with X
+ dgetrs_3l(n, nrhs, A, lda, ipiv, B, ldb, info);
+ }
+
+ return;
+
+ }
+
+
+
+/* one norm of a matrix */
+double onenorm(int row, int col, double *ptrA)
+ {
+ double max, temp;
+ int i, j;
+ temp = 0;
+ for(j=0; j<col; j++)
+ {
+ temp = abs(*(ptrA+j*row));
+ for(i=1; i<row; i++)
+ {
+ temp += abs(*(ptrA+j*row+i));
+ }
+ if(j==0) max = temp;
+ else if(max>temp) temp = max;
+ }
+ return temp;
+ }
+
+
+
+/* computes the Pade approximation of degree m of the matrix A */
+void padeapprox(int m, int row, double *A)
+ {
+ int ii;
+ int row2 = row*row;
+/* int i1 = 1;*/
+/* double d0 = 0;*/
+/* double d1 = 1;*/
+/* double dm1 = -1;*/
+
+ double *U = (double *) malloc(row*row*sizeof(double)); // d_zeros(&U, row, row);
+ double *V = (double *) malloc(row*row*sizeof(double)); // d_zeros(&V, row, row);
+
+ if(m==3)
+ {
+ double c[] = {120, 60, 12, 1};
+ double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+ for(ii=0; ii<row*row; ii++)
+ A0[ii] = 0.0;
+ for(ii=0; ii<row; ii++)
+ A0[ii*(row+1)] = 1.0;
+ double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+// char ta = 'n'; double alpha = 1; double beta = 0;
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+ dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+// dscal_(&row2, &d0, temp, &i1);
+ dscal_3l(row2, 0, temp);
+// daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+ daxpy_3l(row2, c[3], A2, temp);
+// daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+ daxpy_3l(row2, c[1], A0, temp);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+ dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+// dscal_(&row2, &d0, V, &i1);
+ dscal_3l(row2, 0, V);
+// daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+ daxpy_3l(row2, c[2], A2, V);
+// daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+ daxpy_3l(row2, c[0], A0, V);
+ free(A0);
+ free(A2);
+ free(temp);
+ }
+ else if(m==5)
+ {
+ double c[] = {30240, 15120, 3360, 420, 30, 1};
+ double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+ for(ii=0; ii<row*row; ii++)
+ A0[ii] = 0.0;
+ for(ii=0; ii<row; ii++)
+ A0[ii*(row+1)] = 1.0;
+ double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A4 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+// char ta = 'n'; double alpha = 1; double beta = 0;
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+ dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
+ dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
+ dmcopy(row, row, A4, row, V, row);
+ dmcopy(row, row, A4, row, temp, row);
+// daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+ daxpy_3l(row2, c[3], A2, temp);
+// daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+ daxpy_3l(row2, c[1], A0, temp);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+ dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+// dscal_(&row2, &c[4], V, &i1);
+ dscal_3l(row2, c[4], V);
+// daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+ daxpy_3l(row2, c[2], A2, V);
+// daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+ daxpy_3l(row2, c[0], A0, V);
+ free(A0);
+ free(A2);
+ free(A4);
+ free(temp);
+ }
+ else if(m==7)
+ {
+ double c[] = {17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1};
+ double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+ for(ii=0; ii<row*row; ii++)
+ A0[ii] = 0.0;
+ for(ii=0; ii<row; ii++)
+ A0[ii*(row+1)] = 1.0;
+ double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A4 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A6 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+// char ta = 'n'; double alpha = 1; double beta = 1;
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+ dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
+ dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row, &beta, A6, &row);
+ dgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
+// dscal_(&row2, &d0, temp, &i1);
+ dscal_3l(row2, 0, temp);
+// daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+ daxpy_3l(row2, c[3], A2, temp);
+// daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+ daxpy_3l(row2, c[1], A0, temp);
+// daxpy_(&row2, &c[5], A4, &i1, temp, &i1);
+ daxpy_3l(row2, c[5], A4, temp);
+// daxpy_(&row2, &c[7], A6, &i1, temp, &i1);
+ daxpy_3l(row2, c[7], A6, temp);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+ dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+// dscal_(&row2, &d0, V, &i1);
+ dscal_3l(row2, 0, V);
+// daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+ daxpy_3l(row2, c[2], A2, V);
+// daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+ daxpy_3l(row2, c[0], A0, V);
+// daxpy_(&row2, &c[4], A4, &i1, V, &i1);
+ daxpy_3l(row2, c[4], A4, V);
+// daxpy_(&row2, &c[6], A6, &i1, V, &i1);
+ daxpy_3l(row2, c[6], A6, V);
+ free(A0);
+ free(A2);
+ free(A4);
+ free(A6);
+ free(temp);
+ }
+ else if(m==9)
+ {
+ double c[] = {17643225600, 8821612800, 2075673600, 302702400, 30270240, 2162160, 110880, 3960, 90, 1};
+ double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+ for(ii=0; ii<row*row; ii++)
+ A0[ii] = 0.0;
+ for(ii=0; ii<row; ii++)
+ A0[ii*(row+1)] = 1.0;
+ double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A4 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A6 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A8 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+// char ta = 'n'; double alpha = 1; double beta = 0;
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+ dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
+ dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row, &beta, A6, &row);
+ dgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, A2, &row, &beta, A8, &row);
+ dgemm_nn_3l(row, row, row, A6, row, A2, row, A8, row);
+ dmcopy(row, row, A8, row, V, row);
+ dmcopy(row, row, A8, row, temp, row);
+// daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+ daxpy_3l(row2, c[3], A2, temp);
+// daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+ daxpy_3l(row2, c[1], A0, temp);
+// daxpy_(&row2, &c[5], A4, &i1, temp, &i1);
+ daxpy_3l(row2, c[5], A4, temp);
+// daxpy_(&row2, &c[7], A6, &i1, temp, &i1);
+ daxpy_3l(row2, c[7], A6, temp);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+ dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+// dscal_(&row2, &c[8], V, &i1);
+ dscal_3l(row2, c[8], V);
+// daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+ daxpy_3l(row2, c[2], A2, V);
+// daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+ daxpy_3l(row2, c[0], A0, V);
+// daxpy_(&row2, &c[4], A4, &i1, V, &i1);
+ daxpy_3l(row2, c[4], A4, V);
+// daxpy_(&row2, &c[6], A6, &i1, V, &i1);
+ daxpy_3l(row2, c[6], A6, V);
+ free(A0);
+ free(A2);
+ free(A4);
+ free(A6);
+ free(A8);
+ free(temp);
+ }
+ else if(m==13) // tested
+ {
+ double c[] = {64764752532480000, 32382376266240000, 7771770303897600, 1187353796428800, 129060195264000, 10559470521600, 670442572800, 33522128640, 1323241920, 40840800, 960960, 16380, 182, 1};
+ double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+ for(ii=0; ii<row*row; ii++)
+ A0[ii] = 0.0;
+ for(ii=0; ii<row; ii++)
+ A0[ii*(row+1)] = 1.0;
+ double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A4 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *A6 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+ double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+// char ta = 'n'; double alpha = 1; double beta = 0;
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+ dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
+ dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row, &beta, A6, &row);
+ dgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
+ dmcopy(row, row, A2, row, U, row);
+// dscal_(&row2, &c[9], U, &i1);
+ dscal_3l(row2, c[9], U);
+// daxpy_(&row2, &c[11], A4, &i1, U, &i1);
+ daxpy_3l(row2, c[11], A4, U);
+// daxpy_(&row2, &c[13], A6, &i1, U, &i1);
+ daxpy_3l(row2, c[13], A6, U);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, U, &row, &beta, temp, &row);
+ dgemm_nn_3l(row, row, row, A6, row, U, row, temp, row);
+// daxpy_(&row2, &c[7], A6, &i1, temp, &i1);
+ daxpy_3l(row2, c[7], A6, temp);
+// daxpy_(&row2, &c[5], A4, &i1, temp, &i1);
+ daxpy_3l(row2, c[5], A4, temp);
+// daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+ daxpy_3l(row2, c[3], A2, temp);
+// daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+ daxpy_3l(row2, c[1], A0, temp);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+ dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+ dmcopy(row, row, A2, row, temp, row);
+// dscal_(&row2, &c[8], V, &i1);
+ dscal_3l(row2, c[8], V);
+// daxpy_(&row2, &c[12], A6, &i1, temp, &i1);
+ daxpy_3l(row2, c[12], A6, temp);
+// daxpy_(&row2, &c[10], A4, &i1, temp, &i1);
+ daxpy_3l(row2, c[10], A4, temp);
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, temp, &row, &beta, V, &row);
+ dgemm_nn_3l(row, row, row, A6, row, temp, row, V, row);
+// daxpy_(&row2, &c[6], A6, &i1, V, &i1);
+ daxpy_3l(row2, c[6], A6, V);
+// daxpy_(&row2, &c[4], A4, &i1, V, &i1);
+ daxpy_3l(row2, c[4], A4, V);
+// daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+ daxpy_3l(row2, c[2], A2, V);
+// daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+ daxpy_3l(row2, c[0], A0, V);
+ free(A0);
+ free(A2);
+ free(A4);
+ free(A6);
+ free(temp);
+ }
+ else
+ {
+ printf("%s\n", "Wrong Pade approximatin degree");
+ exit(1);
+ }
+ double *D = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+// dcopy_(&row2, V, &i1, A, &i1);
+ dmcopy(row, row, V, row, A, row);
+// daxpy_(&row2, &d1, U, &i1, A, &i1);
+ daxpy_3l(row2, 1.0, U, A);
+// dcopy_(&row2, V, &i1, D, &i1);
+ dmcopy(row, row, V, row, D, row);
+// daxpy_(&row2, &dm1, U, &i1, D, &i1);
+ daxpy_3l(row2, -1.0, U, D);
+ int *ipiv = (int *) malloc(row*sizeof(int));
+ int info = 0;
+// dgesv_(&row, &row, D, &row, ipiv, A, &row, &info);
+ dgesv_3l(row, row, D, row, ipiv, A, row, &info);
+ free(ipiv);
+ free(D);
+ free(U);
+ free(V);
+ }
+
+
+
+void expm(int row, double *A)
+ {
+
+ int i;
+
+ int m_vals[] = {3, 5, 7, 9, 13};
+ double theta[] = {0.01495585217958292, 0.2539398330063230, 0.9504178996162932, 2.097847961257068, 5.371920351148152};
+ int lentheta = 5;
+
+ double normA = onenorm(row, row, A);
+
+ if(normA<=theta[4])
+ {
+ for(i=0; i<lentheta; i++)
+ {
+ if(normA<=theta[i])
+ {
+ padeapprox(m_vals[i], row, A);
+ break;
+ }
+ }
+ }
+ else
+ {
+ int s;
+ double t = frexp(normA/(theta[4]), &s);
+ s = s - (t==0.5);
+ t = pow(2,-s);
+ int row2 = row*row;
+/* int i1 = 1;*/
+// dscal_(&row2, &t, A, &i1);
+ dscal_3l(row2, t, A);
+ padeapprox(m_vals[4], row, A);
+ double *temp = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+// char ta = 'n'; double alpha = 1; double beta = 0;
+ for(i=0; i<s; i++)
+ {
+// dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, temp, &row);
+ dgemm_nn_3l(row, row, row, A, row, A, row, temp, row);
+ dmcopy(row, row, temp, row, A, row);
+ }
+ free(temp);
+ }
+ }
+
+
diff --git a/examples/tools.h b/examples/tools.h
new file mode 100644
index 0000000..b017301
--- /dev/null
+++ b/examples/tools.h
@@ -0,0 +1,37 @@
+/**************************************************************************************************
+* *
+* This file is part of HPMPC. *
+* *
+* HPMPC -- Library for High-Performance implementation of solvers for MPC. *
+* Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* *
+**************************************************************************************************/
+
+void dgemm_nn_3l(int m, int n, int k, double *A, int lda , double *B, int ldb, double *C, int ldc);
+void daxpy_3l(int n, double da, double *dx, double *dy);
+void dscal_3l(int n, double da, double *dx);
+
+/* copies a matrix into another matrix */
+void dmcopy(int row, int col, double *ptrA, int lda, double *ptrB, int ldb);
+
+/* solution of a system of linear equations */
+void dgesv_3l(int n, int nrhs, double *A, int lda, int *ipiv, double *B, int ldb, int *info);
+
+/* matrix exponential */
+void expm(int row, double *A);