Squashed 'third_party/blasfeo/' content from commit 2a828ca

Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/test_problems/CMakeLists.txt b/test_problems/CMakeLists.txt
new file mode 100644
index 0000000..77becb1
--- /dev/null
+++ b/test_problems/CMakeLists.txt
@@ -0,0 +1,32 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of HPIPM.                                                                     #
+#                                                                                                 #
+# HPIPM -- High Performance Interior Point Method.                                                #
+# Copyright (C) 2017 by Gianluca Frison.                                                          #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+add_executable(d_blas test_blas_d.c)
+target_link_libraries(d_blas blasfeo m)
+
+add_executable(s_blas test_blas_s.c)
+target_link_libraries(s_blas blasfeo m)
diff --git a/test_problems/Makefile b/test_problems/Makefile
new file mode 100644
index 0000000..f2e4741
--- /dev/null
+++ b/test_problems/Makefile
@@ -0,0 +1,67 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../Makefile.rule
+
+ifeq ($(REF_BLAS), 0)
+LIBS = -lm 
+endif
+ifeq ($(REF_BLAS), OPENBLAS)
+LIBS = /opt/openblas/lib/libopenblas.a -pthread -lgfortran -lm
+endif
+ifeq ($(REF_BLAS), BLIS)
+LIBS = /opt/netlib/liblapack.a /opt/blis/lib/libblis.a -lgfortran -lm -fopenmp
+endif
+ifeq ($(REF_BLAS), NETLIB)
+LIBS = /opt/netlib/liblapack.a /opt/netlib/libblas.a -lgfortran -lm
+endif
+ifeq ($(REF_BLAS), MKL)
+LIBS = -Wl,--start-group /opt/intel/mkl/lib/intel64/libmkl_gf_lp64.a /opt/intel/mkl/lib/intel64/libmkl_core.a /opt/intel/mkl/lib/intel64/libmkl_sequential.a -Wl,--end-group -ldl -lpthread -lm
+endif
+ifeq ($(REF_BLAS), ATLAS)
+LIBS = /opt/atlas/lib/liblapack.a /opt/atlas/lib/libcblas.a /opt/atlas/lib/libf77blas.a /opt/atlas/lib/libatlas.a -lgfortran -lm
+endif
+
+#ifneq ($(NUM_THREAD), 1)
+#LIBS += -pthread 
+#endif
+
+OBJS_TEST = test_blas_d.o
+#OBJS_TEST = test_blas_s.o
+#OBJS_TEST = test_d_strmat.o
+#OBJS_TEST = test_s_strmat.o
+#OBJS_TEST = kernel_assembly.o test_assembly.o
+
+obj: $(OBJS_TEST)
+	$(CC) -o test.out $(OBJS_TEST) -L. libblasfeo.a $(LIBS) #-pg
+
+clean:
+	rm -f *.o
+	rm -f test.out
+	rm -f libblasfeo.a
+
diff --git a/test_problems/cpu_freq.h b/test_problems/cpu_freq.h
new file mode 100644
index 0000000..30320fc
--- /dev/null
+++ b/test_problems/cpu_freq.h
@@ -0,0 +1,31 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#ifndef GHZ_MAX
+#define GHZ_MAX 3.6
+#endif
diff --git a/test_problems/kernel_assembly.S b/test_problems/kernel_assembly.S
new file mode 100644
index 0000000..b393e0d
--- /dev/null
+++ b/test_problems/kernel_assembly.S
@@ -0,0 +1,633 @@
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r12), %ymm12 // B[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovapd 32(%r12), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 1
+	vmovapd 64(%r12), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 2
+	vmovapd 96(%r12), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	addq	$128, %r11
+
+
+	// unroll 3
+	vmovapd 0(%r12), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 0(%r11), %ymm8 // A0[0]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vmovapd 32(%r12), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 1
+	vmovapd 64(%r12), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 2
+	vmovapd 96(%r12), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	addq	$128, %r11
+
+
+	// unroll 3
+//	vmovapd 0(%r12), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+//	vmovapd 0(%r11), %ymm8 // A0[0]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+
+//	cmpl	$3, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd 0(%r12), %ymm12 // B[0]
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	addq	$32, %r11
+
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	addq	$32, %r12
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	subl	$1, %r10d
+
+	vshufpd $0x5, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovupd		0(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovupd		64(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovupd		96(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+	
+	vmovupd %ymm0,  0(%r10)
+	vmovupd %ymm1, 32(%r10)
+	vmovupd %ymm2, 64(%r10)
+	vmovupd %ymm3, 96(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+//                               1      2              3          4          5             6          7
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_lib4_test
+#if defined(OS_LINUX)
+	.type kernel_dgemm_nt_4x4_lib4_test, @function
+#else // OS_WINDOWS
+	.def kernel_dgemm_nt_4x4_lib4_test; .scl 2; .type 32; .endef
+#endif
+kernel_dgemm_nt_4x4_lib4_test:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_lib4_test
+_kernel_dgemm_nt_4x4_lib4_test:
+#endif
+
+	// prologue
+
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+#if defined(OS_WINDOWS)
+	movq	%rdi, 48(%rsp)
+	movq	%rsi, 56(%rsp)
+	vmovups	%xmm6, 64(%rsp)
+	vmovups	%xmm7, 80(%rsp)
+	vmovups	%xmm8, 96(%rsp)
+	vmovups	%xmm9, 112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+#endif
+
+	vzeroupper
+
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+//	movq	ARG6, %rax
+//	movq	STACKSIZE + 48(%rsp), %rax
+
+
+	// epilogue
+
+	vzeroupper
+
+	movq	  (%rsp), %rbx 
+	movq	 8(%rsp), %rbp
+	movq	16(%rsp), %r12 
+	movq	24(%rsp), %r13 
+	movq	32(%rsp), %r14 
+	movq	40(%rsp), %r15 
+#if defined(OS_WINDOWS)
+	movq	48(%rsp), %rdi
+	movq	56(%rsp), %rsi
+	vmovups	64(%rsp), %xmm6
+	vmovups	80(%rsp), %xmm7
+	vmovups	96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+	addq	$STACKSIZE, %rsp
+
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_lib4_test, .-kernel_dgemm_nt_4x4_lib4_test
+#endif
+
+
diff --git a/test_problems/results/dummy.txt b/test_problems/results/dummy.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test_problems/results/dummy.txt
diff --git a/test_problems/test_assembly.c b/test_problems/test_assembly.c
new file mode 100644
index 0000000..3a07a13
--- /dev/null
+++ b/test_problems/test_assembly.c
@@ -0,0 +1,59 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+int main()
+	{
+
+	printf("\ntest assembly\n");
+
+	int ii;
+
+	int n = 12;
+
+	double *A; d_zeros(&A, n, n);
+	for(ii=0; ii<n*n; ii++) A[ii] = ii;
+	d_print_mat(n, n, A, n);
+
+	double *B; d_zeros(&B, n, n);
+	for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+	d_print_mat(n, n, B, n);
+
+	struct d_strmat sA;
+	d_allocate_strmat(n, n, &sA);
+	d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+	d_print_strmat(n, n, &sA, 0, 0);
+
+	struct d_strmat sB;
+	d_allocate_strmat(n, n, &sB);
+	d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+	d_print_strmat(n, n, &sB, 0, 0);
+
+	struct d_strmat sD;
+	d_allocate_strmat(n, n, &sD);
+
+	struct d_strmat sC;
+	d_allocate_strmat(n, n, &sC);
+
+	double alpha = 1.0;
+	double beta = 0.0;
+	int ret = kernel_dgemm_nt_4x4_lib4_test(n, &alpha, sB.pA, sA.pA, &beta, sB.pA, sD.pA);
+	d_print_strmat(n, n, &sD, 0, 0);
+//	printf("\n%ld %ld\n", (long long) n, ret);
+//	printf("\n%ld %ld\n", (long long) &alpha, ret);
+//	printf("\n%ld %ld\n", (long long) sA.pA, ret);
+//	printf("\n%ld %ld\n", (long long) sB.pA, ret);
+//	printf("\n%ld %ld\n", (long long) &beta, ret);
+//	printf("\n%ld %ld\n", (long long) sC.pA, ret);
+//	printf("\n%ld %ld\n", (long long) sD.pA, ret);
+
+	return 0;
+
+	}
diff --git a/test_problems/test_blas_d.c b/test_problems/test_blas_d.c
new file mode 100644
index 0000000..1e71494
--- /dev/null
+++ b/test_problems/test_blas_d.c
@@ -0,0 +1,480 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+//#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+//#include <xmmintrin.h> // needed to flush to zero sub-normals with _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_ON); in the main()
+//#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+#ifndef D_PS
+#define D_PS 1
+#endif
+#ifndef D_NC
+#define D_NC 1
+#endif
+
+
+
+#if defined(REF_BLAS_OPENBLAS)
+void openblas_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_BLIS)
+void omp_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_MKL)
+#include "mkl.h"
+#endif
+
+
+
+#include "cpu_freq.h"
+
+
+
+int main()
+	{
+		
+#if defined(REF_BLAS_OPENBLAS)
+	openblas_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_BLIS)
+	omp_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_MKL)
+	mkl_set_num_threads(1);
+#endif
+
+//#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+//	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!!
+//#endif
+
+	printf("\n");
+	printf("\n");
+	printf("\n");
+
+	printf("BLAS performance test - double precision\n");
+	printf("\n");
+
+	// maximum frequency of the processor
+	const float GHz_max = GHZ_MAX;
+	printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max);
+	printf("\n");
+
+	// maximum flops per cycle, double precision
+#if defined(TARGET_X64_INTEL_HASWELL)
+	const float flops_max = 16;
+	printf("Testing BLAS version for AVX2 and FMA instruction sets, 64 bit (optimized for Intel Haswell): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	const float flops_max = 8;
+	printf("Testing BLAS version for AVX instruction set, 64 bit (optimized for Intel Sandy Bridge): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_CORE)
+	const float flops_max = 4;
+	printf("Testing BLAS version for SSE3 instruction set, 64 bit (optimized for Intel Core): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+	const float flops_max = 8;
+	printf("Testing BLAS version for SSE3 and FMA instruction set, 64 bit (optimized for AMD Bulldozer): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+	const float flops_max = 4;
+	printf("Testing BLAS version for NEONv2 instruction set, 64 bit (optimized for ARM Cortex A57): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+	const float flops_max = 2;
+	printf("Testing BLAS version for VFPv4 instruction set, 32 bit (optimized for ARM Cortex A15): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_GENERIC)
+	const float flops_max = 2;
+	printf("Testing BLAS version for generic scalar instruction set: theoretical peak %5.1f Gflops ???\n", flops_max*GHz_max);
+#endif
+	
+//	FILE *f;
+//	f = fopen("./test_problems/results/test_blas.m", "w"); // a
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+//	fprintf(f, "C = 'd_x64_intel_haswell';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+//	fprintf(f, "C = 'd_x64_intel_sandybridge';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_CORE)
+//	fprintf(f, "C = 'd_x64_intel_core';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+//	fprintf(f, "C = 'd_x64_amd_bulldozer';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+//	fprintf(f, "C = 'd_armv8a_arm_cortex_a57';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+//	fprintf(f, "C = 'd_armv7a_arm_cortex_a15';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_GENERIC)
+//	fprintf(f, "C = 'd_generic';\n");
+//	fprintf(f, "\n");
+#endif
+
+//	fprintf(f, "A = [%f %f];\n", GHz_max, flops_max);
+//	fprintf(f, "\n");
+
+//	fprintf(f, "B = [\n");
+	
+
+
+	int i, j, rep, ll;
+	
+	const int bsd = D_PS;
+	const int ncd = D_NC;
+
+/*	int info = 0;*/
+	
+	printf("\nn\t  dgemm_blasfeo\t  dgemm_blas\n");
+	printf("\nn\t Gflops\t    %%\t Gflops\n\n");
+	
+#if 1
+	int nn[] = {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332, 336, 340, 344, 348, 352, 356, 360, 364, 368, 372, 376, 380, 384, 388, 392, 396, 400, 404, 408, 412, 416, 420, 424, 428, 432, 436, 440, 444, 448, 452, 456, 460, 500, 550, 600, 650, 700};
+	int nnrep[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 400, 400, 400, 400, 400, 200, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, 4};
+	
+//	for(ll=0; ll<24; ll++)
+	for(ll=0; ll<75; ll++)
+//	for(ll=0; ll<115; ll++)
+//	for(ll=0; ll<120; ll++)
+
+		{
+
+		int n = nn[ll];
+		int nrep = nnrep[ll];
+//		int n = ll+1;
+//		int nrep = nnrep[0];
+//		n = n<12 ? 12 : n;
+//		n = n<8 ? 8 : n;
+
+#else
+	int nn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
+	
+	for(ll=0; ll<24; ll++)
+
+		{
+
+		int n = nn[ll];
+		int nrep = 40000; //nnrep[ll];
+#endif
+
+		double *A; d_zeros(&A, n, n);
+		double *B; d_zeros(&B, n, n);
+		double *C; d_zeros(&C, n, n);
+		double *M; d_zeros(&M, n, n);
+
+		char c_n = 'n';
+		char c_l = 'l';
+		char c_r = 'r';
+		char c_t = 't';
+		char c_u = 'u';
+		int i_1 = 1;
+		int i_t;
+		double d_1 = 1;
+		double d_0 = 0;
+	
+		for(i=0; i<n*n; i++)
+			A[i] = i;
+	
+		for(i=0; i<n; i++)
+			B[i*(n+1)] = 1;
+	
+		for(i=0; i<n*n; i++)
+			M[i] = 1;
+	
+		int n2 = n*n;
+		double *B2; d_zeros(&B2, n, n);
+		for(i=0; i<n*n; i++)
+			B2[i] = 1e-15;
+		for(i=0; i<n; i++)
+			B2[i*(n+1)] = 1;
+
+		int pnd = ((n+bsd-1)/bsd)*bsd;	
+		int cnd = ((n+ncd-1)/ncd)*ncd;	
+		int cnd2 = 2*((n+ncd-1)/ncd)*ncd;	
+
+		double *x; d_zeros_align(&x, pnd, 1);
+		double *y; d_zeros_align(&y, pnd, 1);
+		double *x2; d_zeros_align(&x2, pnd, 1);
+		double *y2; d_zeros_align(&y2, pnd, 1);
+		double *diag; d_zeros_align(&diag, pnd, 1);
+		int *ipiv; int_zeros(&ipiv, n, 1);
+
+		for(i=0; i<pnd; i++) x[i] = 1;
+		for(i=0; i<pnd; i++) x2[i] = 1;
+
+		// matrix struct
+#if 0
+		struct d_strmat sA; d_allocate_strmat(n+4, n+4, &sA);
+		struct d_strmat sB; d_allocate_strmat(n+4, n+4, &sB);
+		struct d_strmat sC; d_allocate_strmat(n+4, n+4, &sC);
+		struct d_strmat sD; d_allocate_strmat(n+4, n+4, &sD);
+		struct d_strmat sE; d_allocate_strmat(n+4, n+4, &sE);
+#else
+		struct d_strmat sA; d_allocate_strmat(n, n, &sA);
+		struct d_strmat sB; d_allocate_strmat(n, n, &sB);
+		struct d_strmat sB2; d_allocate_strmat(n, n, &sB2);
+		struct d_strmat sB3; d_allocate_strmat(n, n, &sB3);
+		struct d_strmat sC; d_allocate_strmat(n, n, &sC);
+		struct d_strmat sD; d_allocate_strmat(n, n, &sD);
+		struct d_strmat sE; d_allocate_strmat(n, n, &sE);
+#endif
+		struct d_strvec sx; d_allocate_strvec(n, &sx);
+		struct d_strvec sy; d_allocate_strvec(n, &sy);
+		struct d_strvec sz; d_allocate_strvec(n, &sz);
+
+		d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+		d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+		d_cvt_mat2strmat(n, n, B2, n, &sB2, 0, 0);
+		d_cvt_vec2strvec(n, x, &sx, 0);
+		int ii;
+		for(ii=0; ii<n; ii++)
+			{
+			DMATEL_LIBSTR(&sB3, ii, ii) = 1.0;
+//			DMATEL_LIBSTR(&sB3, n-1, ii) = 1.0;
+			DMATEL_LIBSTR(&sB3, ii, n-1) = 1.0;
+			DVECEL_LIBSTR(&sx, ii) = 1.0;
+			}
+//		d_print_strmat(n, n, &sB3, 0, 0);
+//		if(n==20) return;
+
+		int qr_work_size = 0;//dgeqrf_work_size_libstr(n, n);
+		void *qr_work;
+		v_zeros_align(&qr_work, qr_work_size);
+
+		int lq_work_size = 0;//dgelqf_work_size_libstr(n, n);
+		void *lq_work;
+		v_zeros_align(&lq_work, lq_work_size);
+
+		// create matrix to pivot all the time
+//		dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+
+		double *dummy;
+
+		int info;
+
+		/* timing */
+		struct timeval tvm1, tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11, tv12, tv13, tv14, tv15, tv16;
+
+		/* warm up */
+		for(rep=0; rep<nrep; rep++)
+			{
+			dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sC, 0, 0);
+			}
+
+		double alpha = 1.0;
+		double beta = 0.0;
+
+		gettimeofday(&tv0, NULL); // stop
+
+		for(rep=0; rep<nrep; rep++)
+			{
+
+//			dgemm_nt_lib(n, n, n, 1.0, pA, cnd, pB, cnd, 0.0, pC, cnd, pC, cnd);
+//			dgemm_nn_lib(n, n, n, 1.0, pA, cnd, pB, cnd, 0.0, pC, cnd, pC, cnd);
+//			dsyrk_nt_l_lib(n, n, n, 1.0, pA, cnd, pB, cnd, 1.0, pC, cnd, pD, cnd);
+//			dtrmm_nt_ru_lib(n, n, pA, cnd, pB, cnd, 0, pC, cnd, pD, cnd);
+//			dpotrf_nt_l_lib(n, n, pB, cnd, pD, cnd, diag);
+//			dsyrk_dpotrf_nt_l_lib(n, n, n, pA, cnd, pA, cnd, 1, pB, cnd, pD, cnd, diag);
+//			dsyrk_nt_l_lib(n, n, n, pA, cnd, pA, cnd, 1, pB, cnd, pD, cnd);
+//			dpotrf_nt_l_lib(n, n, pD, cnd, pD, cnd, diag);
+//			dgetrf_nn_nopivot_lib(n, n, pB, cnd, pB, cnd, diag);
+//			dgetrf_nn_lib(n, n, pB, cnd, pB, cnd, diag, ipiv);
+//			dtrsm_nn_ll_one_lib(n, n, pD, cnd, pB, cnd, pB, cnd);
+//			dtrsm_nn_lu_inv_lib(n, n, pD, cnd, diag, pB, cnd, pB, cnd);
+			}
+	
+		gettimeofday(&tv1, NULL); // stop
+
+		for(rep=0; rep<nrep; rep++)
+			{
+//			kernel_dgemm_nt_12x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_dgemm_nt_8x8_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_dsyrk_nt_l_8x8_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_dgemm_nt_8x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_dgemm_nt_4x8_lib4(n, &alpha, sA.pA, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+//			kernel_dgemm_nt_4x4_lib4(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//			kernel_dger4_12_sub_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+//			kernel_dger4_sub_12r_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+//			kernel_dger4_sub_8r_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+//			kernel_dger12_add_4r_lib4(n, sA.pA, sB.pA, sB.cn, sD.pA);
+//			kernel_dger8_add_4r_lib4(n, sA.pA, sB.pA, sB.cn, sD.pA);
+//			kernel_dger4_sub_4r_lib4(n, sA.pA, sB.pA, sD.pA);
+//			kernel_dger2_sub_4r_lib4(n, sA.pA, sB.pA, sD.pA);
+//			kernel_dger4_sub_8c_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+//			kernel_dger4_sub_4c_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+//			kernel_dgemm_nn_4x12_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+//			kernel_dgemm_nn_4x8_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+//			kernel_dgemm_nn_4x4_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+
+			dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			dgemm_nn_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			dsyrk_ln_libstr(n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			dsyrk_ln_mn_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			dpotrf_l_mn_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+//			dpotrf_l_libstr(n, &sB, 0, 0, &sB, 0, 0);
+//			dgetrf_nopivot_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+//			dgetrf_libstr(n, n, &sB, 0, 0, &sB, 0, 0, ipiv);
+//			dgeqrf_libstr(n, n, &sC, 0, 0, &sD, 0, 0, qr_work);
+//			dcolin_libstr(n, &sx, 0, &sB3, 0, n-1);
+//			dgelqf_libstr(n, n, &sB3, 0, 0, &sB3, 0, 0, lq_work);
+//			dtrmm_rlnn_libstr(n, n, 1.0, &sA, 0, 0, &sD, 0, 0, &sD, 0, 0); //
+//			dtrmm_rutn_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+//			dtrsm_llnu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			dtrsm_lunn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			dtrsm_rltn_libstr(n, n, 1.0, &sB2, 0, 0, &sD, 0, 0, &sD, 0, 0); //
+//			dtrsm_rltu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			dtrsm_rutn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			dgemv_n_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+//			dgemv_t_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+//			dsymv_l_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+//			dgemv_nt_libstr(n, n, 1.0, 1.0, &sA, 0, 0, &sx, 0, &sx, 0, 0.0, 0.0, &sy, 0, &sy, 0, &sz, 0, &sz, 0);
+			}
+
+//		d_print_strmat(n, n, &sD, 0, 0);
+
+		gettimeofday(&tv2, NULL); // stop
+
+		for(rep=0; rep<nrep; rep++)
+			{
+#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB) || defined(REF_BLAS_MKL)
+//			dgemm_(&c_n, &c_t, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+//			dpotrf_(&c_l, &n, B2, &n, &info);
+//			dgemm_(&c_n, &c_n, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+//			dsyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_0, C, &n);
+//			dtrmm_(&c_r, &c_u, &c_t, &c_n, &n, &n, &d_1, A, &n, C, &n);
+//			dgetrf_(&n, &n, B2, &n, ipiv, &info);
+//			dtrsm_(&c_l, &c_l, &c_n, &c_u, &n, &n, &d_1, B2, &n, B, &n);
+//			dtrsm_(&c_l, &c_u, &c_n, &c_n, &n, &n, &d_1, B2, &n, B, &n);
+//			dtrtri_(&c_l, &c_n, &n, B2, &n, &info);
+//			dlauum_(&c_l, &n, B, &n, &info);
+//			dgemv_(&c_n, &n, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+//			dgemv_(&c_t, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y2, &i_1);
+//			dtrmv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+//			dtrsv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+//			dsymv_(&c_l, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+
+//			for(i=0; i<n; i++)
+//				{
+//				i_t = n-i;
+//				dcopy_(&i_t, &B[i*(n+1)], &i_1, &C[i*(n+1)], &i_1);
+//				}
+//			dsyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_1, C, &n);
+//			dpotrf_(&c_l, &n, C, &n, &info);
+
+#endif
+
+#if defined(REF_BLAS_BLIS)
+//			dgemm_(&c_n, &c_t, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+//			dgemm_(&c_n, &c_n, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+//			dsyrk_(&c_l, &c_n, &n77, &n77, &d_1, A, &n77, &d_0, C, &n77);
+//			dtrmm_(&c_r, &c_u, &c_t, &c_n, &n77, &n77, &d_1, A, &n77, C, &n77);
+//			dpotrf_(&c_l, &n77, B, &n77, &info);
+//			dtrtri_(&c_l, &c_n, &n77, B, &n77, &info);
+//			dlauum_(&c_l, &n77, B, &n77, &info);
+#endif
+			}
+
+		gettimeofday(&tv3, NULL); // stop
+
+		float Gflops_max = flops_max * GHz_max;
+
+//		float flop_operation = 4*16.0*2*n; // kernel 12x4
+//		float flop_operation = 3*16.0*2*n; // kernel 12x4
+//		float flop_operation = 2*16.0*2*n; // kernel 8x4
+//		float flop_operation = 1*16.0*2*n; // kernel 4x4
+//		float flop_operation = 0.5*16.0*2*n; // kernel 2x4
+
+		float flop_operation = 2.0*n*n*n; // dgemm
+//		float flop_operation = 1.0*n*n*n; // dsyrk dtrmm dtrsm
+//		float flop_operation = 1.0/3.0*n*n*n; // dpotrf dtrtri
+//		float flop_operation = 2.0/3.0*n*n*n; // dgetrf
+//		float flop_operation = 4.0/3.0*n*n*n; // dgeqrf
+//		float flop_operation = 2.0*n*n; // dgemv dsymv
+//		float flop_operation = 1.0*n*n; // dtrmv dtrsv
+//		float flop_operation = 4.0*n*n; // dgemv_nt
+
+//		float flop_operation = 4.0/3.0*n*n*n; // dsyrk+dpotrf
+
+		float time_hpmpc    = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+		float time_blasfeo  = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+		float time_blas     = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+		float Gflops_hpmpc    = 1e-9*flop_operation/time_hpmpc;
+		float Gflops_blasfeo  = 1e-9*flop_operation/time_blasfeo;
+		float Gflops_blas     = 1e-9*flop_operation/time_blas;
+
+
+//		printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_hpmpc, 100.0*Gflops_hpmpc/Gflops_max, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+//		fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_hpmpc, 100.0*Gflops_hpmpc/Gflops_max, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+		printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+//		fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+
+
+		d_free(A);
+		d_free(B);
+		d_free(B2);
+		d_free(M);
+		d_free_align(x);
+		d_free_align(y);
+		d_free_align(x2);
+		d_free_align(y2);
+		int_free(ipiv);
+		free(qr_work);
+		free(lq_work);
+		
+		d_free_strmat(&sA);
+		d_free_strmat(&sB);
+		d_free_strmat(&sB2);
+		d_free_strmat(&sB3);
+		d_free_strmat(&sC);
+		d_free_strmat(&sD);
+		d_free_strmat(&sE);
+		d_free_strvec(&sx);
+		d_free_strvec(&sy);
+		d_free_strvec(&sz);
+
+		}
+
+	printf("\n");
+
+//	fprintf(f, "];\n");
+//	fclose(f);
+
+	return 0;
+	
+	}
diff --git a/test_problems/test_blas_s.c b/test_problems/test_blas_s.c
new file mode 100644
index 0000000..3ea9f11
--- /dev/null
+++ b/test_problems/test_blas_s.c
@@ -0,0 +1,454 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+#ifndef S_PS
+#define S_PS 1
+#endif
+#ifndef S_NC
+#define S_NC 1
+#endif
+
+
+
+#if defined(REF_BLAS_OPENBLAS)
+void openblas_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_BLIS)
+void omp_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_MKL)
+#include "mkl.h"
+#endif
+
+
+
+#include "cpu_freq.h"
+
+
+
+int main()
+	{
+		
+#if defined(REF_BLAS_OPENBLAS)
+	openblas_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_BLIS)
+	omp_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_MKL)
+	mkl_set_num_threads(1);
+#endif
+
+	printf("\n");
+	printf("\n");
+	printf("\n");
+
+	printf("BLAS performance test - float precision\n");
+	printf("\n");
+
+	// maximum frequency of the processor
+	const float GHz_max = GHZ_MAX;
+	printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max);
+	printf("\n");
+
+	// maximum flops per cycle, single precision
+	// maxumum memops (sustained load->store of floats) per cycle, single precision
+#if defined(TARGET_X64_INTEL_HASWELL)
+	const float flops_max = 32; // 2x256 bit fma
+	const float memops_max = 8; // 2x256 bit load + 1x256 bit store
+	printf("Testing BLAS version for AVX2 and FMA instruction sets, 64 bit (optimized for Intel Haswell): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	const float flops_max = 16; // 1x256 bit mul + 1x256 bit add
+	const float memops_max = 4; // 1x256 bit load + 1x128 bit store
+	printf("Testing BLAS version for AVX instruction set, 64 bit (optimized for Intel Sandy Bridge): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_CORE)
+	const float flops_max = 8; // 1x128 bit mul + 1x128 bit add
+	const float memops_max = 4; // 1x128 bit load + 1x128 bit store;
+	printf("Testing BLAS version for SSE3 instruction set, 64 bit (optimized for Intel Core): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+	const float flops_max = 16; // 2x128 bit fma
+	const float memops_max = 4; // 1x256 bit load + 1x128 bit store
+	printf("Testing BLAS version for SSE3 and FMA instruction set, 64 bit (optimized for AMD Bulldozer): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+	const float flops_max = 8; // 1x128 bit fma
+	const float memops_max = 4; // ???
+	printf("Testing BLAS version for VFPv4 instruction set, 32 bit (optimized for ARM Cortex A15): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+	const float flops_max = 8; // 1x128 bit fma
+	const float memops_max = 4; // ???
+	printf("Testing BLAS version for VFPv4 instruction set, 32 bit (optimized for ARM Cortex A15): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_GENERIC)
+	const float flops_max = 2; // 1x32 bit mul + 1x32 bit add ???
+	const float memops_max = 1; // ???
+	printf("Testing BLAS version for generic scalar instruction set: theoretical peak %5.1f Gflops ???\n", flops_max*GHz_max);
+#endif
+	
+//	FILE *f;
+//	f = fopen("./test_problems/results/test_blas.m", "w"); // a
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+//	fprintf(f, "C = 's_x64_intel_haswell';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+//	fprintf(f, "C = 's_x64_intel_sandybridge';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_CORE)
+//	fprintf(f, "C = 's_x64_intel_core';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+//	fprintf(f, "C = 's_x64_amd_bulldozer';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+//	fprintf(f, "C = 's_armv7a_arm_cortex_a15';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+//	fprintf(f, "C = 's_armv7a_arm_cortex_a15';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_GENERIC)
+//	fprintf(f, "C = 's_generic';\n");
+//	fprintf(f, "\n");
+#endif
+
+//	fprintf(f, "A = [%f %f];\n", GHz_max, flops_max);
+//	fprintf(f, "\n");
+
+//	fprintf(f, "B = [\n");
+	
+
+
+	int i, j, rep, ll;
+	
+	const int bss = S_PS;
+	const int ncs = S_NC;
+
+/*	int info = 0;*/
+	
+	printf("\nn\t  sgemm_blasfeo\t  sgemm_blas\n");
+	printf("\nn\t Gflops\t    %%\t Gflops\t    %%\n\n");
+	
+#if 1
+	int nn[] = {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332, 336, 340, 344, 348, 352, 356, 360, 364, 368, 372, 376, 380, 384, 388, 392, 396, 400, 404, 408, 412, 416, 420, 424, 428, 432, 436, 440, 444, 448, 452, 456, 460, 500, 550, 600, 650, 700};
+	int nnrep[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 400, 400, 400, 400, 400, 200, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, 4};
+	
+//	for(ll=0; ll<24; ll++)
+	for(ll=0; ll<75; ll++)
+//	for(ll=0; ll<115; ll++)
+//	for(ll=0; ll<120; ll++)
+
+		{
+
+		int n = nn[ll];
+		int nrep = nnrep[ll];
+//		int n = ll+1;
+//		int nrep = nnrep[0];
+//		n = n<16 ? 16 : n;
+
+		int n2 = n*n;
+
+#else
+	int nn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
+	
+	for(ll=0; ll<24; ll++)
+
+		{
+
+		int n = nn[ll];
+		int nrep = 40000; //nnrep[ll];
+#endif
+
+		float *A; s_zeros(&A, n, n);
+		float *B; s_zeros(&B, n, n);
+		float *C; s_zeros(&C, n, n);
+		float *M; s_zeros(&M, n, n);
+
+		char c_n = 'n';
+		char c_l = 'l';
+		char c_r = 'r';
+		char c_t = 't';
+		char c_u = 'u';
+		int i_1 = 1;
+		int i_t;
+		float d_1 = 1;
+		float d_0 = 0;
+	
+		for(i=0; i<n*n; i++)
+			A[i] = i;
+	
+		for(i=0; i<n; i++)
+			B[i*(n+1)] = 1;
+	
+		for(i=0; i<n*n; i++)
+			M[i] = 1;
+	
+		float *B2; s_zeros(&B2, n, n);
+		for(i=0; i<n*n; i++)
+			B2[i] = 1e-15;
+		for(i=0; i<n; i++)
+			B2[i*(n+1)] = 1;
+
+		float *x; s_zeros(&x, n, 1);
+		float *y; s_zeros(&y, n, 1);
+		float *x2; s_zeros(&x2, n, 1);
+		float *y2; s_zeros(&y2, n, 1);
+		float *diag; s_zeros(&diag, n, 1);
+		int *ipiv; int_zeros(&ipiv, n, 1);
+
+//		for(i=0; i<n; i++) x[i] = 1;
+//		for(i=0; i<n; i++) x2[i] = 1;
+
+		// matrix struct
+#if 0
+		struct s_strmat sA; s_allocate_strmat(n+4, n+4, &sA);
+		struct s_strmat sB; s_allocate_strmat(n+4, n+4, &sB);
+		struct s_strmat sC; s_allocate_strmat(n+4, n+4, &sC);
+		struct s_strmat sD; s_allocate_strmat(n+4, n+4, &sD);
+		struct s_strmat sE; s_allocate_strmat(n+4, n+4, &sE);
+#else
+		struct s_strmat sA; s_allocate_strmat(n, n, &sA);
+		struct s_strmat sB; s_allocate_strmat(n, n, &sB);
+		struct s_strmat sC; s_allocate_strmat(n, n, &sC);
+		struct s_strmat sD; s_allocate_strmat(n, n, &sD);
+		struct s_strmat sE; s_allocate_strmat(n, n, &sE);
+#endif
+		struct s_strvec sx; s_allocate_strvec(n, &sx);
+		struct s_strvec sy; s_allocate_strvec(n, &sy);
+		struct s_strvec sz; s_allocate_strvec(n, &sz);
+
+		s_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+		s_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+		s_cvt_vec2strvec(n, x, &sx, 0);
+
+
+		// create matrix to pivot all the time
+//		sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+
+		float *dummy;
+
+		int info;
+
+		/* timing */
+		struct timeval tvm1, tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11, tv12, tv13, tv14, tv15, tv16;
+
+		/* warm up */
+		for(rep=0; rep<nrep; rep++)
+			{
+			sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+			}
+
+		float alpha = 1.0;
+		float beta = 0.0;
+
+		gettimeofday(&tv0, NULL); // stop
+
+		gettimeofday(&tv1, NULL); // stop
+
+		for(rep=0; rep<nrep; rep++)
+			{
+//			kernel_sgemm_nt_24x4_lib8(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_sgemm_nt_16x4_lib8(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_sgemm_nt_8x8_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//			kernel_sgemm_nt_8x4_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//			kernel_sgemm_nt_4x8_gen_lib8(n, &alpha, sA.pA, sB.pA, &beta, 0, sD.pA, sD.cn, 0, sD.pA, sD.cn, 0, 4, 0, 8);
+//			kernel_sgemm_nt_4x8_vs_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA, 4, 8);
+//			kernel_sgemm_nt_4x8_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//			kernel_sgemm_nt_12x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_sgemm_nt_8x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_sgemm_nt_4x4_lib4(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//			kernel_sgemm_nn_16x4_lib8(n, &alpha, sA.pA, sA.cn, 0, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_sgemm_nn_8x8_lib8(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+//			kernel_sgemm_nn_8x4_lib8(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+
+//			sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			sgemm_nn_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			ssyrk_ln_libstr(n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			spotrf_l_mn_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+			spotrf_l_libstr(n, &sB, 0, 0, &sB, 0, 0);
+//			sgetr_libstr(n, n, &sA, 0, 0, &sB, 0, 0);
+//			sgetrf_nopivot_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+//			sgetrf_libstr(n, n, &sB, 0, 0, &sB, 0, 0, ipiv);
+//			strmm_rlnn_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+//			strmm_rutn_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+//			strsm_llnu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			strsm_lunn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			strsm_rltn_libstr(n, n, 1.0, &sB, 0, 0, &sD, 0, 0, &sD, 0, 0);
+//			strsm_rltu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			strsm_rutn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			sgemv_n_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+//			sgemv_t_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+//			ssymv_l_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+//			sgemv_nt_libstr(n, n, 1.0, 1.0, &sA, 0, 0, &sx, 0, &sx, 0, 0.0, 0.0, &sy, 0, &sy, 0, &sz, 0, &sz, 0);
+			}
+
+//		d_print_strmat(n, n, &sD, 0, 0);
+
+		gettimeofday(&tv2, NULL); // stop
+
+		for(rep=0; rep<nrep; rep++)
+			{
+#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB) || defined(REF_BLAS_MKL)
+//			sgemm_(&c_n, &c_t, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+//			sgemm_(&c_n, &c_n, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+//			scopy_(&n2, A, &i_1, B, &i_1);
+//			ssyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_0, C, &n);
+//			strmm_(&c_r, &c_u, &c_t, &c_n, &n, &n, &d_1, A, &n, C, &n);
+//			spotrf_(&c_l, &n, B2, &n, &info);
+//			sgetrf_(&n, &n, B2, &n, ipiv, &info);
+//			strsm_(&c_l, &c_l, &c_n, &c_u, &n, &n, &d_1, B2, &n, B, &n);
+//			strsm_(&c_l, &c_u, &c_n, &c_n, &n, &n, &d_1, B2, &n, B, &n);
+//			strtri_(&c_l, &c_n, &n, B2, &n, &info);
+//			slauum_(&c_l, &n, B, &n, &info);
+//			sgemv_(&c_n, &n, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+//			sgemv_(&c_t, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y2, &i_1);
+//			strmv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+//			strsv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+//			ssymv_(&c_l, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+
+//			for(i=0; i<n; i++)
+//				{
+//				i_t = n-i;
+//				scopy_(&i_t, &B[i*(n+1)], &i_1, &C[i*(n+1)], &i_1);
+//				}
+//			ssyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_1, C, &n);
+//			spotrf_(&c_l, &n, C, &n, &info);
+
+#endif
+
+#if defined(REF_BLAS_BLIS)
+//			sgemm_(&c_n, &c_t, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+//			sgemm_(&c_n, &c_n, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+//			ssyrk_(&c_l, &c_n, &n77, &n77, &d_1, A, &n77, &d_0, C, &n77);
+//			strmm_(&c_r, &c_u, &c_t, &c_n, &n77, &n77, &d_1, A, &n77, C, &n77);
+//			spotrf_(&c_l, &n77, B, &n77, &info);
+//			strtri_(&c_l, &c_n, &n77, B, &n77, &info);
+//			slauum_(&c_l, &n77, B, &n77, &info);
+#endif
+			}
+
+		gettimeofday(&tv3, NULL); // stop
+
+		// flops
+		if(1)
+			{
+
+			float Gflops_max = flops_max * GHz_max;
+
+//			float flop_operation = 6*16.0*2*n; // kernel 24x4
+//			float flop_operation = 4*16.0*2*n; // kernel 16x4
+//			float flop_operation = 3*16.0*2*n; // kernel 12x4
+//			float flop_operation = 2*16.0*2*n; // kernel 8x4
+//			float flop_operation = 1*16.0*2*n; // kernel 4x4
+
+//			float flop_operation = 2.0*n*n*n; // dgemm
+//			float flop_operation = 1.0*n*n*n; // dsyrk dtrmm dtrsm
+			float flop_operation = 1.0/3.0*n*n*n; // dpotrf dtrtri
+//			float flop_operation = 2.0/3.0*n*n*n; // dgetrf
+//			float flop_operation = 2.0*n*n; // dgemv dsymv
+//			float flop_operation = 1.0*n*n; // dtrmv dtrsv
+//			float flop_operation = 4.0*n*n; // dgemv_nt
+//			float flop_operation = 3*16.0*2*n; // kernel 12x4
+
+//			float flop_operation = 4.0/3.0*n*n*n; // dsyrk+dpotrf
+
+			float time_hpmpc    = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+			float time_blasfeo  = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+			float time_blas     = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+			float Gflops_hpmpc    = 1e-9*flop_operation/time_hpmpc;
+			float Gflops_blasfeo  = 1e-9*flop_operation/time_blasfeo;
+			float Gflops_blas     = 1e-9*flop_operation/time_blas;
+
+
+			printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+//			fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+
+			}
+		// memops
+		else
+			{
+
+			float Gmemops_max = memops_max * GHz_max;
+
+			float memop_operation = 1.0*n*n; // dgecp
+
+			float time_hpmpc    = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+			float time_blasfeo  = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+			float time_blas     = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+			float Gmemops_hpmpc    = 1e-9*memop_operation/time_hpmpc;
+			float Gmemops_blasfeo  = 1e-9*memop_operation/time_blasfeo;
+			float Gmemops_blas     = 1e-9*memop_operation/time_blas;
+
+
+			printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gmemops_blasfeo, 100.0*Gmemops_blasfeo/Gmemops_max, Gmemops_blas, 100.0*Gmemops_blas/Gmemops_max);
+//			fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gmemops_blasfeo, 100.0*Gmemops_blasfeo/Gmemops_max, Gmemops_blas, 100.0*Gmemops_blas/Gmemops_max);
+
+			}
+
+
+		free(A);
+		free(B);
+		free(B2);
+		free(M);
+		free(x);
+		free(y);
+		free(x2);
+		free(y2);
+		free(ipiv);
+		
+		s_free_strmat(&sA);
+		s_free_strmat(&sB);
+		s_free_strmat(&sC);
+		s_free_strmat(&sD);
+		s_free_strmat(&sE);
+		s_free_strvec(&sx);
+		s_free_strvec(&sy);
+		s_free_strvec(&sz);
+
+		}
+
+	printf("\n");
+
+//	fprintf(f, "];\n");
+//	fclose(f);
+
+	return 0;
+	
+	}
+
diff --git a/test_problems/test_d_strmat.c b/test_problems/test_d_strmat.c
new file mode 100644
index 0000000..e06cf84
--- /dev/null
+++ b/test_problems/test_d_strmat.c
@@ -0,0 +1,512 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+
+int main()
+	{
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+	printf("\nLA provided by HIGH_PERFORMANCE\n\n");
+
+#elif defined(LA_REFERENCE)
+
+	printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+	printf("\nLA provided by BLAS\n\n");
+
+#else
+
+	printf("\nLA provided by ???\n\n");
+	exit(2);
+
+#endif
+
+	int ii;
+
+	int n = 16;
+
+	//
+	// matrices in column-major format
+	//
+	double *A; d_zeros(&A, n, n);
+	for(ii=0; ii<n*n; ii++) A[ii] = ii;
+//	d_print_mat(n, n, A, n);
+
+	double *B; d_zeros(&B, n, n);
+	for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+//	d_print_mat(n, n, B, n);
+
+	double *C; d_zeros(&C, n, n);
+
+	double *D; d_zeros(&D, n, n);
+	for(ii=0; ii<n*n; ii++) D[ii] = -1;
+
+	double *x_n; d_zeros(&x_n, n, 1);
+//	for(ii=0; ii<n; ii++) x_n[ii] = 1.0;
+	x_n[1] = 1.0;
+//	x_n[1] = 1.0;
+//	x_n[2] = 2.0;
+//	x_n[3] = 3.0;
+	double *x_t; d_zeros(&x_t, n, 1);
+//	for(ii=0; ii<n; ii++) x_n[ii] = 1.0;
+	x_t[0] = 1.0;
+	double *y_n; d_zeros(&y_n, n, 1);
+	double *y_t; d_zeros(&y_t, n, 1);
+	double *z_n; d_zeros(&z_n, n, 1);
+	double *z_t; d_zeros(&z_t, n, 1);
+
+	double *x0; d_zeros(&x0, n, 1); x0[0] = 1.0;
+	double *x1; d_zeros(&x1, n, 1); x1[1] = 1.0;
+	double *x2; d_zeros(&x2, n, 1); x2[2] = 1.0;
+	double *x3; d_zeros(&x3, n, 1); x3[3] = 1.0;
+	double *x4; d_zeros(&x4, n, 1); x4[4] = 1.0;
+	double *x5; d_zeros(&x5, n, 1); x5[5] = 1.0;
+	double *x6; d_zeros(&x6, n, 1); x6[6] = 1.0;
+	double *x7; d_zeros(&x7, n, 1); x7[7] = 1.0;
+	double *x8; d_zeros(&x8, n, 1); x8[8] = 1.0;
+	double *x9; d_zeros(&x9, n, 1); x9[9] = 1.0;
+
+	int *ipiv; int_zeros(&ipiv, n, 1);
+
+	//
+	// matrices in matrix struct format
+	//
+	int size_strmat = 5*d_size_strmat(n, n);
+	void *memory_strmat; v_zeros_align(&memory_strmat, size_strmat);
+	char *ptr_memory_strmat = (char *) memory_strmat;
+
+	struct d_strmat sA;
+//	d_allocate_strmat(n, n, &sA);
+	d_create_strmat(n, n, &sA, ptr_memory_strmat);
+	ptr_memory_strmat += sA.memory_size;
+	d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+//	d_cast_mat2strmat(A, &sA);
+	d_print_strmat(n, n, &sA, 0, 0);
+
+	struct d_strmat sB;
+//	d_allocate_strmat(n, n, &sB);
+	d_create_strmat(n, n, &sB, ptr_memory_strmat);
+	ptr_memory_strmat += sB.memory_size;
+	d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+	d_print_strmat(n, n, &sB, 0, 0);
+
+	struct d_strmat sC;
+//	d_allocate_strmat(n, n, &sC);
+	d_create_strmat(n, n, &sC, ptr_memory_strmat);
+	ptr_memory_strmat += sC.memory_size;
+
+	struct d_strmat sD;
+//	d_allocate_strmat(n, n, &sD);
+	d_create_strmat(n, n, &sD, ptr_memory_strmat);
+	ptr_memory_strmat += sD.memory_size;
+	d_cvt_mat2strmat(n, n, D, n, &sD, 0, 0);
+
+	struct d_strmat sE;
+//	d_allocate_strmat(n, n, &sE);
+	d_create_strmat(n, n, &sE, ptr_memory_strmat);
+	ptr_memory_strmat += sE.memory_size;
+
+	struct d_strvec sx_n;
+	d_allocate_strvec(n, &sx_n);
+	d_cvt_vec2strvec(n, x_n, &sx_n, 0);
+
+	struct d_strvec sx_t;
+	d_allocate_strvec(n, &sx_t);
+	d_cvt_vec2strvec(n, x_t, &sx_t, 0);
+
+	struct d_strvec sy_n;
+	d_allocate_strvec(n, &sy_n);
+	d_cvt_vec2strvec(n, y_n, &sy_n, 0);
+
+	struct d_strvec sy_t;
+	d_allocate_strvec(n, &sy_t);
+	d_cvt_vec2strvec(n, y_t, &sy_t, 0);
+
+	struct d_strvec sz_n;
+	d_allocate_strvec(n, &sz_n);
+	d_cvt_vec2strvec(n, z_n, &sz_n, 0);
+
+	struct d_strvec sz_t;
+	d_allocate_strvec(n, &sz_t);
+	d_cvt_vec2strvec(n, z_t, &sz_t, 0);
+
+	struct d_strvec sx0; d_create_strvec(n, &sx0, x0);
+	struct d_strvec sx1; d_create_strvec(n, &sx1, x1);
+	struct d_strvec sx2; d_create_strvec(n, &sx2, x2);
+	struct d_strvec sx3; d_create_strvec(n, &sx3, x3);
+	struct d_strvec sx4; d_create_strvec(n, &sx4, x4);
+	struct d_strvec sx5; d_create_strvec(n, &sx5, x5);
+	struct d_strvec sx6; d_create_strvec(n, &sx6, x6);
+	struct d_strvec sx7; d_create_strvec(n, &sx7, x7);
+	struct d_strvec sx8; d_create_strvec(n, &sx8, x8);
+	struct d_strvec sx9; d_create_strvec(n, &sx9, x9);
+
+	struct d_strvec sz0; d_allocate_strvec(n, &sz0);
+	struct d_strvec sz1; d_allocate_strvec(n, &sz1);
+	struct d_strvec sz2; d_allocate_strvec(n, &sz2);
+	struct d_strvec sz3; d_allocate_strvec(n, &sz3);
+	struct d_strvec sz4; d_allocate_strvec(n, &sz4);
+	struct d_strvec sz5; d_allocate_strvec(n, &sz5);
+	struct d_strvec sz6; d_allocate_strvec(n, &sz6);
+	struct d_strvec sz7; d_allocate_strvec(n, &sz7);
+	struct d_strvec sz8; d_allocate_strvec(n, &sz8);
+	struct d_strvec sz9; d_allocate_strvec(n, &sz9);
+
+	// tests
+	double *v; d_zeros(&v, n, 1);
+	double *vp; d_zeros(&vp, n, 1);
+	double *vm; d_zeros(&vm, n, 1);
+	double *m; d_zeros(&m, n, 1);
+	double *r; d_zeros(&r, n, 1);
+
+	for(ii=0; ii<n; ii++) v[ii] = ii; // x
+	for(ii=0; ii<n; ii++) vp[ii] = 8.0; // upper
+	for(ii=0; ii<n; ii++) vm[ii] = 3.0; // lower
+	for(ii=0; ii<n; ii++) r[ii] = 2*ii+1; // x
+
+	d_print_mat(1, n, v, 1);
+	d_print_mat(1, n, vp, 1);
+	d_print_mat(1, n, vm, 1);
+	d_print_mat(1, n, r, 1);
+
+	struct d_strvec sv; d_create_strvec(n, &sv, v);
+	struct d_strvec svp; d_create_strvec(n, &svp, vp);
+	struct d_strvec svm; d_create_strvec(n, &svm, vm);
+	struct d_strvec sm; d_create_strvec(n, &sm, m);
+	struct d_strvec sr; d_create_strvec(n, &sr, r);
+
+//	d_print_tran_strvec(n, &sv, 0);
+//	d_print_tran_strvec(n, &svp, 0);
+//	d_print_tran_strvec(n, &svm, 0);
+//	d_print_tran_strvec(n, &sm, 0);
+//	d_print_tran_strvec(n, &sr, 0);
+
+//	d_print_tran_strvec(n, &sm, 0);
+//	DVECEL_LIBSTR(&sm, 0) = 0.0;
+//	DVECEL_LIBSTR(&sm, 1) = 1.0;
+//	DVECEL_LIBSTR(&sm, 2) = 2.0;
+//	d_print_tran_strvec(n, &sm, 0);
+//	return 0;
+
+	double alpha = 1.0;
+	double beta = 0.0;
+	kernel_dgemm_nt_4x4_gen_lib4(4, &alpha, sA.pA, sB.pA, &beta, 0, sD.pA, sA.cn, 0, sD.pA, sE.cn, 1, 3, 1, 3);
+	d_print_strmat(n, n, &sD, 0, 0);
+	return 0;
+	dtrmm_rlnn_libstr(8, 8, alpha, &sA, 3, 0, &sB, 0, 0, &sD, 0, 0);
+//	dgemm_nn_libstr(8, 8, 8, alpha, &sB, 0, 0, &sA, 1, 0, beta, &sA, 0, 0, &sD, 0, 0);
+	d_print_strmat(n, n, &sD, 0, 0);
+	return 0;
+//	dsyrk_ln_libstr(n, 15, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+//	dpotrf_l_mn_libstr(n, 15, &sD, 0, 0, &sD, 0, 0);
+//	dsyrk_dpotrf_ln_libstr(n, 15, n, &sA, 0, 0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+//	dtrmm_rlnn_libstr(n, n, alpha, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+//	dgese_libstr(n, n, 0.0/0.0, &sD, 0, 0);
+//	kernel_dgemm_nt_4x8_lib4(n, &alpha, sA.pA, sB.pA, sB.cn, &beta, sC.pA, sD.pA);
+//	kernel_dgemm_nn_4x8_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sC.pA, sD.pA);
+//	kernel_dsyrk_nt_l_4x4_gen_lib4(n, &alpha, sA.pA, sB.pA, &beta, 0, sC.pA, sC.cn, 3, sD.pA, sD.cn, 0, 4, 0, 4);
+//	kernel_dsyrk_nt_l_8x4_gen_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, 0, sC.pA, sC.cn, 3, sD.pA, sD.cn, 0, 8, 0, 8);
+//	dsyrk_ln_libstr(10, 10, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 1, 0);
+//	d_print_strmat(n, n, &sD, 0, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx0, 0, beta, &sz0, 0, &sz0, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx1, 0, beta, &sz1, 0, &sz1, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx2, 0, beta, &sz2, 0, &sz2, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx3, 0, beta, &sz3, 0, &sz3, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx4, 0, beta, &sz4, 0, &sz4, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx5, 0, beta, &sz5, 0, &sz5, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx6, 0, beta, &sz6, 0, &sz6, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx7, 0, beta, &sz7, 0, &sz7, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx8, 0, beta, &sz8, 0, &sz8, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx9, 0, beta, &sz9, 0, &sz9, 0);
+	d_print_tran_strvec(n, &sz0, 0);
+	d_print_tran_strvec(n, &sz1, 0);
+	d_print_tran_strvec(n, &sz2, 0);
+	d_print_tran_strvec(n, &sz3, 0);
+	d_print_tran_strvec(n, &sz4, 0);
+	d_print_tran_strvec(n, &sz5, 0);
+	d_print_tran_strvec(n, &sz6, 0);
+	d_print_tran_strvec(n, &sz7, 0);
+	d_print_tran_strvec(n, &sz8, 0);
+	d_print_tran_strvec(n, &sz9, 0);
+	return 0;
+
+//	d_print_strmat(n, n, &sC, 0, 0);
+//	dgese_libstr(n, n, 1.0, &sB, 0, 0);
+//	kernel_dger4_sub_4_lib4(6, sB.pA, sA.pA, sC.pA);
+//	kernel_dger4_sub_4_vs_lib4(6, sB.pA, sA.pA, sC.pA, 1);
+	return 0;
+
+//	d_print_strmat(n, n, &sC, 0, 0);
+//	dgese_libstr(n, n, 1.0, &sB, 0, 0);
+//	kernel_dger4_sub_4_lib4(6, sB.pA, sA.pA, sC.pA);
+//	kernel_dger4_sub_4_vs_lib4(6, sB.pA, sA.pA, sC.pA, 1);
+//	kernel_dger4_sub_8_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn);
+//	kernel_dger4_sub_8_vs_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn, 5);
+//	kernel_dger4_sub_12_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn);
+//	kernel_dger4_sub_12_vs_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn, 9);
+//	kernel_dger4_sub_8c_lib4(9, sB.pA, sA.cn, sA.pA, sC.pA, sC.cn);
+//	kernel_dger4_sub_4c_lib4(9, sB.pA, sA.cn, sA.pA, sC.pA, sC.cn);
+//	d_print_strmat(n, n, &sC, 0, 0);
+//	return 0;
+
+#if 1
+	dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sC, 0, 0);
+#else
+	dgese_libstr(n, n, 0.1, &sC, 0, 0);
+	DMATEL_LIBSTR(&sC, 0, 0) = 1.0;
+//	DMATEL_LIBSTR(&sC, 0, 1) = 1.0;
+	for(ii=1; ii<n-1; ii++)
+		{
+//		DMATEL_LIBSTR(&sC, ii, ii-1) = 1.0;
+		DMATEL_LIBSTR(&sC, ii, ii) = 1.0;
+//		DMATEL_LIBSTR(&sC, ii, ii+1) = 1.0;
+		}
+//	DMATEL_LIBSTR(&sC, n-1, n-2) = 1.0;
+	DMATEL_LIBSTR(&sC, n-1, n-1) = 1.0;
+#endif
+	d_print_strmat(n, n, &sC, 0, 0);
+	dgese_libstr(n, n, 0.0/0.0, &sD, 0, 0);
+//	d_print_strmat(n, n, &sA, 0, 0);
+//	dgein1_libstr(12.0, &sA, 0, 0);
+//	DMATEL_LIBSTR(&sA, 0, 0) =   12.0;
+//	DMATEL_LIBSTR(&sA, 1, 0) =    6.0;
+//	DMATEL_LIBSTR(&sA, 2, 0) = -  4.0;
+//	DMATEL_LIBSTR(&sA, 0, 1) = - 51.0;
+//	DMATEL_LIBSTR(&sA, 1, 1) =  167.0;
+//	DMATEL_LIBSTR(&sA, 2, 1) =   24.0;
+//	DMATEL_LIBSTR(&sA, 0, 2) =    4.0;
+//	DMATEL_LIBSTR(&sA, 1, 2) = - 68.0;
+//	DMATEL_LIBSTR(&sA, 2, 2) = - 41.0;
+//	d_print_strmat(n, n, &sA, 0, 0);
+	d_print_strmat(n, n, &sC, 0, 0);
+//	printf("\n%f\n", DGEEL_LIBSTR(&sA, 0, 0));
+//	int qr_work_size = dgeqrf_work_size_libstr(n, n);
+	int qr_work_size = dgelqf_work_size_libstr(n, n);
+	void *qr_work;
+	v_zeros_align(&qr_work, qr_work_size);
+//	dgeqrf_libstr(10, 10, &sC, 0, 0, &sD, 0, 0, qr_work);
+	dgelqf_libstr(17, 17, &sC, 0, 0, &sD, 0, 0, qr_work);
+//	dgecp_libstr(10, 10, &sC, 0, 0, &sD, 0, 0);
+//	kernel_dgeqrf_4_lib4(16, 12, sD.pA, sD.cn, sD.dA, qr_work);
+//	d_print_strmat(n, n, &sA, 0, 0);
+//	kernel_dgeqrf_vs_lib4(10, 16, 0, sD.pA+0, sD.cn, sD.dA);
+//	kernel_dgelqf_vs_lib4(10, 10, 10, 0, sD.pA+0, sD.cn, sD.dA);
+	d_print_strmat(n, n, &sD, 0, 0);
+	free(qr_work);
+	return 0;
+
+//	dveccl_mask_libstr(n, &svm, 0, &sv, 0, &svp, 0, &sv, 0, &sm, 0);
+//	veccl_libstr(n, &svm, 0, &sv, 0, &svp, 0, &sv, 0);
+//	d_print_tran_strvec(12, &sv, 0);
+//	d_print_tran_strvec(12, &sm, 0);
+//	dvecze_libstr(n, &sm, 0, &sr, 0, &sr, 0);
+//	d_print_tran_strvec(12, &sr, 0);
+//	return 0;
+
+//	d_print_strmat(n, n, &sA, 0, 0);
+//	dtrsv_unn_libstr(n, &sA, 1, 0, &sx0, 0, &sz0, 0);
+//	d_print_tran_strvec(n, &sz0, 0);
+//	dtrsv_unn_libstr(n, &sA, 1, 0, &sx1, 0, &sz1, 0);
+//	d_print_tran_strvec(n, &sz1, 0);
+//	dtrsv_unn_libstr(n, &sA, 1, 0, &sx2, 0, &sz2, 0);
+//	d_print_tran_strvec(n, &sz2, 0);
+//	dtrsv_unn_libstr(n, &sA, 1, 0, &sx3, 0, &sz3, 0);
+//	d_print_tran_strvec(n, &sz3, 0);
+//	return 0;
+
+//	double alpha = 1.0;
+//	double beta = 1.0;
+//	kernel_dgemm_nt_4x12_vs_lib4(n, &alpha, sA.pA, sB.pA, sB.cn, &beta, sD.pA, sD.pA, 3, 10);
+//	kernel_dgemm_nt_8x8u_vs_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn, 7, 6);
+	dgemm_nn_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+	d_print_strmat(n, n, &sD, 0, 0);
+	dpotrf_l_libstr(16, &sD, 0, 0, &sD, 0, 0);
+	d_print_strmat(n, n, &sD, 0, 0);
+	return 0;;
+
+//	dmatse_libstr(n, n, 100.0, &sD, 0, 0);
+
+//	for(ii=0; ii<n; ii++)
+//		dvecin1_libstr(ii+1, &sx_n, ii);
+//	d_print_tran_strvec(n, &sx_n, 0);
+//	d_print_strmat(n, n, &sD, 0, 0);
+//	// ddiain_libstr(4, -1.0, &sx_n, 1, &sD, 3, 2);
+//	ddiaad_libstr(4, -1.0, &sx_n, 1, &sD, 3, 2);
+//	d_print_strmat(n, n, &sD, 0, 0);
+//	return 0;
+
+//	d_print_tran_strvec(n, &sx_n, 0);
+//	dgemm_l_diag_libstr(n, n, 1.0, &sx_n, 0, &sA, 0, 0, 0.0, &sD, 0, 0, &sD, 0, 0);
+//	dgemm_r_diag_libstr(n, n, 1.0, &sA, 0, 0, &sx_n, 0, 0.0, &sD, 0, 0, &sD, 0, 0);
+//	d_print_strmat(n, n, &sD, 0, 0);
+//	exit(1);
+
+//	dsetmat_libstr(n, n, 0.0, &sD, 0, 0);
+//	dmatin1_libstr(2.0, &sD, 0, 0);
+//	dmatin1_libstr(2.0, &sD, 1, 1);
+//	dmatin1_libstr(2.0, &sD, 2, 2);
+//	dmatin1_libstr(1.0, &sD, 1, 0);
+//	dmatin1_libstr(1.0, &sD, 2, 1);
+//	dmatin1_libstr(0.5, &sD, 2, 0);
+//	d_print_strmat(n, n, &sD, 0, 0);
+//	d_print_tran_strvec(n, &sx_n, 0);
+//	dtrsv_lnn_libstr(n, n, &sD, 0, 0, &sx_n, 0, &sz_n, 0);
+//	d_print_tran_strvec(n, &sz_n, 0);
+//	exit(1);
+
+//	dgemm_nt_libstr(8, 8, 8, 1.0, &sB, 0, 0, &sA, 1, 0, 0.0, &sD, 0, 0, &sD, 0, 0);
+//	d_print_strmat(n, n, &sD, 0, 0);
+//	return 0;
+
+//	double alpha = 1.0;
+//	kernel_dtrmm_nn_rl_4x4_gen_lib4(7, &alpha, sB.pA, 2, sA.pA, sA.cn, 1, sD.pA, sD.cn, 0, 4, 1, 4);
+//	kernel_dtrmm_nn_rl_4x4_gen_lib4(7, &alpha, sB.pA+sB.cn*4, 2, sA.pA, sA.cn, 1, sD.pA+sD.cn*4, sD.cn, 0, 4, 1, 4);
+//	kernel_dtrmm_nn_rl_4x4_lib4(4, &alpha, sB.pA, sA.pA, sA.cn+4*4, sD.pA+4*4);
+//	kernel_dtrmm_nn_rl_4x4_gen_lib4(3, &alpha, sB.pA+sB.cn*4+4*4, 2, sA.pA+sB.cn*4+4*4, sA.cn, 1, sD.pA+sD.cn*4+4*4, sD.cn, 0, 4, 0, 4);
+	dtrmm_rlnn_libstr(8, 8, 1.0, &sB, 0, 0, &sA, 3, 0, &sD, 2, 1);
+	d_print_strmat(n, n, &sD, 0, 0);
+	return 0;
+
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx0, 0, &sx0, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx1, 0, &sx1, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx2, 0, &sx2, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx3, 0, &sx3, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx4, 0, &sx4, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx5, 0, &sx5, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx6, 0, &sx6, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx7, 0, &sx7, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx8, 0, &sx8, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx9, 0, &sx9, 0);
+	d_print_tran_strvec(n, &sx0, 0);
+	d_print_tran_strvec(n, &sx1, 0);
+	d_print_tran_strvec(n, &sx2, 0);
+	d_print_tran_strvec(n, &sx3, 0);
+	d_print_tran_strvec(n, &sx4, 0);
+	d_print_tran_strvec(n, &sx5, 0);
+	d_print_tran_strvec(n, &sx6, 0);
+	d_print_tran_strvec(n, &sx7, 0);
+	d_print_tran_strvec(n, &sx8, 0);
+	d_print_tran_strvec(n, &sx9, 0);
+	return 0;
+
+	dgemv_t_libstr(2, 8, 1.0, &sA, 2, 0, &sx_n, 0, 0.0, &sy_n, 0, &sz_n, 0);
+	d_print_tran_strvec(n, &sz_n, 0);
+	return 0;
+
+	dgemm_nt_libstr(4, 8, 8, 1.0, &sB, 0, 0, &sA, 0, 0, 0.0, &sB, 0, 0, &sD, 3, 0);
+//	d_print_strmat(n, n, &sB, 0, 0);
+	d_print_strmat(n, n, &sD, 0, 0);
+	exit(1);
+
+	dpotrf_l_libstr(n, &sD, 0, 0, &sD, 0, 0);
+//	dgetrf_nopivot_libstr(n, n, &sD, 0, 0, &sD, 0, 0);
+//	dgetrf_libstr(n, n, &sD, 0, 0, &sD, 0, 0, ipiv);
+	d_print_strmat(n, n, &sD, 0, 0);
+#if defined(LA_HIGH_PERFORMANCE) | defined(LA_REFERENCE)
+	d_print_mat(1, n, sD.dA, 1);
+#endif
+	int_print_mat(1, n, ipiv, 1);
+	dtrsm_rltn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sE, 0, 0);
+	d_print_strmat(n, n, &sE, 0, 0);
+	exit(1);
+
+#if 1 // solve P L U X = P B
+	d_print_strmat(n, n, &sB, 0, 0);
+	drowpe_libstr(n, ipiv, &sB);
+	d_print_strmat(n, n, &sB, 0, 0);
+
+	dtrsm_llnu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sE, 0, 0);
+	d_print_strmat(n, n, &sE, 0, 0);
+	dtrsm_lunn_libstr(n, n, 1.0, &sD, 0, 0, &sE, 0, 0, &sE, 0, 0);
+	d_print_strmat(n, n, &sE, 0, 0);
+#else // solve X^T (P L U)^T = B^T P^T
+	d_print_strmat(n, n, &sB, 0, 0);
+	dcolpe_libstr(n, ipiv, &sB);
+	d_print_strmat(n, n, &sB, 0, 0);
+
+	dtrsm_rltu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sE, 0, 0);
+	d_print_strmat(n, n, &sE, 0, 0);
+	dtrsm_rutn_libstr(n, n, 1.0, &sD, 0, 0, &sE, 0, 0, &sE, 0, 0);
+	d_print_strmat(n, n, &sE, 0, 0);
+#endif
+
+//	d_print_strmat(n, n, &sA, 0, 0);
+//	d_print_strmat(n, n, &sB, 0, 0);
+//	d_print_strmat(n, n, &sD, 0, 0);
+//	d_print_strmat(n, n, &sE, 0, 0);
+
+//	d_cvt_strmat2mat(n, n, &sE, 0, 0, C, n);
+//	d_print_mat(n, n, C, n);
+
+	dtrtr_u_libstr(6, &sE, 2, 0, &sB, 1, 0);
+	d_print_strmat(n, n, &sB, 0, 0);
+
+	d_print_strmat(n, n, &sA, 0, 0);
+	dgemv_nt_libstr(6, n, 1.0, 1.0, &sA, 0, 0, &sx_n, 0, &sx_t, 0, 0.0, 0.0, &sy_n, 0, &sy_t, 0, &sz_n, 0, &sz_t, 0);
+//	dsymv_l_libstr(5, 5, 1.0, &sA, 0, 0, x_n, 0.0, y_n, z_n);
+	d_print_mat(1, n, z_n, 1);
+	d_print_mat(1, n, z_t, 1);
+
+
+
+
+//	for(ii=0; ii<sE.pm*sE.cn; ii++) sE.pA[ii] = 0.0;
+//	double alpha = 0.0;
+//	double beta = 1.0;
+//	kernel_dgemm_nt_4x4_gen_lib4(4, &alpha, sA.pA, sB.pA, &beta, 3, sA.pA, sA.cn, 0, sE.pA, sE.cn, 0, 4, 2, 2);
+//	d_print_strmat(n, n, &sE, 0, 0);
+
+	// free memory
+	free(A);
+	free(B);
+	free(C);
+	free(D);
+	free(ipiv);
+//	d_free_strmat(&sA);
+//	d_free_strmat(&sB);
+//	d_free_strmat(&sD);
+	v_free_align(memory_strmat);
+
+	return 0;
+
+	}
diff --git a/test_problems/test_s_strmat.c b/test_problems/test_s_strmat.c
new file mode 100644
index 0000000..456db87
--- /dev/null
+++ b/test_problems/test_s_strmat.c
@@ -0,0 +1,191 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+
+int main()
+	{
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+	printf("\nLA provided by HIGH_PERFORMANCE\n\n");
+
+#elif defined(LA_REFERENCE)
+
+	printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+	printf("\nLA provided by BLAS\n\n");
+
+#else
+
+	printf("\nLA provided by ???\n\n");
+	exit(2);
+
+#endif
+
+	int ii, jj;
+
+	int n = 16;
+
+	//
+	// matrices in column-major format
+	//
+	float *A; s_zeros(&A, n, n);
+	for(ii=0; ii<n*n; ii++) A[ii] = ii;
+//	for(jj=0; jj<n; jj++)
+//		for(ii=0; ii<jj; ii++)
+//			A[ii+n*jj] = 0.0/0.0;
+//	s_print_mat(n, n, A, n);
+
+	float *B; s_zeros(&B, n, n);
+	for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+//	s_print_mat(n, n, B, n);
+
+	float *D; s_zeros(&D, n, n);
+	for(ii=0; ii<n*n; ii++) D[ii] = -1.0;
+//	s_print_mat(n, n, B, n);
+
+
+	//
+	// matrices in matrix struct format
+	//
+
+	struct s_strmat sA;
+	s_allocate_strmat(n, n, &sA);
+	s_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+	s_print_strmat(n, n, &sA, 0, 0);
+
+	struct s_strmat sB;
+	s_allocate_strmat(n, n, &sB);
+	s_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+	s_print_strmat(n, n, &sB, 0, 0);
+
+	struct s_strmat sD;
+	s_allocate_strmat(n, n, &sD);
+	s_cvt_mat2strmat(n, n, D, n, &sD, 0, 0);
+
+	struct s_strvec sx;
+	s_allocate_strvec(n, &sx);
+	sx.pa[7] = 1.0;
+	s_print_tran_strvec(n, &sx, 0);
+
+	struct s_strvec sz0;
+	s_allocate_strvec(n, &sz0);
+
+	struct s_strvec sz1;
+	s_allocate_strvec(n, &sz1);
+
+	//
+	// tests
+	//
+
+	float alpha = 1.0;
+	float beta = 0.0;
+//	kernel_sgemm_nt_24x4_lib8(4, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//	kernel_sgemm_nt_16x4_lib8(4, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//	kernel_sgemm_nt_8x8_lib8(5, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//	kernel_sgemm_nt_8x4_lib8(5, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//	kernel_sgemm_nt_4x8_gen_lib8(8, &alpha, sA.pA, sB.pA, &beta, 0, sD.pA, sD.cn, 0, sD.pA, sD.cn, 0, 4, 0, 8);
+//	kernel_sgemm_nt_8x4_vs_lib8(8, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA, 7, 4);
+//	kernel_sgemm_nt_8x4_lib8(8, &alpha, sB.pA, sA.pA+4, &beta, sA.pA+4*8, sD.pA+4*8);
+//	kernel_sgemm_nn_16x4_lib8(4, &alpha, sA.pA, sA.cn, 0, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//	kernel_sgemm_nt_12x4_lib4(4, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//	kernel_sgemm_nt_8x8_lib4(8, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//	kernel_sgemm_nt_8x4_lib4(2, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//	s_print_strmat(n, n, &sD, 0, 0);
+//	return 0;
+//	sgemm_nt_libstr(n, n, 5, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sB, 0, 0, &sD, 0, 0);
+//	ssyrk_ln_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sB, 0, 0, &sD, 0, 0);
+//	ssyrk_ln_mn_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sB, 0, 0, &sD, 0, 0);
+//	kernel_ssyrk_nt_l_8x8_lib8(n, &alpha, sA.pA, sA.pA, &beta, sB.pA, sD.pA);
+//	sgecp_libstr(16, 16, &sA, 2, 0, &sD, 1, 0);
+//	sgetr_libstr(16, 16, &sA, 2, 0, &sD, 2, 0);
+//	s_print_strmat(n, n, &sD, 0, 0);
+//	sgemv_n_libstr(6, 6, 1.0, &sA, 1, 0, &sx, 0, 0.0, &sz0, 0, &sz0, 0);
+//	sgemv_t_libstr(11, 8, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sz0, 0, &sz0, 0);
+//	strmv_lnn_libstr(6, 6, &sA, 1, 0, &sx, 0, &sz0, 0);
+//	strmv_ltn_libstr(10, 10, &sA, 1, 0, &sx, 0, &sz0, 0);
+//	sA.pA[0] = 1.0;
+//	strsv_lnn_libstr(10, &sA, 0, 0, &sx, 0, &sz0, 0);
+//	for(ii=0; ii<8; ii++) sA.dA[ii] = 1.0/sgeex1_libstr(&sA, ii, ii);
+//	kernel_strsv_lt_inv_8_lib8(0, sA.pA, sA.cn, sA.dA, sx.pa, sx.pa, sz0.pa);
+//	kernel_strsv_lt_inv_8_vs_lib8(0, sA.pA, sA.cn, sA.dA, sx.pa, sx.pa, sz0.pa, 3);
+//	s_print_strmat(n, n, &sA, 0, 0);
+//	strsv_ltn_libstr(12, &sA, 0, 0, &sx, 0, &sz0, 0);
+//	strsv_ltn_mn_libstr(11, 3, &sA, 0, 0, &sx, 0, &sz0, 0);
+//	s_print_strmat(n, n, &sA, 0, 0);
+//	kernel_sgemv_nt_4_lib8(n, &alpha, &alpha, sA.pA, sA.cn, sx.pa, sx.pa, &beta, sz1.pa, sz0.pa, sz1.pa);
+//	kernel_sgemv_nt_4_vs_lib8(n, &alpha, &alpha, sA.pA, sA.cn, sx.pa, sx.pa, &beta, sz1.pa, sz0.pa, sz1.pa, 3);
+//	sgemv_nt_libstr(5, 2, alpha, alpha, &sA, 0, 0, &sx, 0, &sx, 0, beta, beta, &sz0, 0, &sz1, 0, &sz0, 0, &sz1, 0);
+//	ssymv_l_libstr(10, 10, alpha, &sA, 1, 0, &sx, 0, beta, &sz0, 0, &sz1, 0);
+//	s_print_tran_strvec(n, &sz0, 0);
+//	s_print_tran_strvec(n, &sz1, 0);
+//	return 0;
+//	sgesc_libstr(16, 9, 2.0, &sD, 0, 0);
+//	s_print_strmat(n, n, &sD, 0, 0);
+//	kernel_spotrf_nt_l_8x8_lib8(0, sD.pA, sD.pA, sD.pA, sD.pA, sx.pa);
+//	s_print_strmat(n, n, &sD, 0, 0);
+//	s_print_tran_strvec(n, &sx, 0);
+//	kernel_strsm_nt_rl_inv_8x8_lib8(0, sD.pA, sD.pA, sD.pA+8*sD.cn, sD.pA+8*sD.cn, sD.pA, sx.pa);
+//	s_print_strmat(n, n, &sD, 0, 0);
+//	kernel_spotrf_nt_l_8x8_lib8(8, sD.pA+8*sD.cn, sD.pA+8*sD.cn, sD.pA+8*sD.cn+8*8, sD.pA+8*sD.cn+8*8, sx.pa+8);
+//	spotrf_l_mn_libstr(23, 17, &sD, 0, 0, &sD, 0, 0);
+//	spotrf_l_libstr(n, &sD, 0, 0, &sD, 0, 0);
+//	kernel_strmm_nn_rl_8x4_lib8(3, &alpha, sB.pA, 7, sA.pA, sA.cn, sD.pA);
+	strmm_rlnn_libstr(12, 8, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+	s_print_strmat(n, n, &sD, 0, 0);
+	return 0;
+
+
+
+	//
+	// free memory
+	//
+
+	free(A);
+	free(B);
+	free(D);
+	s_free_strmat(&sA);
+	s_free_strmat(&sB);
+	s_free_strmat(&sD);
+
+	return 0;
+	
+	}