Squashed 'third_party/blasfeo/' content from commit 2a828ca
Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/test_problems/CMakeLists.txt b/test_problems/CMakeLists.txt
new file mode 100644
index 0000000..77becb1
--- /dev/null
+++ b/test_problems/CMakeLists.txt
@@ -0,0 +1,32 @@
+###################################################################################################
+# #
+# This file is part of HPIPM. #
+# #
+# HPIPM -- High Performance Interior Point Method. #
+# Copyright (C) 2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+add_executable(d_blas test_blas_d.c)
+target_link_libraries(d_blas blasfeo m)
+
+add_executable(s_blas test_blas_s.c)
+target_link_libraries(s_blas blasfeo m)
diff --git a/test_problems/Makefile b/test_problems/Makefile
new file mode 100644
index 0000000..f2e4741
--- /dev/null
+++ b/test_problems/Makefile
@@ -0,0 +1,67 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../Makefile.rule
+
+ifeq ($(REF_BLAS), 0)
+LIBS = -lm
+endif
+ifeq ($(REF_BLAS), OPENBLAS)
+LIBS = /opt/openblas/lib/libopenblas.a -pthread -lgfortran -lm
+endif
+ifeq ($(REF_BLAS), BLIS)
+LIBS = /opt/netlib/liblapack.a /opt/blis/lib/libblis.a -lgfortran -lm -fopenmp
+endif
+ifeq ($(REF_BLAS), NETLIB)
+LIBS = /opt/netlib/liblapack.a /opt/netlib/libblas.a -lgfortran -lm
+endif
+ifeq ($(REF_BLAS), MKL)
+LIBS = -Wl,--start-group /opt/intel/mkl/lib/intel64/libmkl_gf_lp64.a /opt/intel/mkl/lib/intel64/libmkl_core.a /opt/intel/mkl/lib/intel64/libmkl_sequential.a -Wl,--end-group -ldl -lpthread -lm
+endif
+ifeq ($(REF_BLAS), ATLAS)
+LIBS = /opt/atlas/lib/liblapack.a /opt/atlas/lib/libcblas.a /opt/atlas/lib/libf77blas.a /opt/atlas/lib/libatlas.a -lgfortran -lm
+endif
+
+#ifneq ($(NUM_THREAD), 1)
+#LIBS += -pthread
+#endif
+
+OBJS_TEST = test_blas_d.o
+#OBJS_TEST = test_blas_s.o
+#OBJS_TEST = test_d_strmat.o
+#OBJS_TEST = test_s_strmat.o
+#OBJS_TEST = kernel_assembly.o test_assembly.o
+
+obj: $(OBJS_TEST)
+ $(CC) -o test.out $(OBJS_TEST) -L. libblasfeo.a $(LIBS) #-pg
+
+clean:
+ rm -f *.o
+ rm -f test.out
+ rm -f libblasfeo.a
+
diff --git a/test_problems/cpu_freq.h b/test_problems/cpu_freq.h
new file mode 100644
index 0000000..30320fc
--- /dev/null
+++ b/test_problems/cpu_freq.h
@@ -0,0 +1,31 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#ifndef GHZ_MAX
+#define GHZ_MAX 3.6
+#endif
diff --git a/test_problems/kernel_assembly.S b/test_problems/kernel_assembly.S
new file mode 100644
index 0000000..b393e0d
--- /dev/null
+++ b/test_problems/kernel_assembly.S
@@ -0,0 +1,633 @@
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r12), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+// vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+
+// cmpl $3, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r12
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ subl $1, %r10d
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovupd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovupd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %ymm1, 32(%r10)
+ vmovupd %ymm2, 64(%r10)
+ vmovupd %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_lib4_test
+#if defined(OS_LINUX)
+ .type kernel_dgemm_nt_4x4_lib4_test, @function
+#else // OS_WINDOWS
+ .def kernel_dgemm_nt_4x4_lib4_test; .scl 2; .type 32; .endef
+#endif
+kernel_dgemm_nt_4x4_lib4_test:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_lib4_test
+_kernel_dgemm_nt_4x4_lib4_test:
+#endif
+
+ // prologue
+
+ subq $STACKSIZE, %rsp
+ movq %rbx, (%rsp)
+ movq %rbp, 8(%rsp)
+ movq %r12, 16(%rsp)
+ movq %r13, 24(%rsp)
+ movq %r14, 32(%rsp)
+ movq %r15, 40(%rsp)
+#if defined(OS_WINDOWS)
+ movq %rdi, 48(%rsp)
+ movq %rsi, 56(%rsp)
+ vmovups %xmm6, 64(%rsp)
+ vmovups %xmm7, 80(%rsp)
+ vmovups %xmm8, 96(%rsp)
+ vmovups %xmm9, 112(%rsp)
+ vmovups %xmm10, 128(%rsp)
+ vmovups %xmm11, 144(%rsp)
+ vmovups %xmm12, 160(%rsp)
+ vmovups %xmm13, 176(%rsp)
+ vmovups %xmm14, 192(%rsp)
+ vmovups %xmm15, 208(%rsp)
+#endif
+
+ vzeroupper
+
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+// movq ARG6, %rax
+// movq STACKSIZE + 48(%rsp), %rax
+
+
+ // epilogue
+
+ vzeroupper
+
+ movq (%rsp), %rbx
+ movq 8(%rsp), %rbp
+ movq 16(%rsp), %r12
+ movq 24(%rsp), %r13
+ movq 32(%rsp), %r14
+ movq 40(%rsp), %r15
+#if defined(OS_WINDOWS)
+ movq 48(%rsp), %rdi
+ movq 56(%rsp), %rsi
+ vmovups 64(%rsp), %xmm6
+ vmovups 80(%rsp), %xmm7
+ vmovups 96(%rsp), %xmm8
+ vmovups 112(%rsp), %xmm9
+ vmovups 128(%rsp), %xmm10
+ vmovups 144(%rsp), %xmm11
+ vmovups 160(%rsp), %xmm12
+ vmovups 176(%rsp), %xmm13
+ vmovups 192(%rsp), %xmm14
+ vmovups 208(%rsp), %xmm15
+#endif
+ addq $STACKSIZE, %rsp
+
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_lib4_test, .-kernel_dgemm_nt_4x4_lib4_test
+#endif
+
+
diff --git a/test_problems/results/dummy.txt b/test_problems/results/dummy.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test_problems/results/dummy.txt
diff --git a/test_problems/test_assembly.c b/test_problems/test_assembly.c
new file mode 100644
index 0000000..3a07a13
--- /dev/null
+++ b/test_problems/test_assembly.c
@@ -0,0 +1,59 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+int main()
+ {
+
+ printf("\ntest assembly\n");
+
+ int ii;
+
+ int n = 12;
+
+ double *A; d_zeros(&A, n, n);
+ for(ii=0; ii<n*n; ii++) A[ii] = ii;
+ d_print_mat(n, n, A, n);
+
+ double *B; d_zeros(&B, n, n);
+ for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+ d_print_mat(n, n, B, n);
+
+ struct d_strmat sA;
+ d_allocate_strmat(n, n, &sA);
+ d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+ d_print_strmat(n, n, &sA, 0, 0);
+
+ struct d_strmat sB;
+ d_allocate_strmat(n, n, &sB);
+ d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ struct d_strmat sD;
+ d_allocate_strmat(n, n, &sD);
+
+ struct d_strmat sC;
+ d_allocate_strmat(n, n, &sC);
+
+ double alpha = 1.0;
+ double beta = 0.0;
+ int ret = kernel_dgemm_nt_4x4_lib4_test(n, &alpha, sB.pA, sA.pA, &beta, sB.pA, sD.pA);
+ d_print_strmat(n, n, &sD, 0, 0);
+// printf("\n%ld %ld\n", (long long) n, ret);
+// printf("\n%ld %ld\n", (long long) &alpha, ret);
+// printf("\n%ld %ld\n", (long long) sA.pA, ret);
+// printf("\n%ld %ld\n", (long long) sB.pA, ret);
+// printf("\n%ld %ld\n", (long long) &beta, ret);
+// printf("\n%ld %ld\n", (long long) sC.pA, ret);
+// printf("\n%ld %ld\n", (long long) sD.pA, ret);
+
+ return 0;
+
+ }
diff --git a/test_problems/test_blas_d.c b/test_problems/test_blas_d.c
new file mode 100644
index 0000000..1e71494
--- /dev/null
+++ b/test_problems/test_blas_d.c
@@ -0,0 +1,480 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+//#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+//#include <xmmintrin.h> // needed to flush to zero sub-normals with _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_ON); in the main()
+//#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+#ifndef D_PS
+#define D_PS 1
+#endif
+#ifndef D_NC
+#define D_NC 1
+#endif
+
+
+
+#if defined(REF_BLAS_OPENBLAS)
+void openblas_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_BLIS)
+void omp_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_MKL)
+#include "mkl.h"
+#endif
+
+
+
+#include "cpu_freq.h"
+
+
+
+int main()
+ {
+
+#if defined(REF_BLAS_OPENBLAS)
+ openblas_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_BLIS)
+ omp_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_MKL)
+ mkl_set_num_threads(1);
+#endif
+
+//#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!!
+//#endif
+
+ printf("\n");
+ printf("\n");
+ printf("\n");
+
+ printf("BLAS performance test - double precision\n");
+ printf("\n");
+
+ // maximum frequency of the processor
+ const float GHz_max = GHZ_MAX;
+ printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max);
+ printf("\n");
+
+ // maximum flops per cycle, double precision
+#if defined(TARGET_X64_INTEL_HASWELL)
+ const float flops_max = 16;
+ printf("Testing BLAS version for AVX2 and FMA instruction sets, 64 bit (optimized for Intel Haswell): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ const float flops_max = 8;
+ printf("Testing BLAS version for AVX instruction set, 64 bit (optimized for Intel Sandy Bridge): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_CORE)
+ const float flops_max = 4;
+ printf("Testing BLAS version for SSE3 instruction set, 64 bit (optimized for Intel Core): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+ const float flops_max = 8;
+ printf("Testing BLAS version for SSE3 and FMA instruction set, 64 bit (optimized for AMD Bulldozer): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ const float flops_max = 4;
+ printf("Testing BLAS version for NEONv2 instruction set, 64 bit (optimized for ARM Cortex A57): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+ const float flops_max = 2;
+ printf("Testing BLAS version for VFPv4 instruction set, 32 bit (optimized for ARM Cortex A15): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_GENERIC)
+ const float flops_max = 2;
+ printf("Testing BLAS version for generic scalar instruction set: theoretical peak %5.1f Gflops ???\n", flops_max*GHz_max);
+#endif
+
+// FILE *f;
+// f = fopen("./test_problems/results/test_blas.m", "w"); // a
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+// fprintf(f, "C = 'd_x64_intel_haswell';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+// fprintf(f, "C = 'd_x64_intel_sandybridge';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_CORE)
+// fprintf(f, "C = 'd_x64_intel_core';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+// fprintf(f, "C = 'd_x64_amd_bulldozer';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+// fprintf(f, "C = 'd_armv8a_arm_cortex_a57';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+// fprintf(f, "C = 'd_armv7a_arm_cortex_a15';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_GENERIC)
+// fprintf(f, "C = 'd_generic';\n");
+// fprintf(f, "\n");
+#endif
+
+// fprintf(f, "A = [%f %f];\n", GHz_max, flops_max);
+// fprintf(f, "\n");
+
+// fprintf(f, "B = [\n");
+
+
+
+ int i, j, rep, ll;
+
+ const int bsd = D_PS;
+ const int ncd = D_NC;
+
+/* int info = 0;*/
+
+ printf("\nn\t dgemm_blasfeo\t dgemm_blas\n");
+ printf("\nn\t Gflops\t %%\t Gflops\n\n");
+
+#if 1
+ int nn[] = {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332, 336, 340, 344, 348, 352, 356, 360, 364, 368, 372, 376, 380, 384, 388, 392, 396, 400, 404, 408, 412, 416, 420, 424, 428, 432, 436, 440, 444, 448, 452, 456, 460, 500, 550, 600, 650, 700};
+ int nnrep[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 400, 400, 400, 400, 400, 200, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, 4};
+
+// for(ll=0; ll<24; ll++)
+ for(ll=0; ll<75; ll++)
+// for(ll=0; ll<115; ll++)
+// for(ll=0; ll<120; ll++)
+
+ {
+
+ int n = nn[ll];
+ int nrep = nnrep[ll];
+// int n = ll+1;
+// int nrep = nnrep[0];
+// n = n<12 ? 12 : n;
+// n = n<8 ? 8 : n;
+
+#else
+ int nn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
+
+ for(ll=0; ll<24; ll++)
+
+ {
+
+ int n = nn[ll];
+ int nrep = 40000; //nnrep[ll];
+#endif
+
+ double *A; d_zeros(&A, n, n);
+ double *B; d_zeros(&B, n, n);
+ double *C; d_zeros(&C, n, n);
+ double *M; d_zeros(&M, n, n);
+
+ char c_n = 'n';
+ char c_l = 'l';
+ char c_r = 'r';
+ char c_t = 't';
+ char c_u = 'u';
+ int i_1 = 1;
+ int i_t;
+ double d_1 = 1;
+ double d_0 = 0;
+
+ for(i=0; i<n*n; i++)
+ A[i] = i;
+
+ for(i=0; i<n; i++)
+ B[i*(n+1)] = 1;
+
+ for(i=0; i<n*n; i++)
+ M[i] = 1;
+
+ int n2 = n*n;
+ double *B2; d_zeros(&B2, n, n);
+ for(i=0; i<n*n; i++)
+ B2[i] = 1e-15;
+ for(i=0; i<n; i++)
+ B2[i*(n+1)] = 1;
+
+ int pnd = ((n+bsd-1)/bsd)*bsd;
+ int cnd = ((n+ncd-1)/ncd)*ncd;
+ int cnd2 = 2*((n+ncd-1)/ncd)*ncd;
+
+ double *x; d_zeros_align(&x, pnd, 1);
+ double *y; d_zeros_align(&y, pnd, 1);
+ double *x2; d_zeros_align(&x2, pnd, 1);
+ double *y2; d_zeros_align(&y2, pnd, 1);
+ double *diag; d_zeros_align(&diag, pnd, 1);
+ int *ipiv; int_zeros(&ipiv, n, 1);
+
+ for(i=0; i<pnd; i++) x[i] = 1;
+ for(i=0; i<pnd; i++) x2[i] = 1;
+
+ // matrix struct
+#if 0
+ struct d_strmat sA; d_allocate_strmat(n+4, n+4, &sA);
+ struct d_strmat sB; d_allocate_strmat(n+4, n+4, &sB);
+ struct d_strmat sC; d_allocate_strmat(n+4, n+4, &sC);
+ struct d_strmat sD; d_allocate_strmat(n+4, n+4, &sD);
+ struct d_strmat sE; d_allocate_strmat(n+4, n+4, &sE);
+#else
+ struct d_strmat sA; d_allocate_strmat(n, n, &sA);
+ struct d_strmat sB; d_allocate_strmat(n, n, &sB);
+ struct d_strmat sB2; d_allocate_strmat(n, n, &sB2);
+ struct d_strmat sB3; d_allocate_strmat(n, n, &sB3);
+ struct d_strmat sC; d_allocate_strmat(n, n, &sC);
+ struct d_strmat sD; d_allocate_strmat(n, n, &sD);
+ struct d_strmat sE; d_allocate_strmat(n, n, &sE);
+#endif
+ struct d_strvec sx; d_allocate_strvec(n, &sx);
+ struct d_strvec sy; d_allocate_strvec(n, &sy);
+ struct d_strvec sz; d_allocate_strvec(n, &sz);
+
+ d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+ d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ d_cvt_mat2strmat(n, n, B2, n, &sB2, 0, 0);
+ d_cvt_vec2strvec(n, x, &sx, 0);
+ int ii;
+ for(ii=0; ii<n; ii++)
+ {
+ DMATEL_LIBSTR(&sB3, ii, ii) = 1.0;
+// DMATEL_LIBSTR(&sB3, n-1, ii) = 1.0;
+ DMATEL_LIBSTR(&sB3, ii, n-1) = 1.0;
+ DVECEL_LIBSTR(&sx, ii) = 1.0;
+ }
+// d_print_strmat(n, n, &sB3, 0, 0);
+// if(n==20) return;
+
+ int qr_work_size = 0;//dgeqrf_work_size_libstr(n, n);
+ void *qr_work;
+ v_zeros_align(&qr_work, qr_work_size);
+
+ int lq_work_size = 0;//dgelqf_work_size_libstr(n, n);
+ void *lq_work;
+ v_zeros_align(&lq_work, lq_work_size);
+
+ // create matrix to pivot all the time
+// dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+
+ double *dummy;
+
+ int info;
+
+ /* timing */
+ struct timeval tvm1, tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11, tv12, tv13, tv14, tv15, tv16;
+
+ /* warm up */
+ for(rep=0; rep<nrep; rep++)
+ {
+ dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sC, 0, 0);
+ }
+
+ double alpha = 1.0;
+ double beta = 0.0;
+
+ gettimeofday(&tv0, NULL); // stop
+
+ for(rep=0; rep<nrep; rep++)
+ {
+
+// dgemm_nt_lib(n, n, n, 1.0, pA, cnd, pB, cnd, 0.0, pC, cnd, pC, cnd);
+// dgemm_nn_lib(n, n, n, 1.0, pA, cnd, pB, cnd, 0.0, pC, cnd, pC, cnd);
+// dsyrk_nt_l_lib(n, n, n, 1.0, pA, cnd, pB, cnd, 1.0, pC, cnd, pD, cnd);
+// dtrmm_nt_ru_lib(n, n, pA, cnd, pB, cnd, 0, pC, cnd, pD, cnd);
+// dpotrf_nt_l_lib(n, n, pB, cnd, pD, cnd, diag);
+// dsyrk_dpotrf_nt_l_lib(n, n, n, pA, cnd, pA, cnd, 1, pB, cnd, pD, cnd, diag);
+// dsyrk_nt_l_lib(n, n, n, pA, cnd, pA, cnd, 1, pB, cnd, pD, cnd);
+// dpotrf_nt_l_lib(n, n, pD, cnd, pD, cnd, diag);
+// dgetrf_nn_nopivot_lib(n, n, pB, cnd, pB, cnd, diag);
+// dgetrf_nn_lib(n, n, pB, cnd, pB, cnd, diag, ipiv);
+// dtrsm_nn_ll_one_lib(n, n, pD, cnd, pB, cnd, pB, cnd);
+// dtrsm_nn_lu_inv_lib(n, n, pD, cnd, diag, pB, cnd, pB, cnd);
+ }
+
+ gettimeofday(&tv1, NULL); // stop
+
+ for(rep=0; rep<nrep; rep++)
+ {
+// kernel_dgemm_nt_12x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_dgemm_nt_8x8_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_dsyrk_nt_l_8x8_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_dgemm_nt_8x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_dgemm_nt_4x8_lib4(n, &alpha, sA.pA, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+// kernel_dgemm_nt_4x4_lib4(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_dger4_12_sub_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+// kernel_dger4_sub_12r_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+// kernel_dger4_sub_8r_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+// kernel_dger12_add_4r_lib4(n, sA.pA, sB.pA, sB.cn, sD.pA);
+// kernel_dger8_add_4r_lib4(n, sA.pA, sB.pA, sB.cn, sD.pA);
+// kernel_dger4_sub_4r_lib4(n, sA.pA, sB.pA, sD.pA);
+// kernel_dger2_sub_4r_lib4(n, sA.pA, sB.pA, sD.pA);
+// kernel_dger4_sub_8c_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+// kernel_dger4_sub_4c_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+// kernel_dgemm_nn_4x12_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+// kernel_dgemm_nn_4x8_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+// kernel_dgemm_nn_4x4_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+
+ dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// dgemm_nn_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// dsyrk_ln_libstr(n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// dsyrk_ln_mn_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// dpotrf_l_mn_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+// dpotrf_l_libstr(n, &sB, 0, 0, &sB, 0, 0);
+// dgetrf_nopivot_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+// dgetrf_libstr(n, n, &sB, 0, 0, &sB, 0, 0, ipiv);
+// dgeqrf_libstr(n, n, &sC, 0, 0, &sD, 0, 0, qr_work);
+// dcolin_libstr(n, &sx, 0, &sB3, 0, n-1);
+// dgelqf_libstr(n, n, &sB3, 0, 0, &sB3, 0, 0, lq_work);
+// dtrmm_rlnn_libstr(n, n, 1.0, &sA, 0, 0, &sD, 0, 0, &sD, 0, 0); //
+// dtrmm_rutn_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+// dtrsm_llnu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// dtrsm_lunn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// dtrsm_rltn_libstr(n, n, 1.0, &sB2, 0, 0, &sD, 0, 0, &sD, 0, 0); //
+// dtrsm_rltu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// dtrsm_rutn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// dgemv_n_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+// dgemv_t_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+// dsymv_l_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+// dgemv_nt_libstr(n, n, 1.0, 1.0, &sA, 0, 0, &sx, 0, &sx, 0, 0.0, 0.0, &sy, 0, &sy, 0, &sz, 0, &sz, 0);
+ }
+
+// d_print_strmat(n, n, &sD, 0, 0);
+
+ gettimeofday(&tv2, NULL); // stop
+
+ for(rep=0; rep<nrep; rep++)
+ {
+#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB) || defined(REF_BLAS_MKL)
+// dgemm_(&c_n, &c_t, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+// dpotrf_(&c_l, &n, B2, &n, &info);
+// dgemm_(&c_n, &c_n, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+// dsyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_0, C, &n);
+// dtrmm_(&c_r, &c_u, &c_t, &c_n, &n, &n, &d_1, A, &n, C, &n);
+// dgetrf_(&n, &n, B2, &n, ipiv, &info);
+// dtrsm_(&c_l, &c_l, &c_n, &c_u, &n, &n, &d_1, B2, &n, B, &n);
+// dtrsm_(&c_l, &c_u, &c_n, &c_n, &n, &n, &d_1, B2, &n, B, &n);
+// dtrtri_(&c_l, &c_n, &n, B2, &n, &info);
+// dlauum_(&c_l, &n, B, &n, &info);
+// dgemv_(&c_n, &n, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+// dgemv_(&c_t, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y2, &i_1);
+// dtrmv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+// dtrsv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+// dsymv_(&c_l, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+
+// for(i=0; i<n; i++)
+// {
+// i_t = n-i;
+// dcopy_(&i_t, &B[i*(n+1)], &i_1, &C[i*(n+1)], &i_1);
+// }
+// dsyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_1, C, &n);
+// dpotrf_(&c_l, &n, C, &n, &info);
+
+#endif
+
+#if defined(REF_BLAS_BLIS)
+// dgemm_(&c_n, &c_t, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+// dgemm_(&c_n, &c_n, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+// dsyrk_(&c_l, &c_n, &n77, &n77, &d_1, A, &n77, &d_0, C, &n77);
+// dtrmm_(&c_r, &c_u, &c_t, &c_n, &n77, &n77, &d_1, A, &n77, C, &n77);
+// dpotrf_(&c_l, &n77, B, &n77, &info);
+// dtrtri_(&c_l, &c_n, &n77, B, &n77, &info);
+// dlauum_(&c_l, &n77, B, &n77, &info);
+#endif
+ }
+
+ gettimeofday(&tv3, NULL); // stop
+
+ float Gflops_max = flops_max * GHz_max;
+
+// float flop_operation = 4*16.0*2*n; // kernel 12x4
+// float flop_operation = 3*16.0*2*n; // kernel 12x4
+// float flop_operation = 2*16.0*2*n; // kernel 8x4
+// float flop_operation = 1*16.0*2*n; // kernel 4x4
+// float flop_operation = 0.5*16.0*2*n; // kernel 2x4
+
+ float flop_operation = 2.0*n*n*n; // dgemm
+// float flop_operation = 1.0*n*n*n; // dsyrk dtrmm dtrsm
+// float flop_operation = 1.0/3.0*n*n*n; // dpotrf dtrtri
+// float flop_operation = 2.0/3.0*n*n*n; // dgetrf
+// float flop_operation = 4.0/3.0*n*n*n; // dgeqrf
+// float flop_operation = 2.0*n*n; // dgemv dsymv
+// float flop_operation = 1.0*n*n; // dtrmv dtrsv
+// float flop_operation = 4.0*n*n; // dgemv_nt
+
+// float flop_operation = 4.0/3.0*n*n*n; // dsyrk+dpotrf
+
+ float time_hpmpc = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+ float time_blasfeo = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+ float time_blas = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+ float Gflops_hpmpc = 1e-9*flop_operation/time_hpmpc;
+ float Gflops_blasfeo = 1e-9*flop_operation/time_blasfeo;
+ float Gflops_blas = 1e-9*flop_operation/time_blas;
+
+
+// printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_hpmpc, 100.0*Gflops_hpmpc/Gflops_max, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+// fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_hpmpc, 100.0*Gflops_hpmpc/Gflops_max, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+ printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+// fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+
+
+ d_free(A);
+ d_free(B);
+ d_free(B2);
+ d_free(M);
+ d_free_align(x);
+ d_free_align(y);
+ d_free_align(x2);
+ d_free_align(y2);
+ int_free(ipiv);
+ free(qr_work);
+ free(lq_work);
+
+ d_free_strmat(&sA);
+ d_free_strmat(&sB);
+ d_free_strmat(&sB2);
+ d_free_strmat(&sB3);
+ d_free_strmat(&sC);
+ d_free_strmat(&sD);
+ d_free_strmat(&sE);
+ d_free_strvec(&sx);
+ d_free_strvec(&sy);
+ d_free_strvec(&sz);
+
+ }
+
+ printf("\n");
+
+// fprintf(f, "];\n");
+// fclose(f);
+
+ return 0;
+
+ }
diff --git a/test_problems/test_blas_s.c b/test_problems/test_blas_s.c
new file mode 100644
index 0000000..3ea9f11
--- /dev/null
+++ b/test_problems/test_blas_s.c
@@ -0,0 +1,454 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+#ifndef S_PS
+#define S_PS 1
+#endif
+#ifndef S_NC
+#define S_NC 1
+#endif
+
+
+
+#if defined(REF_BLAS_OPENBLAS)
+void openblas_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_BLIS)
+void omp_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_MKL)
+#include "mkl.h"
+#endif
+
+
+
+#include "cpu_freq.h"
+
+
+
+int main()
+ {
+
+#if defined(REF_BLAS_OPENBLAS)
+ openblas_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_BLIS)
+ omp_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_MKL)
+ mkl_set_num_threads(1);
+#endif
+
+ printf("\n");
+ printf("\n");
+ printf("\n");
+
+ printf("BLAS performance test - float precision\n");
+ printf("\n");
+
+ // maximum frequency of the processor
+ const float GHz_max = GHZ_MAX;
+ printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max);
+ printf("\n");
+
+ // maximum flops per cycle, single precision
+ // maxumum memops (sustained load->store of floats) per cycle, single precision
+#if defined(TARGET_X64_INTEL_HASWELL)
+ const float flops_max = 32; // 2x256 bit fma
+ const float memops_max = 8; // 2x256 bit load + 1x256 bit store
+ printf("Testing BLAS version for AVX2 and FMA instruction sets, 64 bit (optimized for Intel Haswell): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ const float flops_max = 16; // 1x256 bit mul + 1x256 bit add
+ const float memops_max = 4; // 1x256 bit load + 1x128 bit store
+ printf("Testing BLAS version for AVX instruction set, 64 bit (optimized for Intel Sandy Bridge): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_CORE)
+ const float flops_max = 8; // 1x128 bit mul + 1x128 bit add
+ const float memops_max = 4; // 1x128 bit load + 1x128 bit store;
+ printf("Testing BLAS version for SSE3 instruction set, 64 bit (optimized for Intel Core): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+ const float flops_max = 16; // 2x128 bit fma
+ const float memops_max = 4; // 1x256 bit load + 1x128 bit store
+ printf("Testing BLAS version for SSE3 and FMA instruction set, 64 bit (optimized for AMD Bulldozer): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+ const float flops_max = 8; // 1x128 bit fma
+ const float memops_max = 4; // ???
+ printf("Testing BLAS version for VFPv4 instruction set, 32 bit (optimized for ARM Cortex A15): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+ const float flops_max = 8; // 1x128 bit fma
+ const float memops_max = 4; // ???
+ printf("Testing BLAS version for VFPv4 instruction set, 32 bit (optimized for ARM Cortex A15): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_GENERIC)
+ const float flops_max = 2; // 1x32 bit mul + 1x32 bit add ???
+ const float memops_max = 1; // ???
+ printf("Testing BLAS version for generic scalar instruction set: theoretical peak %5.1f Gflops ???\n", flops_max*GHz_max);
+#endif
+
+// FILE *f;
+// f = fopen("./test_problems/results/test_blas.m", "w"); // a
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+// fprintf(f, "C = 's_x64_intel_haswell';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+// fprintf(f, "C = 's_x64_intel_sandybridge';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_CORE)
+// fprintf(f, "C = 's_x64_intel_core';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+// fprintf(f, "C = 's_x64_amd_bulldozer';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+// fprintf(f, "C = 's_armv7a_arm_cortex_a15';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+// fprintf(f, "C = 's_armv7a_arm_cortex_a15';\n");
+// fprintf(f, "\n");
+#elif defined(TARGET_GENERIC)
+// fprintf(f, "C = 's_generic';\n");
+// fprintf(f, "\n");
+#endif
+
+// fprintf(f, "A = [%f %f];\n", GHz_max, flops_max);
+// fprintf(f, "\n");
+
+// fprintf(f, "B = [\n");
+
+
+
+ int i, j, rep, ll;
+
+ const int bss = S_PS;
+ const int ncs = S_NC;
+
+/* int info = 0;*/
+
+ printf("\nn\t sgemm_blasfeo\t sgemm_blas\n");
+ printf("\nn\t Gflops\t %%\t Gflops\t %%\n\n");
+
+#if 1
+ int nn[] = {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332, 336, 340, 344, 348, 352, 356, 360, 364, 368, 372, 376, 380, 384, 388, 392, 396, 400, 404, 408, 412, 416, 420, 424, 428, 432, 436, 440, 444, 448, 452, 456, 460, 500, 550, 600, 650, 700};
+ int nnrep[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 400, 400, 400, 400, 400, 200, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, 4};
+
+// for(ll=0; ll<24; ll++)
+ for(ll=0; ll<75; ll++)
+// for(ll=0; ll<115; ll++)
+// for(ll=0; ll<120; ll++)
+
+ {
+
+ int n = nn[ll];
+ int nrep = nnrep[ll];
+// int n = ll+1;
+// int nrep = nnrep[0];
+// n = n<16 ? 16 : n;
+
+ int n2 = n*n;
+
+#else
+ int nn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
+
+ for(ll=0; ll<24; ll++)
+
+ {
+
+ int n = nn[ll];
+ int nrep = 40000; //nnrep[ll];
+#endif
+
+ float *A; s_zeros(&A, n, n);
+ float *B; s_zeros(&B, n, n);
+ float *C; s_zeros(&C, n, n);
+ float *M; s_zeros(&M, n, n);
+
+ char c_n = 'n';
+ char c_l = 'l';
+ char c_r = 'r';
+ char c_t = 't';
+ char c_u = 'u';
+ int i_1 = 1;
+ int i_t;
+ float d_1 = 1;
+ float d_0 = 0;
+
+ for(i=0; i<n*n; i++)
+ A[i] = i;
+
+ for(i=0; i<n; i++)
+ B[i*(n+1)] = 1;
+
+ for(i=0; i<n*n; i++)
+ M[i] = 1;
+
+ float *B2; s_zeros(&B2, n, n);
+ for(i=0; i<n*n; i++)
+ B2[i] = 1e-15;
+ for(i=0; i<n; i++)
+ B2[i*(n+1)] = 1;
+
+ float *x; s_zeros(&x, n, 1);
+ float *y; s_zeros(&y, n, 1);
+ float *x2; s_zeros(&x2, n, 1);
+ float *y2; s_zeros(&y2, n, 1);
+ float *diag; s_zeros(&diag, n, 1);
+ int *ipiv; int_zeros(&ipiv, n, 1);
+
+// for(i=0; i<n; i++) x[i] = 1;
+// for(i=0; i<n; i++) x2[i] = 1;
+
+ // matrix struct
+#if 0
+ struct s_strmat sA; s_allocate_strmat(n+4, n+4, &sA);
+ struct s_strmat sB; s_allocate_strmat(n+4, n+4, &sB);
+ struct s_strmat sC; s_allocate_strmat(n+4, n+4, &sC);
+ struct s_strmat sD; s_allocate_strmat(n+4, n+4, &sD);
+ struct s_strmat sE; s_allocate_strmat(n+4, n+4, &sE);
+#else
+ struct s_strmat sA; s_allocate_strmat(n, n, &sA);
+ struct s_strmat sB; s_allocate_strmat(n, n, &sB);
+ struct s_strmat sC; s_allocate_strmat(n, n, &sC);
+ struct s_strmat sD; s_allocate_strmat(n, n, &sD);
+ struct s_strmat sE; s_allocate_strmat(n, n, &sE);
+#endif
+ struct s_strvec sx; s_allocate_strvec(n, &sx);
+ struct s_strvec sy; s_allocate_strvec(n, &sy);
+ struct s_strvec sz; s_allocate_strvec(n, &sz);
+
+ s_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+ s_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ s_cvt_vec2strvec(n, x, &sx, 0);
+
+
+ // create matrix to pivot all the time
+// sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+
+ float *dummy;
+
+ int info;
+
+ /* timing */
+ struct timeval tvm1, tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11, tv12, tv13, tv14, tv15, tv16;
+
+ /* warm up */
+ for(rep=0; rep<nrep; rep++)
+ {
+ sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+ }
+
+ float alpha = 1.0;
+ float beta = 0.0;
+
+ gettimeofday(&tv0, NULL); // stop
+
+ gettimeofday(&tv1, NULL); // stop
+
+ for(rep=0; rep<nrep; rep++)
+ {
+// kernel_sgemm_nt_24x4_lib8(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_16x4_lib8(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_8x8_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nt_8x4_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nt_4x8_gen_lib8(n, &alpha, sA.pA, sB.pA, &beta, 0, sD.pA, sD.cn, 0, sD.pA, sD.cn, 0, 4, 0, 8);
+// kernel_sgemm_nt_4x8_vs_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA, 4, 8);
+// kernel_sgemm_nt_4x8_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nt_12x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_8x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_4x4_lib4(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nn_16x4_lib8(n, &alpha, sA.pA, sA.cn, 0, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nn_8x8_lib8(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nn_8x4_lib8(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+
+// sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// sgemm_nn_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// ssyrk_ln_libstr(n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+// spotrf_l_mn_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+ spotrf_l_libstr(n, &sB, 0, 0, &sB, 0, 0);
+// sgetr_libstr(n, n, &sA, 0, 0, &sB, 0, 0);
+// sgetrf_nopivot_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+// sgetrf_libstr(n, n, &sB, 0, 0, &sB, 0, 0, ipiv);
+// strmm_rlnn_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+// strmm_rutn_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+// strsm_llnu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// strsm_lunn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// strsm_rltn_libstr(n, n, 1.0, &sB, 0, 0, &sD, 0, 0, &sD, 0, 0);
+// strsm_rltu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// strsm_rutn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+// sgemv_n_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+// sgemv_t_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+// ssymv_l_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+// sgemv_nt_libstr(n, n, 1.0, 1.0, &sA, 0, 0, &sx, 0, &sx, 0, 0.0, 0.0, &sy, 0, &sy, 0, &sz, 0, &sz, 0);
+ }
+
+// d_print_strmat(n, n, &sD, 0, 0);
+
+ gettimeofday(&tv2, NULL); // stop
+
+ for(rep=0; rep<nrep; rep++)
+ {
+#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB) || defined(REF_BLAS_MKL)
+// sgemm_(&c_n, &c_t, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+// sgemm_(&c_n, &c_n, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+// scopy_(&n2, A, &i_1, B, &i_1);
+// ssyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_0, C, &n);
+// strmm_(&c_r, &c_u, &c_t, &c_n, &n, &n, &d_1, A, &n, C, &n);
+// spotrf_(&c_l, &n, B2, &n, &info);
+// sgetrf_(&n, &n, B2, &n, ipiv, &info);
+// strsm_(&c_l, &c_l, &c_n, &c_u, &n, &n, &d_1, B2, &n, B, &n);
+// strsm_(&c_l, &c_u, &c_n, &c_n, &n, &n, &d_1, B2, &n, B, &n);
+// strtri_(&c_l, &c_n, &n, B2, &n, &info);
+// slauum_(&c_l, &n, B, &n, &info);
+// sgemv_(&c_n, &n, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+// sgemv_(&c_t, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y2, &i_1);
+// strmv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+// strsv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+// ssymv_(&c_l, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+
+// for(i=0; i<n; i++)
+// {
+// i_t = n-i;
+// scopy_(&i_t, &B[i*(n+1)], &i_1, &C[i*(n+1)], &i_1);
+// }
+// ssyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_1, C, &n);
+// spotrf_(&c_l, &n, C, &n, &info);
+
+#endif
+
+#if defined(REF_BLAS_BLIS)
+// sgemm_(&c_n, &c_t, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+// sgemm_(&c_n, &c_n, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+// ssyrk_(&c_l, &c_n, &n77, &n77, &d_1, A, &n77, &d_0, C, &n77);
+// strmm_(&c_r, &c_u, &c_t, &c_n, &n77, &n77, &d_1, A, &n77, C, &n77);
+// spotrf_(&c_l, &n77, B, &n77, &info);
+// strtri_(&c_l, &c_n, &n77, B, &n77, &info);
+// slauum_(&c_l, &n77, B, &n77, &info);
+#endif
+ }
+
+ gettimeofday(&tv3, NULL); // stop
+
+ // flops
+ if(1)
+ {
+
+ float Gflops_max = flops_max * GHz_max;
+
+// float flop_operation = 6*16.0*2*n; // kernel 24x4
+// float flop_operation = 4*16.0*2*n; // kernel 16x4
+// float flop_operation = 3*16.0*2*n; // kernel 12x4
+// float flop_operation = 2*16.0*2*n; // kernel 8x4
+// float flop_operation = 1*16.0*2*n; // kernel 4x4
+
+// float flop_operation = 2.0*n*n*n; // dgemm
+// float flop_operation = 1.0*n*n*n; // dsyrk dtrmm dtrsm
+ float flop_operation = 1.0/3.0*n*n*n; // dpotrf dtrtri
+// float flop_operation = 2.0/3.0*n*n*n; // dgetrf
+// float flop_operation = 2.0*n*n; // dgemv dsymv
+// float flop_operation = 1.0*n*n; // dtrmv dtrsv
+// float flop_operation = 4.0*n*n; // dgemv_nt
+// float flop_operation = 3*16.0*2*n; // kernel 12x4
+
+// float flop_operation = 4.0/3.0*n*n*n; // dsyrk+dpotrf
+
+ float time_hpmpc = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+ float time_blasfeo = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+ float time_blas = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+ float Gflops_hpmpc = 1e-9*flop_operation/time_hpmpc;
+ float Gflops_blasfeo = 1e-9*flop_operation/time_blasfeo;
+ float Gflops_blas = 1e-9*flop_operation/time_blas;
+
+
+ printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+// fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+
+ }
+ // memops
+ else
+ {
+
+ float Gmemops_max = memops_max * GHz_max;
+
+ float memop_operation = 1.0*n*n; // dgecp
+
+ float time_hpmpc = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+ float time_blasfeo = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+ float time_blas = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+ float Gmemops_hpmpc = 1e-9*memop_operation/time_hpmpc;
+ float Gmemops_blasfeo = 1e-9*memop_operation/time_blasfeo;
+ float Gmemops_blas = 1e-9*memop_operation/time_blas;
+
+
+ printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gmemops_blasfeo, 100.0*Gmemops_blasfeo/Gmemops_max, Gmemops_blas, 100.0*Gmemops_blas/Gmemops_max);
+// fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gmemops_blasfeo, 100.0*Gmemops_blasfeo/Gmemops_max, Gmemops_blas, 100.0*Gmemops_blas/Gmemops_max);
+
+ }
+
+
+ free(A);
+ free(B);
+ free(B2);
+ free(M);
+ free(x);
+ free(y);
+ free(x2);
+ free(y2);
+ free(ipiv);
+
+ s_free_strmat(&sA);
+ s_free_strmat(&sB);
+ s_free_strmat(&sC);
+ s_free_strmat(&sD);
+ s_free_strmat(&sE);
+ s_free_strvec(&sx);
+ s_free_strvec(&sy);
+ s_free_strvec(&sz);
+
+ }
+
+ printf("\n");
+
+// fprintf(f, "];\n");
+// fclose(f);
+
+ return 0;
+
+ }
+
diff --git a/test_problems/test_d_strmat.c b/test_problems/test_d_strmat.c
new file mode 100644
index 0000000..e06cf84
--- /dev/null
+++ b/test_problems/test_d_strmat.c
@@ -0,0 +1,512 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+
+int main()
+ {
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by HIGH_PERFORMANCE\n\n");
+
+#elif defined(LA_REFERENCE)
+
+ printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ int ii;
+
+ int n = 16;
+
+ //
+ // matrices in column-major format
+ //
+ double *A; d_zeros(&A, n, n);
+ for(ii=0; ii<n*n; ii++) A[ii] = ii;
+// d_print_mat(n, n, A, n);
+
+ double *B; d_zeros(&B, n, n);
+ for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+// d_print_mat(n, n, B, n);
+
+ double *C; d_zeros(&C, n, n);
+
+ double *D; d_zeros(&D, n, n);
+ for(ii=0; ii<n*n; ii++) D[ii] = -1;
+
+ double *x_n; d_zeros(&x_n, n, 1);
+// for(ii=0; ii<n; ii++) x_n[ii] = 1.0;
+ x_n[1] = 1.0;
+// x_n[1] = 1.0;
+// x_n[2] = 2.0;
+// x_n[3] = 3.0;
+ double *x_t; d_zeros(&x_t, n, 1);
+// for(ii=0; ii<n; ii++) x_n[ii] = 1.0;
+ x_t[0] = 1.0;
+ double *y_n; d_zeros(&y_n, n, 1);
+ double *y_t; d_zeros(&y_t, n, 1);
+ double *z_n; d_zeros(&z_n, n, 1);
+ double *z_t; d_zeros(&z_t, n, 1);
+
+ double *x0; d_zeros(&x0, n, 1); x0[0] = 1.0;
+ double *x1; d_zeros(&x1, n, 1); x1[1] = 1.0;
+ double *x2; d_zeros(&x2, n, 1); x2[2] = 1.0;
+ double *x3; d_zeros(&x3, n, 1); x3[3] = 1.0;
+ double *x4; d_zeros(&x4, n, 1); x4[4] = 1.0;
+ double *x5; d_zeros(&x5, n, 1); x5[5] = 1.0;
+ double *x6; d_zeros(&x6, n, 1); x6[6] = 1.0;
+ double *x7; d_zeros(&x7, n, 1); x7[7] = 1.0;
+ double *x8; d_zeros(&x8, n, 1); x8[8] = 1.0;
+ double *x9; d_zeros(&x9, n, 1); x9[9] = 1.0;
+
+ int *ipiv; int_zeros(&ipiv, n, 1);
+
+ //
+ // matrices in matrix struct format
+ //
+ int size_strmat = 5*d_size_strmat(n, n);
+ void *memory_strmat; v_zeros_align(&memory_strmat, size_strmat);
+ char *ptr_memory_strmat = (char *) memory_strmat;
+
+ struct d_strmat sA;
+// d_allocate_strmat(n, n, &sA);
+ d_create_strmat(n, n, &sA, ptr_memory_strmat);
+ ptr_memory_strmat += sA.memory_size;
+ d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+// d_cast_mat2strmat(A, &sA);
+ d_print_strmat(n, n, &sA, 0, 0);
+
+ struct d_strmat sB;
+// d_allocate_strmat(n, n, &sB);
+ d_create_strmat(n, n, &sB, ptr_memory_strmat);
+ ptr_memory_strmat += sB.memory_size;
+ d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ struct d_strmat sC;
+// d_allocate_strmat(n, n, &sC);
+ d_create_strmat(n, n, &sC, ptr_memory_strmat);
+ ptr_memory_strmat += sC.memory_size;
+
+ struct d_strmat sD;
+// d_allocate_strmat(n, n, &sD);
+ d_create_strmat(n, n, &sD, ptr_memory_strmat);
+ ptr_memory_strmat += sD.memory_size;
+ d_cvt_mat2strmat(n, n, D, n, &sD, 0, 0);
+
+ struct d_strmat sE;
+// d_allocate_strmat(n, n, &sE);
+ d_create_strmat(n, n, &sE, ptr_memory_strmat);
+ ptr_memory_strmat += sE.memory_size;
+
+ struct d_strvec sx_n;
+ d_allocate_strvec(n, &sx_n);
+ d_cvt_vec2strvec(n, x_n, &sx_n, 0);
+
+ struct d_strvec sx_t;
+ d_allocate_strvec(n, &sx_t);
+ d_cvt_vec2strvec(n, x_t, &sx_t, 0);
+
+ struct d_strvec sy_n;
+ d_allocate_strvec(n, &sy_n);
+ d_cvt_vec2strvec(n, y_n, &sy_n, 0);
+
+ struct d_strvec sy_t;
+ d_allocate_strvec(n, &sy_t);
+ d_cvt_vec2strvec(n, y_t, &sy_t, 0);
+
+ struct d_strvec sz_n;
+ d_allocate_strvec(n, &sz_n);
+ d_cvt_vec2strvec(n, z_n, &sz_n, 0);
+
+ struct d_strvec sz_t;
+ d_allocate_strvec(n, &sz_t);
+ d_cvt_vec2strvec(n, z_t, &sz_t, 0);
+
+ struct d_strvec sx0; d_create_strvec(n, &sx0, x0);
+ struct d_strvec sx1; d_create_strvec(n, &sx1, x1);
+ struct d_strvec sx2; d_create_strvec(n, &sx2, x2);
+ struct d_strvec sx3; d_create_strvec(n, &sx3, x3);
+ struct d_strvec sx4; d_create_strvec(n, &sx4, x4);
+ struct d_strvec sx5; d_create_strvec(n, &sx5, x5);
+ struct d_strvec sx6; d_create_strvec(n, &sx6, x6);
+ struct d_strvec sx7; d_create_strvec(n, &sx7, x7);
+ struct d_strvec sx8; d_create_strvec(n, &sx8, x8);
+ struct d_strvec sx9; d_create_strvec(n, &sx9, x9);
+
+ struct d_strvec sz0; d_allocate_strvec(n, &sz0);
+ struct d_strvec sz1; d_allocate_strvec(n, &sz1);
+ struct d_strvec sz2; d_allocate_strvec(n, &sz2);
+ struct d_strvec sz3; d_allocate_strvec(n, &sz3);
+ struct d_strvec sz4; d_allocate_strvec(n, &sz4);
+ struct d_strvec sz5; d_allocate_strvec(n, &sz5);
+ struct d_strvec sz6; d_allocate_strvec(n, &sz6);
+ struct d_strvec sz7; d_allocate_strvec(n, &sz7);
+ struct d_strvec sz8; d_allocate_strvec(n, &sz8);
+ struct d_strvec sz9; d_allocate_strvec(n, &sz9);
+
+ // tests
+ double *v; d_zeros(&v, n, 1);
+ double *vp; d_zeros(&vp, n, 1);
+ double *vm; d_zeros(&vm, n, 1);
+ double *m; d_zeros(&m, n, 1);
+ double *r; d_zeros(&r, n, 1);
+
+ for(ii=0; ii<n; ii++) v[ii] = ii; // x
+ for(ii=0; ii<n; ii++) vp[ii] = 8.0; // upper
+ for(ii=0; ii<n; ii++) vm[ii] = 3.0; // lower
+ for(ii=0; ii<n; ii++) r[ii] = 2*ii+1; // x
+
+ d_print_mat(1, n, v, 1);
+ d_print_mat(1, n, vp, 1);
+ d_print_mat(1, n, vm, 1);
+ d_print_mat(1, n, r, 1);
+
+ struct d_strvec sv; d_create_strvec(n, &sv, v);
+ struct d_strvec svp; d_create_strvec(n, &svp, vp);
+ struct d_strvec svm; d_create_strvec(n, &svm, vm);
+ struct d_strvec sm; d_create_strvec(n, &sm, m);
+ struct d_strvec sr; d_create_strvec(n, &sr, r);
+
+// d_print_tran_strvec(n, &sv, 0);
+// d_print_tran_strvec(n, &svp, 0);
+// d_print_tran_strvec(n, &svm, 0);
+// d_print_tran_strvec(n, &sm, 0);
+// d_print_tran_strvec(n, &sr, 0);
+
+// d_print_tran_strvec(n, &sm, 0);
+// DVECEL_LIBSTR(&sm, 0) = 0.0;
+// DVECEL_LIBSTR(&sm, 1) = 1.0;
+// DVECEL_LIBSTR(&sm, 2) = 2.0;
+// d_print_tran_strvec(n, &sm, 0);
+// return 0;
+
+ double alpha = 1.0;
+ double beta = 0.0;
+ kernel_dgemm_nt_4x4_gen_lib4(4, &alpha, sA.pA, sB.pA, &beta, 0, sD.pA, sA.cn, 0, sD.pA, sE.cn, 1, 3, 1, 3);
+ d_print_strmat(n, n, &sD, 0, 0);
+ return 0;
+ dtrmm_rlnn_libstr(8, 8, alpha, &sA, 3, 0, &sB, 0, 0, &sD, 0, 0);
+// dgemm_nn_libstr(8, 8, 8, alpha, &sB, 0, 0, &sA, 1, 0, beta, &sA, 0, 0, &sD, 0, 0);
+ d_print_strmat(n, n, &sD, 0, 0);
+ return 0;
+// dsyrk_ln_libstr(n, 15, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+// dpotrf_l_mn_libstr(n, 15, &sD, 0, 0, &sD, 0, 0);
+// dsyrk_dpotrf_ln_libstr(n, 15, n, &sA, 0, 0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+// dtrmm_rlnn_libstr(n, n, alpha, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+// dgese_libstr(n, n, 0.0/0.0, &sD, 0, 0);
+// kernel_dgemm_nt_4x8_lib4(n, &alpha, sA.pA, sB.pA, sB.cn, &beta, sC.pA, sD.pA);
+// kernel_dgemm_nn_4x8_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sC.pA, sD.pA);
+// kernel_dsyrk_nt_l_4x4_gen_lib4(n, &alpha, sA.pA, sB.pA, &beta, 0, sC.pA, sC.cn, 3, sD.pA, sD.cn, 0, 4, 0, 4);
+// kernel_dsyrk_nt_l_8x4_gen_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, 0, sC.pA, sC.cn, 3, sD.pA, sD.cn, 0, 8, 0, 8);
+// dsyrk_ln_libstr(10, 10, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 1, 0);
+// d_print_strmat(n, n, &sD, 0, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx0, 0, beta, &sz0, 0, &sz0, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx1, 0, beta, &sz1, 0, &sz1, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx2, 0, beta, &sz2, 0, &sz2, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx3, 0, beta, &sz3, 0, &sz3, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx4, 0, beta, &sz4, 0, &sz4, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx5, 0, beta, &sz5, 0, &sz5, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx6, 0, beta, &sz6, 0, &sz6, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx7, 0, beta, &sz7, 0, &sz7, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx8, 0, beta, &sz8, 0, &sz8, 0);
+ dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx9, 0, beta, &sz9, 0, &sz9, 0);
+ d_print_tran_strvec(n, &sz0, 0);
+ d_print_tran_strvec(n, &sz1, 0);
+ d_print_tran_strvec(n, &sz2, 0);
+ d_print_tran_strvec(n, &sz3, 0);
+ d_print_tran_strvec(n, &sz4, 0);
+ d_print_tran_strvec(n, &sz5, 0);
+ d_print_tran_strvec(n, &sz6, 0);
+ d_print_tran_strvec(n, &sz7, 0);
+ d_print_tran_strvec(n, &sz8, 0);
+ d_print_tran_strvec(n, &sz9, 0);
+ return 0;
+
+// d_print_strmat(n, n, &sC, 0, 0);
+// dgese_libstr(n, n, 1.0, &sB, 0, 0);
+// kernel_dger4_sub_4_lib4(6, sB.pA, sA.pA, sC.pA);
+// kernel_dger4_sub_4_vs_lib4(6, sB.pA, sA.pA, sC.pA, 1);
+ return 0;
+
+// d_print_strmat(n, n, &sC, 0, 0);
+// dgese_libstr(n, n, 1.0, &sB, 0, 0);
+// kernel_dger4_sub_4_lib4(6, sB.pA, sA.pA, sC.pA);
+// kernel_dger4_sub_4_vs_lib4(6, sB.pA, sA.pA, sC.pA, 1);
+// kernel_dger4_sub_8_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn);
+// kernel_dger4_sub_8_vs_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn, 5);
+// kernel_dger4_sub_12_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn);
+// kernel_dger4_sub_12_vs_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn, 9);
+// kernel_dger4_sub_8c_lib4(9, sB.pA, sA.cn, sA.pA, sC.pA, sC.cn);
+// kernel_dger4_sub_4c_lib4(9, sB.pA, sA.cn, sA.pA, sC.pA, sC.cn);
+// d_print_strmat(n, n, &sC, 0, 0);
+// return 0;
+
+#if 1
+ dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sC, 0, 0);
+#else
+ dgese_libstr(n, n, 0.1, &sC, 0, 0);
+ DMATEL_LIBSTR(&sC, 0, 0) = 1.0;
+// DMATEL_LIBSTR(&sC, 0, 1) = 1.0;
+ for(ii=1; ii<n-1; ii++)
+ {
+// DMATEL_LIBSTR(&sC, ii, ii-1) = 1.0;
+ DMATEL_LIBSTR(&sC, ii, ii) = 1.0;
+// DMATEL_LIBSTR(&sC, ii, ii+1) = 1.0;
+ }
+// DMATEL_LIBSTR(&sC, n-1, n-2) = 1.0;
+ DMATEL_LIBSTR(&sC, n-1, n-1) = 1.0;
+#endif
+ d_print_strmat(n, n, &sC, 0, 0);
+ dgese_libstr(n, n, 0.0/0.0, &sD, 0, 0);
+// d_print_strmat(n, n, &sA, 0, 0);
+// dgein1_libstr(12.0, &sA, 0, 0);
+// DMATEL_LIBSTR(&sA, 0, 0) = 12.0;
+// DMATEL_LIBSTR(&sA, 1, 0) = 6.0;
+// DMATEL_LIBSTR(&sA, 2, 0) = - 4.0;
+// DMATEL_LIBSTR(&sA, 0, 1) = - 51.0;
+// DMATEL_LIBSTR(&sA, 1, 1) = 167.0;
+// DMATEL_LIBSTR(&sA, 2, 1) = 24.0;
+// DMATEL_LIBSTR(&sA, 0, 2) = 4.0;
+// DMATEL_LIBSTR(&sA, 1, 2) = - 68.0;
+// DMATEL_LIBSTR(&sA, 2, 2) = - 41.0;
+// d_print_strmat(n, n, &sA, 0, 0);
+ d_print_strmat(n, n, &sC, 0, 0);
+// printf("\n%f\n", DGEEL_LIBSTR(&sA, 0, 0));
+// int qr_work_size = dgeqrf_work_size_libstr(n, n);
+ int qr_work_size = dgelqf_work_size_libstr(n, n);
+ void *qr_work;
+ v_zeros_align(&qr_work, qr_work_size);
+// dgeqrf_libstr(10, 10, &sC, 0, 0, &sD, 0, 0, qr_work);
+ dgelqf_libstr(17, 17, &sC, 0, 0, &sD, 0, 0, qr_work);
+// dgecp_libstr(10, 10, &sC, 0, 0, &sD, 0, 0);
+// kernel_dgeqrf_4_lib4(16, 12, sD.pA, sD.cn, sD.dA, qr_work);
+// d_print_strmat(n, n, &sA, 0, 0);
+// kernel_dgeqrf_vs_lib4(10, 16, 0, sD.pA+0, sD.cn, sD.dA);
+// kernel_dgelqf_vs_lib4(10, 10, 10, 0, sD.pA+0, sD.cn, sD.dA);
+ d_print_strmat(n, n, &sD, 0, 0);
+ free(qr_work);
+ return 0;
+
+// dveccl_mask_libstr(n, &svm, 0, &sv, 0, &svp, 0, &sv, 0, &sm, 0);
+// veccl_libstr(n, &svm, 0, &sv, 0, &svp, 0, &sv, 0);
+// d_print_tran_strvec(12, &sv, 0);
+// d_print_tran_strvec(12, &sm, 0);
+// dvecze_libstr(n, &sm, 0, &sr, 0, &sr, 0);
+// d_print_tran_strvec(12, &sr, 0);
+// return 0;
+
+// d_print_strmat(n, n, &sA, 0, 0);
+// dtrsv_unn_libstr(n, &sA, 1, 0, &sx0, 0, &sz0, 0);
+// d_print_tran_strvec(n, &sz0, 0);
+// dtrsv_unn_libstr(n, &sA, 1, 0, &sx1, 0, &sz1, 0);
+// d_print_tran_strvec(n, &sz1, 0);
+// dtrsv_unn_libstr(n, &sA, 1, 0, &sx2, 0, &sz2, 0);
+// d_print_tran_strvec(n, &sz2, 0);
+// dtrsv_unn_libstr(n, &sA, 1, 0, &sx3, 0, &sz3, 0);
+// d_print_tran_strvec(n, &sz3, 0);
+// return 0;
+
+// double alpha = 1.0;
+// double beta = 1.0;
+// kernel_dgemm_nt_4x12_vs_lib4(n, &alpha, sA.pA, sB.pA, sB.cn, &beta, sD.pA, sD.pA, 3, 10);
+// kernel_dgemm_nt_8x8u_vs_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn, 7, 6);
+ dgemm_nn_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+ d_print_strmat(n, n, &sD, 0, 0);
+ dpotrf_l_libstr(16, &sD, 0, 0, &sD, 0, 0);
+ d_print_strmat(n, n, &sD, 0, 0);
+ return 0;;
+
+// dmatse_libstr(n, n, 100.0, &sD, 0, 0);
+
+// for(ii=0; ii<n; ii++)
+// dvecin1_libstr(ii+1, &sx_n, ii);
+// d_print_tran_strvec(n, &sx_n, 0);
+// d_print_strmat(n, n, &sD, 0, 0);
+// // ddiain_libstr(4, -1.0, &sx_n, 1, &sD, 3, 2);
+// ddiaad_libstr(4, -1.0, &sx_n, 1, &sD, 3, 2);
+// d_print_strmat(n, n, &sD, 0, 0);
+// return 0;
+
+// d_print_tran_strvec(n, &sx_n, 0);
+// dgemm_l_diag_libstr(n, n, 1.0, &sx_n, 0, &sA, 0, 0, 0.0, &sD, 0, 0, &sD, 0, 0);
+// dgemm_r_diag_libstr(n, n, 1.0, &sA, 0, 0, &sx_n, 0, 0.0, &sD, 0, 0, &sD, 0, 0);
+// d_print_strmat(n, n, &sD, 0, 0);
+// exit(1);
+
+// dsetmat_libstr(n, n, 0.0, &sD, 0, 0);
+// dmatin1_libstr(2.0, &sD, 0, 0);
+// dmatin1_libstr(2.0, &sD, 1, 1);
+// dmatin1_libstr(2.0, &sD, 2, 2);
+// dmatin1_libstr(1.0, &sD, 1, 0);
+// dmatin1_libstr(1.0, &sD, 2, 1);
+// dmatin1_libstr(0.5, &sD, 2, 0);
+// d_print_strmat(n, n, &sD, 0, 0);
+// d_print_tran_strvec(n, &sx_n, 0);
+// dtrsv_lnn_libstr(n, n, &sD, 0, 0, &sx_n, 0, &sz_n, 0);
+// d_print_tran_strvec(n, &sz_n, 0);
+// exit(1);
+
+// dgemm_nt_libstr(8, 8, 8, 1.0, &sB, 0, 0, &sA, 1, 0, 0.0, &sD, 0, 0, &sD, 0, 0);
+// d_print_strmat(n, n, &sD, 0, 0);
+// return 0;
+
+// double alpha = 1.0;
+// kernel_dtrmm_nn_rl_4x4_gen_lib4(7, &alpha, sB.pA, 2, sA.pA, sA.cn, 1, sD.pA, sD.cn, 0, 4, 1, 4);
+// kernel_dtrmm_nn_rl_4x4_gen_lib4(7, &alpha, sB.pA+sB.cn*4, 2, sA.pA, sA.cn, 1, sD.pA+sD.cn*4, sD.cn, 0, 4, 1, 4);
+// kernel_dtrmm_nn_rl_4x4_lib4(4, &alpha, sB.pA, sA.pA, sA.cn+4*4, sD.pA+4*4);
+// kernel_dtrmm_nn_rl_4x4_gen_lib4(3, &alpha, sB.pA+sB.cn*4+4*4, 2, sA.pA+sB.cn*4+4*4, sA.cn, 1, sD.pA+sD.cn*4+4*4, sD.cn, 0, 4, 0, 4);
+ dtrmm_rlnn_libstr(8, 8, 1.0, &sB, 0, 0, &sA, 3, 0, &sD, 2, 1);
+ d_print_strmat(n, n, &sD, 0, 0);
+ return 0;
+
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx0, 0, &sx0, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx1, 0, &sx1, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx2, 0, &sx2, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx3, 0, &sx3, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx4, 0, &sx4, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx5, 0, &sx5, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx6, 0, &sx6, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx7, 0, &sx7, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx8, 0, &sx8, 0);
+ dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx9, 0, &sx9, 0);
+ d_print_tran_strvec(n, &sx0, 0);
+ d_print_tran_strvec(n, &sx1, 0);
+ d_print_tran_strvec(n, &sx2, 0);
+ d_print_tran_strvec(n, &sx3, 0);
+ d_print_tran_strvec(n, &sx4, 0);
+ d_print_tran_strvec(n, &sx5, 0);
+ d_print_tran_strvec(n, &sx6, 0);
+ d_print_tran_strvec(n, &sx7, 0);
+ d_print_tran_strvec(n, &sx8, 0);
+ d_print_tran_strvec(n, &sx9, 0);
+ return 0;
+
+ dgemv_t_libstr(2, 8, 1.0, &sA, 2, 0, &sx_n, 0, 0.0, &sy_n, 0, &sz_n, 0);
+ d_print_tran_strvec(n, &sz_n, 0);
+ return 0;
+
+ dgemm_nt_libstr(4, 8, 8, 1.0, &sB, 0, 0, &sA, 0, 0, 0.0, &sB, 0, 0, &sD, 3, 0);
+// d_print_strmat(n, n, &sB, 0, 0);
+ d_print_strmat(n, n, &sD, 0, 0);
+ exit(1);
+
+ dpotrf_l_libstr(n, &sD, 0, 0, &sD, 0, 0);
+// dgetrf_nopivot_libstr(n, n, &sD, 0, 0, &sD, 0, 0);
+// dgetrf_libstr(n, n, &sD, 0, 0, &sD, 0, 0, ipiv);
+ d_print_strmat(n, n, &sD, 0, 0);
+#if defined(LA_HIGH_PERFORMANCE) | defined(LA_REFERENCE)
+ d_print_mat(1, n, sD.dA, 1);
+#endif
+ int_print_mat(1, n, ipiv, 1);
+ dtrsm_rltn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sE, 0, 0);
+ d_print_strmat(n, n, &sE, 0, 0);
+ exit(1);
+
+#if 1 // solve P L U X = P B
+ d_print_strmat(n, n, &sB, 0, 0);
+ drowpe_libstr(n, ipiv, &sB);
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ dtrsm_llnu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sE, 0, 0);
+ d_print_strmat(n, n, &sE, 0, 0);
+ dtrsm_lunn_libstr(n, n, 1.0, &sD, 0, 0, &sE, 0, 0, &sE, 0, 0);
+ d_print_strmat(n, n, &sE, 0, 0);
+#else // solve X^T (P L U)^T = B^T P^T
+ d_print_strmat(n, n, &sB, 0, 0);
+ dcolpe_libstr(n, ipiv, &sB);
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ dtrsm_rltu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sE, 0, 0);
+ d_print_strmat(n, n, &sE, 0, 0);
+ dtrsm_rutn_libstr(n, n, 1.0, &sD, 0, 0, &sE, 0, 0, &sE, 0, 0);
+ d_print_strmat(n, n, &sE, 0, 0);
+#endif
+
+// d_print_strmat(n, n, &sA, 0, 0);
+// d_print_strmat(n, n, &sB, 0, 0);
+// d_print_strmat(n, n, &sD, 0, 0);
+// d_print_strmat(n, n, &sE, 0, 0);
+
+// d_cvt_strmat2mat(n, n, &sE, 0, 0, C, n);
+// d_print_mat(n, n, C, n);
+
+ dtrtr_u_libstr(6, &sE, 2, 0, &sB, 1, 0);
+ d_print_strmat(n, n, &sB, 0, 0);
+
+ d_print_strmat(n, n, &sA, 0, 0);
+ dgemv_nt_libstr(6, n, 1.0, 1.0, &sA, 0, 0, &sx_n, 0, &sx_t, 0, 0.0, 0.0, &sy_n, 0, &sy_t, 0, &sz_n, 0, &sz_t, 0);
+// dsymv_l_libstr(5, 5, 1.0, &sA, 0, 0, x_n, 0.0, y_n, z_n);
+ d_print_mat(1, n, z_n, 1);
+ d_print_mat(1, n, z_t, 1);
+
+
+
+
+// for(ii=0; ii<sE.pm*sE.cn; ii++) sE.pA[ii] = 0.0;
+// double alpha = 0.0;
+// double beta = 1.0;
+// kernel_dgemm_nt_4x4_gen_lib4(4, &alpha, sA.pA, sB.pA, &beta, 3, sA.pA, sA.cn, 0, sE.pA, sE.cn, 0, 4, 2, 2);
+// d_print_strmat(n, n, &sE, 0, 0);
+
+ // free memory
+ free(A);
+ free(B);
+ free(C);
+ free(D);
+ free(ipiv);
+// d_free_strmat(&sA);
+// d_free_strmat(&sB);
+// d_free_strmat(&sD);
+ v_free_align(memory_strmat);
+
+ return 0;
+
+ }
diff --git a/test_problems/test_s_strmat.c b/test_problems/test_s_strmat.c
new file mode 100644
index 0000000..456db87
--- /dev/null
+++ b/test_problems/test_s_strmat.c
@@ -0,0 +1,191 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+
+int main()
+ {
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+ printf("\nLA provided by HIGH_PERFORMANCE\n\n");
+
+#elif defined(LA_REFERENCE)
+
+ printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+ printf("\nLA provided by BLAS\n\n");
+
+#else
+
+ printf("\nLA provided by ???\n\n");
+ exit(2);
+
+#endif
+
+ int ii, jj;
+
+ int n = 16;
+
+ //
+ // matrices in column-major format
+ //
+ float *A; s_zeros(&A, n, n);
+ for(ii=0; ii<n*n; ii++) A[ii] = ii;
+// for(jj=0; jj<n; jj++)
+// for(ii=0; ii<jj; ii++)
+// A[ii+n*jj] = 0.0/0.0;
+// s_print_mat(n, n, A, n);
+
+ float *B; s_zeros(&B, n, n);
+ for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+// s_print_mat(n, n, B, n);
+
+ float *D; s_zeros(&D, n, n);
+ for(ii=0; ii<n*n; ii++) D[ii] = -1.0;
+// s_print_mat(n, n, B, n);
+
+
+ //
+ // matrices in matrix struct format
+ //
+
+ struct s_strmat sA;
+ s_allocate_strmat(n, n, &sA);
+ s_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+ s_print_strmat(n, n, &sA, 0, 0);
+
+ struct s_strmat sB;
+ s_allocate_strmat(n, n, &sB);
+ s_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+ s_print_strmat(n, n, &sB, 0, 0);
+
+ struct s_strmat sD;
+ s_allocate_strmat(n, n, &sD);
+ s_cvt_mat2strmat(n, n, D, n, &sD, 0, 0);
+
+ struct s_strvec sx;
+ s_allocate_strvec(n, &sx);
+ sx.pa[7] = 1.0;
+ s_print_tran_strvec(n, &sx, 0);
+
+ struct s_strvec sz0;
+ s_allocate_strvec(n, &sz0);
+
+ struct s_strvec sz1;
+ s_allocate_strvec(n, &sz1);
+
+ //
+ // tests
+ //
+
+ float alpha = 1.0;
+ float beta = 0.0;
+// kernel_sgemm_nt_24x4_lib8(4, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_16x4_lib8(4, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_8x8_lib8(5, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nt_8x4_lib8(5, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+// kernel_sgemm_nt_4x8_gen_lib8(8, &alpha, sA.pA, sB.pA, &beta, 0, sD.pA, sD.cn, 0, sD.pA, sD.cn, 0, 4, 0, 8);
+// kernel_sgemm_nt_8x4_vs_lib8(8, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA, 7, 4);
+// kernel_sgemm_nt_8x4_lib8(8, &alpha, sB.pA, sA.pA+4, &beta, sA.pA+4*8, sD.pA+4*8);
+// kernel_sgemm_nn_16x4_lib8(4, &alpha, sA.pA, sA.cn, 0, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_12x4_lib4(4, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_8x8_lib4(8, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// kernel_sgemm_nt_8x4_lib4(2, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+// s_print_strmat(n, n, &sD, 0, 0);
+// return 0;
+// sgemm_nt_libstr(n, n, 5, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sB, 0, 0, &sD, 0, 0);
+// ssyrk_ln_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sB, 0, 0, &sD, 0, 0);
+// ssyrk_ln_mn_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sB, 0, 0, &sD, 0, 0);
+// kernel_ssyrk_nt_l_8x8_lib8(n, &alpha, sA.pA, sA.pA, &beta, sB.pA, sD.pA);
+// sgecp_libstr(16, 16, &sA, 2, 0, &sD, 1, 0);
+// sgetr_libstr(16, 16, &sA, 2, 0, &sD, 2, 0);
+// s_print_strmat(n, n, &sD, 0, 0);
+// sgemv_n_libstr(6, 6, 1.0, &sA, 1, 0, &sx, 0, 0.0, &sz0, 0, &sz0, 0);
+// sgemv_t_libstr(11, 8, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sz0, 0, &sz0, 0);
+// strmv_lnn_libstr(6, 6, &sA, 1, 0, &sx, 0, &sz0, 0);
+// strmv_ltn_libstr(10, 10, &sA, 1, 0, &sx, 0, &sz0, 0);
+// sA.pA[0] = 1.0;
+// strsv_lnn_libstr(10, &sA, 0, 0, &sx, 0, &sz0, 0);
+// for(ii=0; ii<8; ii++) sA.dA[ii] = 1.0/sgeex1_libstr(&sA, ii, ii);
+// kernel_strsv_lt_inv_8_lib8(0, sA.pA, sA.cn, sA.dA, sx.pa, sx.pa, sz0.pa);
+// kernel_strsv_lt_inv_8_vs_lib8(0, sA.pA, sA.cn, sA.dA, sx.pa, sx.pa, sz0.pa, 3);
+// s_print_strmat(n, n, &sA, 0, 0);
+// strsv_ltn_libstr(12, &sA, 0, 0, &sx, 0, &sz0, 0);
+// strsv_ltn_mn_libstr(11, 3, &sA, 0, 0, &sx, 0, &sz0, 0);
+// s_print_strmat(n, n, &sA, 0, 0);
+// kernel_sgemv_nt_4_lib8(n, &alpha, &alpha, sA.pA, sA.cn, sx.pa, sx.pa, &beta, sz1.pa, sz0.pa, sz1.pa);
+// kernel_sgemv_nt_4_vs_lib8(n, &alpha, &alpha, sA.pA, sA.cn, sx.pa, sx.pa, &beta, sz1.pa, sz0.pa, sz1.pa, 3);
+// sgemv_nt_libstr(5, 2, alpha, alpha, &sA, 0, 0, &sx, 0, &sx, 0, beta, beta, &sz0, 0, &sz1, 0, &sz0, 0, &sz1, 0);
+// ssymv_l_libstr(10, 10, alpha, &sA, 1, 0, &sx, 0, beta, &sz0, 0, &sz1, 0);
+// s_print_tran_strvec(n, &sz0, 0);
+// s_print_tran_strvec(n, &sz1, 0);
+// return 0;
+// sgesc_libstr(16, 9, 2.0, &sD, 0, 0);
+// s_print_strmat(n, n, &sD, 0, 0);
+// kernel_spotrf_nt_l_8x8_lib8(0, sD.pA, sD.pA, sD.pA, sD.pA, sx.pa);
+// s_print_strmat(n, n, &sD, 0, 0);
+// s_print_tran_strvec(n, &sx, 0);
+// kernel_strsm_nt_rl_inv_8x8_lib8(0, sD.pA, sD.pA, sD.pA+8*sD.cn, sD.pA+8*sD.cn, sD.pA, sx.pa);
+// s_print_strmat(n, n, &sD, 0, 0);
+// kernel_spotrf_nt_l_8x8_lib8(8, sD.pA+8*sD.cn, sD.pA+8*sD.cn, sD.pA+8*sD.cn+8*8, sD.pA+8*sD.cn+8*8, sx.pa+8);
+// spotrf_l_mn_libstr(23, 17, &sD, 0, 0, &sD, 0, 0);
+// spotrf_l_libstr(n, &sD, 0, 0, &sD, 0, 0);
+// kernel_strmm_nn_rl_8x4_lib8(3, &alpha, sB.pA, 7, sA.pA, sA.cn, sD.pA);
+ strmm_rlnn_libstr(12, 8, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+ s_print_strmat(n, n, &sD, 0, 0);
+ return 0;
+
+
+
+ //
+ // free memory
+ //
+
+ free(A);
+ free(B);
+ free(D);
+ s_free_strmat(&sA);
+ s_free_strmat(&sB);
+ s_free_strmat(&sD);
+
+ return 0;
+
+ }