Squashed 'third_party/blasfeo/' content from commit 2a828ca
Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/auxiliary/Makefile b/auxiliary/Makefile
new file mode 100644
index 0000000..d1242bd
--- /dev/null
+++ b/auxiliary/Makefile
@@ -0,0 +1,124 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib8.o
+OBJS += m_aux_lib48.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib8.o
+OBJS += m_aux_lib48.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+OBJS += d_aux_lib.o
+OBJS += s_aux_lib.o
+OBJS += m_aux_lib.o
+
+endif # LA choice
+
+ifeq ($(EXT_DEP), 1)
+#ext dep
+OBJS += d_aux_ext_dep_lib.o
+OBJS += s_aux_ext_dep_lib.o
+OBJS += v_aux_ext_dep_lib.o
+OBJS += i_aux_ext_dep_lib.o
+endif
+
+obj: $(OBJS)
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+ ( cd avx2; $(MAKE) obj)
+ ( cd avx; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+ ( cd avx; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), X64_INTEL_CORE)
+ ( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+ ( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+ ( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+ ( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), GENERIC)
+ ( cd c99; $(MAKE) obj)
+endif
+
+
+clean:
+ rm -f *.o
+ make -C avx2 clean
+ make -C avx clean
+ make -C c99 clean
diff --git a/auxiliary/avx/Makefile b/auxiliary/avx/Makefile
new file mode 100644
index 0000000..84e0154
--- /dev/null
+++ b/auxiliary/avx/Makefile
@@ -0,0 +1,50 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgecp_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
diff --git a/auxiliary/avx/kernel_dgecp_lib4.c b/auxiliary/avx/kernel_dgecp_lib4.c
new file mode 100644
index 0000000..4bc8c9a
--- /dev/null
+++ b/auxiliary/avx/kernel_dgecp_lib4.c
@@ -0,0 +1,3024 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgecp_8_0_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 8-wide + end 7x7 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ alpha_0,
+ a_0;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B0[0+bs*0], a_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B0[0+bs*1], a_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B0[0+bs*2], a_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B0[0+bs*3], a_0 );
+
+ A0 += 16;
+ B0 += 16;
+
+ a_0 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+bs*0], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+bs*1] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+bs*1], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+bs*2] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+bs*2], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+bs*3] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+bs*3], a_0 );
+
+ A1 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B0[0+bs*0], a_0 );
+
+ A0 += 4;
+ B0 += 4;
+
+ a_0 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+bs*0], a_0 );
+
+ A1 += 4;
+ B1 += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 7x7 triangle
+
+ c_0 = _mm_load_sd( &A0[1+0*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[1+0*bs], c_0 );
+ c_0 = _mm_load_pd( &A0[2+0*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B0[2+0*bs], c_0 );
+ a_0 = _mm256_load_pd( &A1[0+0*bs] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+0*bs], a_0 );
+
+ c_0 = _mm_load_pd( &A0[2+1*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B0[2+1*bs], c_0 );
+ a_0 = _mm256_load_pd( &A1[0+1*bs] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+1*bs], a_0 );
+
+ c_0 = _mm_load_sd( &A0[3+2*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+2*bs], c_0 );
+ a_0 = _mm256_load_pd( &A1[0+2*bs] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+2*bs], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+3*bs] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B1[0+3*bs], a_0 );
+
+ c_0 = _mm_load_sd( &A1[1+4*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[1+4*bs], c_0 );
+ c_0 = _mm_load_pd( &A1[2+4*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+4*bs], c_0 );
+
+ c_0 = _mm_load_pd( &A1[2+5*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+5*bs], c_0 );
+
+ c_0 = _mm_load_sd( &A1[3+6*bs] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+6*bs], c_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgecp_8_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 8-wide + end 7x7 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *A2 = A1 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ alpha_0,
+ a_0, a_1, a_2,
+ b_0, b_1;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+
+ a_2 = _mm256_load_pd( &A2[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B1[0+bs*1], b_1 );
+ _mm256_store_pd( &B0[0+bs*1], b_0 );
+
+ a_2 = _mm256_load_pd( &A2[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B1[0+bs*2], b_1 );
+ _mm256_store_pd( &B0[0+bs*2], b_0 );
+
+ a_2 = _mm256_load_pd( &A2[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B1[0+bs*3], b_1 );
+ _mm256_store_pd( &B0[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ A2 += 16;
+ B0 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ A2 += 4;
+ B0 += 4;
+ B1 += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 7x7 triangle
+
+ c_0 = _mm_load_pd( &A0[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B0[1+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A1[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A1[1+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*0], c_0 );
+
+ c_0 = _mm_load_sd( &A0[3+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[2+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A1[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A1[1+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*1], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A1[0+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*2], c_0 );
+ c_0 = _mm_load_sd( &A1[1+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*2], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*2], c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*2], c_0 );
+
+ c_0 = _mm_load_sd( &A1[1+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*3], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*3], c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*3], c_0 );
+
+ c_0 = _mm_load_pd( &A1[2+bs*4] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*4], c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*4] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*4], c_0 );
+
+ c_0 = _mm_load_sd( &A1[3+bs*5] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[2+bs*5], c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*5] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*5], c_0 );
+
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ c_0 = _mm_load_sd( &A2[0+bs*6] );
+ _mm_store_sd( &B1[3+bs*6], c_0 );
+
+ }
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_8_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 8-wide + end 7x7 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *A2 = A1 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ alpha_0,
+ a_0, a_1, a_2,
+ b_0, b_1;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_2 = _mm256_load_pd( &A2[0+bs*1] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*1], b_0 );
+ _mm256_store_pd( &B1[0+bs*1], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_2 = _mm256_load_pd( &A2[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*2], b_0 );
+ _mm256_store_pd( &B1[0+bs*2], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_2 = _mm256_load_pd( &A2[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*3], b_0 );
+ _mm256_store_pd( &B1[0+bs*3], b_1 );
+
+ A0 += 16;
+ A1 += 16;
+ A2 += 16;
+ B0 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ A0 += 4;
+ A1 += 4;
+ A2 += 4;
+ B0 += 4;
+ B1 += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 7x7 triangle
+
+ c_0 = _mm_load_sd( &A0[3+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[1+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A1[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B0[2+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[0+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+bs*0], c_0 );
+
+ c_0 = _mm_load_pd( &A1[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B0[2+bs*1], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[0+bs*1], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A1[1+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*2], c_0 );
+ c_0 = _mm_load_pd( &A1[2+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[0+bs*2], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+bs*2], c_0 );
+
+ c_0 = _mm_load_pd( &A1[2+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[0+bs*3], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+bs*3], c_0 );
+
+ c_0 = _mm_load_sd( &A1[3+bs*4] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[1+bs*4], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*4] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+bs*4], c_0 );
+
+ c_0 = _mm_load_pd( &A2[0+bs*5] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B1[2+bs*5], c_0 );
+
+ c_0 = _mm_load_sd( &A2[1+bs*6] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*6], c_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_8_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 8-wide + end 7x7 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *A2 = A1 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ alpha_0,
+ a_0, a_1, a_2,
+ b_0, b_1;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_2 = _mm256_load_pd( &A2[0+bs*1] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*1], b_0 );
+ _mm256_store_pd( &B1[0+bs*1], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_2 = _mm256_load_pd( &A2[0+bs*2] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*2], b_0 );
+ _mm256_store_pd( &B1[0+bs*2], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_2 = _mm256_load_pd( &A2[0+bs*3] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*3], b_0 );
+ _mm256_store_pd( &B1[0+bs*3], b_1 );
+
+ A0 += 16;
+ A1 += 16;
+ A2 += 16;
+ B0 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ A0 += 4;
+ A1 += 4;
+ A2 += 4;
+ B0 += 4;
+ B1 += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 7x7 triangle
+
+ c_0 = _mm_load_pd( &A1[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B0[1+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A1[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A1[3+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A2[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*0], c_0 );
+
+ c_0 = _mm_load_sd( &A1[1+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[2+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A1[2+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A1[3+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*1], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A2[2+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A1[2+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B0[3+bs*2], c_0 );
+ c_0 = _mm_load_sd( &A1[3+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*2], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*2], c_0 );
+ c_0 = _mm_load_sd( &A2[2+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*2], c_0 );
+
+ c_0 = _mm_load_sd( &A1[3+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[0+bs*3], c_0 );
+ c_0 = _mm_load_pd( &A2[0+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*3], c_0 );
+ c_0 = _mm_load_sd( &A2[2+bs*3] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*3], c_0 );
+
+ c_0 = _mm_load_pd( &A2[0+bs*4] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B1[1+bs*4], c_0 );
+ c_0 = _mm_load_sd( &A2[2+bs*4] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*4], c_0 );
+
+ c_0 = _mm_load_sd( &A2[1+bs*5] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[2+bs*5], c_0 );
+ c_0 = _mm_load_sd( &A2[2+bs*5] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*5], c_0 );
+
+ c_0 = _mm_load_sd( &A2[2+bs*6] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B1[3+bs*6], c_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgecp_4_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m256d
+ alpha_0,
+ a_0;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm256_load_pd( &A[0+bs*1] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm256_load_pd( &A[0+bs*2] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm256_load_pd( &A[0+bs*3] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B[0+bs*3], a_0 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ _mm256_store_pd( &B[0+bs*0], a_0 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ c_0 = _mm_load_sd( &A[1+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[1+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B[2+bs*0], c_0 );
+
+ c_0 = _mm_load_pd( &A[2+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B[2+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A[3+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*2], c_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgecp_4_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m256d
+ alpha_0,
+ a_0, a_1,
+ b_0;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*1], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*2], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ c_0 = _mm_load_pd( &A0[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B[1+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A1[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*0], c_0 );
+
+ c_0 = _mm_load_sd( &A0[3+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[2+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A1[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A1[0+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*2], c_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_4_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m256d
+ alpha_0,
+ a_0, a_1,
+ b_0;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*1], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*2], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ c_0 = _mm_load_sd( &A0[3+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[1+bs*0], c_0 );
+ c_0 = _mm_load_pd( &A1[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B[2+bs*0], c_0 );
+
+ c_0 = _mm_load_pd( &A1[0+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_pd( &B[2+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A1[1+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*2], c_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_4_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m256d
+ alpha_0,
+ a_0, a_1,
+ b_0;
+
+ __m128d
+ c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*1], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*2], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ c_0 = _mm_load_pd( &A1[0+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_storeu_pd( &B[1+bs*0], c_0 );
+ c_0 = _mm_load_sd( &A1[2+bs*0] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*0], c_0 );
+
+ c_0 = _mm_load_sd( &A1[1+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[2+bs*1], c_0 );
+ c_0 = _mm_load_sd( &A1[2+bs*1] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*1], c_0 );
+
+ c_0 = _mm_load_sd( &A1[2+bs*2] );
+ c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+ _mm_store_sd( &B[3+bs*2], c_0 );
+ }
+
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_3_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m128d
+ alpha_0,
+ a_0, a_1;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ a_1 = _mm_load_sd( &A[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*1] );
+ a_1 = _mm_load_sd( &A[2+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+ _mm_store_sd( &B[2+bs*1], a_1 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*2] );
+ a_1 = _mm_load_sd( &A[2+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+ _mm_store_sd( &B[2+bs*2], a_1 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*3] );
+ a_1 = _mm_load_sd( &A[2+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+ _mm_store_sd( &B[2+bs*3], a_1 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ a_1 = _mm_load_sd( &A[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 2x2 triangle
+
+ a_0 = _mm_loadu_pd( &A[1+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[1+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A[2+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[2+bs*1], a_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_3_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m128d
+ alpha_0,
+ a_0, a_1;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ a_1 = _mm_load_sd( &A1[0+bs*0] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+ a_1 = _mm_load_sd( &A1[0+bs*1] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_store_sd( &B[2+bs*1], a_1 );
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+ a_1 = _mm_load_sd( &A1[0+bs*2] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_store_sd( &B[2+bs*2], a_1 );
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+ a_1 = _mm_load_sd( &A1[0+bs*3] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_store_sd( &B[2+bs*3], a_1 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ a_1 = _mm_load_sd( &A1[0+bs*0] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 2x2 triangle
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[1+bs*0], a_0 );
+ a_0 = _mm_load_sd( &A1[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[2+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A1[0+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[2+bs*1], a_0 );
+
+ }
+
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_3_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m128d
+ alpha_0,
+ a_0, a_1;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+ a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[1+bs*0], a_1 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*1], a_0 );
+ a_1 = _mm_loadu_pd( &A1[0+bs*1] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[1+bs*1], a_1 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*2], a_0 );
+ a_1 = _mm_loadu_pd( &A1[0+bs*2] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[1+bs*2], a_1 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*3], a_0 );
+ a_1 = _mm_loadu_pd( &A1[0+bs*3] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[1+bs*3], a_1 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+ a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ _mm_storeu_pd( &B[1+bs*0], a_1 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 2x2 triangle
+
+ a_0 = _mm_loadu_pd( &A1[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[1+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A1[1+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[2+bs*1], a_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_2_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m128d
+ alpha_0,
+ a_0;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 1x1 triangle
+
+ a_0 = _mm_load_sd( &A[1+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[1+bs*0], a_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_2_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m128d
+ alpha_0,
+ a_0;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*1] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*2] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*3] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 1x1 triangle
+
+ a_0 = _mm_load_sd( &A1[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[1+bs*0], a_0 );
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgecp_1_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m128d
+ alpha_0,
+ a_0;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_load_sd( &A[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A[0+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm_load_sd( &A[0+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm_load_sd( &A[0+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*3], a_0 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_load_sd( &A[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgead_8_0_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ a_0, c_0, alpha_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B0[0+bs*0], a_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ c_0 = _mm256_load_pd( &B0[0+bs*1] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B0[0+bs*1], a_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ c_0 = _mm256_load_pd( &B0[0+bs*2] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B0[0+bs*2], a_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ c_0 = _mm256_load_pd( &B0[0+bs*3] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B0[0+bs*3], a_0 );
+
+ A0 += 16;
+ B0 += 16;
+
+ a_0 = _mm256_load_pd( &A1[0+bs*0] );
+ c_0 = _mm256_load_pd( &B1[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B1[0+bs*0], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+bs*1] );
+ c_0 = _mm256_load_pd( &B1[0+bs*1] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B1[0+bs*1], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+bs*2] );
+ c_0 = _mm256_load_pd( &B1[0+bs*2] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B1[0+bs*2], a_0 );
+
+ a_0 = _mm256_load_pd( &A1[0+bs*3] );
+ c_0 = _mm256_load_pd( &B1[0+bs*3] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B1[0+bs*3], a_0 );
+
+ A1 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B0[0+bs*0], a_0 );
+
+ A0 += 4;
+ B0 += 4;
+
+ a_0 = _mm256_load_pd( &A1[0+bs*0] );
+ c_0 = _mm256_load_pd( &B1[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( a_0, c_0 );
+ _mm256_store_pd( &B1[0+bs*0], a_0 );
+
+ A1 += 4;
+ B1 += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgead_8_1_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *A2 = A1 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ a_0, a_1, a_2,
+ b_0, b_1,
+ alpha_0, c_0, c_1;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ c_1 = _mm256_load_pd( &B1[0+bs*0] );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+
+ a_2 = _mm256_load_pd( &A2[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ c_1 = _mm256_load_pd( &B1[0+bs*1] );
+ c_0 = _mm256_load_pd( &B0[0+bs*1] );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ _mm256_store_pd( &B1[0+bs*1], b_1 );
+ _mm256_store_pd( &B0[0+bs*1], b_0 );
+
+ a_2 = _mm256_load_pd( &A2[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ c_1 = _mm256_load_pd( &B1[0+bs*2] );
+ c_0 = _mm256_load_pd( &B0[0+bs*2] );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ _mm256_store_pd( &B1[0+bs*2], b_1 );
+ _mm256_store_pd( &B0[0+bs*2], b_0 );
+
+ a_2 = _mm256_load_pd( &A2[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ c_1 = _mm256_load_pd( &B1[0+bs*3] );
+ c_0 = _mm256_load_pd( &B0[0+bs*3] );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ _mm256_store_pd( &B1[0+bs*3], b_1 );
+ _mm256_store_pd( &B0[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ A2 += 16;
+ B0 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+ b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+ c_1 = _mm256_load_pd( &B1[0+bs*0] );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ A2 += 4;
+ B0 += 4;
+ B1 += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_8_2_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *A2 = A1 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ a_0, a_1, a_2,
+ b_0, b_1,
+ alpha_0, c_0, c_1;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ c_1 = _mm256_load_pd( &B1[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_2 = _mm256_load_pd( &A2[0+bs*1] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ c_0 = _mm256_load_pd( &B0[0+bs*1] );
+ c_1 = _mm256_load_pd( &B1[0+bs*1] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*1], b_0 );
+ _mm256_store_pd( &B1[0+bs*1], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_2 = _mm256_load_pd( &A2[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ c_0 = _mm256_load_pd( &B0[0+bs*2] );
+ c_1 = _mm256_load_pd( &B1[0+bs*2] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*2], b_0 );
+ _mm256_store_pd( &B1[0+bs*2], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_2 = _mm256_load_pd( &A2[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ c_0 = _mm256_load_pd( &B0[0+bs*3] );
+ c_1 = _mm256_load_pd( &B1[0+bs*3] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*3], b_0 );
+ _mm256_store_pd( &B1[0+bs*3], b_1 );
+
+ A0 += 16;
+ A1 += 16;
+ A2 += 16;
+ B0 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ c_1 = _mm256_load_pd( &B1[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ A0 += 4;
+ A1 += 4;
+ A2 += 4;
+ B0 += 4;
+ B1 += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_8_3_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+ double *A2 = A1 + bs*sda;
+ double *B1 = B0 + bs*sdb;
+
+ __m256d
+ a_0, a_1, a_2,
+ b_0, b_1,
+ alpha_0, c_0, c_1;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ c_1 = _mm256_load_pd( &B1[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_2 = _mm256_load_pd( &A2[0+bs*1] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ c_0 = _mm256_load_pd( &B0[0+bs*1] );
+ c_1 = _mm256_load_pd( &B1[0+bs*1] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*1], b_0 );
+ _mm256_store_pd( &B1[0+bs*1], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_2 = _mm256_load_pd( &A2[0+bs*2] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ c_0 = _mm256_load_pd( &B0[0+bs*2] );
+ c_1 = _mm256_load_pd( &B1[0+bs*2] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*2], b_0 );
+ _mm256_store_pd( &B1[0+bs*2], b_1 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_2 = _mm256_load_pd( &A2[0+bs*3] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ c_0 = _mm256_load_pd( &B0[0+bs*3] );
+ c_1 = _mm256_load_pd( &B1[0+bs*3] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*3], b_0 );
+ _mm256_store_pd( &B1[0+bs*3], b_1 );
+
+ A0 += 16;
+ A1 += 16;
+ A2 += 16;
+ B0 += 16;
+ B1 += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_2 = _mm256_load_pd( &A2[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+ c_0 = _mm256_load_pd( &B0[0+bs*0] );
+ c_1 = _mm256_load_pd( &B1[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_1 = _mm256_mul_pd( alpha_0, b_1 );
+ b_0 = _mm256_add_pd ( c_0, b_0 );
+ b_1 = _mm256_add_pd ( c_1, b_1 );
+ _mm256_store_pd( &B0[0+bs*0], b_0 );
+ _mm256_store_pd( &B1[0+bs*0], b_1 );
+
+ A0 += 4;
+ A1 += 4;
+ A2 += 4;
+ B0 += 4;
+ B1 += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgead_4_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m256d
+ a_0, c_0, alpha_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A[0+bs*0] );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( c_0, a_0 );
+ _mm256_store_pd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm256_load_pd( &A[0+bs*1] );
+ c_0 = _mm256_load_pd( &B[0+bs*1] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( c_0, a_0 );
+ _mm256_store_pd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm256_load_pd( &A[0+bs*2] );
+ c_0 = _mm256_load_pd( &B[0+bs*2] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( c_0, a_0 );
+ _mm256_store_pd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm256_load_pd( &A[0+bs*3] );
+ c_0 = _mm256_load_pd( &B[0+bs*3] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( c_0, a_0 );
+ _mm256_store_pd( &B[0+bs*3], a_0 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A[0+bs*0] );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ a_0 = _mm256_mul_pd( alpha_0, a_0 );
+ a_0 = _mm256_add_pd( c_0, a_0 );
+ _mm256_store_pd( &B[0+bs*0], a_0 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgead_4_1_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m256d
+ a_0, a_1,
+ b_0,
+ alpha_0, c_0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*1] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*1], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*2] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*2], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*3] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_4_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m256d
+ a_0, a_1,
+ b_0,
+ alpha_0, c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ c_0 = _mm256_load_pd( &B[0+bs*1] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*1], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ c_0 = _mm256_load_pd( &B[0+bs*2] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*2], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ c_0 = _mm256_load_pd( &B[0+bs*3] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_4_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m256d
+ a_0, a_1,
+ b_0,
+ alpha_0, c_0;
+
+ int k;
+
+ alpha_0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*1] );
+ a_1 = _mm256_load_pd( &A1[0+bs*1] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*1] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*1], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*2] );
+ a_1 = _mm256_load_pd( &A1[0+bs*2] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*2] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*2], b_0 );
+
+ a_0 = _mm256_load_pd( &A0[0+bs*3] );
+ a_1 = _mm256_load_pd( &A1[0+bs*3] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*3] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*3], b_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm256_load_pd( &A0[0+bs*0] );
+ a_1 = _mm256_load_pd( &A1[0+bs*0] );
+ a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+ b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+ c_0 = _mm256_load_pd( &B[0+bs*0] );
+ b_0 = _mm256_mul_pd( alpha_0, b_0 );
+ b_0 = _mm256_add_pd( c_0, b_0 );
+ _mm256_store_pd( &B[0+bs*0], b_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_3_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m128d
+ a_0, a_1,
+ alpha_0, c_0, c_1;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ a_1 = _mm_load_sd( &A[2+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ c_1 = _mm_load_sd( &B[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*1] );
+ a_1 = _mm_load_sd( &A[2+bs*1] );
+ c_0 = _mm_loadu_pd( &B[0+bs*1] );
+ c_1 = _mm_load_sd( &B[2+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+ _mm_store_sd( &B[2+bs*1], a_1 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*2] );
+ a_1 = _mm_load_sd( &A[2+bs*2] );
+ c_0 = _mm_loadu_pd( &B[0+bs*2] );
+ c_1 = _mm_load_sd( &B[2+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+ _mm_store_sd( &B[2+bs*2], a_1 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*3] );
+ a_1 = _mm_load_sd( &A[2+bs*3] );
+ c_0 = _mm_loadu_pd( &B[0+bs*3] );
+ c_1 = _mm_load_sd( &B[2+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+ _mm_store_sd( &B[2+bs*3], a_1 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ a_1 = _mm_load_sd( &A[2+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ c_1 = _mm_load_sd( &B[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_3_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m128d
+ a_0, a_1,
+ alpha_0, c_0, c_1;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+ a_1 = _mm_load_sd( &A1[0+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ c_1 = _mm_load_sd( &B[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*1] );
+ a_1 = _mm_load_sd( &A1[0+bs*1] );
+ c_0 = _mm_loadu_pd( &B[0+bs*1] );
+ c_1 = _mm_load_sd( &B[2+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+ _mm_store_sd( &B[2+bs*1], a_1 );
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*2] );
+ a_1 = _mm_load_sd( &A1[0+bs*2] );
+ c_0 = _mm_loadu_pd( &B[0+bs*2] );
+ c_1 = _mm_load_sd( &B[2+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+ _mm_store_sd( &B[2+bs*2], a_1 );
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*3] );
+ a_1 = _mm_load_sd( &A1[0+bs*3] );
+ c_0 = _mm_loadu_pd( &B[0+bs*3] );
+ c_1 = _mm_load_sd( &B[2+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+ _mm_store_sd( &B[2+bs*3], a_1 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+ a_1 = _mm_load_sd( &A1[0+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ c_1 = _mm_load_sd( &B[2+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_1 = _mm_mul_sd( alpha_0, a_1 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ a_1 = _mm_add_sd( c_1, a_1 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+ _mm_store_sd( &B[2+bs*0], a_1 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_3_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m128d
+ a_0, a_1,
+ alpha_0, c_0, c_1;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+ c_0 = _mm_load_sd( &B[0+bs*0] );
+ c_1 = _mm_loadu_pd( &B[1+bs*0] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ a_1 = _mm_add_pd( c_1, a_1 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+ _mm_storeu_pd( &B[1+bs*0], a_1 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*1] );
+ a_1 = _mm_loadu_pd( &A1[0+bs*1] );
+ c_0 = _mm_load_sd( &B[0+bs*1] );
+ c_1 = _mm_loadu_pd( &B[1+bs*1] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ a_1 = _mm_add_pd( c_1, a_1 );
+ _mm_store_sd( &B[0+bs*1], a_0 );
+ _mm_storeu_pd( &B[1+bs*1], a_1 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*2] );
+ a_1 = _mm_loadu_pd( &A1[0+bs*2] );
+ c_0 = _mm_load_sd( &B[0+bs*2] );
+ c_1 = _mm_loadu_pd( &B[1+bs*2] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ a_1 = _mm_add_pd( c_1, a_1 );
+ _mm_store_sd( &B[0+bs*2], a_0 );
+ _mm_storeu_pd( &B[1+bs*2], a_1 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*3] );
+ a_1 = _mm_loadu_pd( &A1[0+bs*3] );
+ c_0 = _mm_load_sd( &B[0+bs*3] );
+ c_1 = _mm_loadu_pd( &B[1+bs*3] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ a_1 = _mm_add_pd( c_1, a_1 );
+ _mm_store_sd( &B[0+bs*3], a_0 );
+ _mm_storeu_pd( &B[1+bs*3], a_1 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+ c_0 = _mm_load_sd( &B[0+bs*0] );
+ c_1 = _mm_loadu_pd( &B[1+bs*0] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_1 = _mm_mul_pd( alpha_0, a_1 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ a_1 = _mm_add_pd( c_1, a_1 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+ _mm_storeu_pd( &B[1+bs*0], a_1 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_2_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m128d
+ a_0, c_0, alpha_0;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*1] );
+ c_0 = _mm_loadu_pd( &B[0+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*2] );
+ c_0 = _mm_loadu_pd( &B[0+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm_loadu_pd( &A[0+bs*3] );
+ c_0 = _mm_loadu_pd( &B[0+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_loadu_pd( &A[0+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_2_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ __m128d
+ a_0, c_0, alpha_0;
+
+ int k;
+
+ alpha_0 = _mm_loaddup_pd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*1] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*1] );
+ c_0 = _mm_loadu_pd( &B[0+bs*1] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*2] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*2] );
+ c_0 = _mm_loadu_pd( &B[0+bs*2] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm_load_sd( &A0[3+bs*3] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*3] );
+ c_0 = _mm_loadu_pd( &B[0+bs*3] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*3], a_0 );
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_load_sd( &A0[3+bs*0] );
+ a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+ c_0 = _mm_loadu_pd( &B[0+bs*0] );
+ a_0 = _mm_mul_pd( alpha_0, a_0 );
+ a_0 = _mm_add_pd( c_0, a_0 );
+ _mm_storeu_pd( &B[0+bs*0], a_0 );
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgead_1_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ __m128d
+ a_0, c_0, alpha_0;
+
+ int k;
+
+ alpha_0 = _mm_load_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = _mm_load_sd( &A[0+bs*0] );
+ c_0 = _mm_load_sd( &B[0+bs*0] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+
+ a_0 = _mm_load_sd( &A[0+bs*1] );
+ c_0 = _mm_load_sd( &B[0+bs*1] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ _mm_store_sd( &B[0+bs*1], a_0 );
+
+ a_0 = _mm_load_sd( &A[0+bs*2] );
+ c_0 = _mm_load_sd( &B[0+bs*2] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ _mm_store_sd( &B[0+bs*2], a_0 );
+
+ a_0 = _mm_load_sd( &A[0+bs*3] );
+ c_0 = _mm_load_sd( &B[0+bs*3] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ _mm_store_sd( &B[0+bs*3], a_0 );
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = _mm_load_sd( &A[0+bs*0] );
+ c_0 = _mm_load_sd( &B[0+bs*0] );
+ a_0 = _mm_mul_sd( alpha_0, a_0 );
+ a_0 = _mm_add_sd( c_0, a_0 );
+ _mm_store_sd( &B[0+bs*0], a_0 );
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+void kernel_dgeset_4_lib4(int kmax, double alpha, double *A)
+ {
+
+ int k;
+
+ __m256d
+ a0;
+
+ a0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ _mm256_store_pd( &A[0], a0 );
+ _mm256_store_pd( &A[4], a0 );
+ _mm256_store_pd( &A[8], a0 );
+ _mm256_store_pd( &A[12], a0 );
+
+ A += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ _mm256_store_pd( &A[0], a0 );
+
+ A += 4;
+
+ }
+
+ }
+
+
+// A lower triangular
+void kernel_dtrset_4_lib4(int kmax, double alpha, double *A)
+ {
+
+ int k;
+
+ __m256d
+ a0;
+
+ a0 = _mm256_broadcast_sd( &alpha );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ _mm256_store_pd( &A[0], a0 );
+ _mm256_store_pd( &A[4], a0 );
+ _mm256_store_pd( &A[8], a0 );
+ _mm256_store_pd( &A[12], a0 );
+
+ A += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ _mm256_store_pd( &A[0], a0 );
+
+ A += 4;
+
+ }
+
+ // final 4x4 triangle
+ _mm256_store_pd( &A[0], a0 );
+
+ _mm_store_sd( &A[5], _mm256_castpd256_pd128( a0 ) );
+ _mm_store_pd( &A[6], _mm256_castpd256_pd128( a0 ) );
+
+ _mm_store_pd( &A[10], _mm256_castpd256_pd128( a0 ) );
+
+ _mm_store_sd( &A[15], _mm256_castpd256_pd128( a0 ) );
+
+ }
+
+
+
diff --git a/auxiliary/avx/kernel_dgetr_lib4.c b/auxiliary/avx/kernel_dgetr_lib4.c
new file mode 100644
index 0000000..29d095b
--- /dev/null
+++ b/auxiliary/avx/kernel_dgetr_lib4.c
@@ -0,0 +1,490 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ __m256d
+ alph,
+ v0, v1, v2, v3,
+ v4, v5, v6, v7;
+
+ alph = _mm256_broadcast_sd( &alpha );
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-7; k+=8)
+ {
+
+ v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ A += bs*bs;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ v0 = _mm256_mul_pd( v0, alph );
+ _mm256_store_pd( &C[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ v2 = _mm256_mul_pd( v2, alph );
+ _mm256_store_pd( &C[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ v1 = _mm256_mul_pd( v1, alph );
+ _mm256_store_pd( &C[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ v3 = _mm256_mul_pd( v3, alph );
+ _mm256_store_pd( &C[0+bs*3], v3 );
+
+ C += bs*sdc;
+
+ v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ A += bs*bs;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ v0 = _mm256_mul_pd( v0, alph );
+ _mm256_store_pd( &C[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ v2 = _mm256_mul_pd( v2, alph );
+ _mm256_store_pd( &C[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ v1 = _mm256_mul_pd( v1, alph );
+ _mm256_store_pd( &C[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ v3 = _mm256_mul_pd( v3, alph );
+ _mm256_store_pd( &C[0+bs*3], v3 );
+
+ C += bs*sdc;
+
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+
+ v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ A += bs*bs;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ v0 = _mm256_mul_pd( v0, alph );
+ _mm256_store_pd( &C[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ v2 = _mm256_mul_pd( v2, alph );
+ _mm256_store_pd( &C[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ v1 = _mm256_mul_pd( v1, alph );
+ _mm256_store_pd( &C[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ v3 = _mm256_mul_pd( v3, alph );
+ _mm256_store_pd( &C[0+bs*3], v3 );
+
+ C += bs*sdc;
+
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 3x3 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else if(kna==2)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*3] = alpha * A[3+bs*2];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+ C[1+bs*2] = alpha * A[2+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+ C[2+bs*2] = alpha * A[2+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+ C[3+bs*2] = alpha * A[2+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 2x2 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 1x1 triangle
+ C[0+bs*1] = alpha * A[1+bs*0];
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ }
+
+
+
+// transposed of general matrices, read across panels, write along panels
+void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
+ {
+ const int ps = 4;
+ __m256d
+ v0, v1, v2, v3, v4, v5, v6, v7;
+ int k;
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ v0 = _mm256_load_pd( &A[0+ps*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+ps*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+ps*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+ps*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ _mm256_store_pd( &B[0+ps*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ _mm256_store_pd( &B[0+ps*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ _mm256_store_pd( &B[0+ps*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ _mm256_store_pd( &B[0+ps*3], v3 );
+
+ A += ps*sda;
+ B += ps*ps;
+ }
+ for( ; k<kmax; k++)
+ {
+ //
+ B[0+ps*0] = A[0+ps*0];
+ B[1+ps*0] = A[0+ps*1];
+ B[2+ps*0] = A[0+ps*2];
+ B[3+ps*0] = A[0+ps*3];
+
+ A += 1;
+ B += ps;
+ }
+ return;
+ }
+
diff --git a/auxiliary/avx2/Makefile b/auxiliary/avx2/Makefile
new file mode 100644
index 0000000..463ebf5
--- /dev/null
+++ b/auxiliary/avx2/Makefile
@@ -0,0 +1,46 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgetr_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
diff --git a/auxiliary/avx2/kernel_dgetr_lib4.c b/auxiliary/avx2/kernel_dgetr_lib4.c
new file mode 100644
index 0000000..14d00ef
--- /dev/null
+++ b/auxiliary/avx2/kernel_dgetr_lib4.c
@@ -0,0 +1,756 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+
+
+
+// TODO tri !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+void kernel_dgetr_8_lib4(int tri, int kmax, int kna, double alpha, double *A0, int sda, double *C, int sdc)
+ {
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ __m256d
+ alph,
+ v0, v1, v2, v3, v4, v5, v6, v7,
+ v8, v9, va, vb, vc, vd, ve, vf;
+
+ alph = _mm256_broadcast_sd( &alpha );
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A0[0+bs*0];
+ C[0+bs*1] = alpha * A0[1+bs*0];
+ C[0+bs*2] = alpha * A0[2+bs*0];
+ C[0+bs*3] = alpha * A0[3+bs*0];
+
+ C[0+bs*4] = alpha * A1[0+bs*0];
+ C[0+bs*5] = alpha * A1[1+bs*0];
+ C[0+bs*6] = alpha * A1[2+bs*0];
+ C[0+bs*7] = alpha * A1[3+bs*0];
+
+ C += 1;
+ A0 += bs;
+ A1 += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for(; k<kmax-7; k+=8)
+ {
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A0 += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*3], v7 );
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A1 += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*4], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*5], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*6], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*7], v7 );
+
+ C += sdc*bs;
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A0 += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*3], v7 );
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A1 += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*4], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*5], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*6], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*7], v7 );
+
+ C += sdc*bs;
+
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A0 += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*3], v7 );
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A1 += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*4], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*5], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*6], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*7], v7 );
+
+ C += sdc*bs;
+
+ }
+
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A0[0+bs*0];
+ C[0+bs*1] = alpha * A0[1+bs*0];
+ C[0+bs*2] = alpha * A0[2+bs*0];
+ C[0+bs*3] = alpha * A0[3+bs*0];
+
+ C[0+bs*4] = alpha * A1[0+bs*0];
+ C[0+bs*5] = alpha * A1[1+bs*0];
+ C[0+bs*6] = alpha * A1[2+bs*0];
+ C[0+bs*7] = alpha * A1[3+bs*0];
+
+ C += 1;
+ A0 += bs;
+ A1 += bs;
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ __m256d
+ alph,
+ v0, v1, v2, v3,
+ v4, v5, v6, v7;
+
+ alph = _mm256_broadcast_sd( &alpha );
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-7; k+=8)
+ {
+
+#if 1
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*3], v7 );
+
+ C += sdc*bs;
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 );
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 );
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 );
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 );
+
+ A += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*3], v7 );
+
+ C += sdc*bs;
+
+#else // TODO alpha
+
+ v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ A += bs*bs;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ _mm256_store_pd( &C[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ _mm256_store_pd( &C[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ _mm256_store_pd( &C[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ _mm256_store_pd( &C[0+bs*3], v3 );
+
+ C += bs*sdc;
+
+ v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ A += bs*bs;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ _mm256_store_pd( &C[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ _mm256_store_pd( &C[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ _mm256_store_pd( &C[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ _mm256_store_pd( &C[0+bs*3], v3 );
+
+ C += bs*sdc;
+
+#endif
+
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+
+#if 1
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+ A += 4*bs;
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ v4 = _mm256_mul_pd( v4, alph );
+ _mm256_store_pd( &C[0+bs*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ v5 = _mm256_mul_pd( v5, alph );
+ _mm256_store_pd( &C[0+bs*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ v6 = _mm256_mul_pd( v6, alph );
+ _mm256_store_pd( &C[0+bs*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ v7 = _mm256_mul_pd( v7, alph );
+ _mm256_store_pd( &C[0+bs*3], v7 );
+
+ C += sdc*bs;
+
+#else
+
+ v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+ v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+ v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ A += bs*bs;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ _mm256_store_pd( &C[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ _mm256_store_pd( &C[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ _mm256_store_pd( &C[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ _mm256_store_pd( &C[0+bs*3], v3 );
+
+ C += bs*sdc;
+
+#endif
+
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 3x3 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else if(kna==2)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*3] = alpha * A[3+bs*2];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+ C[1+bs*2] = alpha * A[2+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+ C[2+bs*2] = alpha * A[2+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+ C[3+bs*2] = alpha * A[2+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 2x2 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 1x1 triangle
+ C[0+bs*1] = alpha * A[1+bs*0];
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ }
+
+
+
+// transposed of general matrices, read across panels, write along panels
+void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
+ {
+ const int ps = 4;
+ __m256d
+ v0, v1, v2, v3, v4, v5, v6, v7;
+ int k;
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*0] ) ), _mm_load_pd( &A[0+ps*2]) , 0x1 ); // 00 10 02 12
+ v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*1] ) ), _mm_load_pd( &A[0+ps*3]) , 0x1 ); // 01 11 03 13
+ v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*0] ) ), _mm_load_pd( &A[2+ps*2]) , 0x1 ); // 20 30 22 32
+ v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*1] ) ), _mm_load_pd( &A[2+ps*3]) , 0x1 ); // 21 31 23 33
+
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+ _mm256_store_pd( &B[0+ps*0], v4 );
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+ _mm256_store_pd( &B[0+ps*1], v5 );
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+ _mm256_store_pd( &B[0+ps*2], v6 );
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+ _mm256_store_pd( &B[0+ps*3], v7 );
+
+ A += ps*sda;
+ B += ps*ps;
+ }
+ for( ; k<kmax; k++)
+ {
+ //
+ B[0+ps*0] = A[0+ps*0];
+ B[1+ps*0] = A[0+ps*1];
+ B[2+ps*0] = A[0+ps*2];
+ B[3+ps*0] = A[0+ps*3];
+
+ A += 1;
+ B += ps;
+ }
+ return;
+ }
+
diff --git a/auxiliary/c99/Makefile b/auxiliary/c99/Makefile
new file mode 100644
index 0000000..6e9ea7b
--- /dev/null
+++ b/auxiliary/c99/Makefile
@@ -0,0 +1,77 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS +=
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS +=
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
diff --git a/auxiliary/c99/kernel_dgecp_lib4.c b/auxiliary/c99/kernel_dgecp_lib4.c
new file mode 100644
index 0000000..e883072
--- /dev/null
+++ b/auxiliary/c99/kernel_dgecp_lib4.c
@@ -0,0 +1,1261 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgecp_4_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] = alpha*A[0+bs*0];
+ B[1+bs*0] = alpha*A[1+bs*0];
+ B[2+bs*0] = alpha*A[2+bs*0];
+ B[3+bs*0] = alpha*A[3+bs*0];
+
+ B[0+bs*1] = alpha*A[0+bs*1];
+ B[1+bs*1] = alpha*A[1+bs*1];
+ B[2+bs*1] = alpha*A[2+bs*1];
+ B[3+bs*1] = alpha*A[3+bs*1];
+
+ B[0+bs*2] = alpha*A[0+bs*2];
+ B[1+bs*2] = alpha*A[1+bs*2];
+ B[2+bs*2] = alpha*A[2+bs*2];
+ B[3+bs*2] = alpha*A[3+bs*2];
+
+ B[0+bs*3] = alpha*A[0+bs*3];
+ B[1+bs*3] = alpha*A[1+bs*3];
+ B[2+bs*3] = alpha*A[2+bs*3];
+ B[3+bs*3] = alpha*A[3+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A[0+bs*0];
+ B[1+bs*0] = alpha*A[1+bs*0];
+ B[2+bs*0] = alpha*A[2+bs*0];
+ B[3+bs*0] = alpha*A[3+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ B[1+bs*0] = alpha*A[1+bs*0];
+ B[2+bs*0] = alpha*A[2+bs*0];
+ B[3+bs*0] = alpha*A[3+bs*0];
+
+ B[2+bs*1] = alpha*A[2+bs*1];
+ B[3+bs*1] = alpha*A[3+bs*1];
+
+ B[3+bs*2] = alpha*A[3+bs*2];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgecp_4_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] = alpha*A0[1+bs*0];
+ B[1+bs*0] = alpha*A0[2+bs*0];
+ B[2+bs*0] = alpha*A0[3+bs*0];
+ B[3+bs*0] = alpha*A1[0+bs*0];
+
+ B[0+bs*1] = alpha*A0[1+bs*1];
+ B[1+bs*1] = alpha*A0[2+bs*1];
+ B[2+bs*1] = alpha*A0[3+bs*1];
+ B[3+bs*1] = alpha*A1[0+bs*1];
+
+ B[0+bs*2] = alpha*A0[1+bs*2];
+ B[1+bs*2] = alpha*A0[2+bs*2];
+ B[2+bs*2] = alpha*A0[3+bs*2];
+ B[3+bs*2] = alpha*A1[0+bs*2];
+
+ B[0+bs*3] = alpha*A0[1+bs*3];
+ B[1+bs*3] = alpha*A0[2+bs*3];
+ B[2+bs*3] = alpha*A0[3+bs*3];
+ B[3+bs*3] = alpha*A1[0+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A0[1+bs*0];
+ B[1+bs*0] = alpha*A0[2+bs*0];
+ B[2+bs*0] = alpha*A0[3+bs*0];
+ B[3+bs*0] = alpha*A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ B[1+0*bs] = alpha*A0[2+0*bs];
+ B[2+0*bs] = alpha*A0[3+0*bs];
+ B[3+0*bs] = alpha*A1[0+0*bs];
+
+ B[2+1*bs] = alpha*A0[3+1*bs];
+ B[3+1*bs] = alpha*A1[0+1*bs];
+
+ B[3+2*bs] = alpha*A1[0+2*bs];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_4_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] = alpha*A0[2+bs*0];
+ B[1+bs*0] = alpha*A0[3+bs*0];
+ B[2+bs*0] = alpha*A1[0+bs*0];
+ B[3+bs*0] = alpha*A1[1+bs*0];
+
+ B[0+bs*1] = alpha*A0[2+bs*1];
+ B[1+bs*1] = alpha*A0[3+bs*1];
+ B[2+bs*1] = alpha*A1[0+bs*1];
+ B[3+bs*1] = alpha*A1[1+bs*1];
+
+ B[0+bs*2] = alpha*A0[2+bs*2];
+ B[1+bs*2] = alpha*A0[3+bs*2];
+ B[2+bs*2] = alpha*A1[0+bs*2];
+ B[3+bs*2] = alpha*A1[1+bs*2];
+
+ B[0+bs*3] = alpha*A0[2+bs*3];
+ B[1+bs*3] = alpha*A0[3+bs*3];
+ B[2+bs*3] = alpha*A1[0+bs*3];
+ B[3+bs*3] = alpha*A1[1+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A0[2+bs*0];
+ B[1+bs*0] = alpha*A0[3+bs*0];
+ B[2+bs*0] = alpha*A1[0+bs*0];
+ B[3+bs*0] = alpha*A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle}
+
+ B[1+bs*0] = alpha*A0[3+bs*0];
+ B[2+bs*0] = alpha*A1[0+bs*0];
+ B[3+bs*0] = alpha*A1[1+bs*0];
+
+ B[2+bs*1] = alpha*A1[0+bs*1];
+ B[3+bs*1] = alpha*A1[1+bs*1];
+
+ B[3+bs*2] = alpha*A1[1+bs*2];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_4_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] = alpha*A0[3+bs*0];
+ B[1+bs*0] = alpha*A1[0+bs*0];
+ B[2+bs*0] = alpha*A1[1+bs*0];
+ B[3+bs*0] = alpha*A1[2+bs*0];
+
+ B[0+bs*1] = alpha*A0[3+bs*1];
+ B[1+bs*1] = alpha*A1[0+bs*1];
+ B[2+bs*1] = alpha*A1[1+bs*1];
+ B[3+bs*1] = alpha*A1[2+bs*1];
+
+ B[0+bs*2] = alpha*A0[3+bs*2];
+ B[1+bs*2] = alpha*A1[0+bs*2];
+ B[2+bs*2] = alpha*A1[1+bs*2];
+ B[3+bs*2] = alpha*A1[2+bs*2];
+
+ B[0+bs*3] = alpha*A0[3+bs*3];
+ B[1+bs*3] = alpha*A1[0+bs*3];
+ B[2+bs*3] = alpha*A1[1+bs*3];
+ B[3+bs*3] = alpha*A1[2+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A0[3+bs*0];
+ B[1+bs*0] = alpha*A1[0+bs*0];
+ B[2+bs*0] = alpha*A1[1+bs*0];
+ B[3+bs*0] = alpha*A1[2+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 3x3 triangle
+
+ B[1+bs*0] = alpha*A1[0+bs*0];
+ B[2+bs*0] = alpha*A1[1+bs*0];
+ B[3+bs*0] = alpha*A1[2+bs*0];
+
+ B[2+bs*1] = alpha*A1[1+bs*1];
+ B[3+bs*1] = alpha*A1[2+bs*1];
+
+ B[3+bs*2] = alpha*A1[2+bs*2];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_3_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] = alpha*A[0+bs*0];
+ B[1+bs*0] = alpha*A[1+bs*0];
+ B[2+bs*0] = alpha*A[2+bs*0];
+
+ B[0+bs*1] = alpha*A[0+bs*1];
+ B[1+bs*1] = alpha*A[1+bs*1];
+ B[2+bs*1] = alpha*A[2+bs*1];
+
+ B[0+bs*2] = alpha*A[0+bs*2];
+ B[1+bs*2] = alpha*A[1+bs*2];
+ B[2+bs*2] = alpha*A[2+bs*2];
+
+ B[0+bs*3] = alpha*A[0+bs*3];
+ B[1+bs*3] = alpha*A[1+bs*3];
+ B[2+bs*3] = alpha*A[2+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A[0+bs*0];
+ B[1+bs*0] = alpha*A[1+bs*0];
+ B[2+bs*0] = alpha*A[2+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 2x2 triangle
+
+ B[1+bs*0] = alpha*A[1+bs*0];
+ B[2+bs*0] = alpha*A[2+bs*0];
+
+ B[2+bs*1] = alpha*A[2+bs*1];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_3_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] = alpha*A0[2+bs*0];
+ B[1+bs*0] = alpha*A0[3+bs*0];
+ B[2+bs*0] = alpha*A1[0+bs*0];
+
+ B[0+bs*1] = alpha*A0[2+bs*1];
+ B[1+bs*1] = alpha*A0[3+bs*1];
+ B[2+bs*1] = alpha*A1[0+bs*1];
+
+ B[0+bs*2] = alpha*A0[2+bs*2];
+ B[1+bs*2] = alpha*A0[3+bs*2];
+ B[2+bs*2] = alpha*A1[0+bs*2];
+
+ B[0+bs*3] = alpha*A0[2+bs*3];
+ B[1+bs*3] = alpha*A0[3+bs*3];
+ B[2+bs*3] = alpha*A1[0+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A0[2+bs*0];
+ B[1+bs*0] = alpha*A0[3+bs*0];
+ B[2+bs*0] = alpha*A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 2x2 triangle
+
+ B[1+bs*0] = alpha*A0[3+bs*0];
+ B[2+bs*0] = alpha*A1[0+bs*0];
+
+ B[2+bs*1] = alpha*A1[0+bs*1];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_3_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] = alpha*A0[3+bs*0];
+ B[1+bs*0] = alpha*A1[0+bs*0];
+ B[2+bs*0] = alpha*A1[1+bs*0];
+
+ B[0+bs*1] = alpha*A0[3+bs*1];
+ B[1+bs*1] = alpha*A1[0+bs*1];
+ B[2+bs*1] = alpha*A1[1+bs*1];
+
+ B[0+bs*2] = alpha*A0[3+bs*2];
+ B[1+bs*2] = alpha*A1[0+bs*2];
+ B[2+bs*2] = alpha*A1[1+bs*2];
+
+ B[0+bs*3] = alpha*A0[3+bs*3];
+ B[1+bs*3] = alpha*A1[0+bs*3];
+ B[2+bs*3] = alpha*A1[1+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A0[3+bs*0];
+ B[1+bs*0] = alpha*A1[0+bs*0];
+ B[2+bs*0] = alpha*A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 2x2 triangle
+
+ B[1+bs*0] = alpha*A1[0+bs*0];
+ B[2+bs*0] = alpha*A1[1+bs*0];
+
+ B[2+bs*1] = alpha*A1[1+bs*1];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_2_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] = alpha*A[0+bs*0];
+ B[1+bs*0] = alpha*A[1+bs*0];
+
+ B[0+bs*1] = alpha*A[0+bs*1];
+ B[1+bs*1] = alpha*A[1+bs*1];
+
+ B[0+bs*2] = alpha*A[0+bs*2];
+ B[1+bs*2] = alpha*A[1+bs*2];
+
+ B[0+bs*3] = alpha*A[0+bs*3];
+ B[1+bs*3] = alpha*A[1+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A[0+bs*0];
+ B[1+bs*0] = alpha*A[1+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 1x1 triangle
+
+ B[1+bs*0] = alpha*A[1+bs*0];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_2_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] = alpha*A0[3+bs*0];
+ B[1+bs*0] = alpha*A1[0+bs*0];
+
+ B[0+bs*1] = alpha*A0[3+bs*1];
+ B[1+bs*1] = alpha*A1[0+bs*1];
+
+ B[0+bs*2] = alpha*A0[3+bs*2];
+ B[1+bs*2] = alpha*A1[0+bs*2];
+
+ B[0+bs*3] = alpha*A0[3+bs*3];
+ B[1+bs*3] = alpha*A1[0+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A0[3+bs*0];
+ B[1+bs*0] = alpha*A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ if(tri==1)
+ {
+ // 1x1 triangle
+
+ B[1+bs*0] = alpha*A1[0+bs*0];
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgecp_1_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+ {
+
+ if(tri==1)
+ {
+ // A and C are lower triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+ }
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] = alpha*A[0+bs*0];
+
+ B[0+bs*1] = alpha*A[0+bs*1];
+
+ B[0+bs*2] = alpha*A[0+bs*2];
+
+ B[0+bs*3] = alpha*A[0+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] = alpha*A[0+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgead_4_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+ B[3+bs*0] += alpha * A[3+bs*0];
+
+ B[0+bs*1] += alpha * A[0+bs*1];
+ B[1+bs*1] += alpha * A[1+bs*1];
+ B[2+bs*1] += alpha * A[2+bs*1];
+ B[3+bs*1] += alpha * A[3+bs*1];
+
+ B[0+bs*2] += alpha * A[0+bs*2];
+ B[1+bs*2] += alpha * A[1+bs*2];
+ B[2+bs*2] += alpha * A[2+bs*2];
+ B[3+bs*2] += alpha * A[3+bs*2];
+
+ B[0+bs*3] += alpha * A[0+bs*3];
+ B[1+bs*3] += alpha * A[1+bs*3];
+ B[2+bs*3] += alpha * A[2+bs*3];
+ B[3+bs*3] += alpha * A[3+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+ B[3+bs*0] += alpha * A[3+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgead_4_1_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] += alpha * A0[1+bs*0];
+ B[1+bs*0] += alpha * A0[2+bs*0];
+ B[2+bs*0] += alpha * A0[3+bs*0];
+ B[3+bs*0] += alpha * A1[0+bs*0];
+
+ B[0+bs*1] += alpha * A0[1+bs*1];
+ B[1+bs*1] += alpha * A0[2+bs*1];
+ B[2+bs*1] += alpha * A0[3+bs*1];
+ B[3+bs*1] += alpha * A1[0+bs*1];
+
+ B[0+bs*2] += alpha * A0[1+bs*2];
+ B[1+bs*2] += alpha * A0[2+bs*2];
+ B[2+bs*2] += alpha * A0[3+bs*2];
+ B[3+bs*2] += alpha * A1[0+bs*2];
+
+ B[0+bs*3] += alpha * A0[1+bs*3];
+ B[1+bs*3] += alpha * A0[2+bs*3];
+ B[2+bs*3] += alpha * A0[3+bs*3];
+ B[3+bs*3] += alpha * A1[0+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[1+bs*0];
+ B[1+bs*0] += alpha * A0[2+bs*0];
+ B[2+bs*0] += alpha * A0[3+bs*0];
+ B[3+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_4_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+ B[3+bs*0] += alpha * A1[1+bs*0];
+
+ B[0+bs*1] += alpha * A0[2+bs*1];
+ B[1+bs*1] += alpha * A0[3+bs*1];
+ B[2+bs*1] += alpha * A1[0+bs*1];
+ B[3+bs*1] += alpha * A1[1+bs*1];
+
+ B[0+bs*2] += alpha * A0[2+bs*2];
+ B[1+bs*2] += alpha * A0[3+bs*2];
+ B[2+bs*2] += alpha * A1[0+bs*2];
+ B[3+bs*2] += alpha * A1[1+bs*2];
+
+ B[0+bs*3] += alpha * A0[2+bs*3];
+ B[1+bs*3] += alpha * A0[3+bs*3];
+ B[2+bs*3] += alpha * A1[0+bs*3];
+ B[3+bs*3] += alpha * A1[1+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+ B[3+bs*0] += alpha * A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_4_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+ B[3+bs*0] += alpha * A1[2+bs*0];
+
+ B[0+bs*1] += alpha * A0[3+bs*1];
+ B[1+bs*1] += alpha * A1[0+bs*1];
+ B[2+bs*1] += alpha * A1[1+bs*1];
+ B[3+bs*1] += alpha * A1[2+bs*1];
+
+ B[0+bs*2] += alpha * A0[3+bs*2];
+ B[1+bs*2] += alpha * A1[0+bs*2];
+ B[2+bs*2] += alpha * A1[1+bs*2];
+ B[3+bs*2] += alpha * A1[2+bs*2];
+
+ B[0+bs*3] += alpha * A0[3+bs*3];
+ B[1+bs*3] += alpha * A1[0+bs*3];
+ B[2+bs*3] += alpha * A1[1+bs*3];
+ B[3+bs*3] += alpha * A1[2+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+ B[3+bs*0] += alpha * A1[2+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_3_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+
+ B[0+bs*1] += alpha * A[0+bs*1];
+ B[1+bs*1] += alpha * A[1+bs*1];
+ B[2+bs*1] += alpha * A[2+bs*1];
+
+ B[0+bs*2] += alpha * A[0+bs*2];
+ B[1+bs*2] += alpha * A[1+bs*2];
+ B[2+bs*2] += alpha * A[2+bs*2];
+
+ B[0+bs*3] += alpha * A[0+bs*3];
+ B[1+bs*3] += alpha * A[1+bs*3];
+ B[2+bs*3] += alpha * A[2+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_3_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+
+ B[0+bs*1] += alpha * A0[2+bs*1];
+ B[1+bs*1] += alpha * A0[3+bs*1];
+ B[2+bs*1] += alpha * A1[0+bs*1];
+
+ B[0+bs*2] += alpha * A0[2+bs*2];
+ B[1+bs*2] += alpha * A0[3+bs*2];
+ B[2+bs*2] += alpha * A1[0+bs*2];
+
+ B[0+bs*3] += alpha * A0[2+bs*3];
+ B[1+bs*3] += alpha * A0[3+bs*3];
+ B[2+bs*3] += alpha * A1[0+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_3_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+
+ B[0+bs*1] += alpha * A0[3+bs*1];
+ B[1+bs*1] += alpha * A1[0+bs*1];
+ B[2+bs*1] += alpha * A1[1+bs*1];
+
+ B[0+bs*2] += alpha * A0[3+bs*2];
+ B[1+bs*2] += alpha * A1[0+bs*2];
+ B[2+bs*2] += alpha * A1[1+bs*2];
+
+ B[0+bs*3] += alpha * A0[3+bs*3];
+ B[1+bs*3] += alpha * A1[0+bs*3];
+ B[2+bs*3] += alpha * A1[1+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_2_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+
+ B[0+bs*1] += alpha * A[0+bs*1];
+ B[1+bs*1] += alpha * A[1+bs*1];
+
+ B[0+bs*2] += alpha * A[0+bs*2];
+ B[1+bs*2] += alpha * A[1+bs*2];
+
+ B[0+bs*3] += alpha * A[0+bs*3];
+ B[1+bs*3] += alpha * A[1+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_2_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ double *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+
+ B[0+bs*1] += alpha * A0[3+bs*1];
+ B[1+bs*1] += alpha * A1[0+bs*1];
+
+ B[0+bs*2] += alpha * A0[3+bs*2];
+ B[1+bs*2] += alpha * A1[0+bs*2];
+
+ B[0+bs*3] += alpha * A0[3+bs*3];
+ B[1+bs*3] += alpha * A1[0+bs*3];
+
+ A0 += 16;
+ A1 += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgead_1_0_lib4(int kmax, double alpha, double *A, double *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+ B[0+bs*0] += alpha * A[0+bs*0];
+
+ B[0+bs*1] += alpha * A[0+bs*1];
+
+ B[0+bs*2] += alpha * A[0+bs*2];
+
+ B[0+bs*3] += alpha * A[0+bs*3];
+
+ A += 16;
+ B += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+
diff --git a/auxiliary/c99/kernel_dgetr_lib4.c b/auxiliary/c99/kernel_dgetr_lib4.c
new file mode 100644
index 0000000..7d62277
--- /dev/null
+++ b/auxiliary/c99/kernel_dgetr_lib4.c
@@ -0,0 +1,414 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+ C[2+bs*2] = alpha * A[2+bs*2];
+ C[2+bs*3] = alpha * A[3+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+ C[3+bs*2] = alpha * A[2+bs*3];
+ C[3+bs*3] = alpha * A[3+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 3x3 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else if(kna==2)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*3] = alpha * A[3+bs*2];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+ C[1+bs*2] = alpha * A[2+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+ C[2+bs*2] = alpha * A[2+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+ C[3+bs*2] = alpha * A[2+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 2x2 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 1x1 triangle
+ C[0+bs*1] = alpha * A[1+bs*0];
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ }
+
+
+
+// transposed of general matrices, read across panels, write along panels
+void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
+ {
+ const int ps = 4;
+ int k;
+ for(k=0; k<kmax-3; k+=4)
+ {
+ //
+ B[0+ps*0] = A[0+ps*0];
+ B[0+ps*1] = A[1+ps*0];
+ B[0+ps*2] = A[2+ps*0];
+ B[0+ps*3] = A[3+ps*0];
+ //
+ B[1+ps*0] = A[0+ps*1];
+ B[1+ps*1] = A[1+ps*1];
+ B[1+ps*2] = A[2+ps*1];
+ B[1+ps*3] = A[3+ps*1];
+ //
+ B[2+ps*0] = A[0+ps*2];
+ B[2+ps*1] = A[1+ps*2];
+ B[2+ps*2] = A[2+ps*2];
+ B[2+ps*3] = A[3+ps*2];
+ //
+ B[3+ps*0] = A[0+ps*3];
+ B[3+ps*1] = A[1+ps*3];
+ B[3+ps*2] = A[2+ps*3];
+ B[3+ps*3] = A[3+ps*3];
+
+ A += ps*sda;
+ B += ps*ps;
+ }
+ for( ; k<kmax; k++)
+ {
+ //
+ B[0+ps*0] = A[0+ps*0];
+ B[1+ps*0] = A[0+ps*1];
+ B[2+ps*0] = A[0+ps*2];
+ B[3+ps*0] = A[0+ps*3];
+
+ A += 1;
+ B += ps;
+ }
+ return;
+ }
+
diff --git a/auxiliary/c99/kernel_sgetr_lib4.c b/auxiliary/c99/kernel_sgetr_lib4.c
new file mode 100644
index 0000000..4cf6fa2
--- /dev/null
+++ b/auxiliary/c99/kernel_sgetr_lib4.c
@@ -0,0 +1,370 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_4_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+ C[2+bs*2] = alpha * A[2+bs*2];
+ C[2+bs*3] = alpha * A[3+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+ C[3+bs*2] = alpha * A[2+bs*3];
+ C[3+bs*3] = alpha * A[3+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 3x3 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else if(kna==2)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[0+bs*3] = alpha * A[3+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ C[1+bs*3] = alpha * A[3+bs*1];
+ C[2+bs*3] = alpha * A[3+bs*2];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_3_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+ C[1+bs*2] = alpha * A[2+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+ C[2+bs*2] = alpha * A[2+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+ C[3+bs*2] = alpha * A[2+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 2x2 triangle
+ kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+ if(kna==1)
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+ }
+ else
+ {
+ C[0+bs*1] = alpha * A[1+bs*0];
+ C[0+bs*2] = alpha * A[2+bs*0];
+ C[1+bs*2] = alpha * A[2+bs*1];
+ }
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_2_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+ C[1+bs*1] = alpha * A[1+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+ C[2+bs*1] = alpha * A[1+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+ C[3+bs*1] = alpha * A[1+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+ C[0+bs*1] = alpha * A[1+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ if(tri==1)
+ {
+ // end 1x1 triangle
+ C[0+bs*1] = alpha * A[1+bs*0];
+ }
+
+ }
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_1_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+ {
+
+ if(tri==1)
+ {
+ // A is lower triangular, C is upper triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+ }
+
+ const int bs = 4;
+
+ int k;
+
+ k = 0;
+
+ if(kmax<kna)
+ goto cleanup_loop;
+
+ if(kna>0)
+ {
+ for( ; k<kna; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+ C += bs*(sdc-1);
+ }
+
+ for( ; k<kmax-3; k+=4)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C[1+bs*0] = alpha * A[0+bs*1];
+
+ C[2+bs*0] = alpha * A[0+bs*2];
+
+ C[3+bs*0] = alpha * A[0+bs*3];
+
+ C += bs*sdc;
+ A += bs*bs;
+ }
+
+ cleanup_loop:
+
+ for( ; k<kmax; k++)
+ {
+ C[0+bs*0] = alpha * A[0+bs*0];
+
+ C += 1;
+ A += bs;
+ }
+
+ }
+
+
+
+
diff --git a/auxiliary/d_aux_ext_dep_lib.c b/auxiliary/d_aux_ext_dep_lib.c
new file mode 100644
index 0000000..c12da10
--- /dev/null
+++ b/auxiliary/d_aux_ext_dep_lib.c
@@ -0,0 +1,632 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if ! defined(OS_WINDOWS)
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+#endif
+
+
+
+/* creates a zero matrix */
+void d_zeros(double **pA, int row, int col)
+ {
+ *pA = malloc((row*col)*sizeof(double));
+ double *A = *pA;
+ int i;
+ for(i=0; i<row*col; i++) A[i] = 0.0;
+ }
+
+
+
+/* creates a zero matrix aligned to a cache line */
+void d_zeros_align(double **pA, int row, int col)
+ {
+#if defined(OS_WINDOWS)
+ *pA = (double *) _aligned_malloc( (row*col)*sizeof(double), 64 );
+#else
+ void *temp;
+ int err = posix_memalign(&temp, 64, (row*col)*sizeof(double));
+ if(err!=0)
+ {
+ printf("Memory allocation error");
+ exit(1);
+ }
+ *pA = temp;
+#endif
+ double *A = *pA;
+ int i;
+ for(i=0; i<row*col; i++) A[i] = 0.0;
+ }
+
+
+
+/* frees matrix */
+void d_free(double *pA)
+ {
+ free( pA );
+ }
+
+
+
+/* frees aligned matrix */
+void d_free_align(double *pA)
+ {
+#if defined(OS_WINDOWS)
+ _aligned_free( pA );
+#else
+ free( pA );
+#endif
+ }
+
+
+
+/* prints a matrix in column-major format */
+void d_print_mat(int m, int n, double *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<m; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void d_print_tran_mat(int row, int col, double *A, int lda)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ printf("%9.5f ", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/* prints a matrix in column-major format */
+void d_print_to_file_mat(FILE *file, int row, int col, double *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<row; i++)
+ {
+ for(j=0; j<col; j++)
+ {
+ fprintf(file, "%9.5f ", A[i+lda*j]);
+ }
+ fprintf(file, "\n");
+ }
+ fprintf(file, "\n");
+ }
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void d_print_tran_to_file_mat(FILE *file, int row, int col, double *A, int lda)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ fprintf(file, "%9.5f ", A[i+lda*j]);
+ }
+ fprintf(file, "\n");
+ }
+ fprintf(file, "\n");
+ }
+
+
+
+/* prints a matrix in column-major format (exponential notation) */
+void d_print_e_mat(int m, int n, double *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<m; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%1.15e\t", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/* prints the transposed of a matrix in column-major format (exponential notation) */
+void d_print_e_tran_mat(int row, int col, double *A, int lda)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ printf("%e\t", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+#include "../include/blasfeo_block_size.h"
+
+
+
+// create a matrix structure for a matrix of size m*n by dynamically allocating the memory
+void d_allocate_strmat(int m, int n, struct d_strmat *sA)
+ {
+ const int bs = D_PS;
+ int nc = D_NC;
+ int al = bs*nc;
+ sA->m = m;
+ sA->n = n;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ sA->pm = pm;
+ sA->cn = cn;
+ d_zeros_align(&(sA->pA), sA->pm, sA->cn);
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ d_zeros_align(&(sA->dA), tmp, 1);
+ sA->use_dA = 0;
+ sA->memory_size = (pm*cn+tmp)*sizeof(double);
+ return;
+ }
+
+
+
+// free memory of a matrix structure
+void d_free_strmat(struct d_strmat *sA)
+ {
+ d_free_align(sA->pA);
+ d_free_align(sA->dA);
+ return;
+ }
+
+
+
+// create a vector structure for a vector of size m by dynamically allocating the memory
+void d_allocate_strvec(int m, struct d_strvec *sa)
+ {
+ const int bs = D_PS;
+// int nc = D_NC;
+// int al = bs*nc;
+ sa->m = m;
+ int pm = (m+bs-1)/bs*bs;
+ sa->pm = pm;
+ d_zeros_align(&(sa->pa), sa->pm, 1);
+ sa->memory_size = pm*sizeof(double);
+ return;
+ }
+
+
+
+// free memory of a matrix structure
+void d_free_strvec(struct d_strvec *sa)
+ {
+ d_free_align(sa->pa);
+ return;
+ }
+
+
+
+// print a matrix structure
+void d_print_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = D_PS;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int ii, i, j, tmp;
+ ii = 0;
+ if(ai%bs>0)
+ {
+ tmp = bs-ai%bs;
+ tmp = m<tmp ? m : tmp;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", pA[i+bs*j]);
+ }
+ printf("\n");
+ }
+ pA += tmp + bs*(sda-1);
+ m -= tmp;
+ }
+ for( ; ii<m-(bs-1); ii+=bs)
+ {
+ for(i=0; i<bs; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ if(ii<m)
+ {
+ tmp = m-ii;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ printf("\n");
+ return;
+ }
+
+
+
+// print a vector structure
+void d_print_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print the transposed of a vector structure
+void d_print_tran_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void d_print_to_file_strmat(FILE * file, int m, int n, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = D_PS;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int ii, i, j, tmp;
+ ii = 0;
+ if(ai%bs>0)
+ {
+ tmp = bs-ai%bs;
+ tmp = m<tmp ? m : tmp;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ fprintf(file, "%9.5f ", pA[i+bs*j]);
+ }
+ fprintf(file, "\n");
+ }
+ pA += tmp + bs*(sda-1);
+ m -= tmp;
+ }
+ for( ; ii<m-(bs-1); ii+=bs)
+ {
+ for(i=0; i<bs; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ fprintf(file, "\n");
+ }
+ }
+ if(ii<m)
+ {
+ tmp = m-ii;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ fprintf(file, "\n");
+ }
+ }
+ fprintf(file, "\n");
+ return;
+ }
+
+
+
+// print a vector structure
+void d_print_to_file_strvec(FILE * file, int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_to_file_mat(file, m, 1, pa, m);
+ return;
+ }
+
+
+
+// print the transposed of a vector structure
+void d_print_tran_to_file_strvec(FILE * file, int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_to_file_mat(file, 1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void d_print_e_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = D_PS;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int ii, i, j, tmp;
+ ii = 0;
+ if(ai%bs>0)
+ {
+ tmp = bs-ai%bs;
+ tmp = m<tmp ? m : tmp;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", pA[i+bs*j]);
+ }
+ printf("\n");
+ }
+ pA += tmp + bs*(sda-1);
+ m -= tmp;
+ }
+ for( ; ii<m-(bs-1); ii+=bs)
+ {
+ for(i=0; i<bs; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ if(ii<m)
+ {
+ tmp = m-ii;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ printf("\n");
+ return;
+ }
+
+
+
+// print a vector structure
+void d_print_e_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_e_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print the transposed of a vector structure
+void d_print_e_tran_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_e_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+#elif defined(LA_BLAS) | defined(LA_REFERENCE)
+
+
+
+// create a matrix structure for a matrix of size m*n
+void d_allocate_strmat(int m, int n, struct d_strmat *sA)
+ {
+ sA->m = m;
+ sA->n = n;
+ d_zeros(&(sA->pA), sA->m, sA->n);
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ d_zeros(&(sA->dA), tmp, 1);
+ sA->memory_size = (m*n+tmp)*sizeof(double);
+ return;
+ }
+
+
+
+// free memory of a matrix structure
+void d_free_strmat(struct d_strmat *sA)
+ {
+ free(sA->pA);
+ free(sA->dA);
+ return;
+ }
+
+
+
+// create a vector structure for a vector of size m
+void d_allocate_strvec(int m, struct d_strvec *sa)
+ {
+ sa->m = m;
+ d_zeros(&(sa->pa), sa->m, 1);
+ sa->memory_size = m*sizeof(double);
+ return;
+ }
+
+
+
+// free memory of a vector structure
+void d_free_strvec(struct d_strvec *sa)
+ {
+ free(sa->pa);
+ return;
+ }
+
+
+
+// print a matrix structure
+void d_print_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ d_print_mat(m, n, pA, lda);
+ return;
+ }
+
+
+
+// print a vector structure
+void d_print_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print and transpose a vector structure
+void d_print_tran_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void d_print_to_file_strmat(FILE *file, int m, int n, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ d_print_to_file_mat(file, m, n, pA, lda);
+ return;
+ }
+
+
+
+// print a vector structure
+void d_print_to_file_strvec(FILE *file, int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_to_file_mat(file, m, 1, pa, m);
+ return;
+ }
+
+
+
+// print and transpose a vector structure
+void d_print_to_file_tran_strvec(FILE *file, int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_to_file_mat(file, 1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void d_print_e_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ d_print_e_mat(m, n, pA, lda);
+ return;
+ }
+
+
+
+// print a vector structure
+void d_print_e_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_e_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print and transpose a vector structure
+void d_print_e_tran_strvec(int m, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ d_print_e_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/auxiliary/d_aux_lib.c b/auxiliary/d_aux_lib.c
new file mode 100644
index 0000000..6f1f5d1
--- /dev/null
+++ b/auxiliary/d_aux_lib.c
@@ -0,0 +1,982 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+// return memory size (in bytes) needed for a strmat
+int d_size_strmat(int m, int n)
+ {
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ int size = (m*n+tmp)*sizeof(double);
+ return size;
+ }
+
+
+
+// return memory size (in bytes) needed for the diagonal of a strmat
+int d_size_diag_strmat(int m, int n)
+ {
+ int size = 0;
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ size = tmp*sizeof(double);
+ return size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void d_create_strmat(int m, int n, struct d_strmat *sA, void *memory)
+ {
+ sA->m = m;
+ sA->n = n;
+ double *ptr = (double *) memory;
+ sA->pA = ptr;
+ ptr += m*n;
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ sA->dA = ptr;
+ ptr += tmp;
+ sA->use_dA = 0;
+ sA->memory_size = (m*n+tmp)*sizeof(double);
+ return;
+ }
+
+
+
+// return memory size (in bytes) needed for a strvec
+int d_size_strvec(int m)
+ {
+ int size = m*sizeof(double);
+ return size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void d_create_strvec(int m, struct d_strvec *sa, void *memory)
+ {
+ sa->m = m;
+ double *ptr = (double *) memory;
+ sa->pa = ptr;
+// ptr += m * n;
+ sa->memory_size = m*sizeof(double);
+ return;
+ }
+
+
+
+// convert a matrix into a matrix structure
+void d_cvt_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ double *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+ pA[ii+1+jj*lda2] = A[ii+1+jj*lda];
+ pA[ii+2+jj*lda2] = A[ii+2+jj*lda];
+ pA[ii+3+jj*lda2] = A[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix into a matrix structure
+void d_cvt_tran_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ double *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+ pA[jj+(ii+1)*lda2] = A[ii+1+jj*lda];
+ pA[jj+(ii+2)*lda2] = A[ii+2+jj*lda];
+ pA[jj+(ii+3)*lda2] = A[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector into a vector structure
+void d_cvt_vec2strvec(int m, double *a, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ pa[ii] = a[ii];
+ return;
+ }
+
+
+
+// convert a matrix structure into a matrix
+void d_cvt_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ double *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+ A[ii+1+jj*lda] = pA[ii+1+jj*lda2];
+ A[ii+2+jj*lda] = pA[ii+2+jj*lda2];
+ A[ii+3+jj*lda] = pA[ii+3+jj*lda2];
+ }
+ for(; ii<m; ii++)
+ {
+ A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix structure into a matrix
+void d_cvt_tran_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ double *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+ A[jj+(ii+1)*lda] = pA[ii+1+jj*lda2];
+ A[jj+(ii+2)*lda] = pA[ii+2+jj*lda2];
+ A[jj+(ii+3)*lda] = pA[ii+3+jj*lda2];
+ }
+ for(; ii<m; ii++)
+ {
+ A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector structure into a vector
+void d_cvt_strvec2vec(int m, struct d_strvec *sa, int ai, double *a)
+ {
+ double *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ a[ii] = pa[ii];
+ return;
+ }
+
+
+
+// cast a matrix into a matrix structure
+void d_cast_mat2strmat(double *A, struct d_strmat *sA)
+ {
+ sA->pA = A;
+ return;
+ }
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void d_cast_diag_mat2strmat(double *dA, struct d_strmat *sA)
+ {
+ sA->dA = dA;
+ return;
+ }
+
+
+
+// cast a vector into a vector structure
+void d_cast_vec2vecmat(double *a, struct d_strvec *sa)
+ {
+ sa->pa = a;
+ return;
+ }
+
+
+
+// insert element into strmat
+void dgein1_libstr(double a, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ pA[0] = a;
+ return;
+ }
+
+
+
+// extract element from strmat
+double dgeex1_libstr(struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ return pA[0];
+ }
+
+
+
+// insert element into strvec
+void dvecin1_libstr(double a, struct d_strvec *sx, int xi)
+ {
+ double *x = sx->pa + xi;
+ x[0] = a;
+ return;
+ }
+
+
+
+// extract element from strvec
+double dvecex1_libstr(struct d_strvec *sx, int xi)
+ {
+ double *x = sx->pa + xi;
+ return x[0];
+ }
+
+
+
+// set all elements of a strmat to a value
+void dgese_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pA[ii+lda*jj] = alpha;
+ }
+ }
+ return;
+ }
+
+
+
+// set all elements of a strvec to a value
+void dvecse_libstr(int m, double alpha, struct d_strvec *sx, int xi)
+ {
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ x[ii] = alpha;
+ return;
+ }
+
+
+
+// insert a vector into diagonal
+void ddiain_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*(lda+1)] = alpha*x[ii];
+ return;
+ }
+
+
+
+// add scalar to diagonal
+void ddiare_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*(lda+1)] += alpha;
+ return;
+ }
+
+
+
+// extract a row into a vector
+void drowex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ x[ii] = alpha*pA[ii*lda];
+ return;
+ }
+
+
+
+// insert a vector into a row
+void drowin_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*lda] = alpha*x[ii];
+ return;
+ }
+
+
+
+// add a vector to a row
+void drowad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*lda] += alpha*x[ii];
+ return;
+ }
+
+
+
+// swap two rows of a matrix struct
+void drowsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*lda;
+ int ii;
+ double tmp;
+ for(ii=0; ii<kmax; ii++)
+ {
+ tmp = pA[ii*lda];
+ pA[ii*lda] = pC[ii*ldc];
+ pC[ii*ldc] = tmp;
+ }
+ return;
+ }
+
+
+
+// permute the rows of a matrix struct
+void drowpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ drowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+ }
+ return;
+ }
+
+
+
+// extract vector from column
+void dcolex_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ x[ii] = pA[ii];
+ return;
+ }
+
+
+
+// insert a vector into a rcol
+void dcolin_libstr(int kmax, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii] = x[ii];
+ return;
+ }
+
+
+
+// swap two cols of a matrix struct
+void dcolsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*lda;
+ int ii;
+ double tmp;
+ for(ii=0; ii<kmax; ii++)
+ {
+ tmp = pA[ii];
+ pA[ii] = pC[ii];
+ pC[ii] = tmp;
+ }
+ return;
+ }
+
+
+
+// permute the cols of a matrix struct
+void dcolpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ dcolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+ }
+ return;
+ }
+
+
+
+// copy a generic strmat into a generic strmat
+void dgecp_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+ pC[ii+1+jj*ldc] = pA[ii+1+jj*lda];
+ pC[ii+2+jj*ldc] = pA[ii+2+jj*lda];
+ pC[ii+3+jj*ldc] = pA[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// scale a generic strmat
+void dgesc_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pA[ii+0+jj*lda] *= alpha;
+ pA[ii+1+jj*lda] *= alpha;
+ pA[ii+2+jj*lda] *= alpha;
+ pA[ii+3+jj*lda] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pA[ii+0+jj*lda] *= alpha;
+ }
+ }
+ return;
+ }
+
+
+
+// copy a strvec into a strvec
+void dveccp_libstr(int m, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+ {
+ double *pa = sa->pa + ai;
+ double *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] = pa[ii+0];
+ pc[ii+1] = pa[ii+1];
+ pc[ii+2] = pa[ii+2];
+ pc[ii+3] = pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] = pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// scale a strvec
+void dvecsc_libstr(int m, double alpha, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pa[ii+0] *= alpha;
+ pa[ii+1] *= alpha;
+ pa[ii+2] *= alpha;
+ pa[ii+3] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pa[ii+0] *= alpha;
+ }
+ return;
+ }
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void dtrcp_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<m; jj++)
+ {
+ ii = jj;
+ for(; ii<m; ii++)
+ {
+ pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// scale and add a generic strmat into a generic strmat
+void dgead_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+ pC[ii+1+jj*ldc] += alpha*pA[ii+1+jj*lda];
+ pC[ii+2+jj*ldc] += alpha*pA[ii+2+jj*lda];
+ pC[ii+3+jj*ldc] += alpha*pA[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// scales and adds a strvec into a strvec
+void dvecad_libstr(int m, double alpha, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+ {
+ double *pa = sa->pa + ai;
+ double *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ pc[ii+1] += alpha*pa[ii+1];
+ pc[ii+2] += alpha*pa[ii+2];
+ pc[ii+3] += alpha*pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void dgetr_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ pC[jj+(ii+1)*ldc] = pA[ii+1+jj*lda];
+ pC[jj+(ii+2)*ldc] = pA[ii+2+jj*lda];
+ pC[jj+(ii+3)*ldc] = pA[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void dtrtr_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<m; jj++)
+ {
+ ii = jj;
+ for(; ii<m; ii++)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void dtrtr_u_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ double *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<m; jj++)
+ {
+ ii = 0;
+ for(; ii<=jj; ii++)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// insert a strvec to the diagonal of a strmat, sparse formulation
+void ddiain_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ double *x = sx->pa + xi;
+ int ldd = sD->m;
+ double *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*(ldd+1)] = alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// extract a vector from diagonal
+void ddiaex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ x[ii] = alpha*pA[ii*(lda+1)];
+ return;
+ }
+
+
+
+// extract the diagonal of a strmat from a strvec , sparse formulation
+void ddiaex_sp_libstr(int kmax, double alpha, int *idx, struct d_strmat *sD, int di, int dj, struct d_strvec *sx, int xi)
+ {
+ double *x = sx->pa + xi;
+ int ldd = sD->m;
+ double *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[ii*(ldd+1)];
+ }
+ return;
+ }
+
+
+
+// add a vector to diagonal
+void ddiaad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ double *pA = sA->pA + ai + aj*lda;
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*(lda+1)] += alpha*x[ii];
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void ddiaad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ double *x = sx->pa + xi;
+ int ldd = sD->m;
+ double *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*(ldd+1)] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void ddiaadin_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ int ldd = sD->m;
+ double *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*(ldd+1)] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void drowad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ double *x = sx->pa + xi;
+ int ldd = sD->m;
+ double *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*ldd] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+
+void dvecad_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+ {
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] += alpha * x[ii];
+ return;
+ }
+
+
+
+void dvecin_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+ {
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] = alpha * x[ii];
+ return;
+ }
+
+
+
+void dvecex_sp_libstr(int m, double alpha, int *idx, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[ii] = alpha * x[idx[ii]];
+ return;
+ }
+
+
+// clip without mask return
+void dveccl_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi)
+ {
+ double *xm = sxm->pa + xim;
+ double *x = sx->pa + xi;
+ double *xp = sxp->pa + xip;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ if(x[ii]>=xp[ii])
+ {
+ z[ii] = xp[ii];
+ }
+ else if(x[ii]<=xm[ii])
+ {
+ z[ii] = xm[ii];
+ }
+ else
+ {
+ z[ii] = x[ii];
+ }
+ }
+ return;
+ }
+
+
+
+// clip with mask return
+void dveccl_mask_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi, struct d_strvec *sm, int mi)
+ {
+ double *xm = sxm->pa + xim;
+ double *x = sx->pa + xi;
+ double *xp = sxp->pa + xip;
+ double *z = sz->pa + zi;
+ double *mask = sm->pa + mi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ if(x[ii]>=xp[ii])
+ {
+ z[ii] = xp[ii];
+ mask[ii] = 1.0;
+ }
+ else if(x[ii]<=xm[ii])
+ {
+ z[ii] = xm[ii];
+ mask[ii] = -1.0;
+ }
+ else
+ {
+ z[ii] = x[ii];
+ mask[ii] = 0.0;
+ }
+ }
+ return;
+ }
+
+
+// zero out components using mask
+void dvecze_libstr(int m, struct d_strvec *sm, int mi, struct d_strvec *sv, int vi, struct d_strvec *se, int ei)
+ {
+ double *mask = sm->pa + mi;
+ double *v = sv->pa + vi;
+ double *e = se->pa + ei;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ if(mask[ii]==0)
+ {
+ e[ii] = v[ii];
+ }
+ else
+ {
+ e[ii] = 0;
+ }
+ }
+ return;
+ }
+
+
+
+void dvecnrm_inf_libstr(int m, struct d_strvec *sx, int xi, double *ptr_norm)
+ {
+ int ii;
+ double *x = sx->pa + xi;
+ double norm = 0.0;
+ for(ii=0; ii<m; ii++)
+ norm = fmax(norm, fabs(x[ii]));
+ *ptr_norm = norm;
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/auxiliary/d_aux_lib4.c b/auxiliary/d_aux_lib4.c
new file mode 100644
index 0000000..152aed1
--- /dev/null
+++ b/auxiliary/d_aux_lib4.c
@@ -0,0 +1,3609 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_block_size.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+// copies a packed matrix into a packed matrix
+// TODO remove alha !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+void dgecp_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna, ii;
+
+ int offA = offsetA%bs;
+ int offB = offsetB%bs;
+
+ // A at the beginning of the block
+ A -= offA;
+
+ // A at the beginning of the block
+ B -= offB;
+
+ // same alignment
+ if(offA==offB)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_0_lib4(0, n, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_0_lib4(0, n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_0_lib4(0, n, alpha, A, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(0, n, alpha, A, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_0_lib4(0, n, alpha, A, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_0_lib4(0, n, alpha, A, B);
+ }
+ }
+ // skip one element of A
+ else if(offA==(offB+1)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ //A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_3_lib4(0, n, alpha, A, sda, B+2);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_2_lib4(0, n, alpha, A, sda, B+1);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_1_lib4(0, n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_1_lib4(0, n, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+1, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+1, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_0_lib4(0, n, alpha, A+1, B);
+ }
+ }
+ // skip 2 elements of A
+ else if(offA==(offB+2)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_3_lib4(0, n, alpha, A, sda, B+1);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+1, B+3);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_0_lib4(0, n, alpha, A, B+2);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_3_lib4(0, n, alpha, A, sda, B+1);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_2_lib4(0, n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_2_lib4(0, n, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+2, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+2, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_2_lib4(0, n, alpha, A, sda, B);
+ }
+ }
+ // skip 3 elements of A
+ else // if(offA==(offB+3)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_0_lib4(0, n, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_3_lib4(0, n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_3_lib4(0, n, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(0, n, alpha, A+3, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_3_lib4(0, n, alpha, A, sda, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_3_lib4(0, n, alpha, A, sda, B);
+ }
+ }
+
+ }
+
+
+
+// copies a lower triangular packed matrix into a lower triangular packed matrix
+void dtrcp_l_lib(int m, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
+ {
+
+ if(m<=0)
+ return;
+
+ int n = m;
+
+ const int bs = 4;
+
+ int mna, ii;
+
+ int offA = offsetA%bs;
+ int offB = offsetB%bs;
+
+ // A at the beginning of the block
+ A -= offA;
+
+ // A at the beginning of the block
+ B -= offB;
+
+ // same alignment
+ if(offA==offB)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_0_lib4(1, ii, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_0_lib4(1, ii, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_0_lib4(1, ii, alpha, A, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_0_lib4(1, ii, alpha, A, B);
+ }
+ }
+ // skip one element of A
+ else if(offA==(offB+1)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ //A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_3_lib4(1, ii, alpha, A, sda, B+2);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_2_lib4(1, ii, alpha, A, sda, B+1);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_1_lib4(1, ii, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_1_lib4(1, ii, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+1, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+1, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_0_lib4(1, ii, alpha, A+1, B);
+ }
+ }
+ // skip 2 elements of A
+ else if(offA==(offB+2)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_3_lib4(1, ii, alpha, A, sda, B+1);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+1, B+3);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A, B+2);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_3_lib4(1, ii, alpha, A, sda, B+1);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_2_lib4(1, ii, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_2_lib4(1, ii, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+2, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+2, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_2_lib4(1, ii, alpha, A, sda, B);
+ }
+ }
+ // skip 3 elements of A
+ else // if(offA==(offB+3)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgecp_3_0_lib4(1, ii, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgecp_8_3_lib4(1, ii, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgecp_4_3_lib4(1, ii, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgecp_1_0_lib4(1, ii, alpha, A+3, B);
+ else if(m-ii==2)
+ kernel_dgecp_2_3_lib4(1, ii, alpha, A, sda, B);
+ else // if(m-ii==3)
+ kernel_dgecp_3_3_lib4(1, ii, alpha, A, sda, B);
+ }
+ }
+
+ }
+
+
+
+// scales and adds a packed matrix into a packed matrix: B = B + alpha*A
+void dgead_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna, ii;
+
+ int offA = offsetA%bs;
+ int offB = offsetB%bs;
+
+ // A at the beginning of the block
+ A -= offA;
+
+ // A at the beginning of the block
+ B -= offB;
+
+ // same alignment
+ if(offA==offB)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgead_3_0_lib4(n, alpha, A+offA, B+offB);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgead_8_0_lib4(n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgead_4_0_lib4(n, alpha, A, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgead_1_0_lib4(n, alpha, A, B);
+ else if(m-ii==2)
+ kernel_dgead_2_0_lib4(n, alpha, A, B);
+ else // if(m-ii==3)
+ kernel_dgead_3_0_lib4(n, alpha, A, B);
+ }
+ }
+ // skip one element of A
+ else if(offA==(offB+1)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ //A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgead_2_3_lib4(n, alpha, A, sda, B+2);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgead_3_2_lib4(n, alpha, A, sda, B+1);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgead_8_1_lib4(n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgead_4_1_lib4(n, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgead_1_0_lib4(n, alpha, A+1, B);
+ else if(m-ii==2)
+ kernel_dgead_2_0_lib4(n, alpha, A+1, B);
+ else // if(m-ii==3)
+ kernel_dgead_3_0_lib4(n, alpha, A+1, B);
+ }
+ }
+ // skip 2 elements of A
+ else if(offA==(offB+2)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_dgead_2_3_lib4(n, alpha, A, sda, B+1);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+1, B+3);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgead_2_0_lib4(n, alpha, A, B+2);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgead_3_3_lib4(n, alpha, A, sda, B+1);
+ A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgead_8_2_lib4(n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgead_4_2_lib4(n, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgead_1_0_lib4(n, alpha, A+2, B);
+ else if(m-ii==2)
+ kernel_dgead_2_0_lib4(n, alpha, A+2, B);
+ else // if(m-ii==3)
+ kernel_dgead_3_2_lib4(n, alpha, A, sda, B);
+ }
+ }
+ // skip 3 elements of A
+ else // if(offA==(offB+3)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_dgead_3_0_lib4(n, alpha, A+offA, B+offB);
+ // A += 4*sda;
+ B += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+ for(; ii<m-7; ii+=8)
+ {
+ kernel_dgead_8_3_lib4(n, alpha, A, sda, B, sdb);
+ A += 8*sda;
+ B += 8*sdb;
+ }
+#endif
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_dgead_4_3_lib4(n, alpha, A, sda, B);
+ A += 4*sda;
+ B += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_dgead_1_0_lib4(n, alpha, A+3, B);
+ else if(m-ii==2)
+ kernel_dgead_2_3_lib4(n, alpha, A, sda, B);
+ else // if(m-ii==3)
+ kernel_dgead_3_3_lib4(n, alpha, A, sda, B);
+ }
+ }
+
+ }
+
+
+
+// scales and adds a strvec into a strvec
+void dvecad_libstr(int m, double alpha, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+ {
+ double *pa = sa->pa + ai;
+ double *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ pc[ii+1] += alpha*pa[ii+1];
+ pc[ii+2] += alpha*pa[ii+2];
+ pc[ii+3] += alpha*pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// transpose general matrix; m and n are referred to the original matrix
+void dgetr_lib(int m, int n, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+ {
+
+/*
+
+m = 5
+n = 3
+offsetA = 1
+offsetC = 2
+
+A =
+ x x x
+ -
+ x x x
+ x x x
+ x x x
+ x x x
+
+C =
+ x x x x x
+ x x x x x
+ -
+ x x x x x
+
+*/
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna = (bs-offsetA%bs)%bs;
+ mna = m<mna ? m : mna;
+ int nna = (bs-offsetC%bs)%bs;
+ nna = n<nna ? n : nna;
+
+ int ii;
+
+ ii = 0;
+
+ if(mna>0)
+ {
+ if(mna==1)
+ kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else if(mna==2)
+ kernel_dgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else //if(mna==3)
+ kernel_dgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+ ii += mna;
+ pA += mna + bs*(sda-1);
+ pC += mna*bs;
+ }
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgetr_8_lib4(0, n, nna, alpha, pA, sda, pC, sdc);
+ pA += 2*bs*sda;
+ pC += 2*bs*bs;
+ }
+#endif
+ for( ; ii<m-3; ii+=4)
+// for( ; ii<m; ii+=4)
+ {
+ kernel_dgetr_4_lib4(0, n, nna, alpha, pA, pC, sdc);
+ pA += bs*sda;
+ pC += bs*bs;
+ }
+
+ // clean-up at the end using smaller kernels
+ if(ii==m)
+ return;
+
+ if(m-ii==1)
+ kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else if(m-ii==2)
+ kernel_dgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else if(m-ii==3)
+ kernel_dgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+
+ return;
+
+ }
+
+
+
+// transpose lower triangular matrix
+void dtrtr_l_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+ {
+
+/*
+
+A =
+ x
+ x x
+ x x x
+ x x x x
+
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+C =
+ x x x x x x x x
+
+ x x x x x x x
+ x x x x x x
+ x x x x x
+ x x x x
+
+ x x x
+ x x
+ x
+
+*/
+
+ int n = m;
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna = (bs-offsetA%bs)%bs;
+ mna = m<mna ? m : mna;
+ int nna = (bs-offsetC%bs)%bs;
+ nna = n<nna ? n : nna;
+
+ int ii;
+
+ ii = 0;
+
+ if(mna>0)
+ {
+ if(mna==1)
+ {
+ pC[0] = alpha * pA[0];
+ }
+ else if(mna==2)
+ {
+ if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+ }
+ else
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ }
+ }
+ else //if(mna==3)
+ {
+ if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+ pC[1+bs*(1+sdc)] = alpha * pA[2+bs*1];
+ pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+ }
+ else if(nna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+ }
+ else
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ }
+ }
+ ii += mna;
+ pA += mna + bs*(sda-1);
+ pC += mna*bs;
+ }
+#if 0 //defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgetr_8_lib4(1, n, nna, alpha, pA, sda, pC, sdc);
+ pA += 2*bs*sda;
+ pC += 2*bs*bs;
+ }
+#endif
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_dgetr_4_lib4(1, ii, nna, alpha, pA, pC, sdc);
+ pA += bs*sda;
+ pC += bs*bs;
+ }
+
+ // clean-up at the end using smaller kernels
+ if(ii==m)
+ return;
+
+ if(m-ii==1)
+ kernel_dgetr_1_lib4(1, ii, nna, alpha, pA, pC, sdc);
+ else if(m-ii==2)
+ kernel_dgetr_2_lib4(1, ii, nna, alpha, pA, pC, sdc);
+ else if(m-ii==3)
+ kernel_dgetr_3_lib4(1, ii, nna, alpha, pA, pC, sdc);
+
+ return;
+
+ }
+
+
+
+// transpose an aligned upper triangular matrix into an aligned lower triangular matrix
+void dtrtr_u_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+ {
+
+/*
+
+A =
+ x x x x x x x x
+ x x x x x x x
+
+ x x x x x x
+ x x x x x
+ x x x x
+ x x x
+ x x
+ x
+
+C =
+ x
+
+ x x
+ x x x
+ x x x x
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+*/
+
+ int n = m;
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna = (bs-offsetA%bs)%bs;
+ mna = m<mna ? m : mna;
+ int nna = (bs-offsetC%bs)%bs;
+ nna = n<nna ? n : nna;
+ int tna = nna;
+
+ int ii;
+
+ ii = 0;
+
+ if(mna>0)
+ {
+ if(mna==1)
+ {
+ kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+ if(nna!=1)
+ {
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += 1*bs;
+ pC += 1;
+ tna = (bs-(offsetC+1)%bs)%bs;
+ }
+ else //if(nna==1)
+ {
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += 1*bs;
+ pC += 1 + (sdc-1)*bs;
+ tna = 0; //(bs-(offsetC+1)%bs)%bs;
+ }
+// kernel_dgetr_1_lib4(0, n-1, tna, alpha, pA, pC, sdc);
+ }
+ else if(mna==2)
+ {
+ if(nna==0 || nna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2;
+ tna = (bs-(offsetC+2)%bs)%bs;
+ kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += 1*bs;
+ pC += 1 + (sdc-1)*bs;
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+// pC[0+bs*1] = alpha * pA[1+bs*0];
+ kernel_dgetr_2_lib4(0, n-1, 0, alpha, pA, pC, sdc);
+ pA += 1*bs;
+ pC += 1;
+ tna = 3; //(bs-(offsetC+2)%bs)%bs;
+// kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+ tna = 0; //(bs-(offsetC+2)%bs)%bs;
+ kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+ }
+ }
+ else //if(mna==3)
+ {
+ if(nna==0)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pA += 3*bs;
+ pC += 3;
+ tna = 1;
+ kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pA += 2*bs;
+ pC += 2;
+ tna = 2;
+ kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+// pC[0+bs*1] = alpha * pA[1+bs*0];
+// pC[0+bs*2] = alpha * pA[2+bs*0];
+ kernel_dgetr_3_lib4(0, n-2, 0, alpha, pA, pC, sdc);
+ pA += 1*bs;
+ pC += 1;
+ tna = 3;
+// kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ else //if(nna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pA += 3*bs;
+ pC += 3 + (sdc-1)*bs;
+ tna = 0;
+ kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ }
+ ii += mna;
+ pA += mna + bs*(sda-1);
+ pC += mna*bs;
+ }
+#if 0 //defined(TARGET_X64_AVX2)
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_dgetr_8_lib4(0, n, nna, alpha, pA, sda, pC, sdc);
+ pA += 2*bs*sda;
+ pC += 2*bs*bs;
+ }
+#endif
+ for( ; ii<m-3; ii+=4)
+ {
+ if(tna==0)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pC[3+bs*0] = alpha * pA[0+bs*3];
+ pC[3+bs*1] = alpha * pA[1+bs*3];
+ pC[3+bs*2] = alpha * pA[2+bs*3];
+ pC[3+bs*3] = alpha * pA[3+bs*3];
+ pA += 4*bs;
+ pC += sdc*bs;
+ kernel_dgetr_4_lib4(0, n-ii-4, 0, alpha, pA, pC, sdc);
+ }
+ else if(tna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pC[2+bs*3] = alpha * pA[3+bs*2];
+ pA += 3*bs;
+ pC += 3;
+ kernel_dgetr_4_lib4(0, n-ii-4, 1, alpha, pA, pC, sdc);
+ }
+ else if(tna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[1+bs*3] = alpha * pA[3+bs*1];
+ pA += 2*bs;
+ pC += 2;
+ kernel_dgetr_4_lib4(0, n-ii-4, 2, alpha, pA, pC, sdc);
+ }
+ else //if(tna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pA += 3*bs;
+ pC += 3 + (sdc-1)*bs;
+ kernel_dgetr_4_lib4(0, n-ii-3, 0, alpha, pA, pC, sdc);
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+// pC[0+bs*1] = alpha * pA[1+bs*0];
+// pC[0+bs*2] = alpha * pA[2+bs*0];
+// pC[0+bs*3] = alpha * pA[3+bs*0];
+ pA += bs;
+ pC += 1;
+// kernel_dgetr_4_lib4(0, n-ii-4, tna, alpha, pA, pC, sdc);
+ }
+ pA += bs*sda;
+ pC += bs*bs;
+ }
+
+ // clean-up at the end
+ if(ii==m)
+ return;
+
+ if(m-ii==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ }
+ else if(m-ii==2)
+ {
+ if(tna!=1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ }
+ else //if(tna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ }
+ }
+ else if(m-ii==3)
+ {
+ if(tna==0 || tna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ }
+ else if(tna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ }
+ else //if(tna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// regularize diagonal
+void ddiareg_lib(int kmax, double reg, int offset, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] += reg;
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] += reg;
+ pD[jj*sdd+(jj+1)*bs+1] += reg;
+ pD[jj*sdd+(jj+2)*bs+2] += reg;
+ pD[jj*sdd+(jj+3)*bs+3] += reg;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] += reg;
+ }
+
+ }
+
+
+
+// insert sqrt of vector to diagonal
+void ddiain_sqrt_lib(int kmax, double *x, int offset, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] = sqrt(x[ll]);
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] = sqrt(x[jj+0]);
+ pD[jj*sdd+(jj+1)*bs+1] = sqrt(x[jj+1]);
+ pD[jj*sdd+(jj+2)*bs+2] = sqrt(x[jj+2]);
+ pD[jj*sdd+(jj+3)*bs+3] = sqrt(x[jj+3]);
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
+ }
+
+ }
+
+
+
+// extract diagonal to vector
+void ddiaex_lib(int kmax, double alpha, int offset, double *pD, int sdd, double *x)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ x[ll] = alpha * pD[ll+bs*ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[jj+0] = alpha * pD[jj*sdd+(jj+0)*bs+0];
+ x[jj+1] = alpha * pD[jj*sdd+(jj+1)*bs+1];
+ x[jj+2] = alpha * pD[jj*sdd+(jj+2)*bs+2];
+ x[jj+3] = alpha * pD[jj*sdd+(jj+3)*bs+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal
+void ddiaad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] += alpha * x[ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] += alpha * x[jj+0];
+ pD[jj*sdd+(jj+1)*bs+1] += alpha * x[jj+1];
+ pD[jj*sdd+(jj+2)*bs+2] += alpha * x[jj+2];
+ pD[jj*sdd+(jj+3)*bs+3] += alpha * x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
+ }
+
+ }
+
+
+
+// insert vector to diagonal, sparse formulation
+void ddiain_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
+ }
+
+ }
+
+
+
+// extract diagonal to vector, sparse formulation
+void ddiaex_libsp(int kmax, int *idx, double alpha, double *pD, int sdd, double *x)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void ddiaad_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to another vector and insert to diagonal, sparse formulation
+void ddiaadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
+ }
+
+ }
+
+
+
+// insert vector to row
+void drowin_lib(int kmax, double alpha, double *x, double *pD)
+ {
+
+ const int bs = 4;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[(jj+0)*bs] = alpha*x[jj+0];
+ pD[(jj+1)*bs] = alpha*x[jj+1];
+ pD[(jj+2)*bs] = alpha*x[jj+2];
+ pD[(jj+3)*bs] = alpha*x[jj+3];
+ }
+ for(; jj<kmax; jj++)
+ {
+ pD[(jj)*bs] = alpha*x[jj];
+ }
+
+ }
+
+
+
+// extract row to vector
+void drowex_lib(int kmax, double alpha, double *pD, double *x)
+ {
+
+ const int bs = 4;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[jj+0] = alpha*pD[(jj+0)*bs];
+ x[jj+1] = alpha*pD[(jj+1)*bs];
+ x[jj+2] = alpha*pD[(jj+2)*bs];
+ x[jj+3] = alpha*pD[(jj+3)*bs];
+ }
+ for(; jj<kmax; jj++)
+ {
+ x[jj] = alpha*pD[(jj)*bs];
+ }
+
+ }
+
+
+
+// add scaled vector to row
+void drowad_lib(int kmax, double alpha, double *x, double *pD)
+ {
+
+ const int bs = 4;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[(jj+0)*bs] += alpha * x[jj+0];
+ pD[(jj+1)*bs] += alpha * x[jj+1];
+ pD[(jj+2)*bs] += alpha * x[jj+2];
+ pD[(jj+3)*bs] += alpha * x[jj+3];
+ }
+ for(; jj<kmax; jj++)
+ {
+ pD[(jj)*bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// insert vector to row, sparse formulation
+void drowin_libsp(int kmax, double alpha, int *idx, double *x, double *pD)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] = alpha*x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to row, sparse formulation
+void drowad_libsp(int kmax, int *idx, double alpha, double *x, double *pD)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to another vector and insert to row, sparse formulation
+void drowadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] = y[jj] + alpha * x[jj];
+ }
+
+ }
+
+
+
+// swap two rows
+void drowsw_lib(int kmax, double *pA, double *pC)
+ {
+
+ const int bs = 4;
+
+ int ii;
+ double tmp;
+
+ for(ii=0; ii<kmax-3; ii+=4)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ tmp = pA[0+bs*1];
+ pA[0+bs*1] = pC[0+bs*1];
+ pC[0+bs*1] = tmp;
+ tmp = pA[0+bs*2];
+ pA[0+bs*2] = pC[0+bs*2];
+ pC[0+bs*2] = tmp;
+ tmp = pA[0+bs*3];
+ pA[0+bs*3] = pC[0+bs*3];
+ pC[0+bs*3] = tmp;
+ pA += 4*bs;
+ pC += 4*bs;
+ }
+ for( ; ii<kmax; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1*bs;
+ pC += 1*bs;
+ }
+
+ }
+
+
+
+// extract vector from column
+void dcolex_lib(int kmax, int offset, double *pD, int sdd, double *x)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ x[ll] = pD[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[jj+0] = pD[jj*sdd+0];
+ x[jj+1] = pD[jj*sdd+1];
+ x[jj+2] = pD[jj*sdd+2];
+ x[jj+3] = pD[jj*sdd+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ x[jj+ll] = pD[jj*sdd+ll];
+ }
+
+ }
+
+
+
+// insert vector to column
+void dcolin_lib(int kmax, double *x, int offset, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll] = x[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+0] = x[jj+0];
+ pD[jj*sdd+1] = x[jj+1];
+ pD[jj*sdd+2] = x[jj+2];
+ pD[jj*sdd+3] = x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+ll] = x[jj+ll];
+ }
+
+ }
+
+
+
+// add scaled vector to column
+void dcolad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll] += alpha * x[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+0] += alpha * x[jj+0];
+ pD[jj*sdd+1] += alpha * x[jj+1];
+ pD[jj*sdd+2] += alpha * x[jj+2];
+ pD[jj*sdd+3] += alpha * x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+ll] += alpha * x[jj+ll];
+ }
+
+ }
+
+
+
+// insert vector to diagonal, sparse formulation
+void dcolin_libsp(int kmax, int *idx, double *x, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs] = x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void dcolad_libsp(int kmax, double alpha, int *idx, double *x, double *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// swaps two cols
+void dcolsw_lib(int kmax, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+ {
+
+ const int bs = 4;
+
+ int ii;
+
+ double tmp;
+
+ if(offsetA==offsetC)
+ {
+ if(offsetA>0)
+ {
+ ii = 0;
+ for(; ii<bs-offsetA; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1;
+ pC += 1;
+ }
+ pA += bs*(sda-1);
+ pC += bs*(sdc-1);
+ kmax -= bs-offsetA;
+ }
+ ii = 0;
+ for(; ii<kmax-3; ii+=4)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ tmp = pA[1+bs*0];
+ pA[1+bs*0] = pC[1+bs*0];
+ pC[1+bs*0] = tmp;
+ tmp = pA[2+bs*0];
+ pA[2+bs*0] = pC[2+bs*0];
+ pC[2+bs*0] = tmp;
+ tmp = pA[3+bs*0];
+ pA[3+bs*0] = pC[3+bs*0];
+ pC[3+bs*0] = tmp;
+ pA += bs*sda;
+ pC += bs*sdc;
+ }
+ for(; ii<kmax; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1;
+ pC += 1;
+ }
+ }
+ else
+ {
+ printf("\ndcolsw: feature not implemented yet: offsetA!=offsetC\n\n");
+ exit(1);
+ }
+
+ return;
+
+ }
+
+
+
+// insert vector to vector, sparse formulation
+void dvecin_libsp(int kmax, int *idx, double *x, double *y)
+ {
+
+ int jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ y[idx[jj]] = x[jj];
+ }
+
+ }
+
+
+
+// adds vector to vector, sparse formulation
+void dvecad_libsp(int kmax, int *idx, double alpha, double *x, double *y)
+ {
+
+ int jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ y[idx[jj]] += alpha * x[jj];
+ }
+
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// return the memory size (in bytes) needed for a strmat
+int d_size_strmat(int m, int n)
+ {
+ const int bs = 4;
+ int nc = D_NC;
+ int al = bs*nc;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ int memory_size = (pm*cn+tmp)*sizeof(double);
+ return memory_size;
+ }
+
+
+
+// return the memory size (in bytes) needed for the digonal of a strmat
+int d_size_diag_strmat(int m, int n)
+ {
+ const int bs = 4;
+ int nc = D_NC;
+ int al = bs*nc;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ int memory_size = tmp*sizeof(double);
+ return memory_size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void d_create_strmat(int m, int n, struct d_strmat *sA, void *memory)
+ {
+ const int bs = 4;
+ int nc = D_NC;
+ int al = bs*nc;
+ sA->m = m;
+ sA->n = n;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ sA->pm = pm;
+ sA->cn = cn;
+ double *ptr = (double *) memory;
+ sA->pA = ptr;
+ ptr += pm*cn;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ sA->dA = ptr;
+ ptr += tmp;
+ sA->use_dA = 0;
+ sA->memory_size = (pm*cn+tmp)*sizeof(double);
+ return;
+ }
+
+
+
+// return memory size (in bytes) needed for a strvec
+int d_size_strvec(int m)
+ {
+ const int bs = 4;
+// int nc = D_NC;
+// int al = bs*nc;
+ int pm = (m+bs-1)/bs*bs;
+ int memory_size = pm*sizeof(double);
+ return memory_size;
+ }
+
+
+
+// create a vector structure for a vector of size m by using memory passed by a pointer
+void d_create_strvec(int m, struct d_strvec *sa, void *memory)
+ {
+ const int bs = 4;
+// int nc = D_NC;
+// int al = bs*nc;
+ sa->m = m;
+ int pm = (m+bs-1)/bs*bs;
+ sa->pm = pm;
+ double *ptr = (double *) memory;
+ sa->pa = ptr;
+// ptr += pm;
+ sa->memory_size = pm*sizeof(double);
+ return;
+ }
+
+
+
+// convert a matrix into a matrix structure
+void d_cvt_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, j, jj, m0, m1, m2;
+ double *B, *pB;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ __m256d
+ tmp;
+#endif
+ m0 = (bs-ai%bs)%bs;
+ if(m0>m)
+ m0 = m;
+ m1 = m - m0;
+ jj = 0;
+ for( ; jj<n-3; jj+=4)
+ {
+ B = A + jj*lda;
+ pB = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for( ; ii<m0; ii++)
+ {
+ pB[ii+bs*0] = B[ii+lda*0];
+ pB[ii+bs*1] = B[ii+lda*1];
+ pB[ii+bs*2] = B[ii+lda*2];
+ pB[ii+bs*3] = B[ii+lda*3];
+ }
+ B += m0;
+ pB += m0 + bs*(sda-1);
+ }
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for( ; ii<m-3; ii+=4)
+ {
+ tmp = _mm256_loadu_pd( &B[0+lda*0] );
+ _mm256_store_pd( &pB[0+bs*0], tmp );
+ tmp = _mm256_loadu_pd( &B[0+lda*1] );
+ _mm256_store_pd( &pB[0+bs*1], tmp );
+ tmp = _mm256_loadu_pd( &B[0+lda*2] );
+ _mm256_store_pd( &pB[0+bs*2], tmp );
+ tmp = _mm256_loadu_pd( &B[0+lda*3] );
+ _mm256_store_pd( &pB[0+bs*3], tmp );
+ // update
+ B += 4;
+ pB += bs*sda;
+ }
+#else
+ for( ; ii<m-3; ii+=4)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ pB[1+bs*0] = B[1+lda*0];
+ pB[2+bs*0] = B[2+lda*0];
+ pB[3+bs*0] = B[3+lda*0];
+ // col 1
+ pB[0+bs*1] = B[0+lda*1];
+ pB[1+bs*1] = B[1+lda*1];
+ pB[2+bs*1] = B[2+lda*1];
+ pB[3+bs*1] = B[3+lda*1];
+ // col 2
+ pB[0+bs*2] = B[0+lda*2];
+ pB[1+bs*2] = B[1+lda*2];
+ pB[2+bs*2] = B[2+lda*2];
+ pB[3+bs*2] = B[3+lda*2];
+ // col 3
+ pB[0+bs*3] = B[0+lda*3];
+ pB[1+bs*3] = B[1+lda*3];
+ pB[2+bs*3] = B[2+lda*3];
+ pB[3+bs*3] = B[3+lda*3];
+ // update
+ B += 4;
+ pB += bs*sda;
+ }
+#endif
+ for( ; ii<m; ii++)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ // col 1
+ pB[0+bs*1] = B[0+lda*1];
+ // col 2
+ pB[0+bs*2] = B[0+lda*2];
+ // col 3
+ pB[0+bs*3] = B[0+lda*3];
+ // update
+ B += 1;
+ pB += 1;
+ }
+ }
+ for( ; jj<n; jj++)
+ {
+
+ B = A + jj*lda;
+ pB = pA + jj*bs;
+
+ ii = 0;
+ if(m0>0)
+ {
+ for( ; ii<m0; ii++)
+ {
+ pB[ii+bs*0] = B[ii+lda*0];
+ }
+ B += m0;
+ pB += m0 + bs*(sda-1);
+ }
+ for( ; ii<m-3; ii+=4)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ pB[1+bs*0] = B[1+lda*0];
+ pB[2+bs*0] = B[2+lda*0];
+ pB[3+bs*0] = B[3+lda*0];
+ // update
+ B += 4;
+ pB += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ // update
+ B += 1;
+ pB += 1;
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix into a matrix structure
+void d_cvt_tran_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, j, m0, m1, m2;
+ double *B, *pB;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ __m256d
+ v0, v1, v2, v3,
+ v4, v5, v6, v7;
+#endif
+ m0 = (bs-ai%bs)%bs;
+ if(m0>n)
+ m0 = n;
+ m1 = n - m0;
+ ii = 0;
+ if(m0>0)
+ {
+ for(j=0; j<m; j++)
+ {
+ for(i=0; i<m0; i++)
+ {
+ pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+ }
+ }
+ A += m0*lda;
+ pA += m0 + bs*(sda-1);
+ }
+ ii = 0;
+ for(; ii<m1-3; ii+=bs)
+ {
+ j=0;
+ B = A + ii*lda;
+ pB = pA + ii*sda;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ for(; j<m-3; j+=4)
+ {
+ v0 = _mm256_loadu_pd( &B[0+0*lda] ); // 00 10 20 30
+ v1 = _mm256_loadu_pd( &B[0+1*lda] ); // 01 11 21 31
+ v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+ v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+ v2 = _mm256_loadu_pd( &B[0+2*lda] ); // 02 12 22 32
+ v3 = _mm256_loadu_pd( &B[0+3*lda] ); // 03 13 23 33
+ v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+ v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+ B += 4;
+
+ v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+ _mm256_store_pd( &pB[0+bs*0], v0 );
+ v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+ _mm256_store_pd( &pB[0+bs*2], v2 );
+ v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+ _mm256_store_pd( &pB[0+bs*1], v1 );
+ v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+ _mm256_store_pd( &pB[0+bs*3], v3 );
+
+ pB += 4*bs;
+ }
+#else
+ for(; j<m-3; j+=4)
+ {
+ // unroll 0
+ pB[0+0*bs] = B[0+0*lda];
+ pB[1+0*bs] = B[0+1*lda];
+ pB[2+0*bs] = B[0+2*lda];
+ pB[3+0*bs] = B[0+3*lda];
+ // unroll 1
+ pB[0+1*bs] = B[1+0*lda];
+ pB[1+1*bs] = B[1+1*lda];
+ pB[2+1*bs] = B[1+2*lda];
+ pB[3+1*bs] = B[1+3*lda];
+ // unroll 2
+ pB[0+2*bs] = B[2+0*lda];
+ pB[1+2*bs] = B[2+1*lda];
+ pB[2+2*bs] = B[2+2*lda];
+ pB[3+2*bs] = B[2+3*lda];
+ // unroll 3
+ pB[0+3*bs] = B[3+0*lda];
+ pB[1+3*bs] = B[3+1*lda];
+ pB[2+3*bs] = B[3+2*lda];
+ pB[3+3*bs] = B[3+3*lda];
+ B += 4;
+ pB += 4*bs;
+ }
+#endif
+ for(; j<m; j++)
+ {
+ // unroll 0
+ pB[0+0*bs] = B[0+0*lda];
+ pB[1+0*bs] = B[0+1*lda];
+ pB[2+0*bs] = B[0+2*lda];
+ pB[3+0*bs] = B[0+3*lda];
+ B += 1;
+ pB += 1*bs;
+ }
+ }
+ if(ii<m1)
+ {
+ m2 = m1-ii;
+ if(bs<m2) m2 = bs;
+ for(j=0; j<m; j++)
+ {
+ for(i=0; i<m2; i++)
+ {
+ pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector into a vector structure
+void d_cvt_vec2strvec(int m, double *a, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ pa[ii] = a[ii];
+ return;
+ }
+
+
+
+// convert a matrix structure into a matrix
+void d_cvt_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, jj;
+ int m0 = (bs-ai%bs)%bs;
+ double *ptr_pA;
+ jj=0;
+ for(; jj<n-3; jj+=4)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ // unroll 0
+ A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ // unroll 0
+ A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+ A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+ A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+ // unroll 0
+ A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
+ A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
+ A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
+ // unroll 0
+ A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
+ A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
+ A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
+ // unroll 0
+ A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
+ A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
+ A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ // unroll 0
+ A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ A[ii+lda*jj] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ A[0+ii+lda*jj] = ptr_pA[0];
+ A[1+ii+lda*jj] = ptr_pA[1];
+ A[2+ii+lda*jj] = ptr_pA[2];
+ A[3+ii+lda*jj] = ptr_pA[3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ A[ii+lda*jj] = ptr_pA[0];
+ ptr_pA++;
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix structure into a matrix
+void d_cvt_tran_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, jj;
+ int m0 = (bs-ai%bs)%bs;
+ double *ptr_pA;
+ jj=0;
+ for(; jj<n-3; jj+=4)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ // unroll 0
+ A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ // unroll 0
+ A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
+ A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
+ A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
+ A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
+ // unroll 1
+ A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
+ A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
+ A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
+ A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
+ // unroll 2
+ A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
+ A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
+ A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
+ A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
+ // unroll 3
+ A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
+ A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
+ A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
+ A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ // unroll 0
+ A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ A[jj+lda*ii] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ i=0;
+ for(; i<bs; i++)
+ {
+ A[jj+lda*(i+ii)] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ A[jj+lda*ii] = ptr_pA[0];
+ ptr_pA++;
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector structure into a vector
+void d_cvt_strvec2vec(int m, struct d_strvec *sa, int ai, double *a)
+ {
+ double *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ a[ii] = pa[ii];
+ return;
+ }
+
+
+
+// cast a matrix into a matrix structure
+void d_cast_mat2strmat(double *A, struct d_strmat *sA)
+ {
+ sA->pA = A;
+ return;
+ }
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void d_cast_diag_mat2strmat(double *dA, struct d_strmat *sA)
+ {
+ sA->dA = dA;
+ return;
+ }
+
+
+
+// cast a vector into a vector structure
+void d_cast_vec2vecmat(double *a, struct d_strvec *sa)
+ {
+ sa->pa = a;
+ return;
+ }
+
+
+
+// insert element into strmat
+void dgein1_libstr(double a, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ pA[0] = a;
+ return;
+ }
+
+
+
+// extract element from strmat
+double dgeex1_libstr(struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ return pA[0];
+ }
+
+
+
+// insert element into strvec
+void dvecin1_libstr(double a, struct d_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ x[0] = a;
+ return;
+ }
+
+
+
+// extract element from strvec
+double dvecex1_libstr(struct d_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ return x[0];
+ }
+
+
+
+// set all elements of a strmat to a value
+void dgese_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
+ int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
+ int ii, jj;
+ if(m0>0)
+ {
+ for(ii=0; ii<m0; ii++)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[jj*bs] = alpha;
+ }
+ pA += 1;
+ }
+ pA += bs*(sda-1);
+ m -= m0;
+ }
+ for(ii=0; ii<m-3; ii+=4)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[0+jj*bs] = alpha;
+ pA[1+jj*bs] = alpha;
+ pA[2+jj*bs] = alpha;
+ pA[3+jj*bs] = alpha;
+ }
+ pA += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[jj*bs] = alpha;
+ }
+ pA += 1;
+ }
+ return;
+ }
+
+
+
+// set all elements of a strvec to a value
+void dvecse_libstr(int m, double alpha, struct d_strvec *sx, int xi)
+ {
+ double *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ x[ii] = alpha;
+ return;
+ }
+
+
+
+// insert a vector into diagonal
+void ddiain_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ int offsetA = ai%bs;
+
+ int kna = (bs-offsetA%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pA[ll+bs*ll] = alpha*x[ll];
+ }
+ pA += kna + bs*(sda-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pA[jj*sda+(jj+0)*bs+0] = alpha*x[jj+0];
+ pA[jj*sda+(jj+1)*bs+1] = alpha*x[jj+1];
+ pA[jj*sda+(jj+2)*bs+2] = alpha*x[jj+2];
+ pA[jj*sda+(jj+3)*bs+3] = alpha*x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pA[jj*sda+(jj+ll)*bs+ll] = alpha*x[jj+ll];
+ }
+ return;
+ }
+
+
+
+// add scalar to diagonal
+void ddiare_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int offsetA = ai%bs;
+
+ int kna = (bs-offsetA%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pA[ll+bs*ll] += alpha;
+ }
+ pA += kna + bs*(sda-1) + kna*bs;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pA[jj*sda+(jj+0)*bs+0] += alpha;
+ pA[jj*sda+(jj+1)*bs+1] += alpha;
+ pA[jj*sda+(jj+2)*bs+2] += alpha;
+ pA[jj*sda+(jj+3)*bs+3] += alpha;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pA[jj*sda+(jj+ll)*bs+ll] += alpha;
+ }
+ return;
+ }
+
+
+
+// swap two rows of a matrix struct
+void drowsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ drowsw_lib(kmax, pA, pC);
+ return;
+ }
+
+
+
+// permute the rows of a matrix struct
+void drowpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ drowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+ }
+ return;
+ }
+
+
+// extract a row int a vector
+void drowex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ drowex_lib(kmax, alpha, pA, x);
+ return;
+ }
+
+
+
+// insert a vector into a row
+void drowin_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ drowin_lib(kmax, alpha, x, pA);
+ return;
+ }
+
+
+
+// add a vector to a row
+void drowad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ drowad_lib(kmax, alpha, x, pA);
+ return;
+ }
+
+
+
+// extract vector from column
+void dcolex_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ dcolex_lib(kmax, ai%bs, pA, sda, x);
+ return;
+ }
+
+
+
+
+// insert as vector as a column
+void dcolin_libstr(int kmax, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ dcolin_lib(kmax, x, ai%bs, pA, sda);
+ return;
+ }
+
+
+
+
+// swap two cols of a matrix struct
+void dcolsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dcolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
+ return;
+ }
+
+
+
+// permute the cols of a matrix struct
+void dcolpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ dcolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+ }
+ return;
+ }
+
+
+
+// copy a generic strmat into a generic strmat
+void dgecp_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dgecp_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc);
+ return;
+ }
+
+
+
+// scale a generic strmat
+void dgesc_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ dgecp_lib(m, n, alpha, ai%bs, pA, sda, ai%bs, pA, sda);
+ return;
+ }
+
+
+
+// copy a strvec into a strvec
+void dveccp_libstr(int m, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+ {
+ double *pa = sa->pa + ai;
+ double *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] = pa[ii+0];
+ pc[ii+1] = pa[ii+1];
+ pc[ii+2] = pa[ii+2];
+ pc[ii+3] = pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] = pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// scale a strvec
+void dvecsc_libstr(int m, double alpha, struct d_strvec *sa, int ai)
+ {
+ double *pa = sa->pa + ai;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pa[ii+0] *= alpha;
+ pa[ii+1] *= alpha;
+ pa[ii+2] *= alpha;
+ pa[ii+3] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pa[ii+0] *= alpha;
+ }
+ return;
+ }
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void dtrcp_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dtrcp_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc);
+ return;
+ }
+
+
+
+// scale and add a generic strmat into a generic strmat
+void dgead_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dgead_lib(m, n, alpha, ai%bs, pA, sda, ci%bs, pC, sdc);
+ return;
+ }
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void dgetr_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dgetr_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void dtrtr_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dtrtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void dtrtr_u_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ dtrtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// insert a strvec to diagonal of strmat, sparse formulation
+void ddiain_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ int sdd = sD->cn;
+ double *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// extract a vector from diagonal
+void ddiaex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ int offsetA = ai%bs;
+
+ int kna = (bs-offsetA%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ x[ll] = alpha*pA[ll+bs*ll];
+ }
+ pA += kna + bs*(sda-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[jj+0] = alpha*pA[jj*sda+(jj+0)*bs+0];
+ x[jj+1] = alpha*pA[jj*sda+(jj+1)*bs+1];
+ x[jj+2] = alpha*pA[jj*sda+(jj+2)*bs+2];
+ x[jj+3] = alpha*pA[jj*sda+(jj+3)*bs+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ x[jj+ll] = alpha*pA[jj*sda+(jj+ll)*bs+ll];
+ }
+ return;
+ }
+
+
+
+// extract the diagonal of a strmat to a strvec, sparse formulation
+void ddiaex_sp_libstr(int kmax, double alpha, int *idx, struct d_strmat *sD, int di, int dj, struct d_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ int sdd = sD->cn;
+ double *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
+ }
+ return;
+ }
+
+
+
+// add a vector to diagonal
+void ddiaad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ double *x = sx->pa + xi;
+ int offsetA = ai%bs;
+
+ int kna = (bs-offsetA%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pA[ll+bs*ll] += alpha*x[ll];
+ }
+ pA += kna + bs*(sda-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pA[jj*sda+(jj+0)*bs+0] += alpha*x[jj+0];
+ pA[jj*sda+(jj+1)*bs+1] += alpha*x[jj+1];
+ pA[jj*sda+(jj+2)*bs+2] += alpha*x[jj+2];
+ pA[jj*sda+(jj+3)*bs+3] += alpha*x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pA[jj*sda+(jj+ll)*bs+ll] += alpha*x[jj+ll];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to diagonal of strmat, sparse formulation
+void ddiaad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ int sdd = sD->cn;
+ double *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void ddiaadin_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ double *y = sy->pa + yi;
+ int sdd = sD->cn;
+ double *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void drowad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ double *x = sx->pa + xi;
+ int sdd = sD->cn;
+ double *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
+ drowad_libsp(kmax, idx, alpha, x, pD);
+ return;
+ }
+
+
+
+void dvecad_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+ {
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] += alpha * x[ii];
+ return;
+ }
+
+
+
+void dvecin_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+ {
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] = alpha * x[ii];
+ return;
+ }
+
+
+
+void dvecex_sp_libstr(int m, double alpha, int *idx, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+ {
+ double *x = sx->pa + xi;
+ double *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[ii] = alpha * x[idx[ii]];
+ return;
+ }
+
+
+
+void dveccl_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi)
+ {
+
+ double *xm = sxm->pa + xim;
+ double *x = sx->pa + xi;
+ double *xp = sxp->pa + xip;
+ double *z = sz->pa + zi;
+
+ int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ double d0;
+
+ __m256d
+ xm0, x0, xp0, z0, tmp0, tmp1, ones, mones, mask1, mask2;
+
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+ mones = _mm256_set_pd( -1.0, -1.0, -1.0, -1.0 );
+ mask1 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
+
+ for(ii=0; ii<m-3; ii+=4)
+ {
+ x0 = _mm256_loadu_pd( &x[ii] );
+ xp0 = _mm256_loadu_pd( &xp[ii] );
+ xm0 = _mm256_loadu_pd( &xm[ii] );
+ tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+ tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+ z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+ z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+ _mm256_storeu_pd( &z[ii], z0 );
+ }
+ if(ii<m)
+ {
+ d0 = (double) m-ii;
+ mask2 = _mm256_broadcast_sd( &d0 );
+ mask2 = _mm256_sub_pd( mask1, mask2 );
+ x0 = _mm256_loadu_pd( &x[ii] );
+ xp0 = _mm256_loadu_pd( &xp[ii] );
+ xm0 = _mm256_loadu_pd( &xm[ii] );
+ tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+ tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+ z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+ z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+ _mm256_maskstore_pd( &z[ii], _mm256_castpd_si256( mask2 ), z0 );
+ }
+#else
+ for(ii=0; ii<m; ii++)
+ {
+ if(x[ii]>=xp[ii])
+ {
+ z[ii] = xp[ii];
+ }
+ else if(x[ii]<=xm[ii])
+ {
+ z[ii] = xm[ii];
+ }
+ else
+ {
+ z[ii] = x[ii];
+ }
+ }
+#endif
+
+ return;
+
+ }
+
+
+
+void dveccl_mask_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi, struct d_strvec *sm, int mi)
+ {
+
+ double *xm = sxm->pa + xim;
+ double *x = sx->pa + xi;
+ double *xp = sxp->pa + xip;
+ double *z = sz->pa + zi;
+ double *mask = sm->pa + mi;
+
+ int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ double d0;
+
+ __m256d
+ xm0, x0, xp0, z0, mask0, tmp0, tmp1, ones, mones, mask1, mask2;
+
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+ mones = _mm256_set_pd( -1.0, -1.0, -1.0, -1.0 );
+ mask1 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
+
+ for(ii=0; ii<m-3; ii+=4)
+ {
+ mask0 = _mm256_setzero_pd();
+ x0 = _mm256_loadu_pd( &x[ii] );
+ xp0 = _mm256_loadu_pd( &xp[ii] );
+ xm0 = _mm256_loadu_pd( &xm[ii] );
+ tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+ tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+ z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+ z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+ mask0 = _mm256_blendv_pd( mask0, ones, tmp0 );
+ mask0 = _mm256_blendv_pd( mask0, mones, tmp1 );
+ _mm256_storeu_pd( &z[ii], z0 );
+ _mm256_storeu_pd( &mask[ii], mask0 );
+ }
+ if(ii<m)
+ {
+ d0 = (double) m-ii;
+ mask2 = _mm256_broadcast_sd( &d0 );
+ mask2 = _mm256_sub_pd( mask1, mask2 );
+ mask0 = _mm256_setzero_pd();
+ x0 = _mm256_loadu_pd( &x[ii] );
+ xp0 = _mm256_loadu_pd( &xp[ii] );
+ xm0 = _mm256_loadu_pd( &xm[ii] );
+ tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+ tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+ z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+ z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+ mask0 = _mm256_blendv_pd( mask0, ones, tmp0 );
+ mask0 = _mm256_blendv_pd( mask0, mones, tmp1 );
+ _mm256_maskstore_pd( &z[ii], _mm256_castpd_si256( mask2 ), z0 );
+ _mm256_maskstore_pd( &mask[ii], _mm256_castpd_si256( mask2 ), mask0 );
+ }
+#else
+ for(ii=0; ii<m; ii++)
+ {
+ if(x[ii]>=xp[ii])
+ {
+ z[ii] = xp[ii];
+ mask[ii] = 1.0;
+ }
+ else if(x[ii]<=xm[ii])
+ {
+ z[ii] = xm[ii];
+ mask[ii] = -1.0;
+ }
+ else
+ {
+ z[ii] = x[ii];
+ mask[ii] = 0.0;
+ }
+ }
+#endif
+
+ return;
+
+ }
+
+
+
+void dvecze_libstr(int m, struct d_strvec *sm, int mi, struct d_strvec *sv, int vi, struct d_strvec *se, int ei)
+ {
+ double *mask = sm->pa + mi;
+ double *v = sv->pa + vi;
+ double *e = se->pa + ei;
+
+ int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+ double d0;
+
+ __m256d
+ mask0, mask1, mask2, mask3, fives, zeros, e0, v0;
+
+ fives = _mm256_set_pd( 0.5, 0.5, 0.5, 0.5 );
+ zeros = _mm256_setzero_pd();
+ mask3 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
+
+ for(ii=0; ii<m-3; ii+=4)
+ {
+ v0 = _mm256_loadu_pd( &v[ii] );
+ mask0 = _mm256_loadu_pd( &mask[ii] );
+ mask1 = mask0;
+ mask0 = _mm256_sub_pd( mask0, fives);
+ mask1 = _mm256_add_pd( mask1, fives);
+ mask0 = _mm256_xor_pd( mask0, mask1);
+ e0 = _mm256_blendv_pd( zeros, v0, mask0 );
+ _mm256_storeu_pd( &e[ii], e0 );
+ }
+ if(ii<m)
+ {
+ d0 = (double) m-ii;
+ mask2 = _mm256_broadcast_sd( &d0 );
+ mask2 = _mm256_sub_pd( mask3, mask2 );
+ v0 = _mm256_loadu_pd( &v[ii] );
+ mask0 = _mm256_loadu_pd( &mask[ii] );
+ mask1 = mask0;
+ mask0 = _mm256_sub_pd( mask0, fives);
+ mask1 = _mm256_add_pd( mask1, fives);
+ mask0 = _mm256_xor_pd( mask0, mask1);
+ e0 = _mm256_blendv_pd( zeros, v0, mask0 );
+ _mm256_maskstore_pd( &e[ii], _mm256_castpd_si256( mask2 ), e0 );
+ }
+#else
+ for(ii=0; ii<m; ii++)
+ {
+ if(mask[ii]==0)
+ {
+ e[ii] = v[ii];
+ }
+ else
+ {
+ e[ii] = 0;
+ }
+ }
+#endif
+
+ }
+
+
+
+void dvecnrm_inf_libstr(int m, struct d_strvec *sx, int xi, double *ptr_norm)
+ {
+ int ii;
+ double *x = sx->pa + xi;
+ double norm = 0.0;
+ for(ii=0; ii<m; ii++)
+ norm = fmax(norm, fabs(x[ii]));
+ *ptr_norm = norm;
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/auxiliary/i_aux_ext_dep_lib.c b/auxiliary/i_aux_ext_dep_lib.c
new file mode 100644
index 0000000..1ca2292
--- /dev/null
+++ b/auxiliary/i_aux_ext_dep_lib.c
@@ -0,0 +1,111 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+#if ! defined(OS_WINDOWS)
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+#endif
+
+
+
+/* creates a zero matrix aligned */
+void int_zeros(int **pA, int row, int col)
+ {
+ void *temp = malloc((row*col)*sizeof(int));
+ *pA = temp;
+ int *A = *pA;
+ int i;
+ for(i=0; i<row*col; i++) A[i] = 0;
+ }
+
+
+
+/* creates a zero matrix aligned to a cache line */
+void int_zeros_align(int **pA, int row, int col)
+ {
+#if defined(OS_WINDOWS)
+ *pA = (int *) _aligned_malloc( (row*col)*sizeof(int), 64 );
+#else
+ void *temp;
+ int err = posix_memalign(&temp, 64, (row*col)*sizeof(int));
+ if(err!=0)
+ {
+ printf("Memory allocation error");
+ exit(1);
+ }
+ *pA = temp;
+#endif
+ int *A = *pA;
+ int i;
+ for(i=0; i<row*col; i++) A[i] = 0.0;
+ }
+
+
+
+/* frees matrix */
+void int_free(int *pA)
+ {
+ free( pA );
+ }
+
+
+
+/* frees aligned matrix */
+void int_free_align(int *pA)
+ {
+#if defined(OS_WINDOWS)
+ _aligned_free( pA );
+#else
+ free( pA );
+#endif
+ }
+
+
+
+/* prints a matrix in column-major format */
+void int_print_mat(int row, int col, int *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<row; i++)
+ {
+ for(j=0; j<col; j++)
+ {
+ printf("%d ", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
diff --git a/auxiliary/m_aux_lib.c b/auxiliary/m_aux_lib.c
new file mode 100644
index 0000000..30cb333
--- /dev/null
+++ b/auxiliary/m_aux_lib.c
@@ -0,0 +1,112 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi)
+ {
+ double *pd = vd->pa+vdi;
+ float *ps = vs->pa+vsi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ ps[ii] = (float) pd[ii];
+ }
+ return;
+ }
+
+
+
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi)
+ {
+ double *pd = vd->pa+vdi;
+ float *ps = vs->pa+vsi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ pd[ii] = (double) ps[ii];
+ }
+ return;
+ }
+
+
+
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis)
+ {
+ int lda = Md->m;
+ int ldb = Ms->m;
+ double *pA = Md->pA+mid+nid*lda;
+ float *pB = Ms->pA+mis+nis*ldb;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pB[ii+jj*ldb] = (float) pA[ii+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid)
+ {
+ int lda = Ms->m;
+ int ldb = Md->m;
+ float *pA = Ms->pA+mis+nis*lda;
+ double *pB = Md->pA+mid+nid*ldb;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pB[ii+jj*ldb] = (double) pA[ii+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/auxiliary/m_aux_lib44.c b/auxiliary/m_aux_lib44.c
new file mode 100644
index 0000000..a17d545
--- /dev/null
+++ b/auxiliary/m_aux_lib44.c
@@ -0,0 +1,93 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi)
+ {
+ double *pd = vd->pa+vdi;
+ float *ps = vs->pa+vsi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ ps[ii] = (float) pd[ii];
+ }
+ return;
+ }
+
+
+
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi)
+ {
+ double *pd = vd->pa+vdi;
+ float *ps = vs->pa+vsi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ pd[ii] = (double) ps[ii];
+ }
+ return;
+ }
+
+
+
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis)
+ {
+ printf("\nm_cvt_d2s_strmat: feature not implmeneted yet\n\n");
+ exit(1);
+ return;
+ }
+
+
+
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid)
+ {
+ printf("\nm_cvt_s2d_strmat: feature not implmeneted yet\n\n");
+ exit(1);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
diff --git a/auxiliary/m_aux_lib48.c b/auxiliary/m_aux_lib48.c
new file mode 100644
index 0000000..e9fdcd2
--- /dev/null
+++ b/auxiliary/m_aux_lib48.c
@@ -0,0 +1,153 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi)
+ {
+ double *pd = vd->pa+vdi;
+ float *ps = vs->pa+vsi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ ps[ii] = (float) pd[ii];
+ }
+ return;
+ }
+
+
+
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi)
+ {
+ double *pd = vd->pa+vdi;
+ float *ps = vs->pa+vsi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ pd[ii] = (double) ps[ii];
+ }
+ return;
+ }
+
+
+
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis)
+ {
+// printf("\nm_cvt_d2s_strmat: feature not implmeneted yet\n\n");
+// exit(1);
+ if(mid!=0 | mis!=0)
+ {
+ printf("\nm_cvt_d2s_strmat: feature not implmeneted yet: mid=%d, mis=%d\n\n", mid, mis);
+ exit(1);
+ }
+ const int psd = 4;
+ const int pss = 8;
+ const int sdd = Md->cn;
+ double *D0 = Md->pA + nid*psd;
+ double *D1;
+ const int sds = Ms->cn;
+ float *S = Ms->pA + nis*pss;
+ int ii, jj, ll;
+ for(ii=0; ii<m-7; ii+=8)
+ {
+ D1 = D0 + psd*sdd;
+ for(jj=0; jj<n; jj++)
+ {
+ S[0+jj*pss] = (float) D0[0+jj*psd];
+ S[1+jj*pss] = (float) D0[1+jj*psd];
+ S[2+jj*pss] = (float) D0[2+jj*psd];
+ S[3+jj*pss] = (float) D0[3+jj*psd];
+ S[4+jj*pss] = (float) D1[0+jj*psd];
+ S[5+jj*pss] = (float) D1[1+jj*psd];
+ S[6+jj*pss] = (float) D1[2+jj*psd];
+ S[7+jj*pss] = (float) D1[3+jj*psd];
+ }
+ D0 += 8*sdd;
+ S += 8*sds;
+ }
+ if(m-ii>0)
+ {
+ if(m-ii<4)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ for(ll=0; ll<m-ii; ll++)
+ {
+ S[ll+jj*pss] = (float) D0[ll+jj*psd];
+ }
+ }
+ return;
+ }
+ else
+ {
+ D1 = D0 + psd*sdd;
+ for(jj=0; jj<n; jj++)
+ {
+ S[0+jj*pss] = (float) D0[0+jj*psd];
+ S[1+jj*pss] = (float) D0[1+jj*psd];
+ S[2+jj*pss] = (float) D0[2+jj*psd];
+ S[3+jj*pss] = (float) D0[3+jj*psd];
+ for(ll=0; ll<m-ii-4; ll++)
+ {
+ S[4+ll+jj*pss] = (float) D1[ll+jj*psd];
+ }
+ }
+ }
+ }
+ return;
+ }
+
+
+
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid)
+ {
+ printf("\nm_cvt_s2d_strmat: feature not implmeneted yet\n\n");
+ exit(1);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/auxiliary/s_aux_ext_dep_lib.c b/auxiliary/s_aux_ext_dep_lib.c
new file mode 100644
index 0000000..85f7ebc
--- /dev/null
+++ b/auxiliary/s_aux_ext_dep_lib.c
@@ -0,0 +1,633 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if ! defined(OS_WINDOWS)
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+#endif
+
+
+
+/* creates a zero matrix */
+void s_zeros(float **pA, int row, int col)
+ {
+ *pA = malloc((row*col)*sizeof(float));
+ float *A = *pA;
+ int i;
+ for(i=0; i<row*col; i++) A[i] = 0.0;
+ }
+
+
+
+/* creates a zero matrix aligned to a cache line */
+void s_zeros_align(float **pA, int row, int col)
+ {
+#if defined(OS_WINDOWS)
+ *pA = (float *) _aligned_malloc( (row*col)*sizeof(float), 64 );
+#else
+ void *temp;
+ int err = posix_memalign(&temp, 64, (row*col)*sizeof(float));
+ if(err!=0)
+ {
+ printf("Memory allocation error");
+ exit(1);
+ }
+ *pA = temp;
+#endif
+ float *A = *pA;
+ int i;
+ for(i=0; i<row*col; i++) A[i] = 0.0;
+ }
+
+
+
+/* frees matrix */
+void s_free(float *pA)
+ {
+ free( pA );
+ }
+
+
+
+/* frees aligned matrix */
+void s_free_align(float *pA)
+ {
+#if defined(OS_WINDOWS)
+ _aligned_free( pA );
+#else
+ free( pA );
+#endif
+ }
+
+
+
+/* prints a matrix in column-major format */
+void s_print_mat(int m, int n, float *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<m; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void s_print_tran_mat(int row, int col, float *A, int lda)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ printf("%9.5f ", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/* prints a matrix in column-major format */
+void s_print_to_file_mat(FILE *file, int row, int col, float *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<row; i++)
+ {
+ for(j=0; j<col; j++)
+ {
+ fprintf(file, "%9.5f ", A[i+lda*j]);
+ }
+ fprintf(file, "\n");
+ }
+ fprintf(file, "\n");
+ }
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void s_print_tran_to_file_mat(FILE *file, int row, int col, float *A, int lda)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ fprintf(file, "%9.5f ", A[i+lda*j]);
+ }
+ fprintf(file, "\n");
+ }
+ fprintf(file, "\n");
+ }
+
+
+
+/* prints a matrix in column-major format (exponential notation) */
+void s_print_e_mat(int m, int n, float *A, int lda)
+ {
+ int i, j;
+ for(i=0; i<m; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/* prints the transposed of a matrix in column-major format (exponential notation) */
+void s_print_e_tran_mat(int row, int col, float *A, int lda)
+ {
+ int i, j;
+ for(j=0; j<col; j++)
+ {
+ for(i=0; i<row; i++)
+ {
+ printf("%e\t", A[i+lda*j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+#include "../include/blasfeo_block_size.h"
+
+
+
+// create a matrix structure for a matrix of size m*n by dynamically allocating the memory
+void s_allocate_strmat(int m, int n, struct s_strmat *sA)
+ {
+ const int bs = S_PS;
+ int nc = S_NC;
+ int al = bs*nc;
+ sA->m = m;
+ sA->n = n;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ sA->pm = pm;
+ sA->cn = cn;
+ s_zeros_align(&(sA->pA), sA->pm, sA->cn);
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ s_zeros_align(&(sA->dA), tmp, 1);
+ sA->use_dA = 0;
+ sA->memory_size = (pm*cn+tmp)*sizeof(float);
+ return;
+ }
+
+
+
+// free memory of a matrix structure
+void s_free_strmat(struct s_strmat *sA)
+ {
+ s_free_align(sA->pA);
+ s_free_align(sA->dA);
+ return;
+ }
+
+
+
+// create a vector structure for a vector of size m by dynamically allocating the memory
+void s_allocate_strvec(int m, struct s_strvec *sa)
+ {
+ const int bs = S_PS;
+// int nc = S_NC;
+// int al = bs*nc;
+ sa->m = m;
+ int pm = (m+bs-1)/bs*bs;
+ sa->pm = pm;
+ s_zeros_align(&(sa->pa), sa->pm, 1);
+ sa->memory_size = pm*sizeof(float);
+ return;
+ }
+
+
+
+// free memory of a matrix structure
+void s_free_strvec(struct s_strvec *sa)
+ {
+ s_free_align(sa->pa);
+ return;
+ }
+
+
+
+// print a matrix structure
+void s_print_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = S_PS;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int ii, i, j, tmp;
+ ii = 0;
+ if(ai%bs>0)
+ {
+ tmp = bs-ai%bs;
+ tmp = m<tmp ? m : tmp;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", pA[i+bs*j]);
+ }
+ printf("\n");
+ }
+ pA += tmp + bs*(sda-1);
+ m -= tmp;
+ }
+ for( ; ii<m-(bs-1); ii+=bs)
+ {
+ for(i=0; i<bs; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ if(ii<m)
+ {
+ tmp = m-ii;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ printf("\n");
+ return;
+ }
+
+
+
+// print a vector structure
+void s_print_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print the transposed of a vector structure
+void s_print_tran_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void s_print_to_file_strmat(FILE * file, int m, int n, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = S_PS;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int ii, i, j, tmp;
+ ii = 0;
+ if(ai%bs>0)
+ {
+ tmp = bs-ai%bs;
+ tmp = m<tmp ? m : tmp;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ fprintf(file, "%9.5f ", pA[i+bs*j]);
+ }
+ fprintf(file, "\n");
+ }
+ pA += tmp + bs*(sda-1);
+ m -= tmp;
+ }
+ for( ; ii<m-(bs-1); ii+=bs)
+ {
+ for(i=0; i<bs; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ fprintf(file, "\n");
+ }
+ }
+ if(ii<m)
+ {
+ tmp = m-ii;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+ }
+ fprintf(file, "\n");
+ }
+ }
+ fprintf(file, "\n");
+ return;
+ }
+
+
+
+// print a vector structure
+void s_print_to_file_strvec(FILE * file, int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_to_file_mat(file, m, 1, pa, m);
+ return;
+ }
+
+
+
+// print the transposed of a vector structure
+void s_print_tran_to_file_strvec(FILE * file, int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_to_file_mat(file, 1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void s_print_e_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = S_PS;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int ii, i, j, tmp;
+ ii = 0;
+ if(ai%bs>0)
+ {
+ tmp = bs-ai%bs;
+ tmp = m<tmp ? m : tmp;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", pA[i+bs*j]);
+ }
+ printf("\n");
+ }
+ pA += tmp + bs*(sda-1);
+ m -= tmp;
+ }
+ for( ; ii<m-(bs-1); ii+=bs)
+ {
+ for(i=0; i<bs; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ if(ii<m)
+ {
+ tmp = m-ii;
+ for(i=0; i<tmp; i++)
+ {
+ for(j=0; j<n; j++)
+ {
+ printf("%e\t", pA[i+bs*j+sda*ii]);
+ }
+ printf("\n");
+ }
+ }
+ printf("\n");
+ return;
+ }
+
+
+
+// print a vector structure
+void s_print_e_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_e_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print the transposed of a vector structure
+void s_print_e_tran_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_e_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+#elif defined(LA_BLAS) | defined(LA_REFERENCE)
+
+
+
+// create a matrix structure for a matrix of size m*n
+void s_allocate_strmat(int m, int n, struct s_strmat *sA)
+ {
+ sA->m = m;
+ sA->n = n;
+ s_zeros(&(sA->pA), sA->m, sA->n);
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ s_zeros(&(sA->dA), tmp, 1);
+ sA->memory_size = (m*n+tmp)*sizeof(float);
+ return;
+ }
+
+
+
+// free memory of a matrix structure
+void s_free_strmat(struct s_strmat *sA)
+ {
+ free(sA->pA);
+ free(sA->dA);
+ return;
+ }
+
+
+
+// create a vector structure for a vector of size m
+void s_allocate_strvec(int m, struct s_strvec *sa)
+ {
+ sa->m = m;
+ s_zeros(&(sa->pa), sa->m, 1);
+ sa->memory_size = m*sizeof(float);
+ return;
+ }
+
+
+
+// free memory of a vector structure
+void s_free_strvec(struct s_strvec *sa)
+ {
+ free(sa->pa);
+ return;
+ }
+
+
+
+// print a matrix structure
+void s_print_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ s_print_mat(m, n, pA, lda);
+ return;
+ }
+
+
+
+// print a vector structure
+void s_print_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print and transpose a vector structure
+void s_print_tran_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void s_print_to_file_strmat(FILE *file, int m, int n, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ s_print_to_file_mat(file, m, n, pA, lda);
+ return;
+ }
+
+
+
+// print a vector structure
+void s_print_to_file_strvec(FILE *file, int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_to_file_mat(file, m, 1, pa, m);
+ return;
+ }
+
+
+
+// print and transpose a vector structure
+void s_print_to_file_tran_strvec(FILE *file, int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_to_file_mat(file, 1, m, pa, 1);
+ return;
+ }
+
+
+
+// print a matrix structure
+void s_print_e_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ s_print_e_mat(m, n, pA, lda);
+ return;
+ }
+
+
+
+// print a vector structure
+void s_print_e_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_e_mat(m, 1, pa, m);
+ return;
+ }
+
+
+
+// print and transpose a vector structure
+void s_print_e_tran_strvec(int m, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ s_print_e_mat(1, m, pa, 1);
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/auxiliary/s_aux_lib.c b/auxiliary/s_aux_lib.c
new file mode 100644
index 0000000..978eb9a
--- /dev/null
+++ b/auxiliary/s_aux_lib.c
@@ -0,0 +1,956 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+// return memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n)
+ {
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ int size = (m*n+tmp)*sizeof(float);
+ return size;
+ }
+
+
+
+// return memory size (in bytes) needed for the diagonal of a strmat
+int s_size_diag_strmat(int m, int n)
+ {
+ int size = 0;
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ size = tmp*sizeof(float);
+ return size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory)
+ {
+ sA->m = m;
+ sA->n = n;
+ float *ptr = (float *) memory;
+ sA->pA = ptr;
+ ptr += m*n;
+ int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+ sA->dA = ptr;
+ ptr += tmp;
+ sA->use_dA = 0;
+ sA->memory_size = (m*n+tmp)*sizeof(float);
+ return;
+ }
+
+
+
+// return memory size (in bytes) needed for a strvec
+int s_size_strvec(int m)
+ {
+ int size = m*sizeof(float);
+ return size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strvec(int m, struct s_strvec *sa, void *memory)
+ {
+ sa->m = m;
+ float *ptr = (float *) memory;
+ sa->pa = ptr;
+// ptr += m * n;
+ sa->memory_size = m*sizeof(float);
+ return;
+ }
+
+
+
+// convert a matrix into a matrix structure
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ float *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+ pA[ii+1+jj*lda2] = A[ii+1+jj*lda];
+ pA[ii+2+jj*lda2] = A[ii+2+jj*lda];
+ pA[ii+3+jj*lda2] = A[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix into a matrix structure
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ float *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+ pA[jj+(ii+1)*lda2] = A[ii+1+jj*lda];
+ pA[jj+(ii+2)*lda2] = A[ii+2+jj*lda];
+ pA[jj+(ii+3)*lda2] = A[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector into a vector structure
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ pa[ii] = a[ii];
+ return;
+ }
+
+
+
+// convert a matrix structure into a matrix
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ float *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+ A[ii+1+jj*lda] = pA[ii+1+jj*lda2];
+ A[ii+2+jj*lda] = pA[ii+2+jj*lda2];
+ A[ii+3+jj*lda] = pA[ii+3+jj*lda2];
+ }
+ for(; ii<m; ii++)
+ {
+ A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix structure into a matrix
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+ {
+ int ii, jj;
+ int lda2 = sA->m;
+ float *pA = sA->pA + ai + aj*lda2;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+ A[jj+(ii+1)*lda] = pA[ii+1+jj*lda2];
+ A[jj+(ii+2)*lda] = pA[ii+2+jj*lda2];
+ A[jj+(ii+3)*lda] = pA[ii+3+jj*lda2];
+ }
+ for(; ii<m; ii++)
+ {
+ A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector structure into a vector
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ a[ii] = pa[ii];
+ return;
+ }
+
+
+
+// cast a matrix into a matrix structure
+void s_cast_mat2strmat(float *A, struct s_strmat *sA)
+ {
+ sA->pA = A;
+ return;
+ }
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA)
+ {
+ sA->dA = dA;
+ return;
+ }
+
+
+
+// cast a vector into a vector structure
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa)
+ {
+ sa->pa = a;
+ return;
+ }
+
+
+
+// insert element into strmat
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ pA[0] = a;
+ return;
+ }
+
+
+
+// extract element from strmat
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ return pA[0];
+ }
+
+
+
+// insert element into strvec
+void svecin1_libstr(float a, struct s_strvec *sx, int xi)
+ {
+ float *x = sx->pa + xi;
+ x[0] = a;
+ return;
+ }
+
+
+
+// extract element from strvec
+float svecex1_libstr(struct s_strvec *sx, int xi)
+ {
+ float *x = sx->pa + xi;
+ return x[0];
+ }
+
+
+
+// set all elements of a strmat to a value
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ for(ii=0; ii<m; ii++)
+ {
+ pA[ii+lda*jj] = alpha;
+ }
+ }
+ return;
+ }
+
+
+
+// set all elements of a strvec to a value
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi)
+ {
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ x[ii] = alpha;
+ return;
+ }
+
+
+
+// extract diagonal to vector
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ x[ii] = alpha*pA[ii*(lda+1)];
+ return;
+ }
+
+
+
+// insert a vector into diagonal
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*(lda+1)] = alpha*x[ii];
+ return;
+ }
+
+
+
+// extract a row into a vector
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ x[ii] = alpha*pA[ii*lda];
+ return;
+ }
+
+
+
+// insert a vector into a row
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*lda] = alpha*x[ii];
+ return;
+ }
+
+
+
+// add a vector to a row
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*lda] += alpha*x[ii];
+ return;
+ }
+
+
+
+// swap two rows of a matrix struct
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*lda;
+ int ii;
+ float tmp;
+ for(ii=0; ii<kmax; ii++)
+ {
+ tmp = pA[ii*lda];
+ pA[ii*lda] = pC[ii*ldc];
+ pC[ii*ldc] = tmp;
+ }
+ return;
+ }
+
+
+
+// permute the rows of a matrix struct
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ srowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+ }
+ return;
+ }
+
+
+
+// insert a vector into a rcol
+void scolin_libstr(int kmax, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii] = x[ii];
+ return;
+ }
+
+
+
+// swap two cols of a matrix struct
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*lda;
+ int ii;
+ float tmp;
+ for(ii=0; ii<kmax; ii++)
+ {
+ tmp = pA[ii];
+ pA[ii] = pC[ii];
+ pC[ii] = tmp;
+ }
+ return;
+ }
+
+
+
+// permute the cols of a matrix struct
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ scolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+ }
+ return;
+ }
+
+
+
+// copy a generic strmat into a generic strmat
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+ pC[ii+1+jj*ldc] = pA[ii+1+jj*lda];
+ pC[ii+2+jj*ldc] = pA[ii+2+jj*lda];
+ pC[ii+3+jj*ldc] = pA[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// scale a generic strmat
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pA[ii+0+jj*lda] *= alpha;
+ pA[ii+1+jj*lda] *= alpha;
+ pA[ii+2+jj*lda] *= alpha;
+ pA[ii+3+jj*lda] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pA[ii+0+jj*lda] *= alpha;
+ }
+ }
+ return;
+ }
+
+
+
+// copy a strvec into a strvec
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+ {
+ float *pa = sa->pa + ai;
+ float *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] = pa[ii+0];
+ pc[ii+1] = pa[ii+1];
+ pc[ii+2] = pa[ii+2];
+ pc[ii+3] = pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] = pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// scale a strvec
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pa[ii+0] *= alpha;
+ pa[ii+1] *= alpha;
+ pa[ii+2] *= alpha;
+ pa[ii+3] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pa[ii+0] *= alpha;
+ }
+ return;
+ }
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<m; jj++)
+ {
+ ii = jj;
+ for(; ii<m; ii++)
+ {
+ pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// scale and add a generic strmat into a generic strmat
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+ pC[ii+1+jj*ldc] += alpha*pA[ii+1+jj*lda];
+ pC[ii+2+jj*ldc] += alpha*pA[ii+2+jj*lda];
+ pC[ii+3+jj*ldc] += alpha*pA[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// scales and adds a strvec into a strvec
+void svecad_libstr(int m, float alpha, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+ {
+ float *pa = sa->pa + ai;
+ float *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ pc[ii+1] += alpha*pa[ii+1];
+ pc[ii+2] += alpha*pa[ii+2];
+ pc[ii+3] += alpha*pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<n; jj++)
+ {
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ pC[jj+(ii+1)*ldc] = pA[ii+1+jj*lda];
+ pC[jj+(ii+2)*ldc] = pA[ii+2+jj*lda];
+ pC[jj+(ii+3)*ldc] = pA[ii+3+jj*lda];
+ }
+ for(; ii<m; ii++)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<m; jj++)
+ {
+ ii = jj;
+ for(; ii<m; ii++)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ int ldc = sC->m;
+ float *pC = sC->pA + ci + cj*ldc;
+ int ii, jj;
+ for(jj=0; jj<m; jj++)
+ {
+ ii = 0;
+ for(; ii<=jj; ii++)
+ {
+ pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+ }
+ }
+ return;
+ }
+
+
+
+// insert a strvec to the diagonal of a strmat, sparse formulation
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ float *x = sx->pa + xi;
+ int ldd = sD->m;
+ float *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*(ldd+1)] = alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// extract the diagonal of a strmat from a strvec , sparse formulation
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi)
+ {
+ float *x = sx->pa + xi;
+ int ldd = sD->m;
+ float *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[ii*(ldd+1)];
+ }
+ return;
+ }
+
+
+
+// add a vector to diagonal
+void sdiaad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ int lda = sA->m;
+ float *pA = sA->pA + ai + aj*lda;
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ pA[ii*(lda+1)] += alpha*x[ii];
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ float *x = sx->pa + xi;
+ int ldd = sD->m;
+ float *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*(ldd+1)] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ int ldd = sD->m;
+ float *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*(ldd+1)] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ float *x = sx->pa + xi;
+ int ldd = sD->m;
+ float *pD = sD->pA + di + dj*ldd;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*ldd] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+
+void svecad_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] += alpha * x[ii];
+ return;
+ }
+
+
+
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] = alpha * x[ii];
+ return;
+ }
+
+
+
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[ii] = alpha * x[idx[ii]];
+ return;
+ }
+
+
+// clip without mask return
+void sveccl_libstr(int m, struct s_strvec *sxm, int xim, struct s_strvec *sx, int xi, struct s_strvec *sxp, int xip, struct s_strvec *sz, int zi)
+ {
+ float *xm = sxm->pa + xim;
+ float *x = sx->pa + xi;
+ float *xp = sxp->pa + xip;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ if(x[ii]>=xp[ii])
+ {
+ z[ii] = xp[ii];
+ }
+ else if(x[ii]<=xm[ii])
+ {
+ z[ii] = xm[ii];
+ }
+ else
+ {
+ z[ii] = x[ii];
+ }
+ }
+ return;
+ }
+
+
+
+// clip with mask return
+void sveccl_mask_libstr(int m, struct s_strvec *sxm, int xim, struct s_strvec *sx, int xi, struct s_strvec *sxp, int xip, struct s_strvec *sz, int zi, struct s_strvec *sm, int mi)
+ {
+ float *xm = sxm->pa + xim;
+ float *x = sx->pa + xi;
+ float *xp = sxp->pa + xip;
+ float *z = sz->pa + zi;
+ float *mask = sm->pa + mi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ if(x[ii]>=xp[ii])
+ {
+ z[ii] = xp[ii];
+ mask[ii] = 1.0;
+ }
+ else if(x[ii]<=xm[ii])
+ {
+ z[ii] = xm[ii];
+ mask[ii] = -1.0;
+ }
+ else
+ {
+ z[ii] = x[ii];
+ mask[ii] = 0.0;
+ }
+ }
+ return;
+ }
+
+
+// zero out components using mask
+void svecze_libstr(int m, struct s_strvec *sm, int mi, struct s_strvec *sv, int vi, struct s_strvec *se, int ei)
+ {
+ float *mask = sm->pa + mi;
+ float *v = sv->pa + vi;
+ float *e = se->pa + ei;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ {
+ if(mask[ii]==0)
+ {
+ e[ii] = v[ii];
+ }
+ else
+ {
+ e[ii] = 0;
+ }
+ }
+ return;
+ }
+
+
+
+void svecnrm_inf_libstr(int m, struct s_strvec *sx, int xi, float *ptr_norm)
+ {
+ int ii;
+ float *x = sx->pa + xi;
+ float norm = 0.0;
+ for(ii=0; ii<m; ii++)
+ norm = fmax(norm, fabs(x[ii]));
+ *ptr_norm = norm;
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
diff --git a/auxiliary/s_aux_lib4.c b/auxiliary/s_aux_lib4.c
new file mode 100644
index 0000000..12acc47
--- /dev/null
+++ b/auxiliary/s_aux_lib4.c
@@ -0,0 +1,3107 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_block_size.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+// scales and adds a strvec into a strvec
+void svecad_libstr(int m, float *alphap, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+ {
+ float alpha = alphap[0];
+ float *pa = sa->pa + ai;
+ float *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ pc[ii+1] += alpha*pa[ii+1];
+ pc[ii+2] += alpha*pa[ii+2];
+ pc[ii+3] += alpha*pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// transpose general matrix; m and n are referred to the original matrix
+void sgetr_lib(int m, int n, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+
+/*
+
+m = 5
+n = 3
+offsetA = 1
+offsetC = 2
+
+A =
+ x x x
+ -
+ x x x
+ x x x
+ x x x
+ x x x
+
+C =
+ x x x x x
+ x x x x x
+ -
+ x x x x x
+
+*/
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna = (bs-offsetA%bs)%bs;
+ mna = m<mna ? m : mna;
+ int nna = (bs-offsetC%bs)%bs;
+ nna = n<nna ? n : nna;
+
+ int ii;
+
+ ii = 0;
+
+ if(mna>0)
+ {
+ if(mna==1)
+ kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else if(mna==2)
+ kernel_sgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else //if(mna==3)
+ kernel_sgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+ ii += mna;
+ pA += mna + bs*(sda-1);
+ pC += mna*bs;
+ }
+ for( ; ii<m-3; ii+=4)
+// for( ; ii<m; ii+=4)
+ {
+ kernel_sgetr_4_lib4(0, n, nna, alpha, pA, pC, sdc);
+ pA += bs*sda;
+ pC += bs*bs;
+ }
+
+ // clean-up at the end using smaller kernels
+ if(ii==m)
+ return;
+
+ if(m-ii==1)
+ kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else if(m-ii==2)
+ kernel_sgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+ else if(m-ii==3)
+ kernel_sgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+
+ return;
+
+ }
+
+
+
+// transpose lower triangular matrix
+void strtr_l_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+
+/*
+
+A =
+ x
+ x x
+ x x x
+ x x x x
+
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+C =
+ x x x x x x x x
+
+ x x x x x x x
+ x x x x x x
+ x x x x x
+ x x x x
+
+ x x x
+ x x
+ x
+
+*/
+
+ int n = m;
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna = (bs-offsetA%bs)%bs;
+ mna = m<mna ? m : mna;
+ int nna = (bs-offsetC%bs)%bs;
+ nna = n<nna ? n : nna;
+
+ int ii;
+
+ ii = 0;
+
+ if(mna>0)
+ {
+ if(mna==1)
+ {
+ pC[0] = alpha * pA[0];
+ }
+ else if(mna==2)
+ {
+ if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+ }
+ else
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ }
+ }
+ else //if(mna==3)
+ {
+ if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+ pC[1+bs*(1+sdc)] = alpha * pA[2+bs*1];
+ pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+ }
+ else if(nna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+ }
+ else
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ }
+ }
+ ii += mna;
+ pA += mna + bs*(sda-1);
+ pC += mna*bs;
+ }
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_sgetr_4_lib4(1, ii, nna, alpha, pA, pC, sdc);
+ pA += bs*sda;
+ pC += bs*bs;
+ }
+
+ // clean-up at the end using smaller kernels
+ if(ii==m)
+ return;
+
+ if(m-ii==1)
+ kernel_sgetr_1_lib4(1, ii, nna, alpha, pA, pC, sdc);
+ else if(m-ii==2)
+ kernel_sgetr_2_lib4(1, ii, nna, alpha, pA, pC, sdc);
+ else if(m-ii==3)
+ kernel_sgetr_3_lib4(1, ii, nna, alpha, pA, pC, sdc);
+
+ return;
+
+ }
+
+
+
+// transpose an aligned upper triangular matrix into an aligned lower triangular matrix
+void strtr_u_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+
+/*
+
+A =
+ x x x x x x x x
+ x x x x x x x
+
+ x x x x x x
+ x x x x x
+ x x x x
+ x x x
+ x x
+ x
+
+C =
+ x
+
+ x x
+ x x x
+ x x x x
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+*/
+
+ int n = m;
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ int mna = (bs-offsetA%bs)%bs;
+ mna = m<mna ? m : mna;
+ int nna = (bs-offsetC%bs)%bs;
+ nna = n<nna ? n : nna;
+ int tna = nna;
+
+ int ii;
+
+ ii = 0;
+
+ if(mna>0)
+ {
+ if(mna==1)
+ {
+ kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+ if(nna!=1)
+ {
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += 1*bs;
+ pC += 1;
+ tna = (bs-(offsetC+1)%bs)%bs;
+ }
+ else //if(nna==1)
+ {
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += 1*bs;
+ pC += 1 + (sdc-1)*bs;
+ tna = 0; //(bs-(offsetC+1)%bs)%bs;
+ }
+// kernel_sgetr_1_lib4(0, n-1, tna, alpha, pA, pC, sdc);
+ }
+ else if(mna==2)
+ {
+ if(nna==0 || nna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2;
+ tna = (bs-(offsetC+2)%bs)%bs;
+ kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += 1*bs;
+ pC += 1 + (sdc-1)*bs;
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+// pC[0+bs*1] = alpha * pA[1+bs*0];
+ kernel_sgetr_2_lib4(0, n-1, 0, alpha, pA, pC, sdc);
+ pA += 1*bs;
+ pC += 1;
+ tna = 3; //(bs-(offsetC+2)%bs)%bs;
+// kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+ tna = 0; //(bs-(offsetC+2)%bs)%bs;
+ kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+ }
+ }
+ else //if(mna==3)
+ {
+ if(nna==0)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pA += 3*bs;
+ pC += 3;
+ tna = 1;
+ kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pA += 2*bs;
+ pC += 2;
+ tna = 2;
+ kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ else if(nna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+// pC[0+bs*1] = alpha * pA[1+bs*0];
+// pC[0+bs*2] = alpha * pA[2+bs*0];
+ kernel_sgetr_3_lib4(0, n-2, 0, alpha, pA, pC, sdc);
+ pA += 1*bs;
+ pC += 1;
+ tna = 3;
+// kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ else //if(nna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pA += 3*bs;
+ pC += 3 + (sdc-1)*bs;
+ tna = 0;
+ kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+ }
+ }
+ ii += mna;
+ pA += mna + bs*(sda-1);
+ pC += mna*bs;
+ }
+ for( ; ii<m-3; ii+=4)
+ {
+ if(tna==0)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pC[3+bs*0] = alpha * pA[0+bs*3];
+ pC[3+bs*1] = alpha * pA[1+bs*3];
+ pC[3+bs*2] = alpha * pA[2+bs*3];
+ pC[3+bs*3] = alpha * pA[3+bs*3];
+ pA += 4*bs;
+ pC += sdc*bs;
+ kernel_sgetr_4_lib4(0, n-ii-4, 0, alpha, pA, pC, sdc);
+ }
+ else if(tna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pC[2+bs*3] = alpha * pA[3+bs*2];
+ pA += 3*bs;
+ pC += 3;
+ kernel_sgetr_4_lib4(0, n-ii-4, 1, alpha, pA, pC, sdc);
+ }
+ else if(tna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ pC[1+bs*3] = alpha * pA[3+bs*1];
+ pA += 2*bs;
+ pC += 2;
+ kernel_sgetr_4_lib4(0, n-ii-4, 2, alpha, pA, pC, sdc);
+ }
+ else //if(tna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ pA += 3*bs;
+ pC += 3 + (sdc-1)*bs;
+ kernel_sgetr_4_lib4(0, n-ii-3, 0, alpha, pA, pC, sdc);
+// pC[0+bs*0] = alpha * pA[0+bs*0];
+// pC[0+bs*1] = alpha * pA[1+bs*0];
+// pC[0+bs*2] = alpha * pA[2+bs*0];
+// pC[0+bs*3] = alpha * pA[3+bs*0];
+ pA += bs;
+ pC += 1;
+// kernel_sgetr_4_lib4(0, n-ii-4, tna, alpha, pA, pC, sdc);
+ }
+ pA += bs*sda;
+ pC += bs*bs;
+ }
+
+ // clean-up at the end
+ if(ii==m)
+ return;
+
+ if(m-ii==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ }
+ else if(m-ii==2)
+ {
+ if(tna!=1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ }
+ else //if(tna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ }
+ }
+ else if(m-ii==3)
+ {
+ if(tna==0 || tna==3)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[2+bs*0] = alpha * pA[0+bs*2];
+ pC[2+bs*1] = alpha * pA[1+bs*2];
+ pC[2+bs*2] = alpha * pA[2+bs*2];
+ }
+ else if(tna==1)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pA += bs;
+ pC += 1 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pC[1+bs*2] = alpha * pA[2+bs*1];
+ }
+ else //if(tna==2)
+ {
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[1+bs*0] = alpha * pA[0+bs*1];
+ pC[1+bs*1] = alpha * pA[1+bs*1];
+ pA += 2*bs;
+ pC += 2 + (sdc-1)*bs;
+ pC[0+bs*0] = alpha * pA[0+bs*0];
+ pC[0+bs*1] = alpha * pA[1+bs*0];
+ pC[0+bs*2] = alpha * pA[2+bs*0];
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// regularize diagonal
+void sdiareg_lib(int kmax, float reg, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] += reg;
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] += reg;
+ pD[jj*sdd+(jj+1)*bs+1] += reg;
+ pD[jj*sdd+(jj+2)*bs+2] += reg;
+ pD[jj*sdd+(jj+3)*bs+3] += reg;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] += reg;
+ }
+
+ }
+
+
+
+// insert vector to diagonal
+void sdiain_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] = alpha*x[ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] = alpha*x[jj+0];
+ pD[jj*sdd+(jj+1)*bs+1] = alpha*x[jj+1];
+ pD[jj*sdd+(jj+2)*bs+2] = alpha*x[jj+2];
+ pD[jj*sdd+(jj+3)*bs+3] = alpha*x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] = alpha*x[jj+ll];
+ }
+
+ }
+
+
+
+// insert sqrt of vector to diagonal
+void sdiain_sqrt_lib(int kmax, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] = sqrt(x[ll]);
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] = sqrt(x[jj+0]);
+ pD[jj*sdd+(jj+1)*bs+1] = sqrt(x[jj+1]);
+ pD[jj*sdd+(jj+2)*bs+2] = sqrt(x[jj+2]);
+ pD[jj*sdd+(jj+3)*bs+3] = sqrt(x[jj+3]);
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
+ }
+
+ }
+
+
+
+// extract diagonal to vector
+void sdiaex_lib(int kmax, float alpha, int offset, float *pD, int sdd, float *x)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ x[ll] = alpha * pD[ll+bs*ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[jj+0] = alpha * pD[jj*sdd+(jj+0)*bs+0];
+ x[jj+1] = alpha * pD[jj*sdd+(jj+1)*bs+1];
+ x[jj+2] = alpha * pD[jj*sdd+(jj+2)*bs+2];
+ x[jj+3] = alpha * pD[jj*sdd+(jj+3)*bs+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal
+void sdiaad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] += alpha * x[ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+(jj+0)*bs+0] += alpha * x[jj+0];
+ pD[jj*sdd+(jj+1)*bs+1] += alpha * x[jj+1];
+ pD[jj*sdd+(jj+2)*bs+2] += alpha * x[jj+2];
+ pD[jj*sdd+(jj+3)*bs+3] += alpha * x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
+ }
+
+ }
+
+
+
+// insert vector to diagonal, sparse formulation
+void sdiain_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
+ }
+
+ }
+
+
+
+// extract diagonal to vector, sparse formulation
+void sdiaex_libsp(int kmax, int *idx, float alpha, float *pD, int sdd, float *x)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void sdiaad_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to another vector and insert to diagonal, sparse formulation
+void sdiaadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
+ }
+
+ }
+
+
+
+// insert vector to row
+void srowin_lib(int kmax, float alpha, float *x, float *pD)
+ {
+
+ const int bs = 4;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[(jj+0)*bs] = alpha*x[jj+0];
+ pD[(jj+1)*bs] = alpha*x[jj+1];
+ pD[(jj+2)*bs] = alpha*x[jj+2];
+ pD[(jj+3)*bs] = alpha*x[jj+3];
+ }
+ for(; jj<kmax; jj++)
+ {
+ pD[(jj)*bs] = alpha*x[jj];
+ }
+
+ }
+
+
+
+// extract row to vector
+void srowex_lib(int kmax, float alpha, float *pD, float *x)
+ {
+
+ const int bs = 4;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[jj+0] = alpha*pD[(jj+0)*bs];
+ x[jj+1] = alpha*pD[(jj+1)*bs];
+ x[jj+2] = alpha*pD[(jj+2)*bs];
+ x[jj+3] = alpha*pD[(jj+3)*bs];
+ }
+ for(; jj<kmax; jj++)
+ {
+ x[jj] = alpha*pD[(jj)*bs];
+ }
+
+ }
+
+
+
+// add scaled vector to row
+void srowad_lib(int kmax, float alpha, float *x, float *pD)
+ {
+
+ const int bs = 4;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[(jj+0)*bs] += alpha * x[jj+0];
+ pD[(jj+1)*bs] += alpha * x[jj+1];
+ pD[(jj+2)*bs] += alpha * x[jj+2];
+ pD[(jj+3)*bs] += alpha * x[jj+3];
+ }
+ for(; jj<kmax; jj++)
+ {
+ pD[(jj)*bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// insert vector to row, sparse formulation
+void srowin_libsp(int kmax, float alpha, int *idx, float *x, float *pD)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] = alpha*x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to row, sparse formulation
+void srowad_libsp(int kmax, int *idx, float alpha, float *x, float *pD)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to another vector and insert to row, sparse formulation
+void srowadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] = y[jj] + alpha * x[jj];
+ }
+
+ }
+
+
+
+// swap two rows
+void srowsw_lib(int kmax, float *pA, float *pC)
+ {
+
+ const int bs = 4;
+
+ int ii;
+ float tmp;
+
+ for(ii=0; ii<kmax-3; ii+=4)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ tmp = pA[0+bs*1];
+ pA[0+bs*1] = pC[0+bs*1];
+ pC[0+bs*1] = tmp;
+ tmp = pA[0+bs*2];
+ pA[0+bs*2] = pC[0+bs*2];
+ pC[0+bs*2] = tmp;
+ tmp = pA[0+bs*3];
+ pA[0+bs*3] = pC[0+bs*3];
+ pC[0+bs*3] = tmp;
+ pA += 4*bs;
+ pC += 4*bs;
+ }
+ for( ; ii<kmax; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1*bs;
+ pC += 1*bs;
+ }
+
+ }
+
+
+
+// insert vector to column
+void scolin_lib(int kmax, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll] = x[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+0] = x[jj+0];
+ pD[jj*sdd+1] = x[jj+1];
+ pD[jj*sdd+2] = x[jj+2];
+ pD[jj*sdd+3] = x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+ll] = x[jj+ll];
+ }
+
+ }
+
+
+
+// add scaled vector to column
+void scolad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll] += alpha * x[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[jj*sdd+0] += alpha * x[jj+0];
+ pD[jj*sdd+1] += alpha * x[jj+1];
+ pD[jj*sdd+2] += alpha * x[jj+2];
+ pD[jj*sdd+3] += alpha * x[jj+3];
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+ll] += alpha * x[jj+ll];
+ }
+
+ }
+
+
+
+// insert vector to diagonal, sparse formulation
+void scolin_libsp(int kmax, int *idx, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs] = x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void scolad_libsp(int kmax, float alpha, int *idx, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 4;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// swaps two cols
+void scolsw_lib(int kmax, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+
+ const int bs = 4;
+
+ int ii;
+
+ float tmp;
+
+ if(offsetA==offsetC)
+ {
+ if(offsetA>0)
+ {
+ ii = 0;
+ for(; ii<bs-offsetA; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1;
+ pC += 1;
+ }
+ pA += bs*(sda-1);
+ pC += bs*(sdc-1);
+ kmax -= bs-offsetA;
+ }
+ ii = 0;
+ for(; ii<kmax-3; ii+=4)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ tmp = pA[1+bs*0];
+ pA[1+bs*0] = pC[1+bs*0];
+ pC[1+bs*0] = tmp;
+ tmp = pA[2+bs*0];
+ pA[2+bs*0] = pC[2+bs*0];
+ pC[2+bs*0] = tmp;
+ tmp = pA[3+bs*0];
+ pA[3+bs*0] = pC[3+bs*0];
+ pC[3+bs*0] = tmp;
+ pA += bs*sda;
+ pC += bs*sdc;
+ }
+ for(; ii<kmax; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1;
+ pC += 1;
+ }
+ }
+ else
+ {
+ printf("\nscolsw: feature not implemented yet: offsetA!=offsetC\n\n");
+ exit(1);
+ }
+
+ return;
+
+ }
+
+
+
+// insert vector to vector, sparse formulation
+void svecin_libsp(int kmax, int *idx, float *x, float *y)
+ {
+
+ int jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ y[idx[jj]] = x[jj];
+ }
+
+ }
+
+
+
+// adds vector to vector, sparse formulation
+void svecad_libsp(int kmax, int *idx, float alpha, float *x, float *y)
+ {
+
+ int jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ y[idx[jj]] += alpha * x[jj];
+ }
+
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// return the memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n)
+ {
+ const int bs = 4;
+ int nc = S_NC;
+ int al = bs*nc;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ int memory_size = (pm*cn+tmp)*sizeof(float);
+ return memory_size;
+ }
+
+
+
+// return the memory size (in bytes) needed for the digonal of a strmat
+int s_size_diag_strmat(int m, int n)
+ {
+ const int bs = 4;
+ int nc = S_NC;
+ int al = bs*nc;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ int memory_size = tmp*sizeof(float);
+ return memory_size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory)
+ {
+ const int bs = 4;
+ int nc = S_NC;
+ int al = bs*nc;
+ sA->m = m;
+ sA->n = n;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ sA->pm = pm;
+ sA->cn = cn;
+ float *ptr = (float *) memory;
+ sA->pA = ptr;
+ ptr += pm*cn;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ sA->dA = ptr;
+ ptr += tmp;
+ sA->use_dA = 0;
+ sA->memory_size = (pm*cn+tmp)*sizeof(float);
+ return;
+ }
+
+
+
+// return memory size (in bytes) needed for a strvec
+int s_size_strvec(int m)
+ {
+ const int bs = 4;
+// int nc = S_NC;
+// int al = bs*nc;
+ int pm = (m+bs-1)/bs*bs;
+ int memory_size = pm*sizeof(float);
+ return memory_size;
+ }
+
+
+
+// create a vector structure for a vector of size m by using memory passed by a pointer
+void s_create_strvec(int m, struct s_strvec *sa, void *memory)
+ {
+ const int bs = 4;
+// int nc = S_NC;
+// int al = bs*nc;
+ sa->m = m;
+ int pm = (m+bs-1)/bs*bs;
+ sa->pm = pm;
+ float *ptr = (float *) memory;
+ sa->pa = ptr;
+// ptr += pm;
+ sa->memory_size = pm*sizeof(float);
+ return;
+ }
+
+
+
+// convert a matrix into a matrix structure
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, j, jj, m0, m1, m2;
+ float *B, *pB;
+ m0 = (bs-ai%bs)%bs;
+ if(m0>m)
+ m0 = m;
+ m1 = m - m0;
+ jj = 0;
+ for( ; jj<n-3; jj+=4)
+ {
+ B = A + jj*lda;
+ pB = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for( ; ii<m0; ii++)
+ {
+ pB[ii+bs*0] = B[ii+lda*0];
+ pB[ii+bs*1] = B[ii+lda*1];
+ pB[ii+bs*2] = B[ii+lda*2];
+ pB[ii+bs*3] = B[ii+lda*3];
+ }
+ B += m0;
+ pB += m0 + bs*(sda-1);
+ }
+ for( ; ii<m-3; ii+=4)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ pB[1+bs*0] = B[1+lda*0];
+ pB[2+bs*0] = B[2+lda*0];
+ pB[3+bs*0] = B[3+lda*0];
+ // col 1
+ pB[0+bs*1] = B[0+lda*1];
+ pB[1+bs*1] = B[1+lda*1];
+ pB[2+bs*1] = B[2+lda*1];
+ pB[3+bs*1] = B[3+lda*1];
+ // col 2
+ pB[0+bs*2] = B[0+lda*2];
+ pB[1+bs*2] = B[1+lda*2];
+ pB[2+bs*2] = B[2+lda*2];
+ pB[3+bs*2] = B[3+lda*2];
+ // col 3
+ pB[0+bs*3] = B[0+lda*3];
+ pB[1+bs*3] = B[1+lda*3];
+ pB[2+bs*3] = B[2+lda*3];
+ pB[3+bs*3] = B[3+lda*3];
+ // update
+ B += 4;
+ pB += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ // col 1
+ pB[0+bs*1] = B[0+lda*1];
+ // col 2
+ pB[0+bs*2] = B[0+lda*2];
+ // col 3
+ pB[0+bs*3] = B[0+lda*3];
+ // update
+ B += 1;
+ pB += 1;
+ }
+ }
+ for( ; jj<n; jj++)
+ {
+
+ B = A + jj*lda;
+ pB = pA + jj*bs;
+
+ ii = 0;
+ if(m0>0)
+ {
+ for( ; ii<m0; ii++)
+ {
+ pB[ii+bs*0] = B[ii+lda*0];
+ }
+ B += m0;
+ pB += m0 + bs*(sda-1);
+ }
+ for( ; ii<m-3; ii+=4)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ pB[1+bs*0] = B[1+lda*0];
+ pB[2+bs*0] = B[2+lda*0];
+ pB[3+bs*0] = B[3+lda*0];
+ // update
+ B += 4;
+ pB += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ // update
+ B += 1;
+ pB += 1;
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix into a matrix structure
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, j, m0, m1, m2;
+ float *B, *pB;
+ m0 = (bs-ai%bs)%bs;
+ if(m0>n)
+ m0 = n;
+ m1 = n - m0;
+ ii = 0;
+ if(m0>0)
+ {
+ for(j=0; j<m; j++)
+ {
+ for(i=0; i<m0; i++)
+ {
+ pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+ }
+ }
+ A += m0*lda;
+ pA += m0 + bs*(sda-1);
+ }
+ ii = 0;
+ for(; ii<m1-3; ii+=bs)
+ {
+ j=0;
+ B = A + ii*lda;
+ pB = pA + ii*sda;
+ for(; j<m-3; j+=4)
+ {
+ // unroll 0
+ pB[0+0*bs] = B[0+0*lda];
+ pB[1+0*bs] = B[0+1*lda];
+ pB[2+0*bs] = B[0+2*lda];
+ pB[3+0*bs] = B[0+3*lda];
+ // unroll 1
+ pB[0+1*bs] = B[1+0*lda];
+ pB[1+1*bs] = B[1+1*lda];
+ pB[2+1*bs] = B[1+2*lda];
+ pB[3+1*bs] = B[1+3*lda];
+ // unroll 2
+ pB[0+2*bs] = B[2+0*lda];
+ pB[1+2*bs] = B[2+1*lda];
+ pB[2+2*bs] = B[2+2*lda];
+ pB[3+2*bs] = B[2+3*lda];
+ // unroll 3
+ pB[0+3*bs] = B[3+0*lda];
+ pB[1+3*bs] = B[3+1*lda];
+ pB[2+3*bs] = B[3+2*lda];
+ pB[3+3*bs] = B[3+3*lda];
+ B += 4;
+ pB += 4*bs;
+ }
+ for(; j<m; j++)
+ {
+ // unroll 0
+ pB[0+0*bs] = B[0+0*lda];
+ pB[1+0*bs] = B[0+1*lda];
+ pB[2+0*bs] = B[0+2*lda];
+ pB[3+0*bs] = B[0+3*lda];
+ B += 1;
+ pB += 1*bs;
+ }
+ }
+ if(ii<m1)
+ {
+ m2 = m1-ii;
+ if(bs<m2) m2 = bs;
+ for(j=0; j<m; j++)
+ {
+ for(i=0; i<m2; i++)
+ {
+ pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector into a vector structure
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ pa[ii] = a[ii];
+ return;
+ }
+
+
+
+// convert a matrix structure into a matrix
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, jj;
+ int m0 = (bs-ai%bs)%bs;
+ float *ptr_pA;
+ jj=0;
+ for(; jj<n-3; jj+=4)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ // unroll 0
+ A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ // unroll 0
+ A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+ A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+ A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+ // unroll 0
+ A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
+ A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
+ A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
+ // unroll 0
+ A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
+ A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
+ A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
+ // unroll 0
+ A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
+ A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
+ A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ // unroll 0
+ A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ A[ii+lda*jj] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ A[0+ii+lda*jj] = ptr_pA[0];
+ A[1+ii+lda*jj] = ptr_pA[1];
+ A[2+ii+lda*jj] = ptr_pA[2];
+ A[3+ii+lda*jj] = ptr_pA[3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ A[ii+lda*jj] = ptr_pA[0];
+ ptr_pA++;
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix structure into a matrix
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, jj;
+ int m0 = (bs-ai%bs)%bs;
+ float *ptr_pA;
+ jj=0;
+ for(; jj<n-3; jj+=4)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ // unroll 0
+ A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ // unroll 0
+ A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
+ A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
+ A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
+ A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
+ // unroll 1
+ A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
+ A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
+ A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
+ A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
+ // unroll 2
+ A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
+ A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
+ A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
+ A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
+ // unroll 3
+ A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
+ A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
+ A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
+ A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ // unroll 0
+ A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ A[jj+lda*ii] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ i=0;
+ for(; i<bs; i++)
+ {
+ A[jj+lda*(i+ii)] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ A[jj+lda*ii] = ptr_pA[0];
+ ptr_pA++;
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector structure into a vector
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ a[ii] = pa[ii];
+ return;
+ }
+
+
+
+// cast a matrix into a matrix structure
+void s_cast_mat2strmat(float *A, struct s_strmat *sA)
+ {
+ sA->pA = A;
+ return;
+ }
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA)
+ {
+ sA->dA = dA;
+ return;
+ }
+
+
+
+// cast a vector into a vector structure
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa)
+ {
+ sa->pa = a;
+ return;
+ }
+
+
+
+// insert element into strmat
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ pA[0] = a;
+ return;
+ }
+
+
+
+// extract element from strmat
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ return pA[0];
+ }
+
+
+
+// insert element into strvec
+void svecin1_libstr(float a, struct s_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ x[0] = a;
+ return;
+ }
+
+
+
+// extract element from strvec
+float svecex1_libstr(struct s_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ return x[0];
+ }
+
+
+
+// set all elements of a strmat to a value
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
+ int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
+ int ii, jj;
+ if(m0>0)
+ {
+ for(ii=0; ii<m0; ii++)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[jj*bs] = alpha;
+ }
+ pA += 1;
+ }
+ pA += bs*(sda-1);
+ m -= m0;
+ }
+ for(ii=0; ii<m-3; ii+=4)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[0+jj*bs] = alpha;
+ pA[1+jj*bs] = alpha;
+ pA[2+jj*bs] = alpha;
+ pA[3+jj*bs] = alpha;
+ }
+ pA += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[jj*bs] = alpha;
+ }
+ pA += 1;
+ }
+ return;
+ }
+
+
+
+// set all elements of a strvec to a value
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi)
+ {
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ x[ii] = alpha;
+ return;
+ }
+
+
+
+// extract diagonal to vector
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ sdiaex_lib(kmax, alpha, ai%bs, pA, sda, x);
+ return;
+ }
+
+
+
+// insert a vector into diagonal
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ sdiain_lib(kmax, alpha, x, ai%bs, pA, sda);
+ return;
+ }
+
+
+
+// swap two rows of a matrix struct
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ srowsw_lib(kmax, pA, pC);
+ return;
+ }
+
+
+
+// permute the rows of a matrix struct
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ srowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+ }
+ return;
+ }
+
+
+// extract a row int a vector
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ srowex_lib(kmax, alpha, pA, x);
+ return;
+ }
+
+
+
+// insert a vector into a row
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ srowin_lib(kmax, alpha, x, pA);
+ return;
+ }
+
+
+
+// add a vector to a row
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ srowad_lib(kmax, alpha, x, pA);
+ return;
+ }
+
+
+
+// swap two cols of a matrix struct
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ scolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
+ return;
+ }
+
+
+
+// permute the cols of a matrix struct
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ scolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+ }
+ return;
+ }
+
+
+
+// scale a generic strmat
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgesc_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgesc_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgesc_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgesc_libstr : aj<0 : %d<0 *****\n", aj);
+ // inside matrix
+ // A: m x n
+ if(ai+m > sA->m) printf("\n***** sgesc_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** sgesc_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+#endif
+
+ const int bs = 4;
+
+ int mna, ii;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ int offA = ai%bs;
+
+ // same alignment
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offA)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_sgesc_1_lib4(n, &alpha, pA+offA);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_sgesc_2_lib4(n, &alpha, pA+offA);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgesc_1_lib4(n, &alpha, pA+offA);
+ pA += 4*sda;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgesc_2_lib4(n, &alpha, pA+offA);
+ pA += 4*sda;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgesc_3_lib4(n, &alpha, pA+offA);
+ pA += 4*sda;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgesc_4_lib4(n, &alpha, pA);
+ pA += 4*sda;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgesc_1_lib4(n, &alpha, pA);
+ else if(m-ii==2)
+ kernel_sgesc_2_lib4(n, &alpha, pA);
+ else // if(m-ii==3)
+ kernel_sgesc_3_lib4(n, &alpha, pA);
+ }
+
+ return;
+
+ }
+
+
+
+// copy a generic strmat into a generic strmat
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+ {
+
+ if(m<=0 | n<=0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgecp_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgecp_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgecp_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgecp_libstr : aj<0 : %d<0 *****\n", aj);
+ if(bi<0) printf("\n****** sgecp_libstr : bi<0 : %d<0 *****\n", bi);
+ if(bj<0) printf("\n****** sgecp_libstr : bj<0 : %d<0 *****\n", bj);
+ // inside matrix
+ // A: m x n
+ if(ai+m > sA->m) printf("\n***** sgecp_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** sgecp_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // B: m x n
+ if(bi+m > sB->m) printf("\n***** sgecp_libstr : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
+ if(bj+n > sB->n) printf("\n***** sgecp_libstr : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
+#endif
+
+ const int bs = 4;
+
+ int mna, ii;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+ int offA = ai%bs;
+ int offB = bi%bs;
+
+ // same alignment
+ if(offA==offB)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgecp_3_0_lib4(n, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgecp_4_0_lib4(n, pA, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgecp_1_0_lib4(n, pA, pB);
+ else if(m-ii==2)
+ kernel_sgecp_2_0_lib4(n, pA, pB);
+ else // if(m-ii==3)
+ kernel_sgecp_3_0_lib4(n, pA, pB);
+ }
+ }
+ // skip one element of pA
+ else if(offA==(offB+1)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ //pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgecp_2_3_lib4(n, pA, sda, pB+2);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgecp_3_2_lib4(n, pA, sda, pB+1);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_sgecp_4_1_lib4(n, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgecp_1_0_lib4(n, pA+1, pB);
+ else if(m-ii==2)
+ kernel_sgecp_2_0_lib4(n, pA+1, pB);
+ else // if(m-ii==3)
+ kernel_sgecp_3_0_lib4(n, pA+1, pB);
+ }
+ }
+ // skip 2 elements of pA
+ else if(offA==(offB+2)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_sgecp_2_3_lib4(n, pA, sda, pB+1);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+1, pB+3);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgecp_2_0_lib4(n, pA, pB+2);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgecp_3_3_lib4(n, pA, sda, pB+1);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgecp_4_2_lib4(n, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgecp_1_0_lib4(n, pA+2, pB);
+ else if(m-ii==2)
+ kernel_sgecp_2_0_lib4(n, pA+2, pB);
+ else // if(m-ii==3)
+ kernel_sgecp_3_2_lib4(n, pA, sda, pB);
+ }
+ }
+ // skip 3 elements of pA
+ else // if(offA==(offB+3)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgecp_3_0_lib4(n, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgecp_4_3_lib4(n, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgecp_1_0_lib4(n, pA+3, pB);
+ else if(m-ii==2)
+ kernel_sgecp_2_3_lib4(n, pA, sda, pB);
+ else // if(m-ii==3)
+ kernel_sgecp_3_3_lib4(n, pA, sda, pB);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// scale a strvec
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pa[ii+0] *= alpha;
+ pa[ii+1] *= alpha;
+ pa[ii+2] *= alpha;
+ pa[ii+3] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pa[ii+0] *= alpha;
+ }
+ return;
+ }
+
+
+
+// copy a strvec into a strvec
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+ {
+ float *pa = sa->pa + ai;
+ float *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] = pa[ii+0];
+ pc[ii+1] = pa[ii+1];
+ pc[ii+2] = pa[ii+2];
+ pc[ii+3] = pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] = pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+ {
+
+ if(m<=0)
+ return;
+
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+ int offA = ai%bs;
+ int offB = bi%bs;
+
+ int ii, mna;
+
+ // same alignment
+ if(offA==offB)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_strcp_l_3_0_lib4(ii, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_strcp_l_4_0_lib4(ii, pA, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_strcp_l_1_0_lib4(ii, pA, pB);
+ else if(m-ii==2)
+ kernel_strcp_l_2_0_lib4(ii, pA, pB);
+ else // if(m-ii==3)
+ kernel_strcp_l_3_0_lib4(ii, pA, pB);
+ }
+ }
+ // skip one element of pA
+ else if(offA==(offB+1)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ //pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_strcp_l_2_3_lib4(ii, pA, sda, pB+2);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_strcp_l_3_2_lib4(ii, pA, sda, pB+1);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_strcp_l_4_1_lib4(ii, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_strcp_l_1_0_lib4(ii, pA+1, pB);
+ else if(m-ii==2)
+ kernel_strcp_l_2_0_lib4(ii, pA+1, pB);
+ else // if(m-ii==3)
+ kernel_strcp_l_3_0_lib4(ii, pA+1, pB);
+ }
+ }
+ // skip 2 elements of pA
+ else if(offA==(offB+2)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_strcp_l_2_3_lib4(ii, pA, sda, pB+1);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+1, pB+3);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_strcp_l_2_0_lib4(ii, pA, pB+2);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_strcp_l_3_3_lib4(ii, pA, sda, pB+1);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_strcp_l_4_2_lib4(ii, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_strcp_l_1_0_lib4(ii, pA+2, pB);
+ else if(m-ii==2)
+ kernel_strcp_l_2_0_lib4(ii, pA+2, pB);
+ else // if(m-ii==3)
+ kernel_strcp_l_3_2_lib4(ii, pA, sda, pB);
+ }
+ }
+ // skip 3 elements of pA
+ else // if(offA==(offB+3)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_strcp_l_3_0_lib4(ii, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_strcp_l_4_3_lib4(ii, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_strcp_l_1_0_lib4(ii, pA+3, pB);
+ else if(m-ii==2)
+ kernel_strcp_l_2_3_lib4(ii, pA, sda, pB);
+ else // if(m-ii==3)
+ kernel_strcp_l_3_3_lib4(ii, pA, sda, pB);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// scale and add a generic strmat into a generic strmat
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+ const int bs = 4;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+ int offA = ai%bs;
+ int offB = bi%bs;
+
+ int ii, mna;
+
+ // same alignment
+ if(offA==offB)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgead_3_0_lib4(n, &alpha, pA+offA, pB+offB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgead_4_0_lib4(n, &alpha, pA, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgead_1_0_lib4(n, &alpha, pA, pB);
+ else if(m-ii==2)
+ kernel_sgead_2_0_lib4(n, &alpha, pA, pB);
+ else // if(m-ii==3)
+ kernel_sgead_3_0_lib4(n, &alpha, pA, pB);
+ }
+ }
+ // skip one element of pA
+ else if(offA==(offB+1)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna) // mna<=3 ==> m = { 1, 2 }
+ {
+ if(m==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ else //if(m==2 && mna==3)
+ {
+ kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ //pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB+2);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgead_3_2_lib4(n, &alpha, pA, sda, pB+1);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for( ; ii<m-3; ii+=4)
+ {
+ kernel_sgead_4_1_lib4(n, &alpha, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgead_1_0_lib4(n, &alpha, pA+1, pB);
+ else if(m-ii==2)
+ kernel_sgead_2_0_lib4(n, &alpha, pA+1, pB);
+ else // if(m-ii==3)
+ kernel_sgead_3_0_lib4(n, &alpha, pA+1, pB);
+ }
+ }
+ // skip 2 elements of pA
+ else if(offA==(offB+2)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB+1);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+1, pB+3);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgead_2_0_lib4(n, &alpha, pA, pB+2);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgead_3_3_lib4(n, &alpha, pA, sda, pB+1);
+ pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgead_4_2_lib4(n, &alpha, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgead_1_0_lib4(n, &alpha, pA+2, pB);
+ else if(m-ii==2)
+ kernel_sgead_2_0_lib4(n, &alpha, pA+2, pB);
+ else // if(m-ii==3)
+ kernel_sgead_3_2_lib4(n, &alpha, pA, sda, pB);
+ }
+ }
+ // skip 3 elements of pA
+ else // if(offA==(offB+3)%bs)
+ {
+ ii = 0;
+ // clean up at the beginning
+ mna = (4-offB)%bs;
+ if(mna>0)
+ {
+ if(m<mna)
+ {
+ if(m==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ else // if(m==2 && mna==3)
+ {
+ kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+ return;
+ }
+ }
+ if(mna==1)
+ {
+ kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 1;
+ }
+ else if(mna==2)
+ {
+ kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 2;
+ }
+ else // if(mna==3)
+ {
+ kernel_sgead_3_0_lib4(n, &alpha, pA+offA, pB+offB);
+ // pA += 4*sda;
+ pB += 4*sdb;
+ ii += 3;
+ }
+ }
+ // main loop
+ for(; ii<m-3; ii+=4)
+ {
+ kernel_sgead_4_3_lib4(n, &alpha, pA, sda, pB);
+ pA += 4*sda;
+ pB += 4*sdb;
+ }
+ // clean up at the end
+ if(ii<m)
+ {
+ if(m-ii==1)
+ kernel_sgead_1_0_lib4(n, &alpha, pA+3, pB);
+ else if(m-ii==2)
+ kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB);
+ else // if(m-ii==3)
+ kernel_sgead_3_3_lib4(n, &alpha, pA, sda, pB);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ sgetr_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ strtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 4;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ strtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// insert a strvec to diagonal of strmat, sparse formulation
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// extract the diagonal of a strmat to a strvec, sparse formulation
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to diagonal of strmat, sparse formulation
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 4;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
+ srowad_libsp(kmax, idx, alpha, x, pD);
+ return;
+ }
+
+
+
+// adds strvec to strvec, sparse formulation
+void svecad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sy, int yi)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ svecad_libsp(kmax, idx, alpha, x, y);
+ return;
+ }
+
+
+
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] = alpha * x[ii];
+ return;
+ }
+
+
+
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[ii] = alpha * x[idx[ii]];
+ return;
+ }
+
+
+
+void svecnrm_inf_libstr(int m, struct s_strvec *sx, int xi, float *ptr_norm)
+ {
+ int ii;
+ float *x = sx->pa + xi;
+ float norm = 0.0;
+ for(ii=0; ii<m; ii++)
+ norm = fmax(norm, fabs(x[ii]));
+ *ptr_norm = norm;
+ return;
+ }
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/auxiliary/s_aux_lib8.c b/auxiliary/s_aux_lib8.c
new file mode 100644
index 0000000..94ba22d
--- /dev/null
+++ b/auxiliary/s_aux_lib8.c
@@ -0,0 +1,2647 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_block_size.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+// copies a lower triangular packed matrix into a lower triangular packed matrix
+void strcp_l_lib(int m, int offsetA, float *A, int sda, int offsetB, float *B, int sdb)
+ {
+ printf("\nstrcp_;l_lib: feature not implemented yet\n");
+ exit(1);
+ }
+
+
+
+// scales and adds a strvec into a strvec
+void svecad_libstr(int m, float alpha, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+ {
+ float *pa = sa->pa + ai;
+ float *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ pc[ii+1] += alpha*pa[ii+1];
+ pc[ii+2] += alpha*pa[ii+2];
+ pc[ii+3] += alpha*pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] += alpha*pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// transpose lower triangular matrix
+void strtr_l_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+ printf("\nstrtr_l_lib: feature not implemented yet\n");
+ exit(1);
+ }
+
+
+
+// transpose an aligned upper triangular matrix into an aligned lower triangular matrix
+void strtr_u_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+ printf("\nstrtr_u_lib: feature not implemented yet\n");
+ exit(1);
+ }
+
+
+
+// regularize diagonal
+void sdiareg_lib(int kmax, float reg, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ float *pD2;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] += reg;
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ kmax -= kna;
+ }
+ pD2 = pD;
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ pD2[0+0*bs] += reg;
+ pD2[1+1*bs] += reg;
+ pD2[2+2*bs] += reg;
+ pD2[3+3*bs] += reg;
+ pD2[4+4*bs] += reg;
+ pD2[5+5*bs] += reg;
+ pD2[6+6*bs] += reg;
+ pD2[7+7*bs] += reg;
+ pD2 += bs*sdd+bs*bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] += reg;
+ }
+
+ }
+
+
+
+// insert vector to diagonal
+void sdiain_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ float *pD2, *x2;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] = alpha*x[ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ pD2 = pD;
+ x2 = x;
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ pD2[0+bs*0] = alpha*x2[0];
+ pD2[1+bs*1] = alpha*x2[1];
+ pD2[2+bs*2] = alpha*x2[2];
+ pD2[3+bs*3] = alpha*x2[3];
+ pD2[4+bs*4] = alpha*x2[4];
+ pD2[5+bs*5] = alpha*x2[5];
+ pD2[6+bs*6] = alpha*x2[6];
+ pD2[7+bs*7] = alpha*x2[7];
+ pD2 += bs*sdd+bs*bs;
+ x2 += bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] = alpha*x[jj+ll];
+ }
+
+ }
+
+
+
+// insert sqrt of vector to diagonal
+void sdiain_sqrt_lib(int kmax, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ float *pD2, *x2;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] = sqrt(x[ll]);
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ pD2 = pD;
+ x2 = x;
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ pD2[0+bs*0] = sqrt(x2[0]);
+ pD2[1+bs*1] = sqrt(x2[1]);
+ pD2[2+bs*2] = sqrt(x2[2]);
+ pD2[3+bs*3] = sqrt(x2[3]);
+ pD2[4+bs*4] = sqrt(x2[4]);
+ pD2[5+bs*5] = sqrt(x2[5]);
+ pD2[5+bs*6] = sqrt(x2[6]);
+ pD2[7+bs*7] = sqrt(x2[7]);
+ pD2 += bs*sdd+bs*bs;
+ x2 += bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
+ }
+
+ }
+
+
+
+// extract diagonal to vector
+void sdiaex_lib(int kmax, float alpha, int offset, float *pD, int sdd, float *x)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ float *pD2, *x2;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ x[ll] = alpha * pD[ll+bs*ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ pD2 = pD;
+ x2 = x;
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ x2[0] = alpha * pD2[0+bs*0];
+ x2[1] = alpha * pD2[1+bs*1];
+ x2[2] = alpha * pD2[2+bs*2];
+ x2[3] = alpha * pD2[3+bs*3];
+ x2[4] = alpha * pD2[4+bs*4];
+ x2[5] = alpha * pD2[5+bs*5];
+ x2[6] = alpha * pD2[6+bs*6];
+ x2[7] = alpha * pD2[7+bs*7];
+ pD2 += bs*sdd+bs*bs;
+ x2 += bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal
+void sdiaad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ float *pD2, *x2;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll+bs*ll] += alpha * x[ll];
+ }
+ pD += kna + bs*(sdd-1) + kna*bs;
+ x += kna;
+ kmax -= kna;
+ }
+ pD2 = pD;
+ x2 = x;
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ pD2[0+bs*0] += alpha * x2[0];
+ pD2[1+bs*1] += alpha * x2[1];
+ pD2[2+bs*2] += alpha * x2[2];
+ pD2[3+bs*3] += alpha * x2[3];
+ pD2[4+bs*4] += alpha * x2[4];
+ pD2[5+bs*5] += alpha * x2[5];
+ pD2[6+bs*6] += alpha * x2[6];
+ pD2[7+bs*7] += alpha * x2[7];
+ pD2 += bs*sdd+bs*bs;
+ x2 += bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
+ }
+ return;
+ }
+
+
+
+// insert vector to diagonal, sparse formulation
+void sdiain_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// extract diagonal to vector, sparse formulation
+void sdiaex_libsp(int kmax, int *idx, float alpha, float *pD, int sdd, float *x)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
+ }
+ return;
+ }
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void sdiaad_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled vector to another vector and insert to diagonal, sparse formulation
+void sdiaadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// insert vector to row
+void srowin_lib(int kmax, float alpha, float *x, float *pD)
+ {
+
+ const int bs = 8;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[0*bs] = alpha * x[0];
+ pD[1*bs] = alpha * x[1];
+ pD[2*bs] = alpha * x[2];
+ pD[3*bs] = alpha * x[3];
+ pD += 4*bs;
+ x += 4;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[ll*bs] = alpha*x[ll];
+ }
+ return;
+ }
+
+
+
+// extract row to vector
+void srowex_lib(int kmax, float alpha, float *pD, float *x)
+ {
+
+ const int bs = 8;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ x[0] = alpha * pD[0*bs];
+ x[1] = alpha * pD[1*bs];
+ x[2] = alpha * pD[2*bs];
+ x[3] = alpha * pD[3*bs];
+ pD += 4*bs;
+ x += 4;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ x[ll] = alpha*pD[ll*bs];
+ }
+ return;
+ }
+
+
+
+// add scaled vector to row
+void srowad_lib(int kmax, float alpha, float *x, float *pD)
+ {
+
+ const int bs = 8;
+
+ int jj, ll;
+
+ for(jj=0; jj<kmax-3; jj+=4)
+ {
+ pD[0*bs] += alpha * x[0];
+ pD[1*bs] += alpha * x[1];
+ pD[2*bs] += alpha * x[2];
+ pD[3*bs] += alpha * x[3];
+ pD += 4*bs;
+ x += 4;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[ll*bs] += alpha * x[ll];
+ }
+ return;
+ }
+
+
+
+// insert vector to row, sparse formulation
+void srowin_libsp(int kmax, float alpha, int *idx, float *x, float *pD)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] = alpha*x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled vector to row, sparse formulation
+void srowad_libsp(int kmax, int *idx, float alpha, float *x, float *pD)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled vector to another vector and insert to row, sparse formulation
+void srowadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii*bs] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// swap two rows
+void srowsw_lib(int kmax, float *pA, float *pC)
+ {
+
+ const int bs = 8;
+
+ int ii;
+ float tmp;
+
+ for(ii=0; ii<kmax-3; ii+=4)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ tmp = pA[0+bs*1];
+ pA[0+bs*1] = pC[0+bs*1];
+ pC[0+bs*1] = tmp;
+ tmp = pA[0+bs*2];
+ pA[0+bs*2] = pC[0+bs*2];
+ pC[0+bs*2] = tmp;
+ tmp = pA[0+bs*3];
+ pA[0+bs*3] = pC[0+bs*3];
+ pC[0+bs*3] = tmp;
+ pA += 4*bs;
+ pC += 4*bs;
+ }
+ for( ; ii<kmax; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1*bs;
+ pC += 1*bs;
+ }
+ return;
+ }
+
+
+
+// insert vector to column
+void scolin_lib(int kmax, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll] = x[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ pD[0] = x[0];
+ pD[1] = x[1];
+ pD[2] = x[2];
+ pD[3] = x[3];
+ pD[4] = x[4];
+ pD[5] = x[5];
+ pD[6] = x[6];
+ pD[7] = x[7];
+ pD += bs*sdd;
+ x += bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[ll] = x[ll];
+ }
+
+ }
+
+
+
+// add scaled vector to column
+void scolad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int kna = (bs-offset%bs)%bs;
+ kna = kmax<kna ? kmax : kna;
+
+ int jj, ll;
+
+ if(kna>0)
+ {
+ for(ll=0; ll<kna; ll++)
+ {
+ pD[ll] += alpha * x[ll];
+ }
+ pD += kna + bs*(sdd-1);
+ x += kna;
+ kmax -= kna;
+ }
+ for(jj=0; jj<kmax-7; jj+=8)
+ {
+ pD[0] += alpha * x[0];
+ pD[1] += alpha * x[1];
+ pD[2] += alpha * x[2];
+ pD[3] += alpha * x[3];
+ pD[4] += alpha * x[4];
+ pD[5] += alpha * x[5];
+ pD[6] += alpha * x[6];
+ pD[7] += alpha * x[7];
+ pD += bs*sdd;
+ x += bs;
+ }
+ for(ll=0; ll<kmax-jj; ll++)
+ {
+ pD[ll] += alpha * x[ll];
+ }
+
+ }
+
+
+
+// insert vector to diagonal, sparse formulation
+void scolin_libsp(int kmax, int *idx, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs] = x[jj];
+ }
+
+ }
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void scolad_libsp(int kmax, float alpha, int *idx, float *x, float *pD, int sdd)
+ {
+
+ const int bs = 8;
+
+ int ii, jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
+ }
+
+ }
+
+
+
+// swaps two cols
+void scolsw_lib(int kmax, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+ {
+
+ const int bs = 8;
+
+ int ii;
+
+ float tmp;
+
+ if(offsetA==offsetC)
+ {
+ if(offsetA>0)
+ {
+ ii = 0;
+ for(; ii<bs-offsetA; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1;
+ pC += 1;
+ }
+ pA += bs*(sda-1);
+ pC += bs*(sdc-1);
+ kmax -= bs-offsetA;
+ }
+ ii = 0;
+ for(; ii<kmax-7; ii+=8)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ tmp = pA[1+bs*0];
+ pA[1+bs*0] = pC[1+bs*0];
+ pC[1+bs*0] = tmp;
+ tmp = pA[2+bs*0];
+ pA[2+bs*0] = pC[2+bs*0];
+ pC[2+bs*0] = tmp;
+ tmp = pA[3+bs*0];
+ pA[3+bs*0] = pC[3+bs*0];
+ pC[3+bs*0] = tmp;
+ tmp = pA[4+bs*0];
+ pA[4+bs*0] = pC[4+bs*0];
+ pC[4+bs*0] = tmp;
+ tmp = pA[5+bs*0];
+ pA[5+bs*0] = pC[5+bs*0];
+ pC[5+bs*0] = tmp;
+ tmp = pA[6+bs*0];
+ pA[6+bs*0] = pC[6+bs*0];
+ pC[6+bs*0] = tmp;
+ tmp = pA[7+bs*0];
+ pA[7+bs*0] = pC[7+bs*0];
+ pC[7+bs*0] = tmp;
+ pA += bs*sda;
+ pC += bs*sdc;
+ }
+ for(; ii<kmax; ii++)
+ {
+ tmp = pA[0+bs*0];
+ pA[0+bs*0] = pC[0+bs*0];
+ pC[0+bs*0] = tmp;
+ pA += 1;
+ pC += 1;
+ }
+ }
+ else
+ {
+ printf("\nscolsw: feature not implemented yet: offsetA!=offsetC\n\n");
+ exit(1);
+ }
+
+ return;
+
+ }
+
+
+
+// insert vector to vector, sparse formulation
+void svecin_libsp(int kmax, int *idx, float *x, float *y)
+ {
+
+ int jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ y[idx[jj]] = x[jj];
+ }
+
+ }
+
+
+
+// adds vector to vector, sparse formulation
+void svecad_libsp(int kmax, int *idx, float alpha, float *x, float *y)
+ {
+
+ int jj;
+
+ for(jj=0; jj<kmax; jj++)
+ {
+ y[idx[jj]] += alpha * x[jj];
+ }
+
+ }
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// return the memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n)
+ {
+ const int bs = 8;
+ int nc = S_NC;
+ int al = bs*nc;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ int memory_size = (pm*cn+tmp)*sizeof(float);
+ return memory_size;
+ }
+
+
+
+// return the memory size (in bytes) needed for the digonal of a strmat
+int s_size_diag_strmat(int m, int n)
+ {
+ const int bs = 8;
+ int nc = S_NC;
+ int al = bs*nc;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ int memory_size = tmp*sizeof(float);
+ return memory_size;
+ }
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory)
+ {
+ const int bs = 8;
+ int nc = S_NC;
+ int al = bs*nc;
+ sA->m = m;
+ sA->n = n;
+ int pm = (m+bs-1)/bs*bs;
+ int cn = (n+nc-1)/nc*nc;
+ sA->pm = pm;
+ sA->cn = cn;
+ float *ptr = (float *) memory;
+ sA->pA = ptr;
+ ptr += pm*cn;
+ int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+ sA->dA = ptr;
+ ptr += tmp;
+ sA->use_dA = 0;
+ sA->memory_size = (pm*cn+tmp)*sizeof(float);
+ return;
+ }
+
+
+
+// return memory size (in bytes) needed for a strvec
+int s_size_strvec(int m)
+ {
+ const int bs = 8;
+// int nc = S_NC;
+// int al = bs*nc;
+ int pm = (m+bs-1)/bs*bs;
+ int memory_size = pm*sizeof(float);
+ return memory_size;
+ }
+
+
+
+// create a vector structure for a vector of size m by using memory passed by a pointer
+void s_create_strvec(int m, struct s_strvec *sa, void *memory)
+ {
+ const int bs = 8;
+// int nc = S_NC;
+// int al = bs*nc;
+ sa->m = m;
+ int pm = (m+bs-1)/bs*bs;
+ sa->pm = pm;
+ float *ptr = (float *) memory;
+ sa->pa = ptr;
+// ptr += pm;
+ sa->memory_size = pm*sizeof(float);
+ return;
+ }
+
+
+
+// convert a matrix into a matrix structure
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, j, jj, m0, m1, m2;
+ float *B, *pB;
+ m0 = (bs-ai%bs)%bs;
+ if(m0>m)
+ m0 = m;
+ m1 = m - m0;
+ jj = 0;
+ for( ; jj<n-3; jj+=4)
+ {
+ B = A + jj*lda;
+ pB = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for( ; ii<m0; ii++)
+ {
+ pB[ii+bs*0] = B[ii+lda*0];
+ pB[ii+bs*1] = B[ii+lda*1];
+ pB[ii+bs*2] = B[ii+lda*2];
+ pB[ii+bs*3] = B[ii+lda*3];
+ }
+ B += m0;
+ pB += m0 + bs*(sda-1);
+ }
+ for( ; ii<m-7; ii+=8)
+ {
+ // unroll 0
+ pB[0+bs*0] = B[0+lda*0];
+ pB[1+bs*0] = B[1+lda*0];
+ pB[2+bs*0] = B[2+lda*0];
+ pB[3+bs*0] = B[3+lda*0];
+ pB[4+bs*0] = B[4+lda*0];
+ pB[5+bs*0] = B[5+lda*0];
+ pB[6+bs*0] = B[6+lda*0];
+ pB[7+bs*0] = B[7+lda*0];
+ // unroll 1
+ pB[0+bs*1] = B[0+lda*1];
+ pB[1+bs*1] = B[1+lda*1];
+ pB[2+bs*1] = B[2+lda*1];
+ pB[3+bs*1] = B[3+lda*1];
+ pB[4+bs*1] = B[4+lda*1];
+ pB[5+bs*1] = B[5+lda*1];
+ pB[6+bs*1] = B[6+lda*1];
+ pB[7+bs*1] = B[7+lda*1];
+ // unroll 2
+ pB[0+bs*2] = B[0+lda*2];
+ pB[1+bs*2] = B[1+lda*2];
+ pB[2+bs*2] = B[2+lda*2];
+ pB[3+bs*2] = B[3+lda*2];
+ pB[4+bs*2] = B[4+lda*2];
+ pB[5+bs*2] = B[5+lda*2];
+ pB[6+bs*2] = B[6+lda*2];
+ pB[7+bs*2] = B[7+lda*2];
+ // unroll 3
+ pB[0+bs*3] = B[0+lda*3];
+ pB[1+bs*3] = B[1+lda*3];
+ pB[2+bs*3] = B[2+lda*3];
+ pB[3+bs*3] = B[3+lda*3];
+ pB[4+bs*3] = B[4+lda*3];
+ pB[5+bs*3] = B[5+lda*3];
+ pB[6+bs*3] = B[6+lda*3];
+ pB[7+bs*3] = B[7+lda*3];
+ // update
+ B += 8;
+ pB += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ // col 1
+ pB[0+bs*1] = B[0+lda*1];
+ // col 2
+ pB[0+bs*2] = B[0+lda*2];
+ // col 3
+ pB[0+bs*3] = B[0+lda*3];
+ // update
+ B += 1;
+ pB += 1;
+ }
+ }
+ for( ; jj<n; jj++)
+ {
+
+ B = A + jj*lda;
+ pB = pA + jj*bs;
+
+ ii = 0;
+ if(m0>0)
+ {
+ for( ; ii<m0; ii++)
+ {
+ pB[ii+bs*0] = B[ii+lda*0];
+ }
+ B += m0;
+ pB += m0 + bs*(sda-1);
+ }
+ for( ; ii<m-7; ii+=8)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ pB[1+bs*0] = B[1+lda*0];
+ pB[2+bs*0] = B[2+lda*0];
+ pB[3+bs*0] = B[3+lda*0];
+ pB[4+bs*0] = B[4+lda*0];
+ pB[5+bs*0] = B[5+lda*0];
+ pB[6+bs*0] = B[6+lda*0];
+ pB[7+bs*0] = B[7+lda*0];
+ // update
+ B += 8;
+ pB += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ // col 0
+ pB[0+bs*0] = B[0+lda*0];
+ // update
+ B += 1;
+ pB += 1;
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix into a matrix structure
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, j, m0, m1, m2;
+ float *B, *pB;
+ m0 = (bs-ai%bs)%bs;
+ if(m0>n)
+ m0 = n;
+ m1 = n - m0;
+ ii = 0;
+ if(m0>0)
+ {
+ for(j=0; j<m; j++)
+ {
+ for(i=0; i<m0; i++)
+ {
+ pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+ }
+ }
+ A += m0*lda;
+ pA += m0 + bs*(sda-1);
+ }
+ ii = 0;
+ for(; ii<m1-7; ii+=bs)
+ {
+ j=0;
+ B = A + ii*lda;
+ pB = pA + ii*sda;
+ for(; j<m-3; j+=4)
+ {
+ // unroll 0
+ pB[0+0*bs] = B[0+0*lda];
+ pB[1+0*bs] = B[0+1*lda];
+ pB[2+0*bs] = B[0+2*lda];
+ pB[3+0*bs] = B[0+3*lda];
+ pB[4+0*bs] = B[0+4*lda];
+ pB[5+0*bs] = B[0+5*lda];
+ pB[6+0*bs] = B[0+6*lda];
+ pB[7+0*bs] = B[0+7*lda];
+ // unroll 1
+ pB[0+1*bs] = B[1+0*lda];
+ pB[1+1*bs] = B[1+1*lda];
+ pB[2+1*bs] = B[1+2*lda];
+ pB[3+1*bs] = B[1+3*lda];
+ pB[4+1*bs] = B[1+4*lda];
+ pB[5+1*bs] = B[1+5*lda];
+ pB[6+1*bs] = B[1+6*lda];
+ pB[7+1*bs] = B[1+7*lda];
+ // unroll 2
+ pB[0+2*bs] = B[2+0*lda];
+ pB[1+2*bs] = B[2+1*lda];
+ pB[2+2*bs] = B[2+2*lda];
+ pB[3+2*bs] = B[2+3*lda];
+ pB[4+2*bs] = B[2+4*lda];
+ pB[5+2*bs] = B[2+5*lda];
+ pB[6+2*bs] = B[2+6*lda];
+ pB[7+2*bs] = B[2+7*lda];
+ // unroll 3
+ pB[0+3*bs] = B[3+0*lda];
+ pB[1+3*bs] = B[3+1*lda];
+ pB[2+3*bs] = B[3+2*lda];
+ pB[3+3*bs] = B[3+3*lda];
+ pB[4+3*bs] = B[3+4*lda];
+ pB[5+3*bs] = B[3+5*lda];
+ pB[6+3*bs] = B[3+6*lda];
+ pB[7+3*bs] = B[3+7*lda];
+ B += 4;
+ pB += 4*bs;
+ }
+ for(; j<m; j++)
+ {
+ // unroll 0
+ pB[0+0*bs] = B[0+0*lda];
+ pB[1+0*bs] = B[0+1*lda];
+ pB[2+0*bs] = B[0+2*lda];
+ pB[3+0*bs] = B[0+3*lda];
+ pB[4+0*bs] = B[0+4*lda];
+ pB[5+0*bs] = B[0+5*lda];
+ pB[6+0*bs] = B[0+6*lda];
+ pB[7+0*bs] = B[0+7*lda];
+ B += 1;
+ pB += 1*bs;
+ }
+ }
+ if(ii<m1)
+ {
+ m2 = m1-ii;
+ if(bs<m2) m2 = bs;
+ for(j=0; j<m; j++)
+ {
+ for(i=0; i<m2; i++)
+ {
+ pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector into a vector structure
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ pa[ii] = a[ii];
+ return;
+ }
+
+
+
+// convert a matrix structure into a matrix
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, jj;
+ int m0 = (bs-ai%bs)%bs;
+ float *ptr_pA;
+ jj=0;
+ for(; jj<n-3; jj+=4)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ // unroll 0
+ A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ // TODO update A !!!!!
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ // unroll 0
+ A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+ A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+ A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+ A[4+ii+lda*(jj+0)] = ptr_pA[4+bs*0];
+ A[5+ii+lda*(jj+0)] = ptr_pA[5+bs*0];
+ A[6+ii+lda*(jj+0)] = ptr_pA[6+bs*0];
+ A[7+ii+lda*(jj+0)] = ptr_pA[7+bs*0];
+ // unroll 0
+ A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
+ A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
+ A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
+ A[4+ii+lda*(jj+1)] = ptr_pA[4+bs*1];
+ A[5+ii+lda*(jj+1)] = ptr_pA[5+bs*1];
+ A[6+ii+lda*(jj+1)] = ptr_pA[6+bs*1];
+ A[7+ii+lda*(jj+1)] = ptr_pA[7+bs*1];
+ // unroll 0
+ A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
+ A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
+ A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
+ A[4+ii+lda*(jj+2)] = ptr_pA[4+bs*2];
+ A[5+ii+lda*(jj+2)] = ptr_pA[5+bs*2];
+ A[6+ii+lda*(jj+2)] = ptr_pA[6+bs*2];
+ A[7+ii+lda*(jj+2)] = ptr_pA[7+bs*2];
+ // unroll 0
+ A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
+ A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
+ A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
+ A[4+ii+lda*(jj+3)] = ptr_pA[4+bs*3];
+ A[5+ii+lda*(jj+3)] = ptr_pA[5+bs*3];
+ A[6+ii+lda*(jj+3)] = ptr_pA[6+bs*3];
+ A[7+ii+lda*(jj+3)] = ptr_pA[7+bs*3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ // unroll 0
+ A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ A[ii+lda*jj] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+ A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+ A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+ A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+ A[4+ii+lda*(jj+0)] = ptr_pA[4+bs*0];
+ A[5+ii+lda*(jj+0)] = ptr_pA[5+bs*0];
+ A[6+ii+lda*(jj+0)] = ptr_pA[6+bs*0];
+ A[7+ii+lda*(jj+0)] = ptr_pA[7+bs*0];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ A[ii+lda*jj] = ptr_pA[0];
+ ptr_pA++;
+ }
+ }
+ return;
+ }
+
+
+
+// convert and transpose a matrix structure into a matrix
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+ int i, ii, jj;
+ int m0 = (bs-ai%bs)%bs;
+ float *ptr_pA;
+ jj=0;
+ for(; jj<n-3; jj+=4)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ // unroll 0
+ A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ // TODO update A !!!!!
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ // unroll 0
+ A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
+ A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
+ A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
+ A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
+ A[jj+0+lda*(ii+4)] = ptr_pA[4+bs*0];
+ A[jj+0+lda*(ii+5)] = ptr_pA[5+bs*0];
+ A[jj+0+lda*(ii+6)] = ptr_pA[6+bs*0];
+ A[jj+0+lda*(ii+7)] = ptr_pA[7+bs*0];
+ // unroll 1
+ A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
+ A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
+ A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
+ A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
+ A[jj+1+lda*(ii+4)] = ptr_pA[4+bs*1];
+ A[jj+1+lda*(ii+5)] = ptr_pA[5+bs*1];
+ A[jj+1+lda*(ii+6)] = ptr_pA[6+bs*1];
+ A[jj+1+lda*(ii+7)] = ptr_pA[7+bs*1];
+ // unroll 2
+ A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
+ A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
+ A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
+ A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
+ A[jj+2+lda*(ii+4)] = ptr_pA[4+bs*2];
+ A[jj+2+lda*(ii+5)] = ptr_pA[5+bs*2];
+ A[jj+2+lda*(ii+6)] = ptr_pA[6+bs*2];
+ A[jj+2+lda*(ii+7)] = ptr_pA[7+bs*2];
+ // unroll 3
+ A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
+ A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
+ A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
+ A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
+ A[jj+3+lda*(ii+4)] = ptr_pA[4+bs*3];
+ A[jj+3+lda*(ii+5)] = ptr_pA[5+bs*3];
+ A[jj+3+lda*(ii+6)] = ptr_pA[6+bs*3];
+ A[jj+3+lda*(ii+7)] = ptr_pA[7+bs*3];
+ ptr_pA += sda*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ // unroll 0
+ A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+ // unroll 1
+ A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+ // unroll 2
+ A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+ // unroll 3
+ A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+ ptr_pA++;
+ }
+ }
+ for(; jj<n; jj++)
+ {
+ ptr_pA = pA + jj*bs;
+ ii = 0;
+ if(m0>0)
+ {
+ for(; ii<m0; ii++)
+ {
+ A[jj+lda*ii] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m-bs+1; ii+=bs)
+ {
+ i=0;
+ // TODO update A !!!!!
+ // TODO unroll !!!!!!
+ for(; i<bs; i++)
+ {
+ A[jj+lda*(i+ii)] = ptr_pA[0];
+ ptr_pA++;
+ }
+ ptr_pA += (sda-1)*bs;
+ }
+ for(; ii<m; ii++)
+ {
+ A[jj+lda*ii] = ptr_pA[0];
+ ptr_pA++;
+ }
+ }
+ return;
+ }
+
+
+
+// convert a vector structure into a vector
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ a[ii] = pa[ii];
+ return;
+ }
+
+
+
+// cast a matrix into a matrix structure
+void s_cast_mat2strmat(float *A, struct s_strmat *sA)
+ {
+ sA->pA = A;
+ return;
+ }
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA)
+ {
+ sA->dA = dA;
+ return;
+ }
+
+
+
+// cast a vector into a vector structure
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa)
+ {
+ sa->pa = a;
+ return;
+ }
+
+
+
+// insert element into strmat
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ pA[0] = a;
+ return;
+ }
+
+
+
+// extract element from strmat
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ return pA[0];
+ }
+
+
+
+// insert element into strvec
+void svecin1_libstr(float a, struct s_strvec *sx, int xi)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ x[0] = a;
+ return;
+ }
+
+
+
+// extract element from strvec
+float svecex1_libstr(struct s_strvec *sx, int xi)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ return x[0];
+ }
+
+
+
+// set all elements of a strmat to a value
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
+ int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
+ int ii, jj;
+ if(m0>0)
+ {
+ for(ii=0; ii<m0; ii++)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[jj*bs] = alpha;
+ }
+ pA += 1;
+ }
+ pA += bs*(sda-1);
+ m -= m0;
+ }
+ for(ii=0; ii<m-7; ii+=8)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[0+jj*bs] = alpha;
+ pA[1+jj*bs] = alpha;
+ pA[2+jj*bs] = alpha;
+ pA[3+jj*bs] = alpha;
+ pA[4+jj*bs] = alpha;
+ pA[5+jj*bs] = alpha;
+ pA[6+jj*bs] = alpha;
+ pA[7+jj*bs] = alpha;
+ }
+ pA += bs*sda;
+ }
+ for( ; ii<m; ii++)
+ {
+ for(jj=0; jj<n; jj++)
+ {
+ pA[jj*bs] = alpha;
+ }
+ pA += 1;
+ }
+ return;
+ }
+
+
+
+// set all elements of a strvec to a value
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi)
+ {
+ float *x = sx->pa + xi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ x[ii] = alpha;
+ return;
+ }
+
+
+
+// extract diagonal to vector
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ sdiaex_lib(kmax, alpha, ai%bs, pA, sda, x);
+ return;
+ }
+
+
+
+// insert a vector into diagonal
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ sdiain_lib(kmax, alpha, x, ai%bs, pA, sda);
+ return;
+ }
+
+
+
+// swap two rows of a matrix struct
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ srowsw_lib(kmax, pA, pC);
+ return;
+ }
+
+
+
+// permute the rows of a matrix struct
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ srowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+ }
+ return;
+ }
+
+
+// extract a row int a vector
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ srowex_lib(kmax, alpha, pA, x);
+ return;
+ }
+
+
+
+// insert a vector into a row
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ srowin_lib(kmax, alpha, x, pA);
+ return;
+ }
+
+
+
+// add a vector to a row
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ float *x = sx->pa + xi;
+ srowad_lib(kmax, alpha, x, pA);
+ return;
+ }
+
+
+
+// swap two cols of a matrix struct
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ scolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
+ return;
+ }
+
+
+
+// permute the cols of a matrix struct
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+ {
+ int ii;
+ for(ii=0; ii<kmax; ii++)
+ {
+ if(ipiv[ii]!=ii)
+ scolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+ }
+ return;
+ }
+
+
+
+// scale a generic strmat
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+ {
+
+ // early return
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgesc_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgesc_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgesc_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgesc_libstr : aj<0 : %d<0 *****\n", aj);
+ // inside matrix
+ // A: m x n
+ if(ai+m > sA->m) printf("\n***** sgesc_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** sgesc_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+#endif
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ int offsetA = ai%bs;
+
+ int ii, mna;
+
+ if(offsetA>0)
+ {
+ mna = bs-offsetA;
+ mna = m<mna ? m : mna;
+ kernel_sgesc_8_gen_lib8(n, &alpha, &pA[offsetA], mna);
+ m -= mna;
+ pA += 8*sda;
+ }
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgesc_8_lib8(n, &alpha, &pA[0]);
+ pA += 8*sda;
+ }
+ if(ii<m)
+ {
+ kernel_sgesc_8_gen_lib8(n, &alpha, &pA[0], m-ii);
+ }
+
+ return;
+
+ }
+
+
+
+// copy a generic strmat into a generic strmat
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+ {
+
+ // early return
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgecp_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgecp_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgecp_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgecp_libstr : aj<0 : %d<0 *****\n", aj);
+ if(bi<0) printf("\n****** sgecp_libstr : bi<0 : %d<0 *****\n", bi);
+ if(bj<0) printf("\n****** sgecp_libstr : bj<0 : %d<0 *****\n", bj);
+ // inside matrix
+ // A: m x n
+ if(ai+m > sA->m) printf("\n***** sgecp_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** sgecp_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // B: m x n
+ if(bi+m > sB->m) printf("\n***** sgecp_libstr : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
+ if(bj+n > sB->n) printf("\n***** sgecp_libstr : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
+#endif
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+ int offsetA = ai%bs;
+ int offsetB = bi%bs;
+
+ int ii, mna;
+
+#if 1
+ if(offsetB>0)
+ {
+ if(offsetB>offsetA)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_0_gen_lib8(n, &pA[offsetA], &pB[offsetB], mna);
+ m -= mna;
+ //pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else
+ {
+ if(offsetA==0)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_0_gen_lib8(n, &pA[0], &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==1)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_1_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==2)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_2_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==3)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_3_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==4)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_4_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==5)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_5_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==6)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_6_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==7)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgecp_8_7_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ }
+ }
+#endif
+
+ // same alignment
+ if(offsetA==offsetB)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_0_lib8(n, pA, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_0_gen_lib8(n, pA, pB, m-ii);
+ }
+ return;
+ }
+ // XXX different alignment: search tree ???
+ // skip one element of A
+ else if(offsetA==(offsetB+1)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_1_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_1_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ }
+ // skip two elements of A
+ else if(offsetA==(offsetB+2)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_2_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_2_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip three elements of A
+ else if(offsetA==(offsetB+3)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_3_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_3_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip four elements of A
+ else if(offsetA==(offsetB+4)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_4_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_4_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip five elements of A
+ else if(offsetA==(offsetB+5)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_5_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_5_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip six elements of A
+ else if(offsetA==(offsetB+6)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_6_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_6_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip seven elements of A
+ else //if(offsetA==(offsetB+7)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgecp_8_7_lib8(n, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgecp_8_7_gen_lib8(n, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+
+ return;
+
+ }
+
+
+
+// scale a strvec
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai)
+ {
+ float *pa = sa->pa + ai;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pa[ii+0] *= alpha;
+ pa[ii+1] *= alpha;
+ pa[ii+2] *= alpha;
+ pa[ii+3] *= alpha;
+ }
+ for(; ii<m; ii++)
+ {
+ pa[ii+0] *= alpha;
+ }
+ return;
+ }
+
+
+
+// copy a strvec into a strvec
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+ {
+ float *pa = sa->pa + ai;
+ float *pc = sc->pa + ci;
+ int ii;
+ ii = 0;
+ for(; ii<m-3; ii+=4)
+ {
+ pc[ii+0] = pa[ii+0];
+ pc[ii+1] = pa[ii+1];
+ pc[ii+2] = pa[ii+2];
+ pc[ii+3] = pa[ii+3];
+ }
+ for(; ii<m; ii++)
+ {
+ pc[ii+0] = pa[ii+0];
+ }
+ return;
+ }
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ strcp_l_lib(m, ai%bs, pA, sda, ci%bs, pC, sdc);
+ // XXX uses full matrix copy !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+// sgecp_libstr(m, m, sA, ai, aj, sC, ci, cj);
+ return;
+ }
+
+
+
+// scale and add a generic strmat into a generic strmat
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+ {
+
+ // early return
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgead_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgead_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgead_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgead_libstr : aj<0 : %d<0 *****\n", aj);
+ if(bi<0) printf("\n****** sgead_libstr : bi<0 : %d<0 *****\n", bi);
+ if(bj<0) printf("\n****** sgead_libstr : bj<0 : %d<0 *****\n", bj);
+ // inside matrix
+ // A: m x n
+ if(ai+m > sA->m) printf("\n***** sgead_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** sgead_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // B: m x n
+ if(bi+m > sB->m) printf("\n***** sgead_libstr : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
+ if(bj+n > sB->n) printf("\n***** sgead_libstr : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
+#endif
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+ int offsetA = ai%bs;
+ int offsetB = bi%bs;
+
+ int ii, mna;
+
+#if 1
+ if(offsetB>0)
+ {
+ if(offsetB>offsetA)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_0_gen_lib8(n, &alpha, &pA[offsetA], &pB[offsetB], mna);
+ m -= mna;
+ //pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else
+ {
+ if(offsetA==0)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_0_gen_lib8(n, &alpha, &pA[0], &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==1)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_1_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==2)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_2_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==3)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_3_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==4)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_4_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==5)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_5_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==6)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_6_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ else if(offsetA==7)
+ {
+ mna = bs-offsetB;
+ mna = m<mna ? m : mna;
+ kernel_sgead_8_7_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+ m -= mna;
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ }
+ }
+#endif
+
+ // same alignment
+ if(offsetA==offsetB)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_0_lib8(n, &alpha, pA, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_0_gen_lib8(n, &alpha, pA, pB, m-ii);
+ }
+ return;
+ }
+ // XXX different alignment: search tree ???
+ // skip one element of A
+ else if(offsetA==(offsetB+1)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_1_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_1_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ }
+ // skip two elements of A
+ else if(offsetA==(offsetB+2)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_2_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_2_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip three elements of A
+ else if(offsetA==(offsetB+3)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_3_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_3_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip four elements of A
+ else if(offsetA==(offsetB+4)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_4_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_4_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip five elements of A
+ else if(offsetA==(offsetB+5)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_5_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_5_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip six elements of A
+ else if(offsetA==(offsetB+6)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_6_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_6_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+ // skip seven elements of A
+ else //if(offsetA==(offsetB+7)%bs)
+ {
+ ii = 0;
+ for( ; ii<m-7; ii+=8)
+ {
+ kernel_sgead_8_7_lib8(n, &alpha, pA, sda, pB);
+ pA += 8*sda;
+ pB += 8*sdb;
+ }
+ if(ii<m)
+ {
+ kernel_sgead_8_7_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+ }
+ return;
+ }
+
+ return;
+
+ }
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+ {
+
+ // early return
+ if(m==0 | n==0)
+ return;
+
+#if defined(DIM_CHECK)
+ // non-negative size
+ if(m<0) printf("\n****** sgetr_libstr : m<0 : %d<0 *****\n", m);
+ if(n<0) printf("\n****** sgetr_libstr : n<0 : %d<0 *****\n", n);
+ // non-negative offset
+ if(ai<0) printf("\n****** sgetr_libstr : ai<0 : %d<0 *****\n", ai);
+ if(aj<0) printf("\n****** sgetr_libstr : aj<0 : %d<0 *****\n", aj);
+ if(bi<0) printf("\n****** sgetr_libstr : bi<0 : %d<0 *****\n", bi);
+ if(bj<0) printf("\n****** sgetr_libstr : bj<0 : %d<0 *****\n", bj);
+ // inside matrix
+ // A: m x n
+ if(ai+m > sA->m) printf("\n***** sgetr_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+ if(aj+n > sA->n) printf("\n***** sgetr_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+ // B: n x m
+ if(bi+n > sB->m) printf("\n***** sgetr_libstr : bi+n > row(B) : %d+%d > %d *****\n", bi, n, sB->m);
+ if(bj+m > sB->n) printf("\n***** sgetr_libstr : bj+m > col(B) : %d+%d > %d *****\n", bj, m, sB->n);
+#endif
+
+ const int bs = 8;
+
+ int sda = sA->cn;
+ int sdb = sB->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+ float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+ int offsetA = ai%bs;
+ int offsetB = bi%bs;
+
+ int ii, nna;
+
+ if(offsetA==0)
+ {
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_0_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for(ii=0; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_0_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_0_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ // TODO log serach for offsetA>0 ???
+ else if(offsetA==1)
+ {
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_1_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for(ii=0; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_1_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_1_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ else if(offsetA==2)
+ {
+ ii = 0;
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_2_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_2_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_2_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ else if(offsetA==3)
+ {
+ ii = 0;
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_3_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_3_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_3_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ else if(offsetA==4)
+ {
+ ii = 0;
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_4_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_4_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_4_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ else if(offsetA==5)
+ {
+ ii = 0;
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_5_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_5_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_5_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ else if(offsetA==6)
+ {
+ ii = 0;
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_6_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_6_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_6_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+ else if(offsetA==7)
+ {
+ ii = 0;
+ if(offsetB>0)
+ {
+ nna = bs-offsetB;
+ nna = n<nna ? n : nna;
+ kernel_sgetr_8_7_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+ n -= nna;
+ pA += nna*bs;
+ pB += 8*sdb;
+ }
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_sgetr_8_7_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+ }
+ if(ii<n)
+ {
+ kernel_sgetr_8_7_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+ }
+ }
+
+ return;
+
+ }
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ strtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+ {
+ const int bs = 8;
+ int sda = sA->cn;
+ float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+ int sdc = sC->cn;
+ float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+ strtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+ return;
+ }
+
+
+
+// insert a strvec to diagonal of strmat, sparse formulation
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// extract the diagonal of a strmat to a strvec, sparse formulation
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to diagonal of strmat, sparse formulation
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ int sdd = sD->cn;
+ float *pD = sD->pA;
+ int ii, jj;
+ for(jj=0; jj<kmax; jj++)
+ {
+ ii = idx[jj];
+ pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
+ }
+ return;
+ }
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+ {
+ const int bs = 8;
+ float *x = sx->pa + xi;
+ int sdd = sD->cn;
+ float *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
+ srowad_libsp(kmax, idx, alpha, x, pD);
+ return;
+ }
+
+
+
+// adds strvec to strvec, sparse formulation
+void svecad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sy, int yi)
+ {
+ float *x = sx->pa + xi;
+ float *y = sy->pa + yi;
+ svecad_libsp(kmax, idx, alpha, x, y);
+ return;
+ }
+
+
+
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[idx[ii]] = alpha * x[ii];
+ return;
+ }
+
+
+
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+ {
+ float *x = sx->pa + xi;
+ float *z = sz->pa + zi;
+ int ii;
+ for(ii=0; ii<m; ii++)
+ z[ii] = alpha * x[idx[ii]];
+ return;
+ }
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/auxiliary/v_aux_ext_dep_lib.c b/auxiliary/v_aux_ext_dep_lib.c
new file mode 100644
index 0000000..3bf5f90
--- /dev/null
+++ b/auxiliary/v_aux_ext_dep_lib.c
@@ -0,0 +1,138 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+
+
+/* creates a zero matrix given the size in bytes */
+void v_zeros(void **ptrA, int size)
+ {
+ *ptrA = (void *) malloc(size);
+ char *A = *ptrA;
+ int i;
+ for(i=0; i<size; i++) A[i] = 0;
+ }
+
+
+
+/* creates a zero matrix aligned to a cache line given the size in bytes */
+void v_zeros_align(void **ptrA, int size)
+ {
+#if defined(OS_WINDOWS)
+ *ptrA = _aligned_malloc( size, 64 );
+#else
+ int err = posix_memalign(ptrA, 64, size);
+ if(err!=0)
+ {
+ printf("Memory allocation error");
+ exit(1);
+ }
+#endif
+ char *A = *ptrA;
+ int i;
+ for(i=0; i<size; i++) A[i] = 0;
+ }
+
+
+
+/* frees matrix */
+void v_free(void *pA)
+ {
+ free( pA );
+ }
+
+
+
+/* frees aligned matrix */
+void v_free_align(void *pA)
+ {
+#if defined(OS_WINDOWS)
+ _aligned_free( pA );
+#else
+ free( pA );
+#endif
+ }
+
+
+
+/* creates a zero matrix given the size in bytes */
+void c_zeros(char **ptrA, int size)
+ {
+ *ptrA = malloc(size);
+ char *A = *ptrA;
+ int i;
+ for(i=0; i<size; i++) A[i] = 0;
+ }
+
+
+
+/* creates a zero matrix aligned to a cache line given the size in bytes */
+void c_zeros_align(char **ptrA, int size)
+ {
+#if defined(OS_WINDOWS)
+ *ptrA = _aligned_malloc( size, 64 );
+#else
+ void *temp;
+ int err = posix_memalign(&temp, 64, size);
+ if(err!=0)
+ {
+ printf("Memory allocation error");
+ exit(1);
+ }
+ *ptrA = temp;
+#endif
+ char *A = *ptrA;
+ int i;
+ for(i=0; i<size; i++) A[i] = 0;
+ }
+
+
+
+/* frees matrix */
+void c_free(char *pA)
+ {
+ free( pA );
+ }
+
+
+
+/* frees aligned matrix */
+void c_free_align(char *pA)
+ {
+#if defined(OS_WINDOWS)
+ _aligned_free( pA );
+#else
+ free( pA );
+#endif
+ }
+