Squashed 'third_party/blasfeo/' content from commit 2a828ca

Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/auxiliary/Makefile b/auxiliary/Makefile
new file mode 100644
index 0000000..d1242bd
--- /dev/null
+++ b/auxiliary/Makefile
@@ -0,0 +1,124 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib8.o
+OBJS += m_aux_lib48.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib8.o
+OBJS += m_aux_lib48.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+OBJS += d_aux_lib.o
+OBJS += s_aux_lib.o
+OBJS += m_aux_lib.o
+
+endif # LA choice
+
+ifeq ($(EXT_DEP), 1)
+#ext dep
+OBJS += d_aux_ext_dep_lib.o
+OBJS += s_aux_ext_dep_lib.o
+OBJS += v_aux_ext_dep_lib.o
+OBJS += i_aux_ext_dep_lib.o 
+endif
+
+obj: $(OBJS)
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+	( cd avx2; $(MAKE) obj)
+	( cd avx; $(MAKE) obj)
+	( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+	( cd avx; $(MAKE) obj)
+	( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), X64_INTEL_CORE)
+	( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+	( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+	( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+	( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), GENERIC)
+	( cd c99; $(MAKE) obj)
+endif
+
+
+clean:
+	rm -f *.o
+	make -C avx2 clean
+	make -C avx clean
+	make -C c99 clean
diff --git a/auxiliary/avx/Makefile b/auxiliary/avx/Makefile
new file mode 100644
index 0000000..84e0154
--- /dev/null
+++ b/auxiliary/avx/Makefile
@@ -0,0 +1,50 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgecp_lib4.o 
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
diff --git a/auxiliary/avx/kernel_dgecp_lib4.c b/auxiliary/avx/kernel_dgecp_lib4.c
new file mode 100644
index 0000000..4bc8c9a
--- /dev/null
+++ b/auxiliary/avx/kernel_dgecp_lib4.c
@@ -0,0 +1,3024 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgecp_8_0_lib4(int tri, int kmax, double alpha, double *A0, int sda,  double *B0, int sdb)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 8-wide + end 7x7 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		alpha_0,
+		a_0;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B0[0+bs*0], a_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B0[0+bs*1], a_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B0[0+bs*2], a_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B0[0+bs*3], a_0 );
+
+		A0 += 16;
+		B0 += 16;
+
+		a_0 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+bs*0], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+bs*1] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+bs*1], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+bs*2] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+bs*2], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+bs*3] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+bs*3], a_0 );
+
+		A1 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B0[0+bs*0], a_0 );
+
+		A0 += 4;
+		B0 += 4;
+
+		a_0 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+bs*0], a_0 );
+
+		A1 += 4;
+		B1 += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 7x7 triangle 
+
+		c_0 = _mm_load_sd( &A0[1+0*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[1+0*bs], c_0 );
+		c_0 = _mm_load_pd( &A0[2+0*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B0[2+0*bs], c_0 );
+		a_0 = _mm256_load_pd( &A1[0+0*bs] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+0*bs], a_0 );
+
+		c_0 = _mm_load_pd( &A0[2+1*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B0[2+1*bs], c_0 );
+		a_0 = _mm256_load_pd( &A1[0+1*bs] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+1*bs], a_0 );
+
+		c_0 = _mm_load_sd( &A0[3+2*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+2*bs], c_0 );
+		a_0 = _mm256_load_pd( &A1[0+2*bs] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+2*bs], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+3*bs] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+3*bs], a_0 );
+
+		c_0 = _mm_load_sd( &A1[1+4*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[1+4*bs], c_0 );
+		c_0 = _mm_load_pd( &A1[2+4*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+4*bs], c_0 );
+
+		c_0 = _mm_load_pd( &A1[2+5*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+5*bs], c_0 );
+
+		c_0 = _mm_load_sd( &A1[3+6*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+6*bs], c_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgecp_8_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 8-wide + end 7x7 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *A2 = A1 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		alpha_0,
+		a_0, a_1, a_2,
+		b_0, b_1;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+
+		a_2 = _mm256_load_pd( &A2[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B1[0+bs*1], b_1 );
+		_mm256_store_pd( &B0[0+bs*1], b_0 );
+
+		a_2 = _mm256_load_pd( &A2[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B1[0+bs*2], b_1 );
+		_mm256_store_pd( &B0[0+bs*2], b_0 );
+
+		a_2 = _mm256_load_pd( &A2[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B1[0+bs*3], b_1 );
+		_mm256_store_pd( &B0[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		A2 += 16;
+		B0 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		A2 += 4;
+		B0 += 4;
+		B1 += 4;
+
+		}
+
+	if(tri==1)
+		{
+		// 7x7 triangle
+
+		c_0 = _mm_load_pd( &A0[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B0[1+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A1[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A1[1+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*0], c_0 );
+
+		c_0 = _mm_load_sd( &A0[3+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[2+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A1[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A1[1+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*1], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A1[0+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*2], c_0 );
+		c_0 = _mm_load_sd( &A1[1+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*2], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*2], c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*2], c_0 );
+
+		c_0 = _mm_load_sd( &A1[1+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*3], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*3], c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*3], c_0 );
+
+		c_0 = _mm_load_pd( &A1[2+bs*4] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*4], c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*4] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*4], c_0 );
+
+		c_0 = _mm_load_sd( &A1[3+bs*5] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[2+bs*5], c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*5] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*5], c_0 );
+
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*6] );
+		_mm_store_sd( &B1[3+bs*6], c_0 );
+
+		}
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_8_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 8-wide + end 7x7 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *A2 = A1 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		alpha_0,
+		a_0, a_1, a_2,
+		b_0, b_1;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_2 = _mm256_load_pd( &A2[0+bs*1] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*1], b_0 );
+		_mm256_store_pd( &B1[0+bs*1], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_2 = _mm256_load_pd( &A2[0+bs*2] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*2], b_0 );
+		_mm256_store_pd( &B1[0+bs*2], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_2 = _mm256_load_pd( &A2[0+bs*3] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*3], b_0 );
+		_mm256_store_pd( &B1[0+bs*3], b_1 );
+
+		A0 += 16;
+		A1 += 16;
+		A2 += 16;
+		B0 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		A0 += 4;
+		A1 += 4;
+		A2 += 4;
+		B0 += 4;
+		B1 += 4;
+
+		}
+
+	if(tri==1)
+		{
+		// 7x7 triangle 
+
+		c_0 = _mm_load_sd( &A0[3+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[1+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A1[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B0[2+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[0+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+bs*0], c_0 );
+
+		c_0 = _mm_load_pd( &A1[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B0[2+bs*1], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[0+bs*1], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A1[1+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*2], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[0+bs*2], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+bs*2], c_0 );
+
+		c_0 = _mm_load_pd( &A1[2+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[0+bs*3], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+bs*3], c_0 );
+
+		c_0 = _mm_load_sd( &A1[3+bs*4] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[1+bs*4], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*4] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+bs*4], c_0 );
+
+		c_0 = _mm_load_pd( &A2[0+bs*5] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+bs*5], c_0 );
+
+		c_0 = _mm_load_sd( &A2[1+bs*6] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*6], c_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_8_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 8-wide + end 7x7 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *A2 = A1 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		alpha_0,
+		a_0, a_1, a_2,
+		b_0, b_1;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_2 = _mm256_load_pd( &A2[0+bs*1] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*1], b_0 );
+		_mm256_store_pd( &B1[0+bs*1], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_2 = _mm256_load_pd( &A2[0+bs*2] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*2], b_0 );
+		_mm256_store_pd( &B1[0+bs*2], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_2 = _mm256_load_pd( &A2[0+bs*3] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*3], b_0 );
+		_mm256_store_pd( &B1[0+bs*3], b_1 );
+
+		A0 += 16;
+		A1 += 16;
+		A2 += 16;
+		B0 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		A0 += 4;
+		A1 += 4;
+		A2 += 4;
+		B0 += 4;
+		B1 += 4;
+
+		}
+
+	if(tri==1)
+		{
+		// 7x7 triangle 
+
+		c_0 = _mm_load_pd( &A1[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B0[1+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A1[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A1[3+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A2[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*0], c_0 );
+
+		c_0 = _mm_load_sd( &A1[1+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[2+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A1[2+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A1[3+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*1], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A2[2+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A1[2+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*2], c_0 );
+		c_0 = _mm_load_sd( &A1[3+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*2], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*2], c_0 );
+		c_0 = _mm_load_sd( &A2[2+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*2], c_0 );
+
+		c_0 = _mm_load_sd( &A1[3+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*3], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*3], c_0 );
+		c_0 = _mm_load_sd( &A2[2+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*3], c_0 );
+
+		c_0 = _mm_load_pd( &A2[0+bs*4] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*4], c_0 );
+		c_0 = _mm_load_sd( &A2[2+bs*4] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*4], c_0 );
+
+		c_0 = _mm_load_sd( &A2[1+bs*5] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[2+bs*5], c_0 );
+		c_0 = _mm_load_sd( &A2[2+bs*5] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*5], c_0 );
+
+		c_0 = _mm_load_sd( &A2[2+bs*6] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*6], c_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgecp_4_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m256d
+		alpha_0,
+		a_0;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm256_load_pd( &A[0+bs*1] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm256_load_pd( &A[0+bs*2] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm256_load_pd( &A[0+bs*3] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B[0+bs*3], a_0 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B[0+bs*0], a_0 );
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		c_0 = _mm_load_sd( &A[1+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[1+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B[2+bs*0], c_0 );
+
+		c_0 = _mm_load_pd( &A[2+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B[2+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A[3+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*2], c_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgecp_4_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m256d
+		alpha_0,
+		a_0, a_1,
+		b_0;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*1], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*2], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		c_0 = _mm_load_pd( &A0[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B[1+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A1[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*0], c_0 );
+
+		c_0 = _mm_load_sd( &A0[3+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[2+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A1[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A1[0+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*2], c_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_4_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m256d
+		alpha_0,
+		a_0, a_1,
+		b_0;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*1], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*2], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		c_0 = _mm_load_sd( &A0[3+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[1+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A1[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B[2+bs*0], c_0 );
+
+		c_0 = _mm_load_pd( &A1[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B[2+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A1[1+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*2], c_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_4_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m256d
+		alpha_0,
+		a_0, a_1,
+		b_0;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*1], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*2], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		c_0 = _mm_load_pd( &A1[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B[1+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A1[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*0], c_0 );
+
+		c_0 = _mm_load_sd( &A1[1+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[2+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A1[2+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A1[2+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*2], c_0 );
+		}
+
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_3_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m128d
+		alpha_0,
+		a_0, a_1;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		a_1 = _mm_load_sd( &A[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*1] );
+		a_1 = _mm_load_sd( &A[2+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+		_mm_store_sd( &B[2+bs*1], a_1 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*2] );
+		a_1 = _mm_load_sd( &A[2+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+		_mm_store_sd( &B[2+bs*2], a_1 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*3] );
+		a_1 = _mm_load_sd( &A[2+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+		_mm_store_sd( &B[2+bs*3], a_1 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		a_1 = _mm_load_sd( &A[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 2x2 triangle
+
+		a_0 = _mm_loadu_pd( &A[1+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[1+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A[2+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[2+bs*1], a_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_3_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m128d
+		alpha_0,
+		a_0, a_1;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		a_1 = _mm_load_sd( &A1[0+bs*0] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+		a_1 = _mm_load_sd( &A1[0+bs*1] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_store_sd( &B[2+bs*1], a_1 );
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+		a_1 = _mm_load_sd( &A1[0+bs*2] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_store_sd( &B[2+bs*2], a_1 );
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+		a_1 = _mm_load_sd( &A1[0+bs*3] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_store_sd( &B[2+bs*3], a_1 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		a_1 = _mm_load_sd( &A1[0+bs*0] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 2x2 triangle
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[1+bs*0], a_0 );
+		a_0 = _mm_load_sd( &A1[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[2+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A1[0+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[2+bs*1], a_0 );
+
+		}
+
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_3_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m128d
+		alpha_0,
+		a_0, a_1;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+		a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[1+bs*0], a_1 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*1], a_0 );
+		a_1 = _mm_loadu_pd( &A1[0+bs*1] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[1+bs*1], a_1 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*2], a_0 );
+		a_1 = _mm_loadu_pd( &A1[0+bs*2] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[1+bs*2], a_1 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*3], a_0 );
+		a_1 = _mm_loadu_pd( &A1[0+bs*3] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[1+bs*3], a_1 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+		a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[1+bs*0], a_1 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 2x2 triangle
+
+		a_0 = _mm_loadu_pd( &A1[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[1+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A1[1+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[2+bs*1], a_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_2_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m128d
+		alpha_0,
+		a_0;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 1x1 triangle
+
+		a_0 = _mm_load_sd( &A[1+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[1+bs*0], a_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_2_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m128d
+		alpha_0,
+		a_0;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*1] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*2] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*3] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 1x1 triangle
+
+		a_0 = _mm_load_sd( &A1[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[1+bs*0], a_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgecp_1_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 1-wide
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m128d
+		alpha_0,
+		a_0;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_load_sd( &A[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A[0+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm_load_sd( &A[0+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm_load_sd( &A[0+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*3], a_0 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_load_sd( &A[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgead_8_0_lib4(int kmax, double alpha, double *A0, int sda,  double *B0, int sdb)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		a_0, c_0, alpha_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B0[0+bs*0], a_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		c_0 = _mm256_load_pd( &B0[0+bs*1] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B0[0+bs*1], a_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		c_0 = _mm256_load_pd( &B0[0+bs*2] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B0[0+bs*2], a_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		c_0 = _mm256_load_pd( &B0[0+bs*3] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B0[0+bs*3], a_0 );
+
+		A0 += 16;
+		B0 += 16;
+
+		a_0 = _mm256_load_pd( &A1[0+bs*0] );
+		c_0 = _mm256_load_pd( &B1[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B1[0+bs*0], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+bs*1] );
+		c_0 = _mm256_load_pd( &B1[0+bs*1] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B1[0+bs*1], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+bs*2] );
+		c_0 = _mm256_load_pd( &B1[0+bs*2] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B1[0+bs*2], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+bs*3] );
+		c_0 = _mm256_load_pd( &B1[0+bs*3] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B1[0+bs*3], a_0 );
+
+		A1 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B0[0+bs*0], a_0 );
+
+		A0 += 4;
+		B0 += 4;
+
+		a_0 = _mm256_load_pd( &A1[0+bs*0] );
+		c_0 = _mm256_load_pd( &B1[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B1[0+bs*0], a_0 );
+
+		A1 += 4;
+		B1 += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgead_8_1_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *A2 = A1 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		a_0, a_1, a_2,
+		b_0, b_1,
+		alpha_0, c_0, c_1;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		c_1 = _mm256_load_pd( &B1[0+bs*0] );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+
+		a_2 = _mm256_load_pd( &A2[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		c_1 = _mm256_load_pd( &B1[0+bs*1] );
+		c_0 = _mm256_load_pd( &B0[0+bs*1] );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		_mm256_store_pd( &B1[0+bs*1], b_1 );
+		_mm256_store_pd( &B0[0+bs*1], b_0 );
+
+		a_2 = _mm256_load_pd( &A2[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		c_1 = _mm256_load_pd( &B1[0+bs*2] );
+		c_0 = _mm256_load_pd( &B0[0+bs*2] );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		_mm256_store_pd( &B1[0+bs*2], b_1 );
+		_mm256_store_pd( &B0[0+bs*2], b_0 );
+
+		a_2 = _mm256_load_pd( &A2[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		c_1 = _mm256_load_pd( &B1[0+bs*3] );
+		c_0 = _mm256_load_pd( &B0[0+bs*3] );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		_mm256_store_pd( &B1[0+bs*3], b_1 );
+		_mm256_store_pd( &B0[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		A2 += 16;
+		B0 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		c_1 = _mm256_load_pd( &B1[0+bs*0] );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		A2 += 4;
+		B0 += 4;
+		B1 += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_8_2_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *A2 = A1 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		a_0, a_1, a_2,
+		b_0, b_1,
+		alpha_0, c_0, c_1;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		c_1 = _mm256_load_pd( &B1[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_2 = _mm256_load_pd( &A2[0+bs*1] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		c_0 = _mm256_load_pd( &B0[0+bs*1] );
+		c_1 = _mm256_load_pd( &B1[0+bs*1] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*1], b_0 );
+		_mm256_store_pd( &B1[0+bs*1], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_2 = _mm256_load_pd( &A2[0+bs*2] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		c_0 = _mm256_load_pd( &B0[0+bs*2] );
+		c_1 = _mm256_load_pd( &B1[0+bs*2] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*2], b_0 );
+		_mm256_store_pd( &B1[0+bs*2], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_2 = _mm256_load_pd( &A2[0+bs*3] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		c_0 = _mm256_load_pd( &B0[0+bs*3] );
+		c_1 = _mm256_load_pd( &B1[0+bs*3] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*3], b_0 );
+		_mm256_store_pd( &B1[0+bs*3], b_1 );
+
+		A0 += 16;
+		A1 += 16;
+		A2 += 16;
+		B0 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		c_1 = _mm256_load_pd( &B1[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		A0 += 4;
+		A1 += 4;
+		A2 += 4;
+		B0 += 4;
+		B1 += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_8_3_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *A2 = A1 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		a_0, a_1, a_2,
+		b_0, b_1,
+		alpha_0, c_0, c_1;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		c_1 = _mm256_load_pd( &B1[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_2 = _mm256_load_pd( &A2[0+bs*1] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		c_0 = _mm256_load_pd( &B0[0+bs*1] );
+		c_1 = _mm256_load_pd( &B1[0+bs*1] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*1], b_0 );
+		_mm256_store_pd( &B1[0+bs*1], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_2 = _mm256_load_pd( &A2[0+bs*2] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		c_0 = _mm256_load_pd( &B0[0+bs*2] );
+		c_1 = _mm256_load_pd( &B1[0+bs*2] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*2], b_0 );
+		_mm256_store_pd( &B1[0+bs*2], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_2 = _mm256_load_pd( &A2[0+bs*3] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		c_0 = _mm256_load_pd( &B0[0+bs*3] );
+		c_1 = _mm256_load_pd( &B1[0+bs*3] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*3], b_0 );
+		_mm256_store_pd( &B1[0+bs*3], b_1 );
+
+		A0 += 16;
+		A1 += 16;
+		A2 += 16;
+		B0 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		c_1 = _mm256_load_pd( &B1[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		A0 += 4;
+		A1 += 4;
+		A2 += 4;
+		B0 += 4;
+		B1 += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgead_4_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m256d
+		a_0, c_0, alpha_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A[0+bs*0] );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( c_0, a_0 );
+		_mm256_store_pd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm256_load_pd( &A[0+bs*1] );
+		c_0 = _mm256_load_pd( &B[0+bs*1] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( c_0, a_0 );
+		_mm256_store_pd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm256_load_pd( &A[0+bs*2] );
+		c_0 = _mm256_load_pd( &B[0+bs*2] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( c_0, a_0 );
+		_mm256_store_pd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm256_load_pd( &A[0+bs*3] );
+		c_0 = _mm256_load_pd( &B[0+bs*3] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( c_0, a_0 );
+		_mm256_store_pd( &B[0+bs*3], a_0 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A[0+bs*0] );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( c_0, a_0 );
+		_mm256_store_pd( &B[0+bs*0], a_0 );
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgead_4_1_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m256d
+		a_0, a_1,
+		b_0,
+		alpha_0, c_0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*1] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*1], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*2] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*2], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*3] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_4_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m256d
+		a_0, a_1,
+		b_0,
+		alpha_0, c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		c_0 = _mm256_load_pd( &B[0+bs*1] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*1], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		c_0 = _mm256_load_pd( &B[0+bs*2] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*2], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		c_0 = _mm256_load_pd( &B[0+bs*3] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_4_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m256d
+		a_0, a_1,
+		b_0,
+		alpha_0, c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*1] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*1], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*2] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*2], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*3] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_3_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m128d
+		a_0, a_1,
+		alpha_0, c_0, c_1;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		a_1 = _mm_load_sd( &A[2+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		c_1 = _mm_load_sd( &B[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*1] );
+		a_1 = _mm_load_sd( &A[2+bs*1] );
+		c_0 = _mm_loadu_pd( &B[0+bs*1] );
+		c_1 = _mm_load_sd( &B[2+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+		_mm_store_sd( &B[2+bs*1], a_1 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*2] );
+		a_1 = _mm_load_sd( &A[2+bs*2] );
+		c_0 = _mm_loadu_pd( &B[0+bs*2] );
+		c_1 = _mm_load_sd( &B[2+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+		_mm_store_sd( &B[2+bs*2], a_1 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*3] );
+		a_1 = _mm_load_sd( &A[2+bs*3] );
+		c_0 = _mm_loadu_pd( &B[0+bs*3] );
+		c_1 = _mm_load_sd( &B[2+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+		_mm_store_sd( &B[2+bs*3], a_1 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		a_1 = _mm_load_sd( &A[2+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		c_1 = _mm_load_sd( &B[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_3_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m128d
+		a_0, a_1,
+		alpha_0, c_0, c_1;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+		a_1 = _mm_load_sd( &A1[0+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		c_1 = _mm_load_sd( &B[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*1] );
+		a_1 = _mm_load_sd( &A1[0+bs*1] );
+		c_0 = _mm_loadu_pd( &B[0+bs*1] );
+		c_1 = _mm_load_sd( &B[2+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+		_mm_store_sd( &B[2+bs*1], a_1 );
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*2] );
+		a_1 = _mm_load_sd( &A1[0+bs*2] );
+		c_0 = _mm_loadu_pd( &B[0+bs*2] );
+		c_1 = _mm_load_sd( &B[2+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+		_mm_store_sd( &B[2+bs*2], a_1 );
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*3] );
+		a_1 = _mm_load_sd( &A1[0+bs*3] );
+		c_0 = _mm_loadu_pd( &B[0+bs*3] );
+		c_1 = _mm_load_sd( &B[2+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+		_mm_store_sd( &B[2+bs*3], a_1 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+		a_1 = _mm_load_sd( &A1[0+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		c_1 = _mm_load_sd( &B[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_3_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m128d
+		a_0, a_1,
+		alpha_0, c_0, c_1;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+		c_0 = _mm_load_sd( &B[0+bs*0] );
+		c_1 = _mm_loadu_pd( &B[1+bs*0] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		a_1 = _mm_add_pd( c_1, a_1 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+		_mm_storeu_pd( &B[1+bs*0], a_1 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*1] );
+		a_1 = _mm_loadu_pd( &A1[0+bs*1] );
+		c_0 = _mm_load_sd( &B[0+bs*1] );
+		c_1 = _mm_loadu_pd( &B[1+bs*1] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		a_1 = _mm_add_pd( c_1, a_1 );
+		_mm_store_sd( &B[0+bs*1], a_0 );
+		_mm_storeu_pd( &B[1+bs*1], a_1 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*2] );
+		a_1 = _mm_loadu_pd( &A1[0+bs*2] );
+		c_0 = _mm_load_sd( &B[0+bs*2] );
+		c_1 = _mm_loadu_pd( &B[1+bs*2] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		a_1 = _mm_add_pd( c_1, a_1 );
+		_mm_store_sd( &B[0+bs*2], a_0 );
+		_mm_storeu_pd( &B[1+bs*2], a_1 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*3] );
+		a_1 = _mm_loadu_pd( &A1[0+bs*3] );
+		c_0 = _mm_load_sd( &B[0+bs*3] );
+		c_1 = _mm_loadu_pd( &B[1+bs*3] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		a_1 = _mm_add_pd( c_1, a_1 );
+		_mm_store_sd( &B[0+bs*3], a_0 );
+		_mm_storeu_pd( &B[1+bs*3], a_1 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+		c_0 = _mm_load_sd( &B[0+bs*0] );
+		c_1 = _mm_loadu_pd( &B[1+bs*0] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		a_1 = _mm_add_pd( c_1, a_1 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+		_mm_storeu_pd( &B[1+bs*0], a_1 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_2_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m128d
+		a_0, c_0, alpha_0;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*1] );
+		c_0 = _mm_loadu_pd( &B[0+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*2] );
+		c_0 = _mm_loadu_pd( &B[0+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*3] );
+		c_0 = _mm_loadu_pd( &B[0+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_2_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m128d
+		a_0, c_0, alpha_0;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*1] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*1] );
+		c_0 = _mm_loadu_pd( &B[0+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*2] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*2] );
+		c_0 = _mm_loadu_pd( &B[0+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*3] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*3] );
+		c_0 = _mm_loadu_pd( &B[0+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgead_1_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m128d
+		a_0, c_0, alpha_0;
+	
+	int k;
+
+	alpha_0 = _mm_load_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_load_sd( &A[0+bs*0] );
+		c_0 = _mm_load_sd( &B[0+bs*0] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A[0+bs*1] );
+		c_0 = _mm_load_sd( &B[0+bs*1] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		_mm_store_sd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm_load_sd( &A[0+bs*2] );
+		c_0 = _mm_load_sd( &B[0+bs*2] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		_mm_store_sd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm_load_sd( &A[0+bs*3] );
+		c_0 = _mm_load_sd( &B[0+bs*3] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		_mm_store_sd( &B[0+bs*3], a_0 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_load_sd( &A[0+bs*0] );
+		c_0 = _mm_load_sd( &B[0+bs*0] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+void kernel_dgeset_4_lib4(int kmax, double alpha, double *A)
+	{
+
+	int k;
+
+	__m256d 
+		a0;
+
+	a0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		_mm256_store_pd( &A[0], a0 );
+		_mm256_store_pd( &A[4], a0 );
+		_mm256_store_pd( &A[8], a0 );
+		_mm256_store_pd( &A[12], a0 );
+
+		A += 16;
+
+		}	
+	for(; k<kmax; k++)
+		{
+
+		_mm256_store_pd( &A[0], a0 );
+
+		A += 4;
+
+		}
+	
+	}
+
+
+// A lower triangular
+void kernel_dtrset_4_lib4(int kmax, double alpha, double *A)
+	{
+
+	int k;
+
+	__m256d 
+		a0;
+
+	a0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		_mm256_store_pd( &A[0], a0 );
+		_mm256_store_pd( &A[4], a0 );
+		_mm256_store_pd( &A[8], a0 );
+		_mm256_store_pd( &A[12], a0 );
+
+		A += 16;
+
+		}	
+	for(; k<kmax; k++)
+		{
+
+		_mm256_store_pd( &A[0], a0 );
+
+		A += 4;
+
+		}
+	
+	// final 4x4 triangle
+	_mm256_store_pd( &A[0], a0 );
+
+	_mm_store_sd( &A[5], _mm256_castpd256_pd128( a0 ) );
+	_mm_store_pd( &A[6], _mm256_castpd256_pd128( a0 ) );
+	
+	_mm_store_pd( &A[10], _mm256_castpd256_pd128( a0 ) );
+
+	_mm_store_sd( &A[15], _mm256_castpd256_pd128( a0 ) );
+
+	}
+
+
+
diff --git a/auxiliary/avx/kernel_dgetr_lib4.c b/auxiliary/avx/kernel_dgetr_lib4.c
new file mode 100644
index 0000000..29d095b
--- /dev/null
+++ b/auxiliary/avx/kernel_dgetr_lib4.c
@@ -0,0 +1,490 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	__m256d
+		alph,
+		v0, v1, v2, v3,
+		v4, v5, v6, v7;
+	
+	alph = _mm256_broadcast_sd( &alpha );
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+
+	for( ; k<kmax-7; k+=8)
+		{
+
+		v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+		
+		A += bs*bs;
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		v0 = _mm256_mul_pd( v0, alph );
+		_mm256_store_pd( &C[0+bs*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		v2 = _mm256_mul_pd( v2, alph );
+		_mm256_store_pd( &C[0+bs*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		v1 = _mm256_mul_pd( v1, alph );
+		_mm256_store_pd( &C[0+bs*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		v3 = _mm256_mul_pd( v3, alph );
+		_mm256_store_pd( &C[0+bs*3], v3 );
+
+		C += bs*sdc;
+
+		v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+		
+		A += bs*bs;
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		v0 = _mm256_mul_pd( v0, alph );
+		_mm256_store_pd( &C[0+bs*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		v2 = _mm256_mul_pd( v2, alph );
+		_mm256_store_pd( &C[0+bs*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		v1 = _mm256_mul_pd( v1, alph );
+		_mm256_store_pd( &C[0+bs*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		v3 = _mm256_mul_pd( v3, alph );
+		_mm256_store_pd( &C[0+bs*3], v3 );
+
+		C += bs*sdc;
+
+		}
+
+	for( ; k<kmax-3; k+=4)
+		{
+
+		v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+		
+		A += bs*bs;
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		v0 = _mm256_mul_pd( v0, alph );
+		_mm256_store_pd( &C[0+bs*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		v2 = _mm256_mul_pd( v2, alph );
+		_mm256_store_pd( &C[0+bs*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		v1 = _mm256_mul_pd( v1, alph );
+		_mm256_store_pd( &C[0+bs*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		v3 = _mm256_mul_pd( v3, alph );
+		_mm256_store_pd( &C[0+bs*3], v3 );
+
+		C += bs*sdc;
+
+		}
+
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+		C[0+bs*3] = alpha * A[3+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 3x3 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else if(kna==2)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*3] = alpha * A[3+bs*2];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+		C[1+bs*2] = alpha * A[2+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+		C[2+bs*2] = alpha * A[2+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+		C[3+bs*2] = alpha * A[2+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 2x2 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+	
+	if(tri==1)
+		{
+		// end 1x1 triangle
+		C[0+bs*1] = alpha * A[1+bs*0];
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 1-wide
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	}
+
+
+
+// transposed of general matrices, read across panels, write along panels
+void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
+	{
+	const int ps = 4;
+	__m256d
+		v0, v1, v2, v3, v4, v5, v6, v7;
+	int k;
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		v0 = _mm256_load_pd( &A[0+ps*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+ps*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+ps*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+ps*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		_mm256_store_pd( &B[0+ps*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		_mm256_store_pd( &B[0+ps*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		_mm256_store_pd( &B[0+ps*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		_mm256_store_pd( &B[0+ps*3], v3 );
+
+		A += ps*sda;
+		B += ps*ps;
+		}
+	for( ; k<kmax; k++)
+		{
+		//
+		B[0+ps*0] = A[0+ps*0];
+		B[1+ps*0] = A[0+ps*1];
+		B[2+ps*0] = A[0+ps*2];
+		B[3+ps*0] = A[0+ps*3];
+
+		A += 1;
+		B += ps;
+		}
+	return;
+	}
+
diff --git a/auxiliary/avx2/Makefile b/auxiliary/avx2/Makefile
new file mode 100644
index 0000000..463ebf5
--- /dev/null
+++ b/auxiliary/avx2/Makefile
@@ -0,0 +1,46 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgetr_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
diff --git a/auxiliary/avx2/kernel_dgetr_lib4.c b/auxiliary/avx2/kernel_dgetr_lib4.c
new file mode 100644
index 0000000..14d00ef
--- /dev/null
+++ b/auxiliary/avx2/kernel_dgetr_lib4.c
@@ -0,0 +1,756 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+
+
+
+// TODO tri !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+void kernel_dgetr_8_lib4(int tri, int kmax, int kna, double alpha, double *A0, int sda, double *C, int sdc)
+	{
+
+	const int bs = 4;
+	
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	__m256d
+		alph, 
+		v0, v1, v2, v3, v4, v5, v6, v7,
+		v8, v9, va, vb, vc, vd, ve, vf;
+	
+	alph = _mm256_broadcast_sd( &alpha );
+	
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A0[0+bs*0];
+			C[0+bs*1] = alpha * A0[1+bs*0];
+			C[0+bs*2] = alpha * A0[2+bs*0];
+			C[0+bs*3] = alpha * A0[3+bs*0];
+
+			C[0+bs*4] = alpha * A1[0+bs*0];
+			C[0+bs*5] = alpha * A1[1+bs*0];
+			C[0+bs*6] = alpha * A1[2+bs*0];
+			C[0+bs*7] = alpha * A1[3+bs*0];
+
+			C  += 1;
+			A0 += bs;
+			A1 += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for(; k<kmax-7; k+=8)
+		{
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
+		
+		A0 += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*3], v7 );
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+		A1 += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*4], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*5], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*6], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*7], v7 );
+
+		C += sdc*bs;
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
+		
+		A0 += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*3], v7 );
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+		A1 += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*4], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*5], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*6], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*7], v7 );
+
+		C += sdc*bs;
+
+		}
+
+	for(; k<kmax-3; k+=4)
+		{
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
+		
+		A0 += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*3], v7 );
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+		A1 += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*4], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*5], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*6], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*7], v7 );
+
+		C += sdc*bs;
+
+		}
+
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A0[0+bs*0];
+		C[0+bs*1] = alpha * A0[1+bs*0];
+		C[0+bs*2] = alpha * A0[2+bs*0];
+		C[0+bs*3] = alpha * A0[3+bs*0];
+
+		C[0+bs*4] = alpha * A1[0+bs*0];
+		C[0+bs*5] = alpha * A1[1+bs*0];
+		C[0+bs*6] = alpha * A1[2+bs*0];
+		C[0+bs*7] = alpha * A1[3+bs*0];
+
+		C  += 1;
+		A0 += bs;
+		A1 += bs;
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	__m256d
+		alph,
+		v0, v1, v2, v3,
+		v4, v5, v6, v7;
+	
+	alph = _mm256_broadcast_sd( &alpha );
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+
+	for( ; k<kmax-7; k+=8)
+		{
+
+#if 1
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 ); // 21 31 23 33
+		
+		A += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*3], v7 );
+
+		C += sdc*bs;
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 );
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 );
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 );
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 );
+		
+		A += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*3], v7 );
+
+		C += sdc*bs;
+
+#else // TODO alpha
+
+		v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+		
+		A += bs*bs;
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		_mm256_store_pd( &C[0+bs*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		_mm256_store_pd( &C[0+bs*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		_mm256_store_pd( &C[0+bs*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		_mm256_store_pd( &C[0+bs*3], v3 );
+
+		C += bs*sdc;
+
+		v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+		
+		A += bs*bs;
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		_mm256_store_pd( &C[0+bs*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		_mm256_store_pd( &C[0+bs*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		_mm256_store_pd( &C[0+bs*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		_mm256_store_pd( &C[0+bs*3], v3 );
+
+		C += bs*sdc;
+
+#endif
+
+		}
+
+	for( ; k<kmax-3; k+=4)
+		{
+
+#if 1
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 ); // 21 31 23 33
+		
+		A += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*3], v7 );
+
+		C += sdc*bs;
+
+#else
+
+		v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+		
+		A += bs*bs;
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		_mm256_store_pd( &C[0+bs*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		_mm256_store_pd( &C[0+bs*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		_mm256_store_pd( &C[0+bs*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		_mm256_store_pd( &C[0+bs*3], v3 );
+
+		C += bs*sdc;
+
+#endif
+
+		}
+
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+		C[0+bs*3] = alpha * A[3+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 3x3 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else if(kna==2)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*3] = alpha * A[3+bs*2];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+		C[1+bs*2] = alpha * A[2+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+		C[2+bs*2] = alpha * A[2+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+		C[3+bs*2] = alpha * A[2+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 2x2 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+	
+	if(tri==1)
+		{
+		// end 1x1 triangle
+		C[0+bs*1] = alpha * A[1+bs*0];
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 1-wide
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	}
+
+
+
+// transposed of general matrices, read across panels, write along panels
+void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
+	{
+	const int ps = 4;
+	__m256d
+		v0, v1, v2, v3, v4, v5, v6, v7;
+	int k;
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*0] ) ), _mm_load_pd( &A[0+ps*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*1] ) ), _mm_load_pd( &A[0+ps*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*0] ) ), _mm_load_pd( &A[2+ps*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*1] ) ), _mm_load_pd( &A[2+ps*3]) , 0x1 ); // 21 31 23 33
+		
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		_mm256_store_pd( &B[0+ps*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		_mm256_store_pd( &B[0+ps*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		_mm256_store_pd( &B[0+ps*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		_mm256_store_pd( &B[0+ps*3], v7 );
+
+		A += ps*sda;
+		B += ps*ps;
+		}
+	for( ; k<kmax; k++)
+		{
+		//
+		B[0+ps*0] = A[0+ps*0];
+		B[1+ps*0] = A[0+ps*1];
+		B[2+ps*0] = A[0+ps*2];
+		B[3+ps*0] = A[0+ps*3];
+
+		A += 1;
+		B += ps;
+		}
+	return;
+	}
+
diff --git a/auxiliary/c99/Makefile b/auxiliary/c99/Makefile
new file mode 100644
index 0000000..6e9ea7b
--- /dev/null
+++ b/auxiliary/c99/Makefile
@@ -0,0 +1,77 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += 
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += 
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
diff --git a/auxiliary/c99/kernel_dgecp_lib4.c b/auxiliary/c99/kernel_dgecp_lib4.c
new file mode 100644
index 0000000..e883072
--- /dev/null
+++ b/auxiliary/c99/kernel_dgecp_lib4.c
@@ -0,0 +1,1261 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgecp_4_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] = alpha*A[0+bs*0];
+		B[1+bs*0] = alpha*A[1+bs*0];
+		B[2+bs*0] = alpha*A[2+bs*0];
+		B[3+bs*0] = alpha*A[3+bs*0];
+
+		B[0+bs*1] = alpha*A[0+bs*1];
+		B[1+bs*1] = alpha*A[1+bs*1];
+		B[2+bs*1] = alpha*A[2+bs*1];
+		B[3+bs*1] = alpha*A[3+bs*1];
+
+		B[0+bs*2] = alpha*A[0+bs*2];
+		B[1+bs*2] = alpha*A[1+bs*2];
+		B[2+bs*2] = alpha*A[2+bs*2];
+		B[3+bs*2] = alpha*A[3+bs*2];
+
+		B[0+bs*3] = alpha*A[0+bs*3];
+		B[1+bs*3] = alpha*A[1+bs*3];
+		B[2+bs*3] = alpha*A[2+bs*3];
+		B[3+bs*3] = alpha*A[3+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A[0+bs*0];
+		B[1+bs*0] = alpha*A[1+bs*0];
+		B[2+bs*0] = alpha*A[2+bs*0];
+		B[3+bs*0] = alpha*A[3+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		B[1+bs*0] = alpha*A[1+bs*0];
+		B[2+bs*0] = alpha*A[2+bs*0];
+		B[3+bs*0] = alpha*A[3+bs*0];
+
+		B[2+bs*1] = alpha*A[2+bs*1];
+		B[3+bs*1] = alpha*A[3+bs*1];
+
+		B[3+bs*2] = alpha*A[3+bs*2];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgecp_4_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] = alpha*A0[1+bs*0];
+		B[1+bs*0] = alpha*A0[2+bs*0];
+		B[2+bs*0] = alpha*A0[3+bs*0];
+		B[3+bs*0] = alpha*A1[0+bs*0];
+
+		B[0+bs*1] = alpha*A0[1+bs*1];
+		B[1+bs*1] = alpha*A0[2+bs*1];
+		B[2+bs*1] = alpha*A0[3+bs*1];
+		B[3+bs*1] = alpha*A1[0+bs*1];
+
+		B[0+bs*2] = alpha*A0[1+bs*2];
+		B[1+bs*2] = alpha*A0[2+bs*2];
+		B[2+bs*2] = alpha*A0[3+bs*2];
+		B[3+bs*2] = alpha*A1[0+bs*2];
+
+		B[0+bs*3] = alpha*A0[1+bs*3];
+		B[1+bs*3] = alpha*A0[2+bs*3];
+		B[2+bs*3] = alpha*A0[3+bs*3];
+		B[3+bs*3] = alpha*A1[0+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A0[1+bs*0];
+		B[1+bs*0] = alpha*A0[2+bs*0];
+		B[2+bs*0] = alpha*A0[3+bs*0];
+		B[3+bs*0] = alpha*A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		B[1+0*bs] = alpha*A0[2+0*bs];
+		B[2+0*bs] = alpha*A0[3+0*bs];
+		B[3+0*bs] = alpha*A1[0+0*bs];
+
+		B[2+1*bs] = alpha*A0[3+1*bs];
+		B[3+1*bs] = alpha*A1[0+1*bs];
+
+		B[3+2*bs] = alpha*A1[0+2*bs];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_4_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] = alpha*A0[2+bs*0];
+		B[1+bs*0] = alpha*A0[3+bs*0];
+		B[2+bs*0] = alpha*A1[0+bs*0];
+		B[3+bs*0] = alpha*A1[1+bs*0];
+
+		B[0+bs*1] = alpha*A0[2+bs*1];
+		B[1+bs*1] = alpha*A0[3+bs*1];
+		B[2+bs*1] = alpha*A1[0+bs*1];
+		B[3+bs*1] = alpha*A1[1+bs*1];
+
+		B[0+bs*2] = alpha*A0[2+bs*2];
+		B[1+bs*2] = alpha*A0[3+bs*2];
+		B[2+bs*2] = alpha*A1[0+bs*2];
+		B[3+bs*2] = alpha*A1[1+bs*2];
+
+		B[0+bs*3] = alpha*A0[2+bs*3];
+		B[1+bs*3] = alpha*A0[3+bs*3];
+		B[2+bs*3] = alpha*A1[0+bs*3];
+		B[3+bs*3] = alpha*A1[1+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A0[2+bs*0];
+		B[1+bs*0] = alpha*A0[3+bs*0];
+		B[2+bs*0] = alpha*A1[0+bs*0];
+		B[3+bs*0] = alpha*A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle}
+
+		B[1+bs*0] = alpha*A0[3+bs*0];
+		B[2+bs*0] = alpha*A1[0+bs*0];
+		B[3+bs*0] = alpha*A1[1+bs*0];
+
+		B[2+bs*1] = alpha*A1[0+bs*1];
+		B[3+bs*1] = alpha*A1[1+bs*1];
+
+		B[3+bs*2] = alpha*A1[1+bs*2];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_4_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] = alpha*A0[3+bs*0];
+		B[1+bs*0] = alpha*A1[0+bs*0];
+		B[2+bs*0] = alpha*A1[1+bs*0];
+		B[3+bs*0] = alpha*A1[2+bs*0];
+
+		B[0+bs*1] = alpha*A0[3+bs*1];
+		B[1+bs*1] = alpha*A1[0+bs*1];
+		B[2+bs*1] = alpha*A1[1+bs*1];
+		B[3+bs*1] = alpha*A1[2+bs*1];
+
+		B[0+bs*2] = alpha*A0[3+bs*2];
+		B[1+bs*2] = alpha*A1[0+bs*2];
+		B[2+bs*2] = alpha*A1[1+bs*2];
+		B[3+bs*2] = alpha*A1[2+bs*2];
+
+		B[0+bs*3] = alpha*A0[3+bs*3];
+		B[1+bs*3] = alpha*A1[0+bs*3];
+		B[2+bs*3] = alpha*A1[1+bs*3];
+		B[3+bs*3] = alpha*A1[2+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A0[3+bs*0];
+		B[1+bs*0] = alpha*A1[0+bs*0];
+		B[2+bs*0] = alpha*A1[1+bs*0];
+		B[3+bs*0] = alpha*A1[2+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		B[1+bs*0] = alpha*A1[0+bs*0];
+		B[2+bs*0] = alpha*A1[1+bs*0];
+		B[3+bs*0] = alpha*A1[2+bs*0];
+
+		B[2+bs*1] = alpha*A1[1+bs*1];
+		B[3+bs*1] = alpha*A1[2+bs*1];
+
+		B[3+bs*2] = alpha*A1[2+bs*2];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_3_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] = alpha*A[0+bs*0];
+		B[1+bs*0] = alpha*A[1+bs*0];
+		B[2+bs*0] = alpha*A[2+bs*0];
+
+		B[0+bs*1] = alpha*A[0+bs*1];
+		B[1+bs*1] = alpha*A[1+bs*1];
+		B[2+bs*1] = alpha*A[2+bs*1];
+
+		B[0+bs*2] = alpha*A[0+bs*2];
+		B[1+bs*2] = alpha*A[1+bs*2];
+		B[2+bs*2] = alpha*A[2+bs*2];
+
+		B[0+bs*3] = alpha*A[0+bs*3];
+		B[1+bs*3] = alpha*A[1+bs*3];
+		B[2+bs*3] = alpha*A[2+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A[0+bs*0];
+		B[1+bs*0] = alpha*A[1+bs*0];
+		B[2+bs*0] = alpha*A[2+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 2x2 triangle
+
+		B[1+bs*0] = alpha*A[1+bs*0];
+		B[2+bs*0] = alpha*A[2+bs*0];
+
+		B[2+bs*1] = alpha*A[2+bs*1];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_3_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] = alpha*A0[2+bs*0];
+		B[1+bs*0] = alpha*A0[3+bs*0];
+		B[2+bs*0] = alpha*A1[0+bs*0];
+
+		B[0+bs*1] = alpha*A0[2+bs*1];
+		B[1+bs*1] = alpha*A0[3+bs*1];
+		B[2+bs*1] = alpha*A1[0+bs*1];
+
+		B[0+bs*2] = alpha*A0[2+bs*2];
+		B[1+bs*2] = alpha*A0[3+bs*2];
+		B[2+bs*2] = alpha*A1[0+bs*2];
+
+		B[0+bs*3] = alpha*A0[2+bs*3];
+		B[1+bs*3] = alpha*A0[3+bs*3];
+		B[2+bs*3] = alpha*A1[0+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A0[2+bs*0];
+		B[1+bs*0] = alpha*A0[3+bs*0];
+		B[2+bs*0] = alpha*A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 2x2 triangle
+
+		B[1+bs*0] = alpha*A0[3+bs*0];
+		B[2+bs*0] = alpha*A1[0+bs*0];
+
+		B[2+bs*1] = alpha*A1[0+bs*1];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_3_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] = alpha*A0[3+bs*0];
+		B[1+bs*0] = alpha*A1[0+bs*0];
+		B[2+bs*0] = alpha*A1[1+bs*0];
+
+		B[0+bs*1] = alpha*A0[3+bs*1];
+		B[1+bs*1] = alpha*A1[0+bs*1];
+		B[2+bs*1] = alpha*A1[1+bs*1];
+
+		B[0+bs*2] = alpha*A0[3+bs*2];
+		B[1+bs*2] = alpha*A1[0+bs*2];
+		B[2+bs*2] = alpha*A1[1+bs*2];
+
+		B[0+bs*3] = alpha*A0[3+bs*3];
+		B[1+bs*3] = alpha*A1[0+bs*3];
+		B[2+bs*3] = alpha*A1[1+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A0[3+bs*0];
+		B[1+bs*0] = alpha*A1[0+bs*0];
+		B[2+bs*0] = alpha*A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 2x2 triangle
+
+		B[1+bs*0] = alpha*A1[0+bs*0];
+		B[2+bs*0] = alpha*A1[1+bs*0];
+
+		B[2+bs*1] = alpha*A1[1+bs*1];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_2_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] = alpha*A[0+bs*0];
+		B[1+bs*0] = alpha*A[1+bs*0];
+
+		B[0+bs*1] = alpha*A[0+bs*1];
+		B[1+bs*1] = alpha*A[1+bs*1];
+
+		B[0+bs*2] = alpha*A[0+bs*2];
+		B[1+bs*2] = alpha*A[1+bs*2];
+
+		B[0+bs*3] = alpha*A[0+bs*3];
+		B[1+bs*3] = alpha*A[1+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A[0+bs*0];
+		B[1+bs*0] = alpha*A[1+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 1x1 triangle
+
+		B[1+bs*0] = alpha*A[1+bs*0];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_2_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] = alpha*A0[3+bs*0];
+		B[1+bs*0] = alpha*A1[0+bs*0];
+
+		B[0+bs*1] = alpha*A0[3+bs*1];
+		B[1+bs*1] = alpha*A1[0+bs*1];
+
+		B[0+bs*2] = alpha*A0[3+bs*2];
+		B[1+bs*2] = alpha*A1[0+bs*2];
+
+		B[0+bs*3] = alpha*A0[3+bs*3];
+		B[1+bs*3] = alpha*A1[0+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A0[3+bs*0];
+		B[1+bs*0] = alpha*A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 1x1 triangle
+
+		B[1+bs*0] = alpha*A1[0+bs*0];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgecp_1_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 1-wide
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] = alpha*A[0+bs*0];
+
+		B[0+bs*1] = alpha*A[0+bs*1];
+
+		B[0+bs*2] = alpha*A[0+bs*2];
+
+		B[0+bs*3] = alpha*A[0+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A[0+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgead_4_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+		B[2+bs*0] += alpha * A[2+bs*0];
+		B[3+bs*0] += alpha * A[3+bs*0];
+
+		B[0+bs*1] += alpha * A[0+bs*1];
+		B[1+bs*1] += alpha * A[1+bs*1];
+		B[2+bs*1] += alpha * A[2+bs*1];
+		B[3+bs*1] += alpha * A[3+bs*1];
+
+		B[0+bs*2] += alpha * A[0+bs*2];
+		B[1+bs*2] += alpha * A[1+bs*2];
+		B[2+bs*2] += alpha * A[2+bs*2];
+		B[3+bs*2] += alpha * A[3+bs*2];
+
+		B[0+bs*3] += alpha * A[0+bs*3];
+		B[1+bs*3] += alpha * A[1+bs*3];
+		B[2+bs*3] += alpha * A[2+bs*3];
+		B[3+bs*3] += alpha * A[3+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+		B[2+bs*0] += alpha * A[2+bs*0];
+		B[3+bs*0] += alpha * A[3+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgead_4_1_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] += alpha * A0[1+bs*0];
+		B[1+bs*0] += alpha * A0[2+bs*0];
+		B[2+bs*0] += alpha * A0[3+bs*0];
+		B[3+bs*0] += alpha * A1[0+bs*0];
+
+		B[0+bs*1] += alpha * A0[1+bs*1];
+		B[1+bs*1] += alpha * A0[2+bs*1];
+		B[2+bs*1] += alpha * A0[3+bs*1];
+		B[3+bs*1] += alpha * A1[0+bs*1];
+
+		B[0+bs*2] += alpha * A0[1+bs*2];
+		B[1+bs*2] += alpha * A0[2+bs*2];
+		B[2+bs*2] += alpha * A0[3+bs*2];
+		B[3+bs*2] += alpha * A1[0+bs*2];
+
+		B[0+bs*3] += alpha * A0[1+bs*3];
+		B[1+bs*3] += alpha * A0[2+bs*3];
+		B[2+bs*3] += alpha * A0[3+bs*3];
+		B[3+bs*3] += alpha * A1[0+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[1+bs*0];
+		B[1+bs*0] += alpha * A0[2+bs*0];
+		B[2+bs*0] += alpha * A0[3+bs*0];
+		B[3+bs*0] += alpha * A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_4_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] += alpha * A0[2+bs*0];
+		B[1+bs*0] += alpha * A0[3+bs*0];
+		B[2+bs*0] += alpha * A1[0+bs*0];
+		B[3+bs*0] += alpha * A1[1+bs*0];
+
+		B[0+bs*1] += alpha * A0[2+bs*1];
+		B[1+bs*1] += alpha * A0[3+bs*1];
+		B[2+bs*1] += alpha * A1[0+bs*1];
+		B[3+bs*1] += alpha * A1[1+bs*1];
+
+		B[0+bs*2] += alpha * A0[2+bs*2];
+		B[1+bs*2] += alpha * A0[3+bs*2];
+		B[2+bs*2] += alpha * A1[0+bs*2];
+		B[3+bs*2] += alpha * A1[1+bs*2];
+
+		B[0+bs*3] += alpha * A0[2+bs*3];
+		B[1+bs*3] += alpha * A0[3+bs*3];
+		B[2+bs*3] += alpha * A1[0+bs*3];
+		B[3+bs*3] += alpha * A1[1+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[2+bs*0];
+		B[1+bs*0] += alpha * A0[3+bs*0];
+		B[2+bs*0] += alpha * A1[0+bs*0];
+		B[3+bs*0] += alpha * A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_4_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+		B[2+bs*0] += alpha * A1[1+bs*0];
+		B[3+bs*0] += alpha * A1[2+bs*0];
+
+		B[0+bs*1] += alpha * A0[3+bs*1];
+		B[1+bs*1] += alpha * A1[0+bs*1];
+		B[2+bs*1] += alpha * A1[1+bs*1];
+		B[3+bs*1] += alpha * A1[2+bs*1];
+
+		B[0+bs*2] += alpha * A0[3+bs*2];
+		B[1+bs*2] += alpha * A1[0+bs*2];
+		B[2+bs*2] += alpha * A1[1+bs*2];
+		B[3+bs*2] += alpha * A1[2+bs*2];
+
+		B[0+bs*3] += alpha * A0[3+bs*3];
+		B[1+bs*3] += alpha * A1[0+bs*3];
+		B[2+bs*3] += alpha * A1[1+bs*3];
+		B[3+bs*3] += alpha * A1[2+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+		B[2+bs*0] += alpha * A1[1+bs*0];
+		B[3+bs*0] += alpha * A1[2+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_3_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+		B[2+bs*0] += alpha * A[2+bs*0];
+
+		B[0+bs*1] += alpha * A[0+bs*1];
+		B[1+bs*1] += alpha * A[1+bs*1];
+		B[2+bs*1] += alpha * A[2+bs*1];
+
+		B[0+bs*2] += alpha * A[0+bs*2];
+		B[1+bs*2] += alpha * A[1+bs*2];
+		B[2+bs*2] += alpha * A[2+bs*2];
+
+		B[0+bs*3] += alpha * A[0+bs*3];
+		B[1+bs*3] += alpha * A[1+bs*3];
+		B[2+bs*3] += alpha * A[2+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+		B[2+bs*0] += alpha * A[2+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_3_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] += alpha * A0[2+bs*0];
+		B[1+bs*0] += alpha * A0[3+bs*0];
+		B[2+bs*0] += alpha * A1[0+bs*0];
+
+		B[0+bs*1] += alpha * A0[2+bs*1];
+		B[1+bs*1] += alpha * A0[3+bs*1];
+		B[2+bs*1] += alpha * A1[0+bs*1];
+
+		B[0+bs*2] += alpha * A0[2+bs*2];
+		B[1+bs*2] += alpha * A0[3+bs*2];
+		B[2+bs*2] += alpha * A1[0+bs*2];
+
+		B[0+bs*3] += alpha * A0[2+bs*3];
+		B[1+bs*3] += alpha * A0[3+bs*3];
+		B[2+bs*3] += alpha * A1[0+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[2+bs*0];
+		B[1+bs*0] += alpha * A0[3+bs*0];
+		B[2+bs*0] += alpha * A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_3_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+		B[2+bs*0] += alpha * A1[1+bs*0];
+
+		B[0+bs*1] += alpha * A0[3+bs*1];
+		B[1+bs*1] += alpha * A1[0+bs*1];
+		B[2+bs*1] += alpha * A1[1+bs*1];
+
+		B[0+bs*2] += alpha * A0[3+bs*2];
+		B[1+bs*2] += alpha * A1[0+bs*2];
+		B[2+bs*2] += alpha * A1[1+bs*2];
+
+		B[0+bs*3] += alpha * A0[3+bs*3];
+		B[1+bs*3] += alpha * A1[0+bs*3];
+		B[2+bs*3] += alpha * A1[1+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+		B[2+bs*0] += alpha * A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_2_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+
+		B[0+bs*1] += alpha * A[0+bs*1];
+		B[1+bs*1] += alpha * A[1+bs*1];
+
+		B[0+bs*2] += alpha * A[0+bs*2];
+		B[1+bs*2] += alpha * A[1+bs*2];
+
+		B[0+bs*3] += alpha * A[0+bs*3];
+		B[1+bs*3] += alpha * A[1+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_2_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+
+		B[0+bs*1] += alpha * A0[3+bs*1];
+		B[1+bs*1] += alpha * A1[0+bs*1];
+
+		B[0+bs*2] += alpha * A0[3+bs*2];
+		B[1+bs*2] += alpha * A1[0+bs*2];
+
+		B[0+bs*3] += alpha * A0[3+bs*3];
+		B[1+bs*3] += alpha * A1[0+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgead_1_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] += alpha * A[0+bs*0];
+
+		B[0+bs*1] += alpha * A[0+bs*1];
+
+		B[0+bs*2] += alpha * A[0+bs*2];
+
+		B[0+bs*3] += alpha * A[0+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+
diff --git a/auxiliary/c99/kernel_dgetr_lib4.c b/auxiliary/c99/kernel_dgetr_lib4.c
new file mode 100644
index 0000000..7d62277
--- /dev/null
+++ b/auxiliary/c99/kernel_dgetr_lib4.c
@@ -0,0 +1,414 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+		C[0+bs*3] = alpha * A[3+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+		C[1+bs*2] = alpha * A[2+bs*1];
+		C[1+bs*3] = alpha * A[3+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+		C[2+bs*2] = alpha * A[2+bs*2];
+		C[2+bs*3] = alpha * A[3+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+		C[3+bs*2] = alpha * A[2+bs*3];
+		C[3+bs*3] = alpha * A[3+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+		C[0+bs*3] = alpha * A[3+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 3x3 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else if(kna==2)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*3] = alpha * A[3+bs*2];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+		C[1+bs*2] = alpha * A[2+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+		C[2+bs*2] = alpha * A[2+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+		C[3+bs*2] = alpha * A[2+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 2x2 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+	
+	if(tri==1)
+		{
+		// end 1x1 triangle
+		C[0+bs*1] = alpha * A[1+bs*0];
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 1-wide
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	}
+
+
+
+// transposed of general matrices, read across panels, write along panels
+void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
+	{
+	const int ps = 4;
+	int k;
+	for(k=0; k<kmax-3; k+=4)
+		{
+		//
+		B[0+ps*0] = A[0+ps*0];
+		B[0+ps*1] = A[1+ps*0];
+		B[0+ps*2] = A[2+ps*0];
+		B[0+ps*3] = A[3+ps*0];
+		//
+		B[1+ps*0] = A[0+ps*1];
+		B[1+ps*1] = A[1+ps*1];
+		B[1+ps*2] = A[2+ps*1];
+		B[1+ps*3] = A[3+ps*1];
+		//
+		B[2+ps*0] = A[0+ps*2];
+		B[2+ps*1] = A[1+ps*2];
+		B[2+ps*2] = A[2+ps*2];
+		B[2+ps*3] = A[3+ps*2];
+		//
+		B[3+ps*0] = A[0+ps*3];
+		B[3+ps*1] = A[1+ps*3];
+		B[3+ps*2] = A[2+ps*3];
+		B[3+ps*3] = A[3+ps*3];
+
+		A += ps*sda;
+		B += ps*ps;
+		}
+	for( ; k<kmax; k++)
+		{
+		//
+		B[0+ps*0] = A[0+ps*0];
+		B[1+ps*0] = A[0+ps*1];
+		B[2+ps*0] = A[0+ps*2];
+		B[3+ps*0] = A[0+ps*3];
+
+		A += 1;
+		B += ps;
+		}
+	return;
+	}
+
diff --git a/auxiliary/c99/kernel_sgetr_lib4.c b/auxiliary/c99/kernel_sgetr_lib4.c
new file mode 100644
index 0000000..4cf6fa2
--- /dev/null
+++ b/auxiliary/c99/kernel_sgetr_lib4.c
@@ -0,0 +1,370 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_4_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+		C[0+bs*3] = alpha * A[3+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+		C[1+bs*2] = alpha * A[2+bs*1];
+		C[1+bs*3] = alpha * A[3+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+		C[2+bs*2] = alpha * A[2+bs*2];
+		C[2+bs*3] = alpha * A[3+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+		C[3+bs*2] = alpha * A[2+bs*3];
+		C[3+bs*3] = alpha * A[3+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+		C[0+bs*3] = alpha * A[3+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 3x3 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else if(kna==2)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*3] = alpha * A[3+bs*2];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_3_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+		C[1+bs*2] = alpha * A[2+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+		C[2+bs*2] = alpha * A[2+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+		C[3+bs*2] = alpha * A[2+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 2x2 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_2_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+	
+	if(tri==1)
+		{
+		// end 1x1 triangle
+		C[0+bs*1] = alpha * A[1+bs*0];
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_1_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 1-wide
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	}
+
+
+
+
diff --git a/auxiliary/d_aux_ext_dep_lib.c b/auxiliary/d_aux_ext_dep_lib.c
new file mode 100644
index 0000000..c12da10
--- /dev/null
+++ b/auxiliary/d_aux_ext_dep_lib.c
@@ -0,0 +1,632 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if ! defined(OS_WINDOWS)
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+#endif
+
+
+
+/* creates a zero matrix */
+void d_zeros(double **pA, int row, int col)
+	{
+	*pA = malloc((row*col)*sizeof(double));
+	double *A = *pA;
+	int i;
+	for(i=0; i<row*col; i++) A[i] = 0.0;
+	}
+
+
+
+/* creates a zero matrix aligned to a cache line */
+void d_zeros_align(double **pA, int row, int col)
+	{
+#if defined(OS_WINDOWS)
+	*pA = (double *) _aligned_malloc( (row*col)*sizeof(double), 64 );
+#else
+	void *temp;
+	int err = posix_memalign(&temp, 64, (row*col)*sizeof(double));
+	if(err!=0)
+		{
+		printf("Memory allocation error");
+		exit(1);
+		}
+	*pA = temp;
+#endif
+	double *A = *pA;
+	int i;
+	for(i=0; i<row*col; i++) A[i] = 0.0;
+	}
+
+
+
+/* frees matrix */
+void d_free(double *pA)
+	{
+	free( pA );
+	}
+
+
+
+/* frees aligned matrix */
+void d_free_align(double *pA)
+	{
+#if defined(OS_WINDOWS)
+	_aligned_free( pA );
+#else
+	free( pA );
+#endif
+	}
+
+
+
+/* prints a matrix in column-major format */
+void d_print_mat(int m, int n, double *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<m; i++)
+		{
+		for(j=0; j<n; j++)
+			{
+			printf("%9.5f ", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void d_print_tran_mat(int row, int col, double *A, int lda)
+	{
+	int i, j;
+	for(j=0; j<col; j++)
+		{
+		for(i=0; i<row; i++)
+			{
+			printf("%9.5f ", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/* prints a matrix in column-major format */
+void d_print_to_file_mat(FILE *file, int row, int col, double *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<row; i++)
+		{
+		for(j=0; j<col; j++)
+			{
+			fprintf(file, "%9.5f ", A[i+lda*j]);
+			}
+		fprintf(file, "\n");
+		}
+	fprintf(file, "\n");
+	}	
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void d_print_tran_to_file_mat(FILE *file, int row, int col, double *A, int lda)
+	{
+	int i, j;
+	for(j=0; j<col; j++)
+		{
+		for(i=0; i<row; i++)
+			{
+			fprintf(file, "%9.5f ", A[i+lda*j]);
+			}
+		fprintf(file, "\n");
+		}
+	fprintf(file, "\n");
+	}	
+
+
+
+/* prints a matrix in column-major format (exponential notation) */
+void d_print_e_mat(int m, int n, double *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<m; i++)
+		{
+		for(j=0; j<n; j++)
+			{
+			printf("%1.15e\t", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/* prints the transposed of a matrix in column-major format (exponential notation) */
+void d_print_e_tran_mat(int row, int col, double *A, int lda)
+	{
+	int i, j;
+	for(j=0; j<col; j++)
+		{
+		for(i=0; i<row; i++)
+			{
+			printf("%e\t", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/****************************
+* new interface
+****************************/
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+#include "../include/blasfeo_block_size.h"
+
+
+
+// create a matrix structure for a matrix of size m*n by dynamically allocating the memory
+void d_allocate_strmat(int m, int n, struct d_strmat *sA)
+	{
+	const int bs = D_PS;
+	int nc = D_NC;
+	int al = bs*nc;
+	sA->m = m;
+	sA->n = n;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	sA->pm = pm;
+	sA->cn = cn;
+	d_zeros_align(&(sA->pA), sA->pm, sA->cn);
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	d_zeros_align(&(sA->dA), tmp, 1);
+	sA->use_dA = 0;
+	sA->memory_size = (pm*cn+tmp)*sizeof(double);
+	return;
+	}
+
+
+
+// free memory of a matrix structure
+void d_free_strmat(struct d_strmat *sA)
+	{
+	d_free_align(sA->pA);
+	d_free_align(sA->dA);
+	return;
+	}
+
+
+
+// create a vector structure for a vector of size m by dynamically allocating the memory
+void d_allocate_strvec(int m, struct d_strvec *sa)
+	{
+	const int bs = D_PS;
+//	int nc = D_NC;
+//	int al = bs*nc;
+	sa->m = m;
+	int pm = (m+bs-1)/bs*bs;
+	sa->pm = pm;
+	d_zeros_align(&(sa->pa), sa->pm, 1);
+	sa->memory_size = pm*sizeof(double);
+	return;
+	}
+
+
+
+// free memory of a matrix structure
+void d_free_strvec(struct d_strvec *sa)
+	{
+	d_free_align(sa->pa);
+	return;
+	}
+
+
+
+// print a matrix structure
+void d_print_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = D_PS;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int ii, i, j, tmp;
+	ii = 0;
+	if(ai%bs>0)
+		{
+		tmp = bs-ai%bs;
+		tmp = m<tmp ? m : tmp;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%9.5f ", pA[i+bs*j]);
+				}
+			printf("\n");
+			}
+		pA += tmp + bs*(sda-1);
+		m -= tmp;
+		}
+	for( ; ii<m-(bs-1); ii+=bs)
+		{
+		for(i=0; i<bs; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	if(ii<m)
+		{
+		tmp = m-ii;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	printf("\n");
+	return;
+	}
+
+
+
+// print a vector structure
+void d_print_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print the transposed of a vector structure
+void d_print_tran_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void d_print_to_file_strmat(FILE * file, int m, int n, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = D_PS;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int ii, i, j, tmp;
+	ii = 0;
+	if(ai%bs>0)
+		{
+		tmp = bs-ai%bs;
+		tmp = m<tmp ? m : tmp;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				fprintf(file, "%9.5f ", pA[i+bs*j]);
+				}
+			fprintf(file, "\n");
+			}
+		pA += tmp + bs*(sda-1);
+		m -= tmp;
+		}
+	for( ; ii<m-(bs-1); ii+=bs)
+		{
+		for(i=0; i<bs; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			fprintf(file, "\n");
+			}
+		}
+	if(ii<m)
+		{
+		tmp = m-ii;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			fprintf(file, "\n");
+			}
+		}
+	fprintf(file, "\n");
+	return;
+	}
+
+
+
+// print a vector structure
+void d_print_to_file_strvec(FILE * file, int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_to_file_mat(file, m, 1, pa, m);
+	return;
+	}
+
+
+
+// print the transposed of a vector structure
+void d_print_tran_to_file_strvec(FILE * file, int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_to_file_mat(file, 1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void d_print_e_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = D_PS;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int ii, i, j, tmp;
+	ii = 0;
+	if(ai%bs>0)
+		{
+		tmp = bs-ai%bs;
+		tmp = m<tmp ? m : tmp;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%e\t", pA[i+bs*j]);
+				}
+			printf("\n");
+			}
+		pA += tmp + bs*(sda-1);
+		m -= tmp;
+		}
+	for( ; ii<m-(bs-1); ii+=bs)
+		{
+		for(i=0; i<bs; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%e\t", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	if(ii<m)
+		{
+		tmp = m-ii;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%e\t", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	printf("\n");
+	return;
+	}
+
+
+
+// print a vector structure
+void d_print_e_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_e_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print the transposed of a vector structure
+void d_print_e_tran_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_e_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+#elif defined(LA_BLAS) | defined(LA_REFERENCE)
+
+
+
+// create a matrix structure for a matrix of size m*n
+void d_allocate_strmat(int m, int n, struct d_strmat *sA)
+	{
+	sA->m = m;
+	sA->n = n;
+	d_zeros(&(sA->pA), sA->m, sA->n);
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	d_zeros(&(sA->dA), tmp, 1);
+	sA->memory_size = (m*n+tmp)*sizeof(double);
+	return;
+	}
+
+
+
+// free memory of a matrix structure
+void d_free_strmat(struct d_strmat *sA)
+	{
+	free(sA->pA);
+	free(sA->dA);
+	return;
+	}
+
+
+
+// create a vector structure for a vector of size m
+void d_allocate_strvec(int m, struct d_strvec *sa)
+	{
+	sa->m = m;
+	d_zeros(&(sa->pa), sa->m, 1);
+	sa->memory_size = m*sizeof(double);
+	return;
+	}
+
+
+
+// free memory of a vector structure
+void d_free_strvec(struct d_strvec *sa)
+	{
+	free(sa->pa);
+	return;
+	}
+
+
+
+// print a matrix structure
+void d_print_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	d_print_mat(m, n, pA, lda);
+	return;
+	}
+
+
+
+// print a vector structure
+void d_print_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print and transpose a vector structure
+void d_print_tran_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void d_print_to_file_strmat(FILE *file, int m, int n, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	d_print_to_file_mat(file, m, n, pA, lda);
+	return;
+	}
+
+
+
+// print a vector structure
+void d_print_to_file_strvec(FILE *file, int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_to_file_mat(file, m, 1, pa, m);
+	return;
+	}
+
+
+
+// print and transpose a vector structure
+void d_print_to_file_tran_strvec(FILE *file, int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_to_file_mat(file, 1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void d_print_e_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	d_print_e_mat(m, n, pA, lda);
+	return;
+	}
+
+
+
+// print a vector structure
+void d_print_e_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_e_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print and transpose a vector structure
+void d_print_e_tran_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_e_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/auxiliary/d_aux_lib.c b/auxiliary/d_aux_lib.c
new file mode 100644
index 0000000..6f1f5d1
--- /dev/null
+++ b/auxiliary/d_aux_lib.c
@@ -0,0 +1,982 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+// return memory size (in bytes) needed for a strmat
+int d_size_strmat(int m, int n)
+	{
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	int size = (m*n+tmp)*sizeof(double);
+	return size;
+	}
+
+
+
+// return memory size (in bytes) needed for the diagonal of a strmat
+int d_size_diag_strmat(int m, int n)
+	{
+	int size = 0;
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	size = tmp*sizeof(double);
+	return size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void d_create_strmat(int m, int n, struct d_strmat *sA, void *memory)
+	{
+	sA->m = m;
+	sA->n = n;
+	double *ptr = (double *) memory;
+	sA->pA = ptr;
+	ptr += m*n;
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	sA->dA = ptr;
+	ptr += tmp;
+	sA->use_dA = 0;
+	sA->memory_size = (m*n+tmp)*sizeof(double);
+	return;
+	}
+
+
+
+// return memory size (in bytes) needed for a strvec
+int d_size_strvec(int m)
+	{
+	int size = m*sizeof(double);
+	return size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void d_create_strvec(int m, struct d_strvec *sa, void *memory)
+	{
+	sa->m = m;
+	double *ptr = (double *) memory;
+	sa->pa = ptr;
+//	ptr += m * n;
+	sa->memory_size = m*sizeof(double);
+	return;
+	}
+
+
+
+// convert a matrix into a matrix structure
+void d_cvt_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	double *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+			pA[ii+1+jj*lda2] = A[ii+1+jj*lda];
+			pA[ii+2+jj*lda2] = A[ii+2+jj*lda];
+			pA[ii+3+jj*lda2] = A[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix into a matrix structure
+void d_cvt_tran_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	double *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+			pA[jj+(ii+1)*lda2] = A[ii+1+jj*lda];
+			pA[jj+(ii+2)*lda2] = A[ii+2+jj*lda];
+			pA[jj+(ii+3)*lda2] = A[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector into a vector structure
+void d_cvt_vec2strvec(int m, double *a, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		pa[ii] = a[ii];
+	return;
+	}
+
+
+
+// convert a matrix structure into a matrix
+void d_cvt_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	double *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+			A[ii+1+jj*lda] = pA[ii+1+jj*lda2];
+			A[ii+2+jj*lda] = pA[ii+2+jj*lda2];
+			A[ii+3+jj*lda] = pA[ii+3+jj*lda2];
+			}
+		for(; ii<m; ii++)
+			{
+			A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix structure into a matrix
+void d_cvt_tran_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	double *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+			A[jj+(ii+1)*lda] = pA[ii+1+jj*lda2];
+			A[jj+(ii+2)*lda] = pA[ii+2+jj*lda2];
+			A[jj+(ii+3)*lda] = pA[ii+3+jj*lda2];
+			}
+		for(; ii<m; ii++)
+			{
+			A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector structure into a vector
+void d_cvt_strvec2vec(int m, struct d_strvec *sa, int ai, double *a)
+	{
+	double *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		a[ii] = pa[ii];
+	return;
+	}
+
+
+
+// cast a matrix into a matrix structure
+void d_cast_mat2strmat(double *A, struct d_strmat *sA)
+	{
+	sA->pA = A;
+	return;
+	}
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void d_cast_diag_mat2strmat(double *dA, struct d_strmat *sA)
+	{
+	sA->dA = dA;
+	return;
+	}
+
+
+
+// cast a vector into a vector structure
+void d_cast_vec2vecmat(double *a, struct d_strvec *sa)
+	{
+	sa->pa = a;
+	return;
+	}
+
+
+
+// insert element into strmat
+void dgein1_libstr(double a, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	pA[0] = a;
+	return;
+	}
+
+
+
+// extract element from strmat
+double dgeex1_libstr(struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	return pA[0];
+	}
+
+
+
+// insert element into strvec
+void dvecin1_libstr(double a, struct d_strvec *sx, int xi)
+	{
+	double *x = sx->pa + xi;
+	x[0] = a;
+	return;
+	}
+
+
+
+// extract element from strvec
+double dvecex1_libstr(struct d_strvec *sx, int xi)
+	{
+	double *x = sx->pa + xi;
+	return x[0];
+	}
+
+
+
+// set all elements of a strmat to a value
+void dgese_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		for(ii=0; ii<m; ii++)
+			{
+			pA[ii+lda*jj] = alpha;
+			}
+		}
+	return;
+	}
+
+
+
+// set all elements of a strvec to a value
+void dvecse_libstr(int m, double alpha, struct d_strvec *sx, int xi)
+	{
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		x[ii] = alpha;
+	return;
+	}
+
+
+
+// insert a vector into diagonal
+void ddiain_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*(lda+1)] = alpha*x[ii];
+	return;
+	}
+
+
+
+// add scalar to diagonal
+void ddiare_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*(lda+1)] += alpha;
+	return;
+	}
+
+
+
+// extract a row into a vector
+void drowex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		x[ii] = alpha*pA[ii*lda];
+	return;
+	}
+
+
+
+// insert a vector  into a row
+void drowin_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*lda] = alpha*x[ii];
+	return;
+	}
+
+
+
+// add a vector to a row
+void drowad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*lda] += alpha*x[ii];
+	return;
+	}
+
+
+
+// swap two rows of a matrix struct
+void drowsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*lda;
+	int ii;
+	double tmp;
+	for(ii=0; ii<kmax; ii++)
+		{
+		tmp = pA[ii*lda];
+		pA[ii*lda] = pC[ii*ldc];
+		pC[ii*ldc] = tmp;
+		}
+	return;
+	}
+
+
+
+// permute the rows of a matrix struct
+void drowpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			drowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+		}
+	return;
+	}
+
+
+
+// extract vector from column
+void dcolex_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		x[ii] = pA[ii];
+	return;
+	}
+
+
+
+// insert a vector  into a rcol
+void dcolin_libstr(int kmax, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii] = x[ii];
+	return;
+	}
+
+
+
+// swap two cols of a matrix struct
+void dcolsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*lda;
+	int ii;
+	double tmp;
+	for(ii=0; ii<kmax; ii++)
+		{
+		tmp = pA[ii];
+		pA[ii] = pC[ii];
+		pC[ii] = tmp;
+		}
+	return;
+	}
+
+
+
+// permute the cols of a matrix struct
+void dcolpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			dcolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+		}
+	return;
+	}
+
+
+
+// copy a generic strmat into a generic strmat
+void dgecp_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+			pC[ii+1+jj*ldc] = pA[ii+1+jj*lda];
+			pC[ii+2+jj*ldc] = pA[ii+2+jj*lda];
+			pC[ii+3+jj*ldc] = pA[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// scale a generic strmat
+void dgesc_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pA[ii+0+jj*lda] *= alpha;
+			pA[ii+1+jj*lda] *= alpha;
+			pA[ii+2+jj*lda] *= alpha;
+			pA[ii+3+jj*lda] *= alpha;
+			}
+		for(; ii<m; ii++)
+			{
+			pA[ii+0+jj*lda] *= alpha;
+			}
+		}
+	return;
+	}
+
+
+
+// copy a strvec into a strvec
+void dveccp_libstr(int m, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+	{
+	double *pa = sa->pa + ai;
+	double *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] = pa[ii+0];
+		pc[ii+1] = pa[ii+1];
+		pc[ii+2] = pa[ii+2];
+		pc[ii+3] = pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] = pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// scale a strvec
+void dvecsc_libstr(int m, double alpha, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pa[ii+0] *= alpha;
+		pa[ii+1] *= alpha;
+		pa[ii+2] *= alpha;
+		pa[ii+3] *= alpha;
+		}
+	for(; ii<m; ii++)
+		{
+		pa[ii+0] *= alpha;
+		}
+	return;
+	}
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void dtrcp_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<m; jj++)
+		{
+		ii = jj;
+		for(; ii<m; ii++)
+			{
+			pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// scale and add a generic strmat into a generic strmat
+void dgead_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+			pC[ii+1+jj*ldc] += alpha*pA[ii+1+jj*lda];
+			pC[ii+2+jj*ldc] += alpha*pA[ii+2+jj*lda];
+			pC[ii+3+jj*ldc] += alpha*pA[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// scales and adds a strvec into a strvec
+void dvecad_libstr(int m, double alpha, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+	{
+	double *pa = sa->pa + ai;
+	double *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		pc[ii+1] += alpha*pa[ii+1];
+		pc[ii+2] += alpha*pa[ii+2];
+		pc[ii+3] += alpha*pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void dgetr_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			pC[jj+(ii+1)*ldc] = pA[ii+1+jj*lda];
+			pC[jj+(ii+2)*ldc] = pA[ii+2+jj*lda];
+			pC[jj+(ii+3)*ldc] = pA[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void dtrtr_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<m; jj++)
+		{
+		ii = jj;
+		for(; ii<m; ii++)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void dtrtr_u_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<m; jj++)
+		{
+		ii = 0;
+		for(; ii<=jj; ii++)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// insert a strvec to the diagonal of a strmat, sparse formulation
+void ddiain_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	double *x = sx->pa + xi;
+	int ldd = sD->m;
+	double *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*(ldd+1)] = alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// extract a vector from diagonal
+void ddiaex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		x[ii] = alpha*pA[ii*(lda+1)];
+	return;
+	}
+
+
+
+// extract the diagonal of a strmat from a strvec , sparse formulation
+void ddiaex_sp_libstr(int kmax, double alpha, int *idx, struct d_strmat *sD, int di, int dj, struct d_strvec *sx, int xi)
+	{
+	double *x = sx->pa + xi;
+	int ldd = sD->m;
+	double *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[ii*(ldd+1)];
+		}
+	return;
+	}
+
+
+
+// add a vector to diagonal
+void ddiaad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*(lda+1)] += alpha*x[ii];
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void ddiaad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	double *x = sx->pa + xi;
+	int ldd = sD->m;
+	double *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*(ldd+1)] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void ddiaadin_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	double *x = sx->pa + xi;
+	double *y = sy->pa + yi;
+	int ldd = sD->m;
+	double *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*(ldd+1)] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void drowad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	double *x = sx->pa + xi;
+	int ldd = sD->m;
+	double *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*ldd] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+
+void dvecad_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+	{
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] += alpha * x[ii];
+	return;
+	}
+
+
+
+void dvecin_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+	{
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] = alpha * x[ii];
+	return;
+	}
+
+
+
+void dvecex_sp_libstr(int m, double alpha, int *idx, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[ii] = alpha * x[idx[ii]];
+	return;
+	}
+
+
+// clip without mask return
+void dveccl_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi)
+	{
+	double *xm = sxm->pa + xim;
+	double *x  = sx->pa + xi;
+	double *xp = sxp->pa + xip;
+	double *z  = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		if(x[ii]>=xp[ii])
+			{
+			z[ii] = xp[ii];
+			}
+		else if(x[ii]<=xm[ii])
+			{
+			z[ii] = xm[ii];
+			}
+		else
+			{
+			z[ii] = x[ii];
+			}
+		}
+	return;
+	}
+
+
+
+// clip with mask return
+void dveccl_mask_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi, struct d_strvec *sm, int mi)
+	{
+	double *xm = sxm->pa + xim;
+	double *x  = sx->pa + xi;
+	double *xp = sxp->pa + xip;
+	double *z  = sz->pa + zi;
+	double *mask  = sm->pa + mi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		if(x[ii]>=xp[ii])
+			{
+			z[ii] = xp[ii];
+			mask[ii] = 1.0;
+			}
+		else if(x[ii]<=xm[ii])
+			{
+			z[ii] = xm[ii];
+			mask[ii] = -1.0;
+			}
+		else
+			{
+			z[ii] = x[ii];
+			mask[ii] = 0.0;
+			}
+		}
+	return;
+	}
+
+
+// zero out components using mask
+void dvecze_libstr(int m, struct d_strvec *sm, int mi, struct d_strvec *sv, int vi, struct d_strvec *se, int ei)
+	{
+	double *mask = sm->pa + mi;
+	double *v = sv->pa + vi;
+	double *e = se->pa + ei;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		if(mask[ii]==0)
+			{
+			e[ii] = v[ii];
+			}
+		else
+			{
+			e[ii] = 0;
+			}
+		}
+	return;
+	}
+
+
+
+void dvecnrm_inf_libstr(int m, struct d_strvec *sx, int xi, double *ptr_norm)
+	{
+	int ii;
+	double *x = sx->pa + xi;
+	double norm = 0.0;
+	for(ii=0; ii<m; ii++)
+		norm = fmax(norm, fabs(x[ii]));
+	*ptr_norm = norm;
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/auxiliary/d_aux_lib4.c b/auxiliary/d_aux_lib4.c
new file mode 100644
index 0000000..152aed1
--- /dev/null
+++ b/auxiliary/d_aux_lib4.c
@@ -0,0 +1,3609 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_block_size.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+// copies a packed matrix into a packed matrix
+// TODO remove alha !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+void dgecp_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna, ii;
+
+	int offA = offsetA%bs;
+	int offB = offsetB%bs;
+
+	// A at the beginning of the block
+	A -= offA;
+
+	// A at the beginning of the block
+	B -= offB;
+
+	// same alignment
+	if(offA==offB)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_0_lib4(0, n, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_0_lib4(0, n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_0_lib4(0, n, alpha, A, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(0, n, alpha, A, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_0_lib4(0, n, alpha, A, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_0_lib4(0, n, alpha, A, B);
+			}
+		}
+	// skip one element of A
+	else if(offA==(offB+1)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+				//A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_3_lib4(0, n, alpha, A, sda, B+2);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_2_lib4(0, n, alpha, A, sda, B+1);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_1_lib4(0, n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_1_lib4(0, n, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+1, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_0_lib4(0, n, alpha, A+1, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_0_lib4(0, n, alpha, A+1, B);
+			}
+		}
+	// skip 2 elements of A
+	else if(offA==(offB+2)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_3_lib4(0, n, alpha, A, sda, B+1);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+1, B+3);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_0_lib4(0, n, alpha, A, B+2);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_3_lib4(0, n, alpha, A, sda, B+1);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_2_lib4(0, n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_2_lib4(0, n, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+2, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_0_lib4(0, n, alpha, A+2, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_2_lib4(0, n, alpha, A, sda, B);
+			}
+		}
+	// skip 3 elements of A
+	else // if(offA==(offB+3)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_0_lib4(0, n, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_3_lib4(0, n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_3_lib4(0, n, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+3, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_3_lib4(0, n, alpha, A, sda, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_3_lib4(0, n, alpha, A, sda, B);
+			}
+		}
+
+	}
+
+
+
+// copies a lower triangular packed matrix into a lower triangular packed matrix
+void dtrcp_l_lib(int m, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
+	{
+
+	if(m<=0)
+		return;
+
+	int n = m;
+
+	const int bs = 4;
+
+	int mna, ii;
+
+	int offA = offsetA%bs;
+	int offB = offsetB%bs;
+
+	// A at the beginning of the block
+	A -= offA;
+
+	// A at the beginning of the block
+	B -= offB;
+
+	// same alignment
+	if(offA==offB)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_0_lib4(1, ii, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_0_lib4(1, ii, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_0_lib4(1, ii, alpha, A, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_0_lib4(1, ii, alpha, A, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_0_lib4(1, ii, alpha, A, B);
+			}
+		}
+	// skip one element of A
+	else if(offA==(offB+1)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+				//A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_3_lib4(1, ii, alpha, A, sda, B+2);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_2_lib4(1, ii, alpha, A, sda, B+1);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_1_lib4(1, ii, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_1_lib4(1, ii, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+1, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_0_lib4(1, ii, alpha, A+1, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_0_lib4(1, ii, alpha, A+1, B);
+			}
+		}
+	// skip 2 elements of A
+	else if(offA==(offB+2)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_3_lib4(1, ii, alpha, A, sda, B+1);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+1, B+3);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_0_lib4(1, ii, alpha, A, B+2);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_3_lib4(1, ii, alpha, A, sda, B+1);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_2_lib4(1, ii, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_2_lib4(1, ii, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+2, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_0_lib4(1, ii, alpha, A+2, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_2_lib4(1, ii, alpha, A, sda, B);
+			}
+		}
+	// skip 3 elements of A
+	else // if(offA==(offB+3)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_0_lib4(1, ii, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_3_lib4(1, ii, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_3_lib4(1, ii, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+3, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_3_lib4(1, ii, alpha, A, sda, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_3_lib4(1, ii, alpha, A, sda, B);
+			}
+		}
+
+	}
+
+
+
+// scales and adds a packed matrix into a packed matrix: B = B + alpha*A
+void dgead_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna, ii;
+
+	int offA = offsetA%bs;
+	int offB = offsetB%bs;
+
+	// A at the beginning of the block
+	A -= offA;
+
+	// A at the beginning of the block
+	B -= offB;
+
+	// same alignment
+	if(offA==offB)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgead_3_0_lib4(n, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgead_8_0_lib4(n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgead_4_0_lib4(n, alpha, A, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgead_1_0_lib4(n, alpha, A, B);
+			else if(m-ii==2)
+				kernel_dgead_2_0_lib4(n, alpha, A, B);
+			else // if(m-ii==3)
+				kernel_dgead_3_0_lib4(n, alpha, A, B);
+			}
+		}
+	// skip one element of A
+	else if(offA==(offB+1)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+				//A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgead_2_3_lib4(n, alpha, A, sda, B+2);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgead_3_2_lib4(n, alpha, A, sda, B+1);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_dgead_8_1_lib4(n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_dgead_4_1_lib4(n, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgead_1_0_lib4(n, alpha, A+1, B);
+			else if(m-ii==2)
+				kernel_dgead_2_0_lib4(n, alpha, A+1, B);
+			else // if(m-ii==3)
+				kernel_dgead_3_0_lib4(n, alpha, A+1, B);
+			}
+		}
+	// skip 2 elements of A
+	else if(offA==(offB+2)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_dgead_2_3_lib4(n, alpha, A, sda, B+1);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgead_1_0_lib4(n, alpha, A+1, B+3);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgead_2_0_lib4(n, alpha, A, B+2);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgead_3_3_lib4(n, alpha, A, sda, B+1);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgead_8_2_lib4(n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgead_4_2_lib4(n, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgead_1_0_lib4(n, alpha, A+2, B);
+			else if(m-ii==2)
+				kernel_dgead_2_0_lib4(n, alpha, A+2, B);
+			else // if(m-ii==3)
+				kernel_dgead_3_2_lib4(n, alpha, A, sda, B);
+			}
+		}
+	// skip 3 elements of A
+	else // if(offA==(offB+3)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgead_3_0_lib4(n, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgead_8_3_lib4(n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgead_4_3_lib4(n, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgead_1_0_lib4(n, alpha, A+3, B);
+			else if(m-ii==2)
+				kernel_dgead_2_3_lib4(n, alpha, A, sda, B);
+			else // if(m-ii==3)
+				kernel_dgead_3_3_lib4(n, alpha, A, sda, B);
+			}
+		}
+
+	}
+
+
+
+// scales and adds a strvec into a strvec
+void dvecad_libstr(int m, double alpha, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+	{
+	double *pa = sa->pa + ai;
+	double *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		pc[ii+1] += alpha*pa[ii+1];
+		pc[ii+2] += alpha*pa[ii+2];
+		pc[ii+3] += alpha*pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// transpose general matrix; m and n are referred to the original matrix
+void dgetr_lib(int m, int n, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+	{
+
+/*
+
+m = 5
+n = 3
+offsetA = 1
+offsetC = 2
+
+A =
+ x x x
+ -
+ x x x
+ x x x
+ x x x
+ x x x
+
+C =
+ x x x x x
+ x x x x x
+ -
+ x x x x x
+
+*/
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna = (bs-offsetA%bs)%bs;
+	mna = m<mna ? m : mna;
+	int nna = (bs-offsetC%bs)%bs;
+	nna = n<nna ? n : nna;
+
+	int ii;
+
+	ii = 0;
+
+	if(mna>0)
+		{
+		if(mna==1)
+			kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+		else if(mna==2)
+			kernel_dgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+		else //if(mna==3)
+			kernel_dgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+		ii += mna;
+		pA += mna + bs*(sda-1);
+		pC += mna*bs;
+		}
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for( ; ii<m-7; ii+=8)
+		{
+		kernel_dgetr_8_lib4(0, n, nna, alpha, pA, sda, pC, sdc);
+		pA += 2*bs*sda;
+		pC += 2*bs*bs;
+		}
+#endif
+	for( ; ii<m-3; ii+=4)
+//	for( ; ii<m; ii+=4)
+		{
+		kernel_dgetr_4_lib4(0, n, nna, alpha, pA, pC, sdc);
+		pA += bs*sda;
+		pC += bs*bs;
+		}
+
+	// clean-up at the end using smaller kernels
+	if(ii==m)
+		return;
+
+	if(m-ii==1)
+		kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+	else if(m-ii==2)
+		kernel_dgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+	else if(m-ii==3)
+		kernel_dgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+
+	return;
+
+	}
+
+
+
+// transpose lower triangular matrix
+void dtrtr_l_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+	{
+
+/*
+
+A =
+ x
+ x x
+ x x x
+ x x x x
+
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+C =
+ x x x x x x x x
+
+   x x x x x x x
+     x x x x x x
+	   x x x x x
+	     x x x x
+
+	       x x x
+	         x x
+	           x
+
+*/
+
+	int n = m;
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna = (bs-offsetA%bs)%bs;
+	mna = m<mna ? m : mna;
+	int nna = (bs-offsetC%bs)%bs;
+	nna = n<nna ? n : nna;
+
+	int ii;
+
+	ii = 0;
+
+	if(mna>0)
+		{
+		if(mna==1)
+			{
+			pC[0] = alpha * pA[0];
+			}
+		else if(mna==2)
+			{
+			if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+				}
+			else
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				}
+			}
+		else //if(mna==3)
+			{
+			if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[0+bs*2] = alpha * pA[2+bs*0];
+				pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+				pC[1+bs*(1+sdc)] = alpha * pA[2+bs*1];
+				pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+				}
+			else if(nna==2)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[0+bs*2] = alpha * pA[2+bs*0];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[1+bs*2] = alpha * pA[2+bs*1];
+				pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+				}
+			else
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[0+bs*2] = alpha * pA[2+bs*0];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[1+bs*2] = alpha * pA[2+bs*1];
+				pC[2+bs*2] = alpha * pA[2+bs*2];
+				}
+			}
+		ii += mna;
+		pA += mna + bs*(sda-1);
+		pC += mna*bs;
+		}
+#if 0 //defined(TARGET_X64_INTEL_HASWELL)
+	for( ; ii<m-7; ii+=8)
+		{
+		kernel_dgetr_8_lib4(1, n, nna, alpha, pA, sda, pC, sdc);
+		pA += 2*bs*sda;
+		pC += 2*bs*bs;
+		}
+#endif
+	for( ; ii<m-3; ii+=4)
+		{
+		kernel_dgetr_4_lib4(1, ii, nna, alpha, pA, pC, sdc);
+		pA += bs*sda;
+		pC += bs*bs;
+		}
+
+	// clean-up at the end using smaller kernels
+	if(ii==m)
+		return;
+
+	if(m-ii==1)
+		kernel_dgetr_1_lib4(1, ii, nna, alpha, pA, pC, sdc);
+	else if(m-ii==2)
+		kernel_dgetr_2_lib4(1, ii, nna, alpha, pA, pC, sdc);
+	else if(m-ii==3)
+		kernel_dgetr_3_lib4(1, ii, nna, alpha, pA, pC, sdc);
+
+	return;
+
+	}
+
+
+
+// transpose an aligned upper triangular matrix into an aligned lower triangular matrix
+void dtrtr_u_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+	{
+
+/*
+
+A =
+ x x x x x x x x
+   x x x x x x x
+
+     x x x x x x
+       x x x x x
+         x x x x
+           x x x
+             x x
+               x
+
+C =
+ x
+
+ x x
+ x x x
+ x x x x
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+*/
+
+	int n = m;
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna = (bs-offsetA%bs)%bs;
+	mna = m<mna ? m : mna;
+	int nna = (bs-offsetC%bs)%bs;
+	nna = n<nna ? n : nna;
+	int tna = nna;
+
+	int ii;
+
+	ii = 0;
+
+	if(mna>0)
+		{
+		if(mna==1)
+			{
+			kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+			if(nna!=1)
+				{
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += 1*bs;
+				pC += 1;
+				tna = (bs-(offsetC+1)%bs)%bs;
+				}
+			else //if(nna==1)
+				{
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += 1*bs;
+				pC += 1 + (sdc-1)*bs;
+				tna = 0; //(bs-(offsetC+1)%bs)%bs;
+				}
+//			kernel_dgetr_1_lib4(0, n-1, tna, alpha, pA, pC, sdc);
+			}
+		else if(mna==2)
+			{
+			if(nna==0 || nna==3)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pA += 2*bs;
+				pC += 2;
+				tna = (bs-(offsetC+2)%bs)%bs;
+				kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += 1*bs;
+				pC += 1 + (sdc-1)*bs;
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+//				pC[0+bs*1] = alpha * pA[1+bs*0];
+				kernel_dgetr_2_lib4(0, n-1, 0, alpha, pA, pC, sdc);
+				pA += 1*bs;
+				pC += 1;
+				tna = 3; //(bs-(offsetC+2)%bs)%bs;
+//				kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==2)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pA += 2*bs;
+				pC += 2 + (sdc-1)*bs;
+				tna = 0; //(bs-(offsetC+2)%bs)%bs;
+				kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+				}
+			}
+		else //if(mna==3)
+			{
+			if(nna==0)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[2+bs*0] = alpha * pA[0+bs*2];
+				pC[2+bs*1] = alpha * pA[1+bs*2];
+				pC[2+bs*2] = alpha * pA[2+bs*2];
+				pA += 3*bs;
+				pC += 3;
+				tna = 1;
+				kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += bs;
+				pC += 1 + (sdc-1)*bs;
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[1+bs*2] = alpha * pA[2+bs*1];
+				pA += 2*bs;
+				pC += 2;
+				tna = 2;
+				kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==2)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pA += 2*bs;
+				pC += 2 + (sdc-1)*bs;
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+//				pC[0+bs*1] = alpha * pA[1+bs*0];
+//				pC[0+bs*2] = alpha * pA[2+bs*0];
+				kernel_dgetr_3_lib4(0, n-2, 0, alpha, pA, pC, sdc);
+				pA += 1*bs;
+				pC += 1;
+				tna = 3;
+//				kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			else //if(nna==3)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[2+bs*0] = alpha * pA[0+bs*2];
+				pC[2+bs*1] = alpha * pA[1+bs*2];
+				pC[2+bs*2] = alpha * pA[2+bs*2];
+				pA += 3*bs;
+				pC += 3 + (sdc-1)*bs;
+				tna = 0;
+				kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			}
+		ii += mna;
+		pA += mna + bs*(sda-1);
+		pC += mna*bs;
+		}
+#if 0 //defined(TARGET_X64_AVX2)
+	for( ; ii<m-7; ii+=8)
+		{
+		kernel_dgetr_8_lib4(0, n, nna, alpha, pA, sda, pC, sdc);
+		pA += 2*bs*sda;
+		pC += 2*bs*bs;
+		}
+#endif
+	for( ; ii<m-3; ii+=4)
+		{
+		if(tna==0)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			pC[3+bs*0] = alpha * pA[0+bs*3];
+			pC[3+bs*1] = alpha * pA[1+bs*3];
+			pC[3+bs*2] = alpha * pA[2+bs*3];
+			pC[3+bs*3] = alpha * pA[3+bs*3];
+			pA += 4*bs;
+			pC += sdc*bs;
+			kernel_dgetr_4_lib4(0, n-ii-4, 0, alpha, pA, pC, sdc);
+			}
+		else if(tna==1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pA += bs;
+			pC += 1 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[1+bs*2] = alpha * pA[2+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			pC[2+bs*3] = alpha * pA[3+bs*2];
+			pA += 3*bs;
+			pC += 3;
+			kernel_dgetr_4_lib4(0, n-ii-4, 1, alpha, pA, pC, sdc);
+			}
+		else if(tna==2)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pA += 2*bs;
+			pC += 2 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[0+bs*2] = alpha * pA[2+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[1+bs*2] = alpha * pA[2+bs*1];
+			pC[1+bs*3] = alpha * pA[3+bs*1];
+			pA += 2*bs;
+			pC += 2;
+			kernel_dgetr_4_lib4(0, n-ii-4, 2, alpha, pA, pC, sdc);
+			}
+		else //if(tna==3)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			pA += 3*bs;
+			pC += 3 + (sdc-1)*bs;
+			kernel_dgetr_4_lib4(0, n-ii-3, 0, alpha, pA, pC, sdc);
+//			pC[0+bs*0] = alpha * pA[0+bs*0];
+//			pC[0+bs*1] = alpha * pA[1+bs*0];
+//			pC[0+bs*2] = alpha * pA[2+bs*0];
+//			pC[0+bs*3] = alpha * pA[3+bs*0];
+			pA += bs;
+			pC += 1;
+//			kernel_dgetr_4_lib4(0, n-ii-4, tna, alpha, pA, pC, sdc);
+			}
+		pA += bs*sda;
+		pC += bs*bs;
+		}
+
+	// clean-up at the end
+	if(ii==m)
+		return;
+
+	if(m-ii==1)
+		{
+		pC[0+bs*0] = alpha * pA[0+bs*0];
+		}
+	else if(m-ii==2)
+		{
+		if(tna!=1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			}
+		else //if(tna==1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pA += bs;
+			pC += 1 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			}
+		}
+	else if(m-ii==3)
+		{
+		if(tna==0 || tna==3)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			}
+		else if(tna==1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pA += bs;
+			pC += 1 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[1+bs*2] = alpha * pA[2+bs*1];
+			}
+		else //if(tna==2)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pA += 2*bs;
+			pC += 2 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[0+bs*2] = alpha * pA[2+bs*0];
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// regularize diagonal
+void ddiareg_lib(int kmax, double reg, int offset, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] += reg;
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] += reg;
+		pD[jj*sdd+(jj+1)*bs+1] += reg;
+		pD[jj*sdd+(jj+2)*bs+2] += reg;
+		pD[jj*sdd+(jj+3)*bs+3] += reg;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] += reg;
+		}
+
+	}
+
+
+
+// insert sqrt of vector to diagonal
+void ddiain_sqrt_lib(int kmax, double *x, int offset, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] = sqrt(x[ll]);
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] = sqrt(x[jj+0]);
+		pD[jj*sdd+(jj+1)*bs+1] = sqrt(x[jj+1]);
+		pD[jj*sdd+(jj+2)*bs+2] = sqrt(x[jj+2]);
+		pD[jj*sdd+(jj+3)*bs+3] = sqrt(x[jj+3]);
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
+		}
+
+	}
+
+
+
+// extract diagonal to vector
+void ddiaex_lib(int kmax, double alpha, int offset, double *pD, int sdd, double *x)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			x[ll] = alpha * pD[ll+bs*ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[jj+0] = alpha * pD[jj*sdd+(jj+0)*bs+0];
+		x[jj+1] = alpha * pD[jj*sdd+(jj+1)*bs+1];
+		x[jj+2] = alpha * pD[jj*sdd+(jj+2)*bs+2];
+		x[jj+3] = alpha * pD[jj*sdd+(jj+3)*bs+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
+		}
+
+	}
+
+
+
+// add scaled vector to diagonal
+void ddiaad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] += alpha * x[ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] += alpha * x[jj+0];
+		pD[jj*sdd+(jj+1)*bs+1] += alpha * x[jj+1];
+		pD[jj*sdd+(jj+2)*bs+2] += alpha * x[jj+2];
+		pD[jj*sdd+(jj+3)*bs+3] += alpha * x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
+		}
+
+	}
+
+
+
+// insert vector to diagonal, sparse formulation
+void ddiain_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
+		}
+
+	}
+
+
+
+// extract diagonal to vector, sparse formulation
+void ddiaex_libsp(int kmax, int *idx, double alpha, double *pD, int sdd, double *x)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
+		}
+
+	}
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void ddiaad_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
+		}
+
+	}
+
+
+
+// add scaled vector to another vector and insert to diagonal, sparse formulation
+void ddiaadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
+		}
+
+	}
+
+
+
+// insert vector to row
+void drowin_lib(int kmax, double alpha, double *x, double *pD)
+	{
+
+	const int bs = 4;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[(jj+0)*bs] = alpha*x[jj+0];
+		pD[(jj+1)*bs] = alpha*x[jj+1];
+		pD[(jj+2)*bs] = alpha*x[jj+2];
+		pD[(jj+3)*bs] = alpha*x[jj+3];
+		}
+	for(; jj<kmax; jj++)
+		{
+		pD[(jj)*bs] = alpha*x[jj];
+		}
+
+	}
+
+
+
+// extract row to vector
+void drowex_lib(int kmax, double alpha, double *pD, double *x)
+	{
+
+	const int bs = 4;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[jj+0] = alpha*pD[(jj+0)*bs];
+		x[jj+1] = alpha*pD[(jj+1)*bs];
+		x[jj+2] = alpha*pD[(jj+2)*bs];
+		x[jj+3] = alpha*pD[(jj+3)*bs];
+		}
+	for(; jj<kmax; jj++)
+		{
+		x[jj] = alpha*pD[(jj)*bs];
+		}
+
+	}
+
+
+
+// add scaled vector to row
+void drowad_lib(int kmax, double alpha, double *x, double *pD)
+	{
+
+	const int bs = 4;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[(jj+0)*bs] += alpha * x[jj+0];
+		pD[(jj+1)*bs] += alpha * x[jj+1];
+		pD[(jj+2)*bs] += alpha * x[jj+2];
+		pD[(jj+3)*bs] += alpha * x[jj+3];
+		}
+	for(; jj<kmax; jj++)
+		{
+		pD[(jj)*bs] += alpha * x[jj];
+		}
+
+	}
+
+
+
+// insert vector to row, sparse formulation
+void drowin_libsp(int kmax, double alpha, int *idx, double *x, double *pD)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] = alpha*x[jj];
+		}
+
+	}
+
+
+
+// add scaled vector to row, sparse formulation
+void drowad_libsp(int kmax, int *idx, double alpha, double *x, double *pD)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] += alpha * x[jj];
+		}
+
+	}
+
+
+
+// add scaled vector to another vector and insert to row, sparse formulation
+void drowadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] = y[jj] + alpha * x[jj];
+		}
+
+	}
+
+
+
+// swap two rows
+void drowsw_lib(int kmax, double *pA, double *pC)
+	{
+
+	const int bs = 4;
+
+	int ii;
+	double tmp;
+
+	for(ii=0; ii<kmax-3; ii+=4)
+		{
+		tmp = pA[0+bs*0];
+		pA[0+bs*0] = pC[0+bs*0];
+		pC[0+bs*0] = tmp;
+		tmp = pA[0+bs*1];
+		pA[0+bs*1] = pC[0+bs*1];
+		pC[0+bs*1] = tmp;
+		tmp = pA[0+bs*2];
+		pA[0+bs*2] = pC[0+bs*2];
+		pC[0+bs*2] = tmp;
+		tmp = pA[0+bs*3];
+		pA[0+bs*3] = pC[0+bs*3];
+		pC[0+bs*3] = tmp;
+		pA += 4*bs;
+		pC += 4*bs;
+		}
+	for( ; ii<kmax; ii++)
+		{
+		tmp = pA[0+bs*0];
+		pA[0+bs*0] = pC[0+bs*0];
+		pC[0+bs*0] = tmp;
+		pA += 1*bs;
+		pC += 1*bs;
+		}
+
+	}
+
+
+
+// extract vector from column
+void dcolex_lib(int kmax, int offset, double *pD, int sdd, double *x)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			x[ll] = pD[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[jj+0] = pD[jj*sdd+0];
+		x[jj+1] = pD[jj*sdd+1];
+		x[jj+2] = pD[jj*sdd+2];
+		x[jj+3] = pD[jj*sdd+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		x[jj+ll] = pD[jj*sdd+ll];
+		}
+
+	}
+
+
+
+// insert vector to column
+void dcolin_lib(int kmax, double *x, int offset, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll] = x[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+0] = x[jj+0];
+		pD[jj*sdd+1] = x[jj+1];
+		pD[jj*sdd+2] = x[jj+2];
+		pD[jj*sdd+3] = x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+ll] = x[jj+ll];
+		}
+
+	}
+
+
+
+// add scaled vector to column
+void dcolad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll] += alpha * x[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+0] += alpha * x[jj+0];
+		pD[jj*sdd+1] += alpha * x[jj+1];
+		pD[jj*sdd+2] += alpha * x[jj+2];
+		pD[jj*sdd+3] += alpha * x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+ll] += alpha * x[jj+ll];
+		}
+
+	}
+
+
+
+// insert vector to diagonal, sparse formulation
+void dcolin_libsp(int kmax, int *idx, double *x, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs] = x[jj];
+		}
+
+	}
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void dcolad_libsp(int kmax, double alpha, int *idx, double *x, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
+		}
+
+	}
+
+
+
+// swaps two cols
+void dcolsw_lib(int kmax, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+	{
+
+	const int bs = 4;
+
+	int ii;
+
+	double tmp;
+
+	if(offsetA==offsetC)
+		{
+		if(offsetA>0)
+			{
+			ii = 0;
+			for(; ii<bs-offsetA; ii++)
+				{
+				tmp = pA[0+bs*0];
+				pA[0+bs*0] = pC[0+bs*0];
+				pC[0+bs*0] = tmp;
+				pA += 1;
+				pC += 1;
+				}
+			pA += bs*(sda-1);
+			pC += bs*(sdc-1);
+			kmax -= bs-offsetA;
+			}
+		ii = 0;
+		for(; ii<kmax-3; ii+=4)
+			{
+			tmp = pA[0+bs*0];
+			pA[0+bs*0] = pC[0+bs*0];
+			pC[0+bs*0] = tmp;
+			tmp = pA[1+bs*0];
+			pA[1+bs*0] = pC[1+bs*0];
+			pC[1+bs*0] = tmp;
+			tmp = pA[2+bs*0];
+			pA[2+bs*0] = pC[2+bs*0];
+			pC[2+bs*0] = tmp;
+			tmp = pA[3+bs*0];
+			pA[3+bs*0] = pC[3+bs*0];
+			pC[3+bs*0] = tmp;
+			pA += bs*sda;
+			pC += bs*sdc;
+			}
+		for(; ii<kmax; ii++)
+			{
+			tmp = pA[0+bs*0];
+			pA[0+bs*0] = pC[0+bs*0];
+			pC[0+bs*0] = tmp;
+			pA += 1;
+			pC += 1;
+			}
+		}
+	else
+		{
+		printf("\ndcolsw: feature not implemented yet: offsetA!=offsetC\n\n");
+		exit(1);
+		}
+
+	return;
+
+	}
+
+
+
+// insert vector to vector, sparse formulation
+void dvecin_libsp(int kmax, int *idx, double *x, double *y)
+	{
+
+	int jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		y[idx[jj]] = x[jj];
+		}
+
+	}
+
+
+
+// adds vector to vector, sparse formulation
+void dvecad_libsp(int kmax, int *idx, double alpha, double *x, double *y)
+	{
+
+	int jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		y[idx[jj]] += alpha * x[jj];
+		}
+
+	}
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// return the memory size (in bytes) needed for a strmat
+int d_size_strmat(int m, int n)
+	{
+	const int bs = 4;
+	int nc = D_NC;
+	int al = bs*nc;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	int memory_size = (pm*cn+tmp)*sizeof(double);
+	return memory_size;
+	}
+
+
+
+// return the memory size (in bytes) needed for the digonal of a strmat
+int d_size_diag_strmat(int m, int n)
+	{
+	const int bs = 4;
+	int nc = D_NC;
+	int al = bs*nc;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	int memory_size = tmp*sizeof(double);
+	return memory_size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void d_create_strmat(int m, int n, struct d_strmat *sA, void *memory)
+	{
+	const int bs = 4;
+	int nc = D_NC;
+	int al = bs*nc;
+	sA->m = m;
+	sA->n = n;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	sA->pm = pm;
+	sA->cn = cn;
+	double *ptr = (double *) memory;
+	sA->pA = ptr;
+	ptr += pm*cn;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	sA->dA = ptr;
+	ptr += tmp;
+	sA->use_dA = 0;
+	sA->memory_size = (pm*cn+tmp)*sizeof(double);
+	return;
+	}
+
+
+
+// return memory size (in bytes) needed for a strvec
+int d_size_strvec(int m)
+	{
+	const int bs = 4;
+//	int nc = D_NC;
+//	int al = bs*nc;
+	int pm = (m+bs-1)/bs*bs;
+	int memory_size = pm*sizeof(double);
+	return memory_size;
+	}
+
+
+
+// create a vector structure for a vector of size m by using memory passed by a pointer
+void d_create_strvec(int m, struct d_strvec *sa, void *memory)
+	{
+	const int bs = 4;
+//	int nc = D_NC;
+//	int al = bs*nc;
+	sa->m = m;
+	int pm = (m+bs-1)/bs*bs;
+	sa->pm = pm;
+	double *ptr = (double *) memory;
+	sa->pa = ptr;
+//	ptr += pm;
+	sa->memory_size = pm*sizeof(double);
+	return;
+	}
+
+
+
+// convert a matrix into a matrix structure
+void d_cvt_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, j, jj, m0, m1, m2;
+	double 	*B, *pB;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	__m256d
+		tmp;
+#endif
+	m0 = (bs-ai%bs)%bs;
+	if(m0>m)
+		m0 = m;
+	m1 = m - m0;
+	jj = 0;
+	for( ; jj<n-3; jj+=4)
+		{
+		B  =  A + jj*lda;
+		pB = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for( ; ii<m0; ii++)
+				{
+				pB[ii+bs*0] = B[ii+lda*0];
+				pB[ii+bs*1] = B[ii+lda*1];
+				pB[ii+bs*2] = B[ii+lda*2];
+				pB[ii+bs*3] = B[ii+lda*3];
+				}
+			B  += m0;
+			pB += m0 + bs*(sda-1);
+			}
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for( ; ii<m-3; ii+=4)
+			{
+			tmp = _mm256_loadu_pd( &B[0+lda*0] );
+			_mm256_store_pd( &pB[0+bs*0], tmp );
+			tmp = _mm256_loadu_pd( &B[0+lda*1] );
+			_mm256_store_pd( &pB[0+bs*1], tmp );
+			tmp = _mm256_loadu_pd( &B[0+lda*2] );
+			_mm256_store_pd( &pB[0+bs*2], tmp );
+			tmp = _mm256_loadu_pd( &B[0+lda*3] );
+			_mm256_store_pd( &pB[0+bs*3], tmp );
+			// update
+			B  += 4;
+			pB += bs*sda;
+			}
+#else
+		for( ; ii<m-3; ii+=4)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			pB[1+bs*0] = B[1+lda*0];
+			pB[2+bs*0] = B[2+lda*0];
+			pB[3+bs*0] = B[3+lda*0];
+			// col 1
+			pB[0+bs*1] = B[0+lda*1];
+			pB[1+bs*1] = B[1+lda*1];
+			pB[2+bs*1] = B[2+lda*1];
+			pB[3+bs*1] = B[3+lda*1];
+			// col 2
+			pB[0+bs*2] = B[0+lda*2];
+			pB[1+bs*2] = B[1+lda*2];
+			pB[2+bs*2] = B[2+lda*2];
+			pB[3+bs*2] = B[3+lda*2];
+			// col 3
+			pB[0+bs*3] = B[0+lda*3];
+			pB[1+bs*3] = B[1+lda*3];
+			pB[2+bs*3] = B[2+lda*3];
+			pB[3+bs*3] = B[3+lda*3];
+			// update
+			B  += 4;
+			pB += bs*sda;
+			}
+#endif
+		for( ; ii<m; ii++)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			// col 1
+			pB[0+bs*1] = B[0+lda*1];
+			// col 2
+			pB[0+bs*2] = B[0+lda*2];
+			// col 3
+			pB[0+bs*3] = B[0+lda*3];
+			// update
+			B  += 1;
+			pB += 1;
+			}
+		}
+	for( ; jj<n; jj++)
+		{
+
+		B  =  A + jj*lda;
+		pB = pA + jj*bs;
+
+		ii = 0;
+		if(m0>0)
+			{
+			for( ; ii<m0; ii++)
+				{
+				pB[ii+bs*0] = B[ii+lda*0];
+				}
+			B  += m0;
+			pB += m0 + bs*(sda-1);
+			}
+		for( ; ii<m-3; ii+=4)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			pB[1+bs*0] = B[1+lda*0];
+			pB[2+bs*0] = B[2+lda*0];
+			pB[3+bs*0] = B[3+lda*0];
+			// update
+			B  += 4;
+			pB += bs*sda;
+			}
+		for( ; ii<m; ii++)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			// update
+			B  += 1;
+			pB += 1;
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix into a matrix structure
+void d_cvt_tran_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, j, m0, m1, m2;
+	double 	*B, *pB;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	__m256d
+		v0, v1, v2, v3,
+		v4, v5, v6, v7;
+#endif
+	m0 = (bs-ai%bs)%bs;
+	if(m0>n)
+		m0 = n;
+	m1 = n - m0;
+	ii = 0;
+	if(m0>0)
+		{
+		for(j=0; j<m; j++)
+			{
+			for(i=0; i<m0; i++)
+				{
+				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+				}
+			}
+		A  += m0*lda;
+		pA += m0 + bs*(sda-1);
+		}
+	ii = 0;
+	for(; ii<m1-3; ii+=bs)
+		{
+		j=0;
+		B  = A + ii*lda;
+		pB = pA + ii*sda;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for(; j<m-3; j+=4)
+			{
+			v0 = _mm256_loadu_pd( &B[0+0*lda] ); // 00 10 20 30
+			v1 = _mm256_loadu_pd( &B[0+1*lda] ); // 01 11 21 31
+			v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+			v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+			v2 = _mm256_loadu_pd( &B[0+2*lda] ); // 02 12 22 32
+			v3 = _mm256_loadu_pd( &B[0+3*lda] ); // 03 13 23 33
+			v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+			v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+			B += 4;
+
+			v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+			_mm256_store_pd( &pB[0+bs*0], v0 );
+			v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+			_mm256_store_pd( &pB[0+bs*2], v2 );
+			v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+			_mm256_store_pd( &pB[0+bs*1], v1 );
+			v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+			_mm256_store_pd( &pB[0+bs*3], v3 );
+
+			pB += 4*bs;
+			}
+#else
+		for(; j<m-3; j+=4)
+			{
+			// unroll 0
+			pB[0+0*bs] = B[0+0*lda];
+			pB[1+0*bs] = B[0+1*lda];
+			pB[2+0*bs] = B[0+2*lda];
+			pB[3+0*bs] = B[0+3*lda];
+			// unroll 1
+			pB[0+1*bs] = B[1+0*lda];
+			pB[1+1*bs] = B[1+1*lda];
+			pB[2+1*bs] = B[1+2*lda];
+			pB[3+1*bs] = B[1+3*lda];
+			// unroll 2
+			pB[0+2*bs] = B[2+0*lda];
+			pB[1+2*bs] = B[2+1*lda];
+			pB[2+2*bs] = B[2+2*lda];
+			pB[3+2*bs] = B[2+3*lda];
+			// unroll 3
+			pB[0+3*bs] = B[3+0*lda];
+			pB[1+3*bs] = B[3+1*lda];
+			pB[2+3*bs] = B[3+2*lda];
+			pB[3+3*bs] = B[3+3*lda];
+			B  += 4;
+			pB += 4*bs;
+			}
+#endif
+		for(; j<m; j++)
+			{
+			// unroll 0
+			pB[0+0*bs] = B[0+0*lda];
+			pB[1+0*bs] = B[0+1*lda];
+			pB[2+0*bs] = B[0+2*lda];
+			pB[3+0*bs] = B[0+3*lda];
+			B  += 1;
+			pB += 1*bs;
+			}
+		}
+	if(ii<m1)
+		{
+		m2 = m1-ii;
+		if(bs<m2) m2 = bs;
+		for(j=0; j<m; j++)
+			{
+			for(i=0; i<m2; i++)
+				{
+				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector into a vector structure
+void d_cvt_vec2strvec(int m, double *a, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		pa[ii] = a[ii];
+	return;
+	}
+
+
+
+// convert a matrix structure into a matrix
+void d_cvt_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, jj;
+	int m0 = (bs-ai%bs)%bs;
+	double *ptr_pA;
+	jj=0;
+	for(; jj<n-3; jj+=4)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				// unroll 0
+				A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+				// unroll 1
+				A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+				// unroll 2
+				A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+				// unroll 3
+				A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			// unroll 0
+			A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+			A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+			A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+			// unroll 0
+			A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+			A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
+			A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
+			A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
+			// unroll 0
+			A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+			A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
+			A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
+			A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
+			// unroll 0
+			A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+			A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
+			A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
+			A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			// unroll 0
+			A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			// unroll 1
+			A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+			// unroll 2
+			A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+			// unroll 3
+			A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+			ptr_pA++;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				A[ii+lda*jj] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			A[0+ii+lda*jj] = ptr_pA[0];
+			A[1+ii+lda*jj] = ptr_pA[1];
+			A[2+ii+lda*jj] = ptr_pA[2];
+			A[3+ii+lda*jj] = ptr_pA[3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			A[ii+lda*jj] = ptr_pA[0];
+			ptr_pA++;
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix structure into a matrix
+void d_cvt_tran_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, jj;
+	int m0 = (bs-ai%bs)%bs;
+	double *ptr_pA;
+	jj=0;
+	for(; jj<n-3; jj+=4)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				// unroll 0
+				A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+				// unroll 1
+				A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+				// unroll 2
+				A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+				// unroll 3
+				A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			// unroll 0
+			A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
+			A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
+			A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
+			A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
+			// unroll 1
+			A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
+			A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
+			A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
+			A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
+			// unroll 2
+			A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
+			A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
+			A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
+			A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
+			// unroll 3
+			A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
+			A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
+			A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
+			A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			// unroll 0
+			A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+			// unroll 1
+			A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+			// unroll 2
+			A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+			// unroll 3
+			A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+			ptr_pA++;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				A[jj+lda*ii] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			i=0;
+			for(; i<bs; i++)
+				{
+				A[jj+lda*(i+ii)] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			A[jj+lda*ii] = ptr_pA[0];
+			ptr_pA++;
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector structure into a vector
+void d_cvt_strvec2vec(int m, struct d_strvec *sa, int ai, double *a)
+	{
+	double *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		a[ii] = pa[ii];
+	return;
+	}
+
+
+
+// cast a matrix into a matrix structure
+void d_cast_mat2strmat(double *A, struct d_strmat *sA)
+	{
+	sA->pA = A;
+	return;
+	}
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void d_cast_diag_mat2strmat(double *dA, struct d_strmat *sA)
+	{
+	sA->dA = dA;
+	return;
+	}
+
+
+
+// cast a vector into a vector structure
+void d_cast_vec2vecmat(double *a, struct d_strvec *sa)
+	{
+	sa->pa = a;
+	return;
+	}
+
+
+
+// insert element into strmat
+void dgein1_libstr(double a, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	pA[0] = a;
+	return;
+	}
+
+
+
+// extract element from strmat
+double dgeex1_libstr(struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	return pA[0];
+	}
+
+
+
+// insert element into strvec
+void dvecin1_libstr(double a, struct d_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	x[0] = a;
+	return;
+	}
+
+
+
+// extract element from strvec
+double dvecex1_libstr(struct d_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	return x[0];
+	}
+
+
+
+// set all elements of a strmat to a value
+void dgese_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
+	int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
+	int ii, jj;
+	if(m0>0)
+		{
+		for(ii=0; ii<m0; ii++)
+			{
+			for(jj=0; jj<n; jj++)
+				{
+				pA[jj*bs] = alpha;
+				}
+			pA += 1;
+			}
+		pA += bs*(sda-1);
+		m -= m0;
+		}
+	for(ii=0; ii<m-3; ii+=4)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			pA[0+jj*bs] = alpha;
+			pA[1+jj*bs] = alpha;
+			pA[2+jj*bs] = alpha;
+			pA[3+jj*bs] = alpha;
+			}
+		pA += bs*sda;
+		}
+	for( ; ii<m; ii++)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			pA[jj*bs] = alpha;
+			}
+		pA += 1;
+		}
+	return;
+	}
+
+
+
+// set all elements of a strvec to a value
+void dvecse_libstr(int m, double alpha, struct d_strvec *sx, int xi)
+	{
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		x[ii] = alpha;
+	return;
+	}
+
+
+
+// insert a vector into diagonal
+void ddiain_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	int offsetA = ai%bs;
+
+	int kna = (bs-offsetA%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pA[ll+bs*ll] = alpha*x[ll];
+			}
+		pA += kna + bs*(sda-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pA[jj*sda+(jj+0)*bs+0] = alpha*x[jj+0];
+		pA[jj*sda+(jj+1)*bs+1] = alpha*x[jj+1];
+		pA[jj*sda+(jj+2)*bs+2] = alpha*x[jj+2];
+		pA[jj*sda+(jj+3)*bs+3] = alpha*x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pA[jj*sda+(jj+ll)*bs+ll] = alpha*x[jj+ll];
+		}
+	return;
+	}
+
+
+
+// add scalar to diagonal
+void ddiare_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int offsetA = ai%bs;
+
+	int kna = (bs-offsetA%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pA[ll+bs*ll] += alpha;
+			}
+		pA += kna + bs*(sda-1) + kna*bs;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pA[jj*sda+(jj+0)*bs+0] += alpha;
+		pA[jj*sda+(jj+1)*bs+1] += alpha;
+		pA[jj*sda+(jj+2)*bs+2] += alpha;
+		pA[jj*sda+(jj+3)*bs+3] += alpha;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pA[jj*sda+(jj+ll)*bs+ll] += alpha;
+		}
+	return;
+	}
+
+
+
+// swap two rows of a matrix struct
+void drowsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	drowsw_lib(kmax, pA, pC);
+	return;
+	}
+
+
+
+// permute the rows of a matrix struct
+void drowpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			drowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+		}
+	return;
+	}
+
+
+// extract a row int a vector
+void drowex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	drowex_lib(kmax, alpha, pA, x);
+	return;
+	}
+
+
+
+// insert a vector into a row
+void drowin_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	drowin_lib(kmax, alpha, x, pA);
+	return;
+	}
+
+
+
+// add a vector to a row
+void drowad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	drowad_lib(kmax, alpha, x, pA);
+	return;
+	}
+
+
+
+// extract vector from column
+void dcolex_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	dcolex_lib(kmax, ai%bs, pA, sda, x);
+	return;
+	}
+
+
+
+
+// insert as vector as a column
+void dcolin_libstr(int kmax, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	dcolin_lib(kmax, x, ai%bs, pA, sda);
+	return;
+	}
+
+
+
+
+// swap two cols of a matrix struct
+void dcolsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dcolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
+	return;
+	}
+
+
+
+// permute the cols of a matrix struct
+void dcolpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			dcolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+		}
+	return;
+	}
+
+
+
+// copy a generic strmat into a generic strmat
+void dgecp_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dgecp_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc);
+	return;
+	}
+
+
+
+// scale a generic strmat
+void dgesc_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	dgecp_lib(m, n, alpha, ai%bs, pA, sda, ai%bs, pA, sda);
+	return;
+	}
+
+
+
+// copy a strvec into a strvec
+void dveccp_libstr(int m, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+	{
+	double *pa = sa->pa + ai;
+	double *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] = pa[ii+0];
+		pc[ii+1] = pa[ii+1];
+		pc[ii+2] = pa[ii+2];
+		pc[ii+3] = pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] = pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// scale a strvec
+void dvecsc_libstr(int m, double alpha, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pa[ii+0] *= alpha;
+		pa[ii+1] *= alpha;
+		pa[ii+2] *= alpha;
+		pa[ii+3] *= alpha;
+		}
+	for(; ii<m; ii++)
+		{
+		pa[ii+0] *= alpha;
+		}
+	return;
+	}
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void dtrcp_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dtrcp_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc);
+	return;
+	}
+
+
+
+// scale and add a generic strmat into a generic strmat
+void dgead_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dgead_lib(m, n, alpha, ai%bs, pA, sda, ci%bs, pC, sdc);
+	return;
+	}
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void dgetr_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dgetr_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void dtrtr_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dtrtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void dtrtr_u_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dtrtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// insert a strvec to diagonal of strmat, sparse formulation
+void ddiain_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	int sdd = sD->cn;
+	double *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// extract a vector from diagonal
+void ddiaex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	int offsetA = ai%bs;
+
+	int kna = (bs-offsetA%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			x[ll] = alpha*pA[ll+bs*ll];
+			}
+		pA += kna + bs*(sda-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[jj+0] = alpha*pA[jj*sda+(jj+0)*bs+0];
+		x[jj+1] = alpha*pA[jj*sda+(jj+1)*bs+1];
+		x[jj+2] = alpha*pA[jj*sda+(jj+2)*bs+2];
+		x[jj+3] = alpha*pA[jj*sda+(jj+3)*bs+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		x[jj+ll] = alpha*pA[jj*sda+(jj+ll)*bs+ll];
+		}
+	return;
+	}
+
+
+
+// extract the diagonal of a strmat to a strvec, sparse formulation
+void ddiaex_sp_libstr(int kmax, double alpha, int *idx, struct d_strmat *sD, int di, int dj, struct d_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	int sdd = sD->cn;
+	double *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
+		}
+	return;
+	}
+
+
+
+// add a vector to diagonal
+void ddiaad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	int offsetA = ai%bs;
+
+	int kna = (bs-offsetA%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pA[ll+bs*ll] += alpha*x[ll];
+			}
+		pA += kna + bs*(sda-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pA[jj*sda+(jj+0)*bs+0] += alpha*x[jj+0];
+		pA[jj*sda+(jj+1)*bs+1] += alpha*x[jj+1];
+		pA[jj*sda+(jj+2)*bs+2] += alpha*x[jj+2];
+		pA[jj*sda+(jj+3)*bs+3] += alpha*x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pA[jj*sda+(jj+ll)*bs+ll] += alpha*x[jj+ll];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to diagonal of strmat, sparse formulation
+void ddiaad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	int sdd = sD->cn;
+	double *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void ddiaadin_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	double *y = sy->pa + yi;
+	int sdd = sD->cn;
+	double *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void drowad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	int sdd = sD->cn;
+	double *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
+	drowad_libsp(kmax, idx, alpha, x, pD);
+	return;
+	}
+
+
+
+void dvecad_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+	{
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] += alpha * x[ii];
+	return;
+	}
+
+
+
+void dvecin_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+	{
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] = alpha * x[ii];
+	return;
+	}
+
+
+
+void dvecex_sp_libstr(int m, double alpha, int *idx, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[ii] = alpha * x[idx[ii]];
+	return;
+	}
+
+
+
+void dveccl_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi)
+	{
+
+	double *xm = sxm->pa + xim;
+	double *x  = sx->pa + xi;
+	double *xp = sxp->pa + xip;
+	double *z  = sz->pa + zi;
+
+	int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	double d0;
+
+	__m256d
+		xm0, x0, xp0, z0, tmp0, tmp1, ones, mones, mask1, mask2;
+
+	ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+	mones = _mm256_set_pd( -1.0, -1.0, -1.0, -1.0 );
+	mask1 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
+
+	for(ii=0; ii<m-3; ii+=4)
+		{
+		x0  = _mm256_loadu_pd( &x[ii] );
+		xp0 = _mm256_loadu_pd( &xp[ii] );
+		xm0 = _mm256_loadu_pd( &xm[ii] );
+		tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+		tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+		z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+		z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+		_mm256_storeu_pd( &z[ii], z0 );
+		}
+	if(ii<m)
+		{
+		d0 = (double) m-ii;
+		mask2 = _mm256_broadcast_sd( &d0 );
+		mask2 = _mm256_sub_pd( mask1, mask2 );
+		x0  = _mm256_loadu_pd( &x[ii] );
+		xp0 = _mm256_loadu_pd( &xp[ii] );
+		xm0 = _mm256_loadu_pd( &xm[ii] );
+		tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+		tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+		z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+		z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+		_mm256_maskstore_pd( &z[ii], _mm256_castpd_si256( mask2 ), z0 );
+		}
+#else
+	for(ii=0; ii<m; ii++)
+		{
+		if(x[ii]>=xp[ii])
+			{
+			z[ii] = xp[ii];
+			}
+		else if(x[ii]<=xm[ii])
+			{
+			z[ii] = xm[ii];
+			}
+		else
+			{
+			z[ii] = x[ii];
+			}
+		}
+#endif
+
+	return;
+
+	}
+
+
+
+void dveccl_mask_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi, struct d_strvec *sm, int mi)
+	{
+
+	double *xm = sxm->pa + xim;
+	double *x  = sx->pa + xi;
+	double *xp = sxp->pa + xip;
+	double *z  = sz->pa + zi;
+	double *mask  = sm->pa + mi;
+
+	int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	double d0;
+
+	__m256d
+		xm0, x0, xp0, z0, mask0, tmp0, tmp1, ones, mones, mask1, mask2;
+
+	ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+	mones = _mm256_set_pd( -1.0, -1.0, -1.0, -1.0 );
+	mask1 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
+
+	for(ii=0; ii<m-3; ii+=4)
+		{
+		mask0 = _mm256_setzero_pd();
+		x0  = _mm256_loadu_pd( &x[ii] );
+		xp0 = _mm256_loadu_pd( &xp[ii] );
+		xm0 = _mm256_loadu_pd( &xm[ii] );
+		tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+		tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+		z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+		z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+		mask0 = _mm256_blendv_pd( mask0, ones, tmp0 );
+		mask0 = _mm256_blendv_pd( mask0, mones, tmp1 );
+		_mm256_storeu_pd( &z[ii], z0 );
+		_mm256_storeu_pd( &mask[ii], mask0 );
+		}
+	if(ii<m)
+		{
+		d0 = (double) m-ii;
+		mask2 = _mm256_broadcast_sd( &d0 );
+		mask2 = _mm256_sub_pd( mask1, mask2 );
+		mask0 = _mm256_setzero_pd();
+		x0  = _mm256_loadu_pd( &x[ii] );
+		xp0 = _mm256_loadu_pd( &xp[ii] );
+		xm0 = _mm256_loadu_pd( &xm[ii] );
+		tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+		tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+		z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+		z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+		mask0 = _mm256_blendv_pd( mask0, ones, tmp0 );
+		mask0 = _mm256_blendv_pd( mask0, mones, tmp1 );
+		_mm256_maskstore_pd( &z[ii], _mm256_castpd_si256( mask2 ), z0 );
+		_mm256_maskstore_pd( &mask[ii], _mm256_castpd_si256( mask2 ), mask0 );
+		}
+#else
+	for(ii=0; ii<m; ii++)
+		{
+		if(x[ii]>=xp[ii])
+			{
+			z[ii] = xp[ii];
+			mask[ii] = 1.0;
+			}
+		else if(x[ii]<=xm[ii])
+			{
+			z[ii] = xm[ii];
+			mask[ii] = -1.0;
+			}
+		else
+			{
+			z[ii] = x[ii];
+			mask[ii] = 0.0;
+			}
+		}
+#endif
+
+	return;
+
+	}
+
+
+
+void dvecze_libstr(int m, struct d_strvec *sm, int mi, struct d_strvec *sv, int vi, struct d_strvec *se, int ei)
+	{
+	double *mask = sm->pa + mi;
+	double *v = sv->pa + vi;
+	double *e = se->pa + ei;
+
+	int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	double d0;
+
+	__m256d
+		mask0, mask1, mask2, mask3, fives, zeros, e0, v0;
+
+	fives = _mm256_set_pd( 0.5, 0.5, 0.5, 0.5 );
+	zeros = _mm256_setzero_pd();
+	mask3 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
+
+	for(ii=0; ii<m-3; ii+=4)
+		{
+		v0 = _mm256_loadu_pd( &v[ii] );
+		mask0 = _mm256_loadu_pd( &mask[ii] );
+		mask1 = mask0;
+		mask0 = _mm256_sub_pd( mask0, fives);
+		mask1 = _mm256_add_pd( mask1, fives);
+		mask0 = _mm256_xor_pd( mask0, mask1);
+		e0 = _mm256_blendv_pd( zeros, v0, mask0 );
+		_mm256_storeu_pd( &e[ii], e0 );
+		}
+	if(ii<m)
+		{
+		d0 = (double) m-ii;
+		mask2 = _mm256_broadcast_sd( &d0 );
+		mask2 = _mm256_sub_pd( mask3, mask2 );
+		v0 = _mm256_loadu_pd( &v[ii] );
+		mask0 = _mm256_loadu_pd( &mask[ii] );
+		mask1 = mask0;
+		mask0 = _mm256_sub_pd( mask0, fives);
+		mask1 = _mm256_add_pd( mask1, fives);
+		mask0 = _mm256_xor_pd( mask0, mask1);
+		e0 = _mm256_blendv_pd( zeros, v0, mask0 );
+		_mm256_maskstore_pd( &e[ii], _mm256_castpd_si256( mask2 ), e0 );
+		}
+#else
+	for(ii=0; ii<m; ii++)
+		{
+		if(mask[ii]==0)
+			{
+			e[ii] = v[ii];
+			}
+		else
+			{
+			e[ii] = 0;
+			}
+		}
+#endif
+
+	}
+
+
+
+void dvecnrm_inf_libstr(int m, struct d_strvec *sx, int xi, double *ptr_norm)
+	{
+	int ii;
+	double *x = sx->pa + xi;
+	double norm = 0.0;
+	for(ii=0; ii<m; ii++)
+		norm = fmax(norm, fabs(x[ii]));
+	*ptr_norm = norm;
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/auxiliary/i_aux_ext_dep_lib.c b/auxiliary/i_aux_ext_dep_lib.c
new file mode 100644
index 0000000..1ca2292
--- /dev/null
+++ b/auxiliary/i_aux_ext_dep_lib.c
@@ -0,0 +1,111 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+#if ! defined(OS_WINDOWS)
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+#endif
+
+
+
+/* creates a zero matrix aligned */
+void int_zeros(int **pA, int row, int col)
+	{
+	void *temp = malloc((row*col)*sizeof(int));
+	*pA = temp;
+	int *A = *pA;
+	int i;
+	for(i=0; i<row*col; i++) A[i] = 0;
+	}
+
+
+
+/* creates a zero matrix aligned to a cache line */
+void int_zeros_align(int **pA, int row, int col)
+	{
+#if defined(OS_WINDOWS)
+	*pA = (int *) _aligned_malloc( (row*col)*sizeof(int), 64 );
+#else
+	void *temp;
+	int err = posix_memalign(&temp, 64, (row*col)*sizeof(int));
+	if(err!=0)
+		{
+		printf("Memory allocation error");
+		exit(1);
+		}
+	*pA = temp;
+#endif
+	int *A = *pA;
+	int i;
+	for(i=0; i<row*col; i++) A[i] = 0.0;
+	}
+
+
+
+/* frees matrix */
+void int_free(int *pA)
+	{
+	free( pA );
+	}
+
+
+
+/* frees aligned matrix */
+void int_free_align(int *pA)
+	{
+#if defined(OS_WINDOWS)
+	_aligned_free( pA );
+#else
+	free( pA );
+#endif
+	}
+
+
+
+/* prints a matrix in column-major format */
+void int_print_mat(int row, int col, int *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<row; i++)
+		{
+		for(j=0; j<col; j++)
+			{
+			printf("%d ", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
diff --git a/auxiliary/m_aux_lib.c b/auxiliary/m_aux_lib.c
new file mode 100644
index 0000000..30cb333
--- /dev/null
+++ b/auxiliary/m_aux_lib.c
@@ -0,0 +1,112 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi)
+	{
+	double *pd = vd->pa+vdi;
+	float *ps = vs->pa+vsi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		ps[ii] = (float) pd[ii];
+		}
+	return;
+	}
+
+
+
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi)
+	{
+	double *pd = vd->pa+vdi;
+	float *ps = vs->pa+vsi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		pd[ii] = (double) ps[ii];
+		}
+	return;
+	}
+
+
+
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis)
+	{
+	int lda = Md->m;
+	int ldb = Ms->m;
+	double *pA = Md->pA+mid+nid*lda;
+	float *pB = Ms->pA+mis+nis*ldb;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		for(ii=0; ii<m; ii++)
+			{
+			pB[ii+jj*ldb] = (float) pA[ii+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid)
+	{
+	int lda = Ms->m;
+	int ldb = Md->m;
+	float *pA = Ms->pA+mis+nis*lda;
+	double *pB = Md->pA+mid+nid*ldb;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		for(ii=0; ii<m; ii++)
+			{
+			pB[ii+jj*ldb] = (double) pA[ii+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/auxiliary/m_aux_lib44.c b/auxiliary/m_aux_lib44.c
new file mode 100644
index 0000000..a17d545
--- /dev/null
+++ b/auxiliary/m_aux_lib44.c
@@ -0,0 +1,93 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi)
+	{
+	double *pd = vd->pa+vdi;
+	float *ps = vs->pa+vsi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		ps[ii] = (float) pd[ii];
+		}
+	return;
+	}
+
+
+
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi)
+	{
+	double *pd = vd->pa+vdi;
+	float *ps = vs->pa+vsi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		pd[ii] = (double) ps[ii];
+		}
+	return;
+	}
+
+
+
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis)
+	{
+	printf("\nm_cvt_d2s_strmat: feature not implmeneted yet\n\n");
+	exit(1);
+	return;
+	}
+
+
+
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid)
+	{
+	printf("\nm_cvt_s2d_strmat: feature not implmeneted yet\n\n");
+	exit(1);
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
diff --git a/auxiliary/m_aux_lib48.c b/auxiliary/m_aux_lib48.c
new file mode 100644
index 0000000..e9fdcd2
--- /dev/null
+++ b/auxiliary/m_aux_lib48.c
@@ -0,0 +1,153 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi)
+	{
+	double *pd = vd->pa+vdi;
+	float *ps = vs->pa+vsi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		ps[ii] = (float) pd[ii];
+		}
+	return;
+	}
+
+
+
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi)
+	{
+	double *pd = vd->pa+vdi;
+	float *ps = vs->pa+vsi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		pd[ii] = (double) ps[ii];
+		}
+	return;
+	}
+
+
+
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis)
+	{
+//	printf("\nm_cvt_d2s_strmat: feature not implmeneted yet\n\n");
+//	exit(1);
+	if(mid!=0 | mis!=0)
+		{
+		printf("\nm_cvt_d2s_strmat: feature not implmeneted yet: mid=%d, mis=%d\n\n", mid, mis);
+		exit(1);
+		}
+	const int psd = 4;
+	const int pss = 8;
+	const int sdd = Md->cn;
+	double *D0 = Md->pA + nid*psd;
+	double *D1;
+	const int sds = Ms->cn;
+	float *S = Ms->pA + nis*pss;
+	int ii, jj, ll;
+	for(ii=0; ii<m-7; ii+=8)
+		{
+		D1 = D0 + psd*sdd;
+		for(jj=0; jj<n; jj++)
+			{
+			S[0+jj*pss] = (float) D0[0+jj*psd];
+			S[1+jj*pss] = (float) D0[1+jj*psd];
+			S[2+jj*pss] = (float) D0[2+jj*psd];
+			S[3+jj*pss] = (float) D0[3+jj*psd];
+			S[4+jj*pss] = (float) D1[0+jj*psd];
+			S[5+jj*pss] = (float) D1[1+jj*psd];
+			S[6+jj*pss] = (float) D1[2+jj*psd];
+			S[7+jj*pss] = (float) D1[3+jj*psd];
+			}
+		D0 += 8*sdd;
+		S  += 8*sds;
+		}
+	if(m-ii>0)
+		{
+		if(m-ii<4)
+			{
+			for(jj=0; jj<n; jj++)
+				{
+				for(ll=0; ll<m-ii; ll++)
+					{
+					S[ll+jj*pss] = (float) D0[ll+jj*psd];
+					}
+				}
+			return;
+			}
+		else
+			{
+			D1 = D0 + psd*sdd;
+			for(jj=0; jj<n; jj++)
+				{
+				S[0+jj*pss] = (float) D0[0+jj*psd];
+				S[1+jj*pss] = (float) D0[1+jj*psd];
+				S[2+jj*pss] = (float) D0[2+jj*psd];
+				S[3+jj*pss] = (float) D0[3+jj*psd];
+				for(ll=0; ll<m-ii-4; ll++)
+					{
+					S[4+ll+jj*pss] = (float) D1[ll+jj*psd];
+					}
+				}
+			}
+		}
+	return;
+	}
+
+
+
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid)
+	{
+	printf("\nm_cvt_s2d_strmat: feature not implmeneted yet\n\n");
+	exit(1);
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/auxiliary/s_aux_ext_dep_lib.c b/auxiliary/s_aux_ext_dep_lib.c
new file mode 100644
index 0000000..85f7ebc
--- /dev/null
+++ b/auxiliary/s_aux_ext_dep_lib.c
@@ -0,0 +1,633 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if ! defined(OS_WINDOWS)
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+#endif
+
+
+
+/* creates a zero matrix */
+void s_zeros(float **pA, int row, int col)
+	{
+	*pA = malloc((row*col)*sizeof(float));
+	float *A = *pA;
+	int i;
+	for(i=0; i<row*col; i++) A[i] = 0.0;
+	}
+
+
+
+/* creates a zero matrix aligned to a cache line */
+void s_zeros_align(float **pA, int row, int col)
+	{
+#if defined(OS_WINDOWS)
+	*pA = (float *) _aligned_malloc( (row*col)*sizeof(float), 64 );
+#else
+	void *temp;
+	int err = posix_memalign(&temp, 64, (row*col)*sizeof(float));
+	if(err!=0)
+		{
+		printf("Memory allocation error");
+		exit(1);
+		}
+	*pA = temp;
+#endif
+	float *A = *pA;
+	int i;
+	for(i=0; i<row*col; i++) A[i] = 0.0;
+	}
+
+
+
+/* frees matrix */
+void s_free(float *pA)
+	{
+	free( pA );
+	}
+
+
+
+/* frees aligned matrix */
+void s_free_align(float *pA)
+	{
+#if defined(OS_WINDOWS)
+	_aligned_free( pA );
+#else
+	free( pA );
+#endif
+	}
+
+
+
+/* prints a matrix in column-major format */
+void s_print_mat(int m, int n, float *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<m; i++)
+		{
+		for(j=0; j<n; j++)
+			{
+			printf("%9.5f ", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void s_print_tran_mat(int row, int col, float *A, int lda)
+	{
+	int i, j;
+	for(j=0; j<col; j++)
+		{
+		for(i=0; i<row; i++)
+			{
+			printf("%9.5f ", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/* prints a matrix in column-major format */
+void s_print_to_file_mat(FILE *file, int row, int col, float *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<row; i++)
+		{
+		for(j=0; j<col; j++)
+			{
+			fprintf(file, "%9.5f ", A[i+lda*j]);
+			}
+		fprintf(file, "\n");
+		}
+	fprintf(file, "\n");
+	}	
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void s_print_tran_to_file_mat(FILE *file, int row, int col, float *A, int lda)
+	{
+	int i, j;
+	for(j=0; j<col; j++)
+		{
+		for(i=0; i<row; i++)
+			{
+			fprintf(file, "%9.5f ", A[i+lda*j]);
+			}
+		fprintf(file, "\n");
+		}
+	fprintf(file, "\n");
+	}	
+
+
+
+/* prints a matrix in column-major format (exponential notation) */
+void s_print_e_mat(int m, int n, float *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<m; i++)
+		{
+		for(j=0; j<n; j++)
+			{
+			printf("%e\t", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/* prints the transposed of a matrix in column-major format (exponential notation) */
+void s_print_e_tran_mat(int row, int col, float *A, int lda)
+	{
+	int i, j;
+	for(j=0; j<col; j++)
+		{
+		for(i=0; i<row; i++)
+			{
+			printf("%e\t", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/****************************
+* new interface
+****************************/
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+#include "../include/blasfeo_block_size.h"
+
+
+
+// create a matrix structure for a matrix of size m*n by dynamically allocating the memory
+void s_allocate_strmat(int m, int n, struct s_strmat *sA)
+	{
+	const int bs = S_PS;
+	int nc = S_NC;
+	int al = bs*nc;
+	sA->m = m;
+	sA->n = n;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	sA->pm = pm;
+	sA->cn = cn;
+	s_zeros_align(&(sA->pA), sA->pm, sA->cn);
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	s_zeros_align(&(sA->dA), tmp, 1);
+	sA->use_dA = 0;
+	sA->memory_size = (pm*cn+tmp)*sizeof(float);
+	return;
+	}
+
+
+
+// free memory of a matrix structure
+void s_free_strmat(struct s_strmat *sA)
+	{
+	s_free_align(sA->pA);
+	s_free_align(sA->dA);
+	return;
+	}
+
+
+
+// create a vector structure for a vector of size m by dynamically allocating the memory
+void s_allocate_strvec(int m, struct s_strvec *sa)
+	{
+	const int bs = S_PS;
+//	int nc = S_NC;
+//	int al = bs*nc;
+	sa->m = m;
+	int pm = (m+bs-1)/bs*bs;
+	sa->pm = pm;
+	s_zeros_align(&(sa->pa), sa->pm, 1);
+	sa->memory_size = pm*sizeof(float);
+	return;
+	}
+
+
+
+// free memory of a matrix structure
+void s_free_strvec(struct s_strvec *sa)
+	{
+	s_free_align(sa->pa);
+	return;
+	}
+
+
+
+// print a matrix structure
+void s_print_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = S_PS;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int ii, i, j, tmp;
+	ii = 0;
+	if(ai%bs>0)
+		{
+		tmp = bs-ai%bs;
+		tmp = m<tmp ? m : tmp;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%9.5f ", pA[i+bs*j]);
+				}
+			printf("\n");
+			}
+		pA += tmp + bs*(sda-1);
+		m -= tmp;
+		}
+	for( ; ii<m-(bs-1); ii+=bs)
+		{
+		for(i=0; i<bs; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	if(ii<m)
+		{
+		tmp = m-ii;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	printf("\n");
+	return;
+	}
+
+
+
+// print a vector structure
+void s_print_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print the transposed of a vector structure
+void s_print_tran_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void s_print_to_file_strmat(FILE * file, int m, int n, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = S_PS;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int ii, i, j, tmp;
+	ii = 0;
+	if(ai%bs>0)
+		{
+		tmp = bs-ai%bs;
+		tmp = m<tmp ? m : tmp;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				fprintf(file, "%9.5f ", pA[i+bs*j]);
+				}
+			fprintf(file, "\n");
+			}
+		pA += tmp + bs*(sda-1);
+		m -= tmp;
+		}
+	for( ; ii<m-(bs-1); ii+=bs)
+		{
+		for(i=0; i<bs; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			fprintf(file, "\n");
+			}
+		}
+	if(ii<m)
+		{
+		tmp = m-ii;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			fprintf(file, "\n");
+			}
+		}
+	fprintf(file, "\n");
+	return;
+	}
+
+
+
+// print a vector structure
+void s_print_to_file_strvec(FILE * file, int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_to_file_mat(file, m, 1, pa, m);
+	return;
+	}
+
+
+
+// print the transposed of a vector structure
+void s_print_tran_to_file_strvec(FILE * file, int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_to_file_mat(file, 1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void s_print_e_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = S_PS;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int ii, i, j, tmp;
+	ii = 0;
+	if(ai%bs>0)
+		{
+		tmp = bs-ai%bs;
+		tmp = m<tmp ? m : tmp;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%e\t", pA[i+bs*j]);
+				}
+			printf("\n");
+			}
+		pA += tmp + bs*(sda-1);
+		m -= tmp;
+		}
+	for( ; ii<m-(bs-1); ii+=bs)
+		{
+		for(i=0; i<bs; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%e\t", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	if(ii<m)
+		{
+		tmp = m-ii;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%e\t", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	printf("\n");
+	return;
+	}
+
+
+
+// print a vector structure
+void s_print_e_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_e_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print the transposed of a vector structure
+void s_print_e_tran_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_e_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+#elif defined(LA_BLAS) | defined(LA_REFERENCE)
+
+
+
+// create a matrix structure for a matrix of size m*n
+void s_allocate_strmat(int m, int n, struct s_strmat *sA)
+	{
+	sA->m = m;
+	sA->n = n;
+	s_zeros(&(sA->pA), sA->m, sA->n);
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	s_zeros(&(sA->dA), tmp, 1);
+	sA->memory_size = (m*n+tmp)*sizeof(float);
+	return;
+	}
+
+
+
+// free memory of a matrix structure
+void s_free_strmat(struct s_strmat *sA)
+	{
+	free(sA->pA);
+	free(sA->dA);
+	return;
+	}
+
+
+
+// create a vector structure for a vector of size m
+void s_allocate_strvec(int m, struct s_strvec *sa)
+	{
+	sa->m = m;
+	s_zeros(&(sa->pa), sa->m, 1);
+	sa->memory_size = m*sizeof(float);
+	return;
+	}
+
+
+
+// free memory of a vector structure
+void s_free_strvec(struct s_strvec *sa)
+	{
+	free(sa->pa);
+	return;
+	}
+
+
+
+// print a matrix structure
+void s_print_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	s_print_mat(m, n, pA, lda);
+	return;
+	}
+
+
+
+// print a vector structure
+void s_print_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print and transpose a vector structure
+void s_print_tran_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void s_print_to_file_strmat(FILE *file, int m, int n, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	s_print_to_file_mat(file, m, n, pA, lda);
+	return;
+	}
+
+
+
+// print a vector structure
+void s_print_to_file_strvec(FILE *file, int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_to_file_mat(file, m, 1, pa, m);
+	return;
+	}
+
+
+
+// print and transpose a vector structure
+void s_print_to_file_tran_strvec(FILE *file, int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_to_file_mat(file, 1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void s_print_e_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	s_print_e_mat(m, n, pA, lda);
+	return;
+	}
+
+
+
+// print a vector structure
+void s_print_e_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_e_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print and transpose a vector structure
+void s_print_e_tran_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_e_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/auxiliary/s_aux_lib.c b/auxiliary/s_aux_lib.c
new file mode 100644
index 0000000..978eb9a
--- /dev/null
+++ b/auxiliary/s_aux_lib.c
@@ -0,0 +1,956 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+// return memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n)
+	{
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	int size = (m*n+tmp)*sizeof(float);
+	return size;
+	}
+
+
+
+// return memory size (in bytes) needed for the diagonal of a strmat
+int s_size_diag_strmat(int m, int n)
+	{
+	int size = 0;
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	size = tmp*sizeof(float);
+	return size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory)
+	{
+	sA->m = m;
+	sA->n = n;
+	float *ptr = (float *) memory;
+	sA->pA = ptr;
+	ptr += m*n;
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	sA->dA = ptr;
+	ptr += tmp;
+	sA->use_dA = 0;
+	sA->memory_size = (m*n+tmp)*sizeof(float);
+	return;
+	}
+
+
+
+// return memory size (in bytes) needed for a strvec
+int s_size_strvec(int m)
+	{
+	int size = m*sizeof(float);
+	return size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strvec(int m, struct s_strvec *sa, void *memory)
+	{
+	sa->m = m;
+	float *ptr = (float *) memory;
+	sa->pa = ptr;
+//	ptr += m * n;
+	sa->memory_size = m*sizeof(float);
+	return;
+	}
+
+
+
+// convert a matrix into a matrix structure
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	float *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+			pA[ii+1+jj*lda2] = A[ii+1+jj*lda];
+			pA[ii+2+jj*lda2] = A[ii+2+jj*lda];
+			pA[ii+3+jj*lda2] = A[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix into a matrix structure
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	float *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+			pA[jj+(ii+1)*lda2] = A[ii+1+jj*lda];
+			pA[jj+(ii+2)*lda2] = A[ii+2+jj*lda];
+			pA[jj+(ii+3)*lda2] = A[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector into a vector structure
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		pa[ii] = a[ii];
+	return;
+	}
+
+
+
+// convert a matrix structure into a matrix
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	float *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+			A[ii+1+jj*lda] = pA[ii+1+jj*lda2];
+			A[ii+2+jj*lda] = pA[ii+2+jj*lda2];
+			A[ii+3+jj*lda] = pA[ii+3+jj*lda2];
+			}
+		for(; ii<m; ii++)
+			{
+			A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix structure into a matrix
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	float *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+			A[jj+(ii+1)*lda] = pA[ii+1+jj*lda2];
+			A[jj+(ii+2)*lda] = pA[ii+2+jj*lda2];
+			A[jj+(ii+3)*lda] = pA[ii+3+jj*lda2];
+			}
+		for(; ii<m; ii++)
+			{
+			A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector structure into a vector
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		a[ii] = pa[ii];
+	return;
+	}
+
+
+
+// cast a matrix into a matrix structure
+void s_cast_mat2strmat(float *A, struct s_strmat *sA)
+	{
+	sA->pA = A;
+	return;
+	}
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA)
+	{
+	sA->dA = dA;
+	return;
+	}
+
+
+
+// cast a vector into a vector structure
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa)
+	{
+	sa->pa = a;
+	return;
+	}
+
+
+
+// insert element into strmat
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	pA[0] = a;
+	return;
+	}
+
+
+
+// extract element from strmat
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	return pA[0];
+	}
+
+
+
+// insert element into strvec
+void svecin1_libstr(float a, struct s_strvec *sx, int xi)
+	{
+	float *x = sx->pa + xi;
+	x[0] = a;
+	return;
+	}
+
+
+
+// extract element from strvec
+float svecex1_libstr(struct s_strvec *sx, int xi)
+	{
+	float *x = sx->pa + xi;
+	return x[0];
+	}
+
+
+
+// set all elements of a strmat to a value
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		for(ii=0; ii<m; ii++)
+			{
+			pA[ii+lda*jj] = alpha;
+			}
+		}
+	return;
+	}
+
+
+
+// set all elements of a strvec to a value
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi)
+	{
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		x[ii] = alpha;
+	return;
+	}
+
+
+
+// extract diagonal to vector
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		x[ii] = alpha*pA[ii*(lda+1)];
+	return;
+	}
+
+
+
+// insert a vector into diagonal
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*(lda+1)] = alpha*x[ii];
+	return;
+	}
+
+
+
+// extract a row into a vector
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		x[ii] = alpha*pA[ii*lda];
+	return;
+	}
+
+
+
+// insert a vector  into a row
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*lda] = alpha*x[ii];
+	return;
+	}
+
+
+
+// add a vector to a row
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*lda] += alpha*x[ii];
+	return;
+	}
+
+
+
+// swap two rows of a matrix struct
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*lda;
+	int ii;
+	float tmp;
+	for(ii=0; ii<kmax; ii++)
+		{
+		tmp = pA[ii*lda];
+		pA[ii*lda] = pC[ii*ldc];
+		pC[ii*ldc] = tmp;
+		}
+	return;
+	}
+
+
+
+// permute the rows of a matrix struct
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			srowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+		}
+	return;
+	}
+
+
+
+// insert a vector  into a rcol
+void scolin_libstr(int kmax, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii] = x[ii];
+	return;
+	}
+
+
+
+// swap two cols of a matrix struct
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*lda;
+	int ii;
+	float tmp;
+	for(ii=0; ii<kmax; ii++)
+		{
+		tmp = pA[ii];
+		pA[ii] = pC[ii];
+		pC[ii] = tmp;
+		}
+	return;
+	}
+
+
+
+// permute the cols of a matrix struct
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			scolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+		}
+	return;
+	}
+
+
+
+// copy a generic strmat into a generic strmat
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+			pC[ii+1+jj*ldc] = pA[ii+1+jj*lda];
+			pC[ii+2+jj*ldc] = pA[ii+2+jj*lda];
+			pC[ii+3+jj*ldc] = pA[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// scale a generic strmat
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pA[ii+0+jj*lda] *= alpha;
+			pA[ii+1+jj*lda] *= alpha;
+			pA[ii+2+jj*lda] *= alpha;
+			pA[ii+3+jj*lda] *= alpha;
+			}
+		for(; ii<m; ii++)
+			{
+			pA[ii+0+jj*lda] *= alpha;
+			}
+		}
+	return;
+	}
+
+
+
+// copy a strvec into a strvec
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+	{
+	float *pa = sa->pa + ai;
+	float *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] = pa[ii+0];
+		pc[ii+1] = pa[ii+1];
+		pc[ii+2] = pa[ii+2];
+		pc[ii+3] = pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] = pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// scale a strvec
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pa[ii+0] *= alpha;
+		pa[ii+1] *= alpha;
+		pa[ii+2] *= alpha;
+		pa[ii+3] *= alpha;
+		}
+	for(; ii<m; ii++)
+		{
+		pa[ii+0] *= alpha;
+		}
+	return;
+	}
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<m; jj++)
+		{
+		ii = jj;
+		for(; ii<m; ii++)
+			{
+			pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// scale and add a generic strmat into a generic strmat
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+			pC[ii+1+jj*ldc] += alpha*pA[ii+1+jj*lda];
+			pC[ii+2+jj*ldc] += alpha*pA[ii+2+jj*lda];
+			pC[ii+3+jj*ldc] += alpha*pA[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// scales and adds a strvec into a strvec
+void svecad_libstr(int m, float alpha, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+	{
+	float *pa = sa->pa + ai;
+	float *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		pc[ii+1] += alpha*pa[ii+1];
+		pc[ii+2] += alpha*pa[ii+2];
+		pc[ii+3] += alpha*pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			pC[jj+(ii+1)*ldc] = pA[ii+1+jj*lda];
+			pC[jj+(ii+2)*ldc] = pA[ii+2+jj*lda];
+			pC[jj+(ii+3)*ldc] = pA[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<m; jj++)
+		{
+		ii = jj;
+		for(; ii<m; ii++)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<m; jj++)
+		{
+		ii = 0;
+		for(; ii<=jj; ii++)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// insert a strvec to the diagonal of a strmat, sparse formulation
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	float *x = sx->pa + xi;
+	int ldd = sD->m;
+	float *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*(ldd+1)] = alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// extract the diagonal of a strmat from a strvec , sparse formulation
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi)
+	{
+	float *x = sx->pa + xi;
+	int ldd = sD->m;
+	float *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[ii*(ldd+1)];
+		}
+	return;
+	}
+
+
+
+// add a vector to diagonal
+void sdiaad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*(lda+1)] += alpha*x[ii];
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	float *x = sx->pa + xi;
+	int ldd = sD->m;
+	float *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*(ldd+1)] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	int ldd = sD->m;
+	float *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*(ldd+1)] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	float *x = sx->pa + xi;
+	int ldd = sD->m;
+	float *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*ldd] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+
+void svecad_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] += alpha * x[ii];
+	return;
+	}
+
+
+
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] = alpha * x[ii];
+	return;
+	}
+
+
+
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[ii] = alpha * x[idx[ii]];
+	return;
+	}
+
+
+// clip without mask return
+void sveccl_libstr(int m, struct s_strvec *sxm, int xim, struct s_strvec *sx, int xi, struct s_strvec *sxp, int xip, struct s_strvec *sz, int zi)
+	{
+	float *xm = sxm->pa + xim;
+	float *x  = sx->pa + xi;
+	float *xp = sxp->pa + xip;
+	float *z  = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		if(x[ii]>=xp[ii])
+			{
+			z[ii] = xp[ii];
+			}
+		else if(x[ii]<=xm[ii])
+			{
+			z[ii] = xm[ii];
+			}
+		else
+			{
+			z[ii] = x[ii];
+			}
+		}
+	return;
+	}
+
+
+
+// clip with mask return
+void sveccl_mask_libstr(int m, struct s_strvec *sxm, int xim, struct s_strvec *sx, int xi, struct s_strvec *sxp, int xip, struct s_strvec *sz, int zi, struct s_strvec *sm, int mi)
+	{
+	float *xm = sxm->pa + xim;
+	float *x  = sx->pa + xi;
+	float *xp = sxp->pa + xip;
+	float *z  = sz->pa + zi;
+	float *mask  = sm->pa + mi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		if(x[ii]>=xp[ii])
+			{
+			z[ii] = xp[ii];
+			mask[ii] = 1.0;
+			}
+		else if(x[ii]<=xm[ii])
+			{
+			z[ii] = xm[ii];
+			mask[ii] = -1.0;
+			}
+		else
+			{
+			z[ii] = x[ii];
+			mask[ii] = 0.0;
+			}
+		}
+	return;
+	}
+
+
+// zero out components using mask
+void svecze_libstr(int m, struct s_strvec *sm, int mi, struct s_strvec *sv, int vi, struct s_strvec *se, int ei)
+	{
+	float *mask = sm->pa + mi;
+	float *v = sv->pa + vi;
+	float *e = se->pa + ei;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		if(mask[ii]==0)
+			{
+			e[ii] = v[ii];
+			}
+		else
+			{
+			e[ii] = 0;
+			}
+		}
+	return;
+	}
+
+
+
+void svecnrm_inf_libstr(int m, struct s_strvec *sx, int xi, float *ptr_norm)
+	{
+	int ii;
+	float *x = sx->pa + xi;
+	float norm = 0.0;
+	for(ii=0; ii<m; ii++)
+		norm = fmax(norm, fabs(x[ii]));
+	*ptr_norm = norm;
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
diff --git a/auxiliary/s_aux_lib4.c b/auxiliary/s_aux_lib4.c
new file mode 100644
index 0000000..12acc47
--- /dev/null
+++ b/auxiliary/s_aux_lib4.c
@@ -0,0 +1,3107 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_block_size.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+// scales and adds a strvec into a strvec
+void svecad_libstr(int m, float *alphap, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+	{
+	float alpha = alphap[0];
+	float *pa = sa->pa + ai;
+	float *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		pc[ii+1] += alpha*pa[ii+1];
+		pc[ii+2] += alpha*pa[ii+2];
+		pc[ii+3] += alpha*pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// transpose general matrix; m and n are referred to the original matrix
+void sgetr_lib(int m, int n, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+
+/*
+
+m = 5
+n = 3
+offsetA = 1
+offsetC = 2
+
+A = 
+ x x x
+ -
+ x x x
+ x x x
+ x x x
+ x x x
+
+C =
+ x x x x x
+ x x x x x
+ -
+ x x x x x
+
+*/
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna = (bs-offsetA%bs)%bs;
+	mna = m<mna ? m : mna;
+	int nna = (bs-offsetC%bs)%bs;
+	nna = n<nna ? n : nna;
+	
+	int ii;
+
+	ii = 0;
+
+	if(mna>0)
+		{
+		if(mna==1)
+			kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+		else if(mna==2)
+			kernel_sgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+		else //if(mna==3)
+			kernel_sgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+		ii += mna;
+		pA += mna + bs*(sda-1);
+		pC += mna*bs;
+		}
+	for( ; ii<m-3; ii+=4)
+//	for( ; ii<m; ii+=4)
+		{
+		kernel_sgetr_4_lib4(0, n, nna, alpha, pA, pC, sdc);
+		pA += bs*sda;
+		pC += bs*bs;
+		}
+
+	// clean-up at the end using smaller kernels
+	if(ii==m)
+		return;
+	
+	if(m-ii==1)
+		kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+	else if(m-ii==2)
+		kernel_sgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+	else if(m-ii==3)
+		kernel_sgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+		
+	return;
+	
+	}	
+
+
+
+// transpose lower triangular matrix
+void strtr_l_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+
+/*
+
+A = 
+ x
+ x x
+ x x x
+ x x x x
+  
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+C =
+ x x x x x x x x
+  
+   x x x x x x x
+     x x x x x x
+	   x x x x x
+	     x x x x
+
+	       x x x
+	         x x
+	           x
+
+*/
+
+	int n = m;
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna = (bs-offsetA%bs)%bs;
+	mna = m<mna ? m : mna;
+	int nna = (bs-offsetC%bs)%bs;
+	nna = n<nna ? n : nna;
+	
+	int ii;
+
+	ii = 0;
+
+	if(mna>0)
+		{
+		if(mna==1)
+			{
+			pC[0] = alpha * pA[0];
+			}
+		else if(mna==2)
+			{
+			if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+				}
+			else
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				}
+			}
+		else //if(mna==3)
+			{
+			if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[0+bs*2] = alpha * pA[2+bs*0];
+				pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+				pC[1+bs*(1+sdc)] = alpha * pA[2+bs*1];
+				pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+				}
+			else if(nna==2)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[0+bs*2] = alpha * pA[2+bs*0];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[1+bs*2] = alpha * pA[2+bs*1];
+				pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+				}
+			else
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[0+bs*2] = alpha * pA[2+bs*0];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[1+bs*2] = alpha * pA[2+bs*1];
+				pC[2+bs*2] = alpha * pA[2+bs*2];
+				}
+			}
+		ii += mna;
+		pA += mna + bs*(sda-1);
+		pC += mna*bs;
+		}
+	for( ; ii<m-3; ii+=4)
+		{
+		kernel_sgetr_4_lib4(1, ii, nna, alpha, pA, pC, sdc);
+		pA += bs*sda;
+		pC += bs*bs;
+		}
+	
+	// clean-up at the end using smaller kernels
+	if(ii==m)
+		return;
+	
+	if(m-ii==1)
+		kernel_sgetr_1_lib4(1, ii, nna, alpha, pA, pC, sdc);
+	else if(m-ii==2)
+		kernel_sgetr_2_lib4(1, ii, nna, alpha, pA, pC, sdc);
+	else if(m-ii==3)
+		kernel_sgetr_3_lib4(1, ii, nna, alpha, pA, pC, sdc);
+		
+	return;
+
+	}
+
+
+
+// transpose an aligned upper triangular matrix into an aligned lower triangular matrix
+void strtr_u_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+
+/*
+
+A = 
+ x x x x x x x x
+   x x x x x x x
+
+     x x x x x x
+       x x x x x
+         x x x x
+           x x x
+             x x
+               x
+
+C = 
+ x
+
+ x x
+ x x x
+ x x x x
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+*/
+
+	int n = m;
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna = (bs-offsetA%bs)%bs;
+	mna = m<mna ? m : mna;
+	int nna = (bs-offsetC%bs)%bs;
+	nna = n<nna ? n : nna;
+	int tna = nna;
+	
+	int ii;
+
+	ii = 0;
+
+	if(mna>0)
+		{
+		if(mna==1)
+			{
+			kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+			if(nna!=1)
+				{
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += 1*bs;
+				pC += 1;
+				tna = (bs-(offsetC+1)%bs)%bs;
+				}
+			else //if(nna==1)
+				{
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += 1*bs;
+				pC += 1 + (sdc-1)*bs;
+				tna = 0; //(bs-(offsetC+1)%bs)%bs;
+				}
+//			kernel_sgetr_1_lib4(0, n-1, tna, alpha, pA, pC, sdc);
+			}
+		else if(mna==2)
+			{
+			if(nna==0 || nna==3)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pA += 2*bs;
+				pC += 2;
+				tna = (bs-(offsetC+2)%bs)%bs;
+				kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += 1*bs;
+				pC += 1 + (sdc-1)*bs;
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+//				pC[0+bs*1] = alpha * pA[1+bs*0];
+				kernel_sgetr_2_lib4(0, n-1, 0, alpha, pA, pC, sdc);
+				pA += 1*bs;
+				pC += 1;
+				tna = 3; //(bs-(offsetC+2)%bs)%bs;
+//				kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==2)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pA += 2*bs;
+				pC += 2 + (sdc-1)*bs;
+				tna = 0; //(bs-(offsetC+2)%bs)%bs;
+				kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+				}
+			}
+		else //if(mna==3)
+			{
+			if(nna==0)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[2+bs*0] = alpha * pA[0+bs*2];
+				pC[2+bs*1] = alpha * pA[1+bs*2];
+				pC[2+bs*2] = alpha * pA[2+bs*2];
+				pA += 3*bs;
+				pC += 3;
+				tna = 1;
+				kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += bs;
+				pC += 1 + (sdc-1)*bs;
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[1+bs*2] = alpha * pA[2+bs*1];
+				pA += 2*bs;
+				pC += 2;
+				tna = 2;
+				kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==2)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pA += 2*bs;
+				pC += 2 + (sdc-1)*bs;
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+//				pC[0+bs*1] = alpha * pA[1+bs*0];
+//				pC[0+bs*2] = alpha * pA[2+bs*0];
+				kernel_sgetr_3_lib4(0, n-2, 0, alpha, pA, pC, sdc);
+				pA += 1*bs;
+				pC += 1;
+				tna = 3;
+//				kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			else //if(nna==3)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[2+bs*0] = alpha * pA[0+bs*2];
+				pC[2+bs*1] = alpha * pA[1+bs*2];
+				pC[2+bs*2] = alpha * pA[2+bs*2];
+				pA += 3*bs;
+				pC += 3 + (sdc-1)*bs;
+				tna = 0;
+				kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			}
+		ii += mna;
+		pA += mna + bs*(sda-1);
+		pC += mna*bs;
+		}
+	for( ; ii<m-3; ii+=4)
+		{
+		if(tna==0)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			pC[3+bs*0] = alpha * pA[0+bs*3];
+			pC[3+bs*1] = alpha * pA[1+bs*3];
+			pC[3+bs*2] = alpha * pA[2+bs*3];
+			pC[3+bs*3] = alpha * pA[3+bs*3];
+			pA += 4*bs;
+			pC += sdc*bs;
+			kernel_sgetr_4_lib4(0, n-ii-4, 0, alpha, pA, pC, sdc);
+			}
+		else if(tna==1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pA += bs;
+			pC += 1 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[1+bs*2] = alpha * pA[2+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			pC[2+bs*3] = alpha * pA[3+bs*2];
+			pA += 3*bs;
+			pC += 3;
+			kernel_sgetr_4_lib4(0, n-ii-4, 1, alpha, pA, pC, sdc);
+			}
+		else if(tna==2)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pA += 2*bs;
+			pC += 2 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[0+bs*2] = alpha * pA[2+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[1+bs*2] = alpha * pA[2+bs*1];
+			pC[1+bs*3] = alpha * pA[3+bs*1];
+			pA += 2*bs;
+			pC += 2;
+			kernel_sgetr_4_lib4(0, n-ii-4, 2, alpha, pA, pC, sdc);
+			}
+		else //if(tna==3)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			pA += 3*bs;
+			pC += 3 + (sdc-1)*bs;
+			kernel_sgetr_4_lib4(0, n-ii-3, 0, alpha, pA, pC, sdc);
+//			pC[0+bs*0] = alpha * pA[0+bs*0];
+//			pC[0+bs*1] = alpha * pA[1+bs*0];
+//			pC[0+bs*2] = alpha * pA[2+bs*0];
+//			pC[0+bs*3] = alpha * pA[3+bs*0];
+			pA += bs;
+			pC += 1;
+//			kernel_sgetr_4_lib4(0, n-ii-4, tna, alpha, pA, pC, sdc);
+			}
+		pA += bs*sda;
+		pC += bs*bs;
+		}
+
+	// clean-up at the end
+	if(ii==m)
+		return;
+	
+	if(m-ii==1)
+		{
+		pC[0+bs*0] = alpha * pA[0+bs*0];
+		}
+	else if(m-ii==2)
+		{
+		if(tna!=1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			}
+		else //if(tna==1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pA += bs;
+			pC += 1 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			}
+		}
+	else if(m-ii==3)
+		{
+		if(tna==0 || tna==3)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			}
+		else if(tna==1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pA += bs;
+			pC += 1 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[1+bs*2] = alpha * pA[2+bs*1];
+			}
+		else //if(tna==2)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pA += 2*bs;
+			pC += 2 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[0+bs*2] = alpha * pA[2+bs*0];
+			}
+		}
+		
+	return;
+
+	}
+
+
+
+// regularize diagonal 
+void sdiareg_lib(int kmax, float reg, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] += reg;
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] += reg;
+		pD[jj*sdd+(jj+1)*bs+1] += reg;
+		pD[jj*sdd+(jj+2)*bs+2] += reg;
+		pD[jj*sdd+(jj+3)*bs+3] += reg;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] += reg;
+		}
+	
+	}
+
+
+
+// insert vector to diagonal 
+void sdiain_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] = alpha*x[ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] = alpha*x[jj+0];
+		pD[jj*sdd+(jj+1)*bs+1] = alpha*x[jj+1];
+		pD[jj*sdd+(jj+2)*bs+2] = alpha*x[jj+2];
+		pD[jj*sdd+(jj+3)*bs+3] = alpha*x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] = alpha*x[jj+ll];
+		}
+	
+	}
+
+
+
+// insert sqrt of vector to diagonal 
+void sdiain_sqrt_lib(int kmax, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] = sqrt(x[ll]);
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] = sqrt(x[jj+0]);
+		pD[jj*sdd+(jj+1)*bs+1] = sqrt(x[jj+1]);
+		pD[jj*sdd+(jj+2)*bs+2] = sqrt(x[jj+2]);
+		pD[jj*sdd+(jj+3)*bs+3] = sqrt(x[jj+3]);
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
+		}
+	
+	}
+
+
+
+// extract diagonal to vector 
+void sdiaex_lib(int kmax, float alpha, int offset, float *pD, int sdd, float *x)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			x[ll] = alpha * pD[ll+bs*ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[jj+0] = alpha * pD[jj*sdd+(jj+0)*bs+0];
+		x[jj+1] = alpha * pD[jj*sdd+(jj+1)*bs+1];
+		x[jj+2] = alpha * pD[jj*sdd+(jj+2)*bs+2];
+		x[jj+3] = alpha * pD[jj*sdd+(jj+3)*bs+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
+		}
+	
+	}
+
+
+
+// add scaled vector to diagonal 
+void sdiaad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] += alpha * x[ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] += alpha * x[jj+0];
+		pD[jj*sdd+(jj+1)*bs+1] += alpha * x[jj+1];
+		pD[jj*sdd+(jj+2)*bs+2] += alpha * x[jj+2];
+		pD[jj*sdd+(jj+3)*bs+3] += alpha * x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
+		}
+	
+	}
+
+
+
+// insert vector to diagonal, sparse formulation 
+void sdiain_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
+		}
+	
+	}
+
+
+
+// extract diagonal to vector, sparse formulation 
+void sdiaex_libsp(int kmax, int *idx, float alpha, float *pD, int sdd, float *x)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
+		}
+	
+	}
+
+
+
+// add scaled vector to diagonal, sparse formulation 
+void sdiaad_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+// add scaled vector to another vector and insert to diagonal, sparse formulation 
+void sdiaadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
+		}
+	
+	}
+
+
+
+// insert vector to row 
+void srowin_lib(int kmax, float alpha, float *x, float *pD)
+	{
+	
+	const int bs = 4;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[(jj+0)*bs] = alpha*x[jj+0];
+		pD[(jj+1)*bs] = alpha*x[jj+1];
+		pD[(jj+2)*bs] = alpha*x[jj+2];
+		pD[(jj+3)*bs] = alpha*x[jj+3];
+		}
+	for(; jj<kmax; jj++)
+		{
+		pD[(jj)*bs] = alpha*x[jj];
+		}
+	
+	}
+
+
+
+// extract row to vector
+void srowex_lib(int kmax, float alpha, float *pD, float *x)
+	{
+	
+	const int bs = 4;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[jj+0] = alpha*pD[(jj+0)*bs];
+		x[jj+1] = alpha*pD[(jj+1)*bs];
+		x[jj+2] = alpha*pD[(jj+2)*bs];
+		x[jj+3] = alpha*pD[(jj+3)*bs];
+		}
+	for(; jj<kmax; jj++)
+		{
+		x[jj] = alpha*pD[(jj)*bs];
+		}
+	
+	}
+
+
+
+// add scaled vector to row 
+void srowad_lib(int kmax, float alpha, float *x, float *pD)
+	{
+
+	const int bs = 4;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[(jj+0)*bs] += alpha * x[jj+0];
+		pD[(jj+1)*bs] += alpha * x[jj+1];
+		pD[(jj+2)*bs] += alpha * x[jj+2];
+		pD[(jj+3)*bs] += alpha * x[jj+3];
+		}
+	for(; jj<kmax; jj++)
+		{
+		pD[(jj)*bs] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+// insert vector to row, sparse formulation 
+void srowin_libsp(int kmax, float alpha, int *idx, float *x, float *pD)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] = alpha*x[jj];
+		}
+	
+	}
+
+
+
+// add scaled vector to row, sparse formulation 
+void srowad_libsp(int kmax, int *idx, float alpha, float *x, float *pD)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+// add scaled vector to another vector and insert to row, sparse formulation 
+void srowadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] = y[jj] + alpha * x[jj];
+		}
+	
+	}
+
+
+
+// swap two rows
+void srowsw_lib(int kmax, float *pA, float *pC)
+	{
+
+	const int bs = 4;
+
+	int ii;
+	float tmp;
+
+	for(ii=0; ii<kmax-3; ii+=4)
+		{
+		tmp = pA[0+bs*0];
+		pA[0+bs*0] = pC[0+bs*0];
+		pC[0+bs*0] = tmp;
+		tmp = pA[0+bs*1];
+		pA[0+bs*1] = pC[0+bs*1];
+		pC[0+bs*1] = tmp;
+		tmp = pA[0+bs*2];
+		pA[0+bs*2] = pC[0+bs*2];
+		pC[0+bs*2] = tmp;
+		tmp = pA[0+bs*3];
+		pA[0+bs*3] = pC[0+bs*3];
+		pC[0+bs*3] = tmp;
+		pA += 4*bs;
+		pC += 4*bs;
+		}
+	for( ; ii<kmax; ii++)
+		{
+		tmp = pA[0+bs*0];
+		pA[0+bs*0] = pC[0+bs*0];
+		pC[0+bs*0] = tmp;
+		pA += 1*bs;
+		pC += 1*bs;
+		}
+	
+	}
+
+
+
+// insert vector to column 
+void scolin_lib(int kmax, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll] = x[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+0] = x[jj+0];
+		pD[jj*sdd+1] = x[jj+1];
+		pD[jj*sdd+2] = x[jj+2];
+		pD[jj*sdd+3] = x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+ll] = x[jj+ll];
+		}
+	
+	}
+
+
+
+// add scaled vector to column 
+void scolad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll] += alpha * x[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+0] += alpha * x[jj+0];
+		pD[jj*sdd+1] += alpha * x[jj+1];
+		pD[jj*sdd+2] += alpha * x[jj+2];
+		pD[jj*sdd+3] += alpha * x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+ll] += alpha * x[jj+ll];
+		}
+	
+	}
+
+
+
+// insert vector to diagonal, sparse formulation 
+void scolin_libsp(int kmax, int *idx, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs] = x[jj];
+		}
+	
+	}
+
+
+
+// add scaled vector to diagonal, sparse formulation 
+void scolad_libsp(int kmax, float alpha, int *idx, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+// swaps two cols
+void scolsw_lib(int kmax, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+
+	const int bs = 4;
+
+	int ii;
+
+	float tmp;
+
+	if(offsetA==offsetC)
+		{
+		if(offsetA>0)
+			{
+			ii = 0;
+			for(; ii<bs-offsetA; ii++)
+				{
+				tmp = pA[0+bs*0];
+				pA[0+bs*0] = pC[0+bs*0];
+				pC[0+bs*0] = tmp;
+				pA += 1;
+				pC += 1;
+				}
+			pA += bs*(sda-1);
+			pC += bs*(sdc-1);
+			kmax -= bs-offsetA;
+			}
+		ii = 0;
+		for(; ii<kmax-3; ii+=4)
+			{
+			tmp = pA[0+bs*0];
+			pA[0+bs*0] = pC[0+bs*0];
+			pC[0+bs*0] = tmp;
+			tmp = pA[1+bs*0];
+			pA[1+bs*0] = pC[1+bs*0];
+			pC[1+bs*0] = tmp;
+			tmp = pA[2+bs*0];
+			pA[2+bs*0] = pC[2+bs*0];
+			pC[2+bs*0] = tmp;
+			tmp = pA[3+bs*0];
+			pA[3+bs*0] = pC[3+bs*0];
+			pC[3+bs*0] = tmp;
+			pA += bs*sda;
+			pC += bs*sdc;
+			}
+		for(; ii<kmax; ii++)
+			{
+			tmp = pA[0+bs*0];
+			pA[0+bs*0] = pC[0+bs*0];
+			pC[0+bs*0] = tmp;
+			pA += 1;
+			pC += 1;
+			}
+		}
+	else
+		{
+		printf("\nscolsw: feature not implemented yet: offsetA!=offsetC\n\n");
+		exit(1);
+		}
+
+	return;
+
+	}
+
+
+
+// insert vector to vector, sparse formulation
+void svecin_libsp(int kmax, int *idx, float *x, float *y)
+	{
+
+	int jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		y[idx[jj]] = x[jj];
+		}
+	
+	}
+
+
+
+// adds vector to vector, sparse formulation
+void svecad_libsp(int kmax, int *idx, float alpha, float *x, float *y)
+	{
+
+	int jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		y[idx[jj]] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// return the memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n)
+	{
+	const int bs = 4;
+	int nc = S_NC;
+	int al = bs*nc;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	int memory_size = (pm*cn+tmp)*sizeof(float);
+	return memory_size;
+	}
+
+
+
+// return the memory size (in bytes) needed for the digonal of a strmat
+int s_size_diag_strmat(int m, int n)
+	{
+	const int bs = 4;
+	int nc = S_NC;
+	int al = bs*nc;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	int memory_size = tmp*sizeof(float);
+	return memory_size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory)
+	{
+	const int bs = 4;
+	int nc = S_NC;
+	int al = bs*nc;
+	sA->m = m;
+	sA->n = n;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	sA->pm = pm;
+	sA->cn = cn;
+	float *ptr = (float *) memory;
+	sA->pA = ptr;
+	ptr += pm*cn;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	sA->dA = ptr;
+	ptr += tmp;
+	sA->use_dA = 0;
+	sA->memory_size = (pm*cn+tmp)*sizeof(float);
+	return;
+	}
+
+
+
+// return memory size (in bytes) needed for a strvec
+int s_size_strvec(int m)
+	{
+	const int bs = 4;
+//	int nc = S_NC;
+//	int al = bs*nc;
+	int pm = (m+bs-1)/bs*bs;
+	int memory_size = pm*sizeof(float);
+	return memory_size;
+	}
+
+
+
+// create a vector structure for a vector of size m by using memory passed by a pointer
+void s_create_strvec(int m, struct s_strvec *sa, void *memory)
+	{
+	const int bs = 4;
+//	int nc = S_NC;
+//	int al = bs*nc;
+	sa->m = m;
+	int pm = (m+bs-1)/bs*bs;
+	sa->pm = pm;
+	float *ptr = (float *) memory;
+	sa->pa = ptr;
+//	ptr += pm;
+	sa->memory_size = pm*sizeof(float);
+	return;
+	}
+
+
+
+// convert a matrix into a matrix structure
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, j, jj, m0, m1, m2;
+	float *B, *pB;
+	m0 = (bs-ai%bs)%bs;
+	if(m0>m)
+		m0 = m;
+	m1 = m - m0;
+	jj = 0;
+	for( ; jj<n-3; jj+=4)
+		{
+		B  =  A + jj*lda;
+		pB = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for( ; ii<m0; ii++)
+				{
+				pB[ii+bs*0] = B[ii+lda*0];
+				pB[ii+bs*1] = B[ii+lda*1];
+				pB[ii+bs*2] = B[ii+lda*2];
+				pB[ii+bs*3] = B[ii+lda*3];
+				}
+			B  += m0;
+			pB += m0 + bs*(sda-1);
+			}
+		for( ; ii<m-3; ii+=4)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			pB[1+bs*0] = B[1+lda*0];
+			pB[2+bs*0] = B[2+lda*0];
+			pB[3+bs*0] = B[3+lda*0];
+			// col 1
+			pB[0+bs*1] = B[0+lda*1];
+			pB[1+bs*1] = B[1+lda*1];
+			pB[2+bs*1] = B[2+lda*1];
+			pB[3+bs*1] = B[3+lda*1];
+			// col 2
+			pB[0+bs*2] = B[0+lda*2];
+			pB[1+bs*2] = B[1+lda*2];
+			pB[2+bs*2] = B[2+lda*2];
+			pB[3+bs*2] = B[3+lda*2];
+			// col 3
+			pB[0+bs*3] = B[0+lda*3];
+			pB[1+bs*3] = B[1+lda*3];
+			pB[2+bs*3] = B[2+lda*3];
+			pB[3+bs*3] = B[3+lda*3];
+			// update
+			B  += 4;
+			pB += bs*sda;
+			}
+		for( ; ii<m; ii++)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			// col 1
+			pB[0+bs*1] = B[0+lda*1];
+			// col 2
+			pB[0+bs*2] = B[0+lda*2];
+			// col 3
+			pB[0+bs*3] = B[0+lda*3];
+			// update
+			B  += 1;
+			pB += 1;
+			}
+		}
+	for( ; jj<n; jj++)
+		{
+
+		B  =  A + jj*lda;
+		pB = pA + jj*bs;
+
+		ii = 0;
+		if(m0>0)
+			{
+			for( ; ii<m0; ii++)
+				{
+				pB[ii+bs*0] = B[ii+lda*0];
+				}
+			B  += m0;
+			pB += m0 + bs*(sda-1);
+			}
+		for( ; ii<m-3; ii+=4)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			pB[1+bs*0] = B[1+lda*0];
+			pB[2+bs*0] = B[2+lda*0];
+			pB[3+bs*0] = B[3+lda*0];
+			// update
+			B  += 4;
+			pB += bs*sda;
+			}
+		for( ; ii<m; ii++)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			// update
+			B  += 1;
+			pB += 1;
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix into a matrix structure
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, j, m0, m1, m2;
+	float 	*B, *pB;
+	m0 = (bs-ai%bs)%bs;
+	if(m0>n)
+		m0 = n;
+	m1 = n - m0;
+	ii = 0;
+	if(m0>0)
+		{
+		for(j=0; j<m; j++)
+			{
+			for(i=0; i<m0; i++)
+				{
+				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+				}
+			}
+		A  += m0*lda;
+		pA += m0 + bs*(sda-1);
+		}
+	ii = 0;
+	for(; ii<m1-3; ii+=bs)
+		{
+		j=0;
+		B  = A + ii*lda;
+		pB = pA + ii*sda;
+		for(; j<m-3; j+=4)
+			{
+			// unroll 0
+			pB[0+0*bs] = B[0+0*lda];
+			pB[1+0*bs] = B[0+1*lda];
+			pB[2+0*bs] = B[0+2*lda];
+			pB[3+0*bs] = B[0+3*lda];
+			// unroll 1
+			pB[0+1*bs] = B[1+0*lda];
+			pB[1+1*bs] = B[1+1*lda];
+			pB[2+1*bs] = B[1+2*lda];
+			pB[3+1*bs] = B[1+3*lda];
+			// unroll 2
+			pB[0+2*bs] = B[2+0*lda];
+			pB[1+2*bs] = B[2+1*lda];
+			pB[2+2*bs] = B[2+2*lda];
+			pB[3+2*bs] = B[2+3*lda];
+			// unroll 3
+			pB[0+3*bs] = B[3+0*lda];
+			pB[1+3*bs] = B[3+1*lda];
+			pB[2+3*bs] = B[3+2*lda];
+			pB[3+3*bs] = B[3+3*lda];
+			B  += 4;
+			pB += 4*bs;
+			}
+		for(; j<m; j++)
+			{
+			// unroll 0
+			pB[0+0*bs] = B[0+0*lda];
+			pB[1+0*bs] = B[0+1*lda];
+			pB[2+0*bs] = B[0+2*lda];
+			pB[3+0*bs] = B[0+3*lda];
+			B  += 1;
+			pB += 1*bs;
+			}
+		}
+	if(ii<m1)
+		{
+		m2 = m1-ii;
+		if(bs<m2) m2 = bs;
+		for(j=0; j<m; j++)
+			{
+			for(i=0; i<m2; i++)
+				{
+				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector into a vector structure
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		pa[ii] = a[ii];
+	return;
+	}
+
+
+
+// convert a matrix structure into a matrix
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, jj;
+	int m0 = (bs-ai%bs)%bs;
+	float *ptr_pA;
+	jj=0;
+	for(; jj<n-3; jj+=4)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				// unroll 0
+				A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+				// unroll 1
+				A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+				// unroll 2
+				A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+				// unroll 3
+				A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			// unroll 0
+			A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+			A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+			A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+			// unroll 0
+			A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+			A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
+			A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
+			A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
+			// unroll 0
+			A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+			A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
+			A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
+			A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
+			// unroll 0
+			A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+			A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
+			A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
+			A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			// unroll 0
+			A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			// unroll 1
+			A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+			// unroll 2
+			A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+			// unroll 3
+			A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+			ptr_pA++;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				A[ii+lda*jj] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			A[0+ii+lda*jj] = ptr_pA[0];
+			A[1+ii+lda*jj] = ptr_pA[1];
+			A[2+ii+lda*jj] = ptr_pA[2];
+			A[3+ii+lda*jj] = ptr_pA[3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			A[ii+lda*jj] = ptr_pA[0];
+			ptr_pA++;
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix structure into a matrix
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, jj;
+	int m0 = (bs-ai%bs)%bs;
+	float *ptr_pA;
+	jj=0;
+	for(; jj<n-3; jj+=4)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				// unroll 0
+				A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+				// unroll 1
+				A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+				// unroll 2
+				A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+				// unroll 3
+				A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			// unroll 0
+			A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
+			A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
+			A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
+			A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
+			// unroll 1
+			A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
+			A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
+			A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
+			A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
+			// unroll 2
+			A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
+			A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
+			A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
+			A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
+			// unroll 3
+			A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
+			A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
+			A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
+			A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			// unroll 0
+			A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+			// unroll 1
+			A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+			// unroll 2
+			A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+			// unroll 3
+			A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+			ptr_pA++;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				A[jj+lda*ii] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			i=0;
+			for(; i<bs; i++)
+				{
+				A[jj+lda*(i+ii)] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			A[jj+lda*ii] = ptr_pA[0];
+			ptr_pA++;
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector structure into a vector 
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		a[ii] = pa[ii];
+	return;
+	}
+
+
+
+// cast a matrix into a matrix structure
+void s_cast_mat2strmat(float *A, struct s_strmat *sA)
+	{
+	sA->pA = A;
+	return;
+	}
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA)
+	{
+	sA->dA = dA;
+	return;
+	}
+
+
+
+// cast a vector into a vector structure
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa)
+	{
+	sa->pa = a;
+	return;
+	}
+
+
+
+// insert element into strmat
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	pA[0] = a;
+	return;
+	}
+
+
+
+// extract element from strmat
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	return pA[0];
+	}
+
+
+
+// insert element into strvec
+void svecin1_libstr(float a, struct s_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	x[0] = a;
+	return;
+	}
+
+
+
+// extract element from strvec
+float svecex1_libstr(struct s_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	return x[0];
+	}
+
+
+
+// set all elements of a strmat to a value
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
+	int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
+	int ii, jj;
+	if(m0>0)
+		{
+		for(ii=0; ii<m0; ii++)
+			{
+			for(jj=0; jj<n; jj++)
+				{
+				pA[jj*bs] = alpha;
+				}
+			pA += 1;
+			}
+		pA += bs*(sda-1);
+		m -= m0;
+		}
+	for(ii=0; ii<m-3; ii+=4)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			pA[0+jj*bs] = alpha;
+			pA[1+jj*bs] = alpha;
+			pA[2+jj*bs] = alpha;
+			pA[3+jj*bs] = alpha;
+			}
+		pA += bs*sda;
+		}
+	for( ; ii<m; ii++)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			pA[jj*bs] = alpha;
+			}
+		pA += 1;
+		}
+	return;
+	}
+
+
+
+// set all elements of a strvec to a value
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi)
+	{
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		x[ii] = alpha;
+	return;
+	}
+
+
+
+// extract diagonal to vector
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	sdiaex_lib(kmax, alpha, ai%bs, pA, sda, x);
+	return;
+	}
+
+
+
+// insert a vector into diagonal
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	sdiain_lib(kmax, alpha, x, ai%bs, pA, sda);
+	return;
+	}
+
+
+
+// swap two rows of a matrix struct
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	srowsw_lib(kmax, pA, pC);
+	return;
+	}
+
+
+
+// permute the rows of a matrix struct
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			srowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+		}
+	return;
+	}
+
+
+// extract a row int a vector
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	srowex_lib(kmax, alpha, pA, x);
+	return;
+	}
+
+
+
+// insert a vector into a row
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	srowin_lib(kmax, alpha, x, pA);
+	return;
+	}
+
+
+
+// add a vector to a row
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	srowad_lib(kmax, alpha, x, pA);
+	return;
+	}
+
+
+
+// swap two cols of a matrix struct
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	scolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
+	return;
+	}
+
+
+
+// permute the cols of a matrix struct
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			scolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+		}
+	return;
+	}
+
+
+
+// scale a generic strmat
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+	{
+
+	if(m<=0 | n<=0)
+		return;
+
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** sgesc_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgesc_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgesc_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgesc_libstr : aj<0 : %d<0 *****\n", aj);
+	// inside matrix
+	// A: m x n
+	if(ai+m > sA->m) printf("\n***** sgesc_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** sgesc_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+#endif
+
+	const int bs = 4;
+
+	int mna, ii;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	int offA = ai%bs;
+
+	// same alignment
+	ii = 0;
+	// clean up at the beginning
+	mna = (4-offA)%bs;
+	if(mna>0)
+		{
+		if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+			{
+			if(m==1)
+				{
+				kernel_sgesc_1_lib4(n, &alpha, pA+offA);
+				return;
+				}
+			else //if(m==2 && mna==3)
+				{
+				kernel_sgesc_2_lib4(n, &alpha, pA+offA);
+				return;
+				}
+			}
+		if(mna==1)
+			{
+			kernel_sgesc_1_lib4(n, &alpha, pA+offA);
+			pA += 4*sda;
+			ii += 1;
+			}
+		else if(mna==2)
+			{
+			kernel_sgesc_2_lib4(n, &alpha, pA+offA);
+			pA += 4*sda;
+			ii += 2;
+			}
+		else // if(mna==3)
+			{
+			kernel_sgesc_3_lib4(n, &alpha, pA+offA);
+			pA += 4*sda;
+			ii += 3;
+			}
+		}
+	// main loop
+	for(; ii<m-3; ii+=4)
+		{
+		kernel_sgesc_4_lib4(n, &alpha, pA);
+		pA += 4*sda;
+		}
+	// clean up at the end
+	if(ii<m)
+		{
+		if(m-ii==1)
+			kernel_sgesc_1_lib4(n, &alpha, pA);
+		else if(m-ii==2)
+			kernel_sgesc_2_lib4(n, &alpha, pA);
+		else // if(m-ii==3)
+			kernel_sgesc_3_lib4(n, &alpha, pA);
+		}
+
+	return;
+
+	}
+
+
+
+// copy a generic strmat into a generic strmat
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+	{
+
+	if(m<=0 | n<=0)
+		return;
+
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** sgecp_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgecp_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgecp_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgecp_libstr : aj<0 : %d<0 *****\n", aj);
+	if(bi<0) printf("\n****** sgecp_libstr : bi<0 : %d<0 *****\n", bi);
+	if(bj<0) printf("\n****** sgecp_libstr : bj<0 : %d<0 *****\n", bj);
+	// inside matrix
+	// A: m x n
+	if(ai+m > sA->m) printf("\n***** sgecp_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** sgecp_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// B: m x n
+	if(bi+m > sB->m) printf("\n***** sgecp_libstr : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
+	if(bj+n > sB->n) printf("\n***** sgecp_libstr : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
+#endif
+
+	const int bs = 4;
+
+	int mna, ii;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+	int offA = ai%bs;
+	int offB = bi%bs;
+
+	// same alignment
+	if(offA==offB)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgecp_3_0_lib4(n, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_sgecp_4_0_lib4(n, pA, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgecp_1_0_lib4(n, pA, pB);
+			else if(m-ii==2)
+				kernel_sgecp_2_0_lib4(n, pA, pB);
+			else // if(m-ii==3)
+				kernel_sgecp_3_0_lib4(n, pA, pB);
+			}
+		}
+	// skip one element of pA
+	else if(offA==(offB+1)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+				//pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgecp_2_3_lib4(n, pA, sda, pB+2);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgecp_3_2_lib4(n, pA, sda, pB+1);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_sgecp_4_1_lib4(n, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgecp_1_0_lib4(n, pA+1, pB);
+			else if(m-ii==2)
+				kernel_sgecp_2_0_lib4(n, pA+1, pB);
+			else // if(m-ii==3)
+				kernel_sgecp_3_0_lib4(n, pA+1, pB);
+			}
+		}
+	// skip 2 elements of pA
+	else if(offA==(offB+2)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_sgecp_2_3_lib4(n, pA, sda, pB+1);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgecp_1_0_lib4(n, pA+1, pB+3);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgecp_2_0_lib4(n, pA, pB+2);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgecp_3_3_lib4(n, pA, sda, pB+1);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_sgecp_4_2_lib4(n, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgecp_1_0_lib4(n, pA+2, pB);
+			else if(m-ii==2)
+				kernel_sgecp_2_0_lib4(n, pA+2, pB);
+			else // if(m-ii==3)
+				kernel_sgecp_3_2_lib4(n, pA, sda, pB);
+			}
+		}
+	// skip 3 elements of pA
+	else // if(offA==(offB+3)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgecp_3_0_lib4(n, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_sgecp_4_3_lib4(n, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgecp_1_0_lib4(n, pA+3, pB);
+			else if(m-ii==2)
+				kernel_sgecp_2_3_lib4(n, pA, sda, pB);
+			else // if(m-ii==3)
+				kernel_sgecp_3_3_lib4(n, pA, sda, pB);
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// scale a strvec
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pa[ii+0] *= alpha;
+		pa[ii+1] *= alpha;
+		pa[ii+2] *= alpha;
+		pa[ii+3] *= alpha;
+		}
+	for(; ii<m; ii++)
+		{
+		pa[ii+0] *= alpha;
+		}
+	return;
+	}
+
+
+
+// copy a strvec into a strvec
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+	{
+	float *pa = sa->pa + ai;
+	float *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] = pa[ii+0];
+		pc[ii+1] = pa[ii+1];
+		pc[ii+2] = pa[ii+2];
+		pc[ii+3] = pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] = pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+	{
+
+	if(m<=0)
+		return;
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+	int offA = ai%bs;
+	int offB = bi%bs;
+
+	int ii, mna;
+
+	// same alignment
+	if(offA==offB)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_strcp_l_3_0_lib4(ii, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_strcp_l_4_0_lib4(ii, pA, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_strcp_l_1_0_lib4(ii, pA, pB);
+			else if(m-ii==2)
+				kernel_strcp_l_2_0_lib4(ii, pA, pB);
+			else // if(m-ii==3)
+				kernel_strcp_l_3_0_lib4(ii, pA, pB);
+			}
+		}
+	// skip one element of pA
+	else if(offA==(offB+1)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+				//pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_strcp_l_2_3_lib4(ii, pA, sda, pB+2);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_strcp_l_3_2_lib4(ii, pA, sda, pB+1);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_strcp_l_4_1_lib4(ii, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_strcp_l_1_0_lib4(ii, pA+1, pB);
+			else if(m-ii==2)
+				kernel_strcp_l_2_0_lib4(ii, pA+1, pB);
+			else // if(m-ii==3)
+				kernel_strcp_l_3_0_lib4(ii, pA+1, pB);
+			}
+		}
+	// skip 2 elements of pA
+	else if(offA==(offB+2)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_strcp_l_2_3_lib4(ii, pA, sda, pB+1);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_strcp_l_1_0_lib4(ii, pA+1, pB+3);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_strcp_l_2_0_lib4(ii, pA, pB+2);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_strcp_l_3_3_lib4(ii, pA, sda, pB+1);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_strcp_l_4_2_lib4(ii, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_strcp_l_1_0_lib4(ii, pA+2, pB);
+			else if(m-ii==2)
+				kernel_strcp_l_2_0_lib4(ii, pA+2, pB);
+			else // if(m-ii==3)
+				kernel_strcp_l_3_2_lib4(ii, pA, sda, pB);
+			}
+		}
+	// skip 3 elements of pA
+	else // if(offA==(offB+3)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_strcp_l_3_0_lib4(ii, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_strcp_l_4_3_lib4(ii, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_strcp_l_1_0_lib4(ii, pA+3, pB);
+			else if(m-ii==2)
+				kernel_strcp_l_2_3_lib4(ii, pA, sda, pB);
+			else // if(m-ii==3)
+				kernel_strcp_l_3_3_lib4(ii, pA, sda, pB);
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// scale and add a generic strmat into a generic strmat
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	const int bs = 4;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+	int offA = ai%bs;
+	int offB = bi%bs;
+
+	int ii, mna;
+
+	// same alignment
+	if(offA==offB)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgead_3_0_lib4(n, &alpha, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_sgead_4_0_lib4(n, &alpha, pA, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgead_1_0_lib4(n, &alpha, pA, pB);
+			else if(m-ii==2)
+				kernel_sgead_2_0_lib4(n, &alpha, pA, pB);
+			else // if(m-ii==3)
+				kernel_sgead_3_0_lib4(n, &alpha, pA, pB);
+			}
+		}
+	// skip one element of pA
+	else if(offA==(offB+1)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+				//pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB+2);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgead_3_2_lib4(n, &alpha, pA, sda, pB+1);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_sgead_4_1_lib4(n, &alpha, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgead_1_0_lib4(n, &alpha, pA+1, pB);
+			else if(m-ii==2)
+				kernel_sgead_2_0_lib4(n, &alpha, pA+1, pB);
+			else // if(m-ii==3)
+				kernel_sgead_3_0_lib4(n, &alpha, pA+1, pB);
+			}
+		}
+	// skip 2 elements of pA
+	else if(offA==(offB+2)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB+1);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgead_1_0_lib4(n, &alpha, pA+1, pB+3);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgead_2_0_lib4(n, &alpha, pA, pB+2);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgead_3_3_lib4(n, &alpha, pA, sda, pB+1);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_sgead_4_2_lib4(n, &alpha, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgead_1_0_lib4(n, &alpha, pA+2, pB);
+			else if(m-ii==2)
+				kernel_sgead_2_0_lib4(n, &alpha, pA+2, pB);
+			else // if(m-ii==3)
+				kernel_sgead_3_2_lib4(n, &alpha, pA, sda, pB);
+			}
+		}
+	// skip 3 elements of pA
+	else // if(offA==(offB+3)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgead_3_0_lib4(n, &alpha, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_sgead_4_3_lib4(n, &alpha, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgead_1_0_lib4(n, &alpha, pA+3, pB);
+			else if(m-ii==2)
+				kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB);
+			else // if(m-ii==3)
+				kernel_sgead_3_3_lib4(n, &alpha, pA, sda, pB);
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	sgetr_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	strtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	strtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// insert a strvec to diagonal of strmat, sparse formulation 
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// extract the diagonal of a strmat to a strvec, sparse formulation 
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to diagonal of strmat, sparse formulation 
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation 
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to row of strmat, sparse formulation 
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
+	srowad_libsp(kmax, idx, alpha, x, pD);
+	return;
+	}
+
+
+
+// adds strvec to strvec, sparse formulation
+void svecad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sy, int yi)
+	{
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	svecad_libsp(kmax, idx, alpha, x, y);
+	return;
+	}
+
+
+
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] = alpha * x[ii];
+	return;
+	}
+
+
+
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[ii] = alpha * x[idx[ii]];
+	return;
+	}
+
+
+
+void svecnrm_inf_libstr(int m, struct s_strvec *sx, int xi, float *ptr_norm)
+	{
+	int ii;
+	float *x = sx->pa + xi;
+	float norm = 0.0;
+	for(ii=0; ii<m; ii++)
+		norm = fmax(norm, fabs(x[ii]));
+	*ptr_norm = norm;
+	return;
+	}
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/auxiliary/s_aux_lib8.c b/auxiliary/s_aux_lib8.c
new file mode 100644
index 0000000..94ba22d
--- /dev/null
+++ b/auxiliary/s_aux_lib8.c
@@ -0,0 +1,2647 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_block_size.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+// copies a lower triangular packed matrix into a lower triangular packed matrix
+void strcp_l_lib(int m, int offsetA, float *A, int sda, int offsetB, float *B, int sdb)
+	{
+	printf("\nstrcp_;l_lib: feature not implemented yet\n");
+	exit(1);
+	}
+
+
+
+// scales and adds a strvec into a strvec
+void svecad_libstr(int m, float alpha, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+	{
+	float *pa = sa->pa + ai;
+	float *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		pc[ii+1] += alpha*pa[ii+1];
+		pc[ii+2] += alpha*pa[ii+2];
+		pc[ii+3] += alpha*pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// transpose lower triangular matrix
+void strtr_l_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+	printf("\nstrtr_l_lib: feature not implemented yet\n");
+	exit(1);
+	}
+
+
+
+// transpose an aligned upper triangular matrix into an aligned lower triangular matrix
+void strtr_u_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+	printf("\nstrtr_u_lib: feature not implemented yet\n");
+	exit(1);
+	}
+
+
+
+// regularize diagonal 
+void sdiareg_lib(int kmax, float reg, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	float *pD2;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] += reg;
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		kmax -= kna;
+		}
+	pD2 = pD;
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		pD2[0+0*bs] += reg;
+		pD2[1+1*bs] += reg;
+		pD2[2+2*bs] += reg;
+		pD2[3+3*bs] += reg;
+		pD2[4+4*bs] += reg;
+		pD2[5+5*bs] += reg;
+		pD2[6+6*bs] += reg;
+		pD2[7+7*bs] += reg;
+		pD2 += bs*sdd+bs*bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] += reg;
+		}
+	
+	}
+
+
+
+// insert vector to diagonal 
+void sdiain_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	float *pD2, *x2;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] = alpha*x[ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	pD2 = pD;
+	x2 = x;
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		pD2[0+bs*0] = alpha*x2[0];
+		pD2[1+bs*1] = alpha*x2[1];
+		pD2[2+bs*2] = alpha*x2[2];
+		pD2[3+bs*3] = alpha*x2[3];
+		pD2[4+bs*4] = alpha*x2[4];
+		pD2[5+bs*5] = alpha*x2[5];
+		pD2[6+bs*6] = alpha*x2[6];
+		pD2[7+bs*7] = alpha*x2[7];
+		pD2 += bs*sdd+bs*bs;
+		x2 += bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] = alpha*x[jj+ll];
+		}
+	
+	}
+
+
+
+// insert sqrt of vector to diagonal 
+void sdiain_sqrt_lib(int kmax, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	float *pD2, *x2;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] = sqrt(x[ll]);
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	pD2 = pD;
+	x2 = x;
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		pD2[0+bs*0] = sqrt(x2[0]);
+		pD2[1+bs*1] = sqrt(x2[1]);
+		pD2[2+bs*2] = sqrt(x2[2]);
+		pD2[3+bs*3] = sqrt(x2[3]);
+		pD2[4+bs*4] = sqrt(x2[4]);
+		pD2[5+bs*5] = sqrt(x2[5]);
+		pD2[5+bs*6] = sqrt(x2[6]);
+		pD2[7+bs*7] = sqrt(x2[7]);
+		pD2 += bs*sdd+bs*bs;
+		x2 += bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
+		}
+	
+	}
+
+
+
+// extract diagonal to vector 
+void sdiaex_lib(int kmax, float alpha, int offset, float *pD, int sdd, float *x)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	float *pD2, *x2;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			x[ll] = alpha * pD[ll+bs*ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	pD2 = pD;
+	x2 = x;
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		x2[0] = alpha * pD2[0+bs*0];
+		x2[1] = alpha * pD2[1+bs*1];
+		x2[2] = alpha * pD2[2+bs*2];
+		x2[3] = alpha * pD2[3+bs*3];
+		x2[4] = alpha * pD2[4+bs*4];
+		x2[5] = alpha * pD2[5+bs*5];
+		x2[6] = alpha * pD2[6+bs*6];
+		x2[7] = alpha * pD2[7+bs*7];
+		pD2 += bs*sdd+bs*bs;
+		x2 += bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
+		}
+	
+	}
+
+
+
+// add scaled vector to diagonal 
+void sdiaad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	float *pD2, *x2;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] += alpha * x[ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	pD2 = pD;
+	x2 = x;
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		pD2[0+bs*0] += alpha * x2[0];
+		pD2[1+bs*1] += alpha * x2[1];
+		pD2[2+bs*2] += alpha * x2[2];
+		pD2[3+bs*3] += alpha * x2[3];
+		pD2[4+bs*4] += alpha * x2[4];
+		pD2[5+bs*5] += alpha * x2[5];
+		pD2[6+bs*6] += alpha * x2[6];
+		pD2[7+bs*7] += alpha * x2[7];
+		pD2 += bs*sdd+bs*bs;
+		x2 += bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
+		}
+	return;
+	}
+
+
+
+// insert vector to diagonal, sparse formulation 
+void sdiain_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// extract diagonal to vector, sparse formulation 
+void sdiaex_libsp(int kmax, int *idx, float alpha, float *pD, int sdd, float *x)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
+		}
+	return;
+	}
+
+
+
+// add scaled vector to diagonal, sparse formulation 
+void sdiaad_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled vector to another vector and insert to diagonal, sparse formulation 
+void sdiaadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// insert vector to row 
+void srowin_lib(int kmax, float alpha, float *x, float *pD)
+	{
+	
+	const int bs = 8;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[0*bs] = alpha * x[0];
+		pD[1*bs] = alpha * x[1];
+		pD[2*bs] = alpha * x[2];
+		pD[3*bs] = alpha * x[3];
+		pD += 4*bs;
+		x += 4;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[ll*bs] = alpha*x[ll];
+		}
+	return;
+	}
+
+
+
+// extract row to vector
+void srowex_lib(int kmax, float alpha, float *pD, float *x)
+	{
+	
+	const int bs = 8;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[0] = alpha * pD[0*bs];
+		x[1] = alpha * pD[1*bs];
+		x[2] = alpha * pD[2*bs];
+		x[3] = alpha * pD[3*bs];
+		pD += 4*bs;
+		x += 4;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		x[ll] = alpha*pD[ll*bs];
+		}
+	return;
+	}
+
+
+
+// add scaled vector to row 
+void srowad_lib(int kmax, float alpha, float *x, float *pD)
+	{
+
+	const int bs = 8;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[0*bs] += alpha * x[0];
+		pD[1*bs] += alpha * x[1];
+		pD[2*bs] += alpha * x[2];
+		pD[3*bs] += alpha * x[3];
+		pD += 4*bs;
+		x += 4;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[ll*bs] += alpha * x[ll];
+		}
+	return;
+	}
+
+
+
+// insert vector to row, sparse formulation 
+void srowin_libsp(int kmax, float alpha, int *idx, float *x, float *pD)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] = alpha*x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled vector to row, sparse formulation 
+void srowad_libsp(int kmax, int *idx, float alpha, float *x, float *pD)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled vector to another vector and insert to row, sparse formulation 
+void srowadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// swap two rows
+void srowsw_lib(int kmax, float *pA, float *pC)
+	{
+
+	const int bs = 8;
+
+	int ii;
+	float tmp;
+
+	for(ii=0; ii<kmax-3; ii+=4)
+		{
+		tmp = pA[0+bs*0];
+		pA[0+bs*0] = pC[0+bs*0];
+		pC[0+bs*0] = tmp;
+		tmp = pA[0+bs*1];
+		pA[0+bs*1] = pC[0+bs*1];
+		pC[0+bs*1] = tmp;
+		tmp = pA[0+bs*2];
+		pA[0+bs*2] = pC[0+bs*2];
+		pC[0+bs*2] = tmp;
+		tmp = pA[0+bs*3];
+		pA[0+bs*3] = pC[0+bs*3];
+		pC[0+bs*3] = tmp;
+		pA += 4*bs;
+		pC += 4*bs;
+		}
+	for( ; ii<kmax; ii++)
+		{
+		tmp = pA[0+bs*0];
+		pA[0+bs*0] = pC[0+bs*0];
+		pC[0+bs*0] = tmp;
+		pA += 1*bs;
+		pC += 1*bs;
+		}
+	return;
+	}
+
+
+
+// insert vector to column 
+void scolin_lib(int kmax, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll] = x[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		pD[0] = x[0];
+		pD[1] = x[1];
+		pD[2] = x[2];
+		pD[3] = x[3];
+		pD[4] = x[4];
+		pD[5] = x[5];
+		pD[6] = x[6];
+		pD[7] = x[7];
+		pD += bs*sdd;
+		x += bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[ll] = x[ll];
+		}
+	
+	}
+
+
+
+// add scaled vector to column 
+void scolad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll] += alpha * x[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		pD[0] += alpha * x[0];
+		pD[1] += alpha * x[1];
+		pD[2] += alpha * x[2];
+		pD[3] += alpha * x[3];
+		pD[4] += alpha * x[4];
+		pD[5] += alpha * x[5];
+		pD[6] += alpha * x[6];
+		pD[7] += alpha * x[7];
+		pD += bs*sdd;
+		x += bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[ll] += alpha * x[ll];
+		}
+	
+	}
+
+
+
+// insert vector to diagonal, sparse formulation 
+void scolin_libsp(int kmax, int *idx, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs] = x[jj];
+		}
+	
+	}
+
+
+
+// add scaled vector to diagonal, sparse formulation 
+void scolad_libsp(int kmax, float alpha, int *idx, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+// swaps two cols
+void scolsw_lib(int kmax, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+
+	const int bs = 8;
+
+	int ii;
+
+	float tmp;
+
+	if(offsetA==offsetC)
+		{
+		if(offsetA>0)
+			{
+			ii = 0;
+			for(; ii<bs-offsetA; ii++)
+				{
+				tmp = pA[0+bs*0];
+				pA[0+bs*0] = pC[0+bs*0];
+				pC[0+bs*0] = tmp;
+				pA += 1;
+				pC += 1;
+				}
+			pA += bs*(sda-1);
+			pC += bs*(sdc-1);
+			kmax -= bs-offsetA;
+			}
+		ii = 0;
+		for(; ii<kmax-7; ii+=8)
+			{
+			tmp = pA[0+bs*0];
+			pA[0+bs*0] = pC[0+bs*0];
+			pC[0+bs*0] = tmp;
+			tmp = pA[1+bs*0];
+			pA[1+bs*0] = pC[1+bs*0];
+			pC[1+bs*0] = tmp;
+			tmp = pA[2+bs*0];
+			pA[2+bs*0] = pC[2+bs*0];
+			pC[2+bs*0] = tmp;
+			tmp = pA[3+bs*0];
+			pA[3+bs*0] = pC[3+bs*0];
+			pC[3+bs*0] = tmp;
+			tmp = pA[4+bs*0];
+			pA[4+bs*0] = pC[4+bs*0];
+			pC[4+bs*0] = tmp;
+			tmp = pA[5+bs*0];
+			pA[5+bs*0] = pC[5+bs*0];
+			pC[5+bs*0] = tmp;
+			tmp = pA[6+bs*0];
+			pA[6+bs*0] = pC[6+bs*0];
+			pC[6+bs*0] = tmp;
+			tmp = pA[7+bs*0];
+			pA[7+bs*0] = pC[7+bs*0];
+			pC[7+bs*0] = tmp;
+			pA += bs*sda;
+			pC += bs*sdc;
+			}
+		for(; ii<kmax; ii++)
+			{
+			tmp = pA[0+bs*0];
+			pA[0+bs*0] = pC[0+bs*0];
+			pC[0+bs*0] = tmp;
+			pA += 1;
+			pC += 1;
+			}
+		}
+	else
+		{
+		printf("\nscolsw: feature not implemented yet: offsetA!=offsetC\n\n");
+		exit(1);
+		}
+
+	return;
+
+	}
+
+
+
+// insert vector to vector, sparse formulation
+void svecin_libsp(int kmax, int *idx, float *x, float *y)
+	{
+
+	int jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		y[idx[jj]] = x[jj];
+		}
+	
+	}
+
+
+
+// adds vector to vector, sparse formulation
+void svecad_libsp(int kmax, int *idx, float alpha, float *x, float *y)
+	{
+
+	int jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		y[idx[jj]] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// return the memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n)
+	{
+	const int bs = 8;
+	int nc = S_NC;
+	int al = bs*nc;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	int memory_size = (pm*cn+tmp)*sizeof(float);
+	return memory_size;
+	}
+
+
+
+// return the memory size (in bytes) needed for the digonal of a strmat
+int s_size_diag_strmat(int m, int n)
+	{
+	const int bs = 8;
+	int nc = S_NC;
+	int al = bs*nc;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	int memory_size = tmp*sizeof(float);
+	return memory_size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory)
+	{
+	const int bs = 8;
+	int nc = S_NC;
+	int al = bs*nc;
+	sA->m = m;
+	sA->n = n;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	sA->pm = pm;
+	sA->cn = cn;
+	float *ptr = (float *) memory;
+	sA->pA = ptr;
+	ptr += pm*cn;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	sA->dA = ptr;
+	ptr += tmp;
+	sA->use_dA = 0;
+	sA->memory_size = (pm*cn+tmp)*sizeof(float);
+	return;
+	}
+
+
+
+// return memory size (in bytes) needed for a strvec
+int s_size_strvec(int m)
+	{
+	const int bs = 8;
+//	int nc = S_NC;
+//	int al = bs*nc;
+	int pm = (m+bs-1)/bs*bs;
+	int memory_size = pm*sizeof(float);
+	return memory_size;
+	}
+
+
+
+// create a vector structure for a vector of size m by using memory passed by a pointer
+void s_create_strvec(int m, struct s_strvec *sa, void *memory)
+	{
+	const int bs = 8;
+//	int nc = S_NC;
+//	int al = bs*nc;
+	sa->m = m;
+	int pm = (m+bs-1)/bs*bs;
+	sa->pm = pm;
+	float *ptr = (float *) memory;
+	sa->pa = ptr;
+//	ptr += pm;
+	sa->memory_size = pm*sizeof(float);
+	return;
+	}
+
+
+
+// convert a matrix into a matrix structure
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, j, jj, m0, m1, m2;
+	float *B, *pB;
+	m0 = (bs-ai%bs)%bs;
+	if(m0>m)
+		m0 = m;
+	m1 = m - m0;
+	jj = 0;
+	for( ; jj<n-3; jj+=4)
+		{
+		B  =  A + jj*lda;
+		pB = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for( ; ii<m0; ii++)
+				{
+				pB[ii+bs*0] = B[ii+lda*0];
+				pB[ii+bs*1] = B[ii+lda*1];
+				pB[ii+bs*2] = B[ii+lda*2];
+				pB[ii+bs*3] = B[ii+lda*3];
+				}
+			B  += m0;
+			pB += m0 + bs*(sda-1);
+			}
+		for( ; ii<m-7; ii+=8)
+			{
+			// unroll 0
+			pB[0+bs*0] = B[0+lda*0];
+			pB[1+bs*0] = B[1+lda*0];
+			pB[2+bs*0] = B[2+lda*0];
+			pB[3+bs*0] = B[3+lda*0];
+			pB[4+bs*0] = B[4+lda*0];
+			pB[5+bs*0] = B[5+lda*0];
+			pB[6+bs*0] = B[6+lda*0];
+			pB[7+bs*0] = B[7+lda*0];
+			// unroll 1
+			pB[0+bs*1] = B[0+lda*1];
+			pB[1+bs*1] = B[1+lda*1];
+			pB[2+bs*1] = B[2+lda*1];
+			pB[3+bs*1] = B[3+lda*1];
+			pB[4+bs*1] = B[4+lda*1];
+			pB[5+bs*1] = B[5+lda*1];
+			pB[6+bs*1] = B[6+lda*1];
+			pB[7+bs*1] = B[7+lda*1];
+			// unroll 2
+			pB[0+bs*2] = B[0+lda*2];
+			pB[1+bs*2] = B[1+lda*2];
+			pB[2+bs*2] = B[2+lda*2];
+			pB[3+bs*2] = B[3+lda*2];
+			pB[4+bs*2] = B[4+lda*2];
+			pB[5+bs*2] = B[5+lda*2];
+			pB[6+bs*2] = B[6+lda*2];
+			pB[7+bs*2] = B[7+lda*2];
+			// unroll 3
+			pB[0+bs*3] = B[0+lda*3];
+			pB[1+bs*3] = B[1+lda*3];
+			pB[2+bs*3] = B[2+lda*3];
+			pB[3+bs*3] = B[3+lda*3];
+			pB[4+bs*3] = B[4+lda*3];
+			pB[5+bs*3] = B[5+lda*3];
+			pB[6+bs*3] = B[6+lda*3];
+			pB[7+bs*3] = B[7+lda*3];
+			// update
+			B  += 8;
+			pB += bs*sda;
+			}
+		for( ; ii<m; ii++)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			// col 1
+			pB[0+bs*1] = B[0+lda*1];
+			// col 2
+			pB[0+bs*2] = B[0+lda*2];
+			// col 3
+			pB[0+bs*3] = B[0+lda*3];
+			// update
+			B  += 1;
+			pB += 1;
+			}
+		}
+	for( ; jj<n; jj++)
+		{
+
+		B  =  A + jj*lda;
+		pB = pA + jj*bs;
+
+		ii = 0;
+		if(m0>0)
+			{
+			for( ; ii<m0; ii++)
+				{
+				pB[ii+bs*0] = B[ii+lda*0];
+				}
+			B  += m0;
+			pB += m0 + bs*(sda-1);
+			}
+		for( ; ii<m-7; ii+=8)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			pB[1+bs*0] = B[1+lda*0];
+			pB[2+bs*0] = B[2+lda*0];
+			pB[3+bs*0] = B[3+lda*0];
+			pB[4+bs*0] = B[4+lda*0];
+			pB[5+bs*0] = B[5+lda*0];
+			pB[6+bs*0] = B[6+lda*0];
+			pB[7+bs*0] = B[7+lda*0];
+			// update
+			B  += 8;
+			pB += bs*sda;
+			}
+		for( ; ii<m; ii++)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			// update
+			B  += 1;
+			pB += 1;
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix into a matrix structure
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, j, m0, m1, m2;
+	float 	*B, *pB;
+	m0 = (bs-ai%bs)%bs;
+	if(m0>n)
+		m0 = n;
+	m1 = n - m0;
+	ii = 0;
+	if(m0>0)
+		{
+		for(j=0; j<m; j++)
+			{
+			for(i=0; i<m0; i++)
+				{
+				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+				}
+			}
+		A  += m0*lda;
+		pA += m0 + bs*(sda-1);
+		}
+	ii = 0;
+	for(; ii<m1-7; ii+=bs)
+		{
+		j=0;
+		B  = A + ii*lda;
+		pB = pA + ii*sda;
+		for(; j<m-3; j+=4)
+			{
+			// unroll 0
+			pB[0+0*bs] = B[0+0*lda];
+			pB[1+0*bs] = B[0+1*lda];
+			pB[2+0*bs] = B[0+2*lda];
+			pB[3+0*bs] = B[0+3*lda];
+			pB[4+0*bs] = B[0+4*lda];
+			pB[5+0*bs] = B[0+5*lda];
+			pB[6+0*bs] = B[0+6*lda];
+			pB[7+0*bs] = B[0+7*lda];
+			// unroll 1
+			pB[0+1*bs] = B[1+0*lda];
+			pB[1+1*bs] = B[1+1*lda];
+			pB[2+1*bs] = B[1+2*lda];
+			pB[3+1*bs] = B[1+3*lda];
+			pB[4+1*bs] = B[1+4*lda];
+			pB[5+1*bs] = B[1+5*lda];
+			pB[6+1*bs] = B[1+6*lda];
+			pB[7+1*bs] = B[1+7*lda];
+			// unroll 2
+			pB[0+2*bs] = B[2+0*lda];
+			pB[1+2*bs] = B[2+1*lda];
+			pB[2+2*bs] = B[2+2*lda];
+			pB[3+2*bs] = B[2+3*lda];
+			pB[4+2*bs] = B[2+4*lda];
+			pB[5+2*bs] = B[2+5*lda];
+			pB[6+2*bs] = B[2+6*lda];
+			pB[7+2*bs] = B[2+7*lda];
+			// unroll 3
+			pB[0+3*bs] = B[3+0*lda];
+			pB[1+3*bs] = B[3+1*lda];
+			pB[2+3*bs] = B[3+2*lda];
+			pB[3+3*bs] = B[3+3*lda];
+			pB[4+3*bs] = B[3+4*lda];
+			pB[5+3*bs] = B[3+5*lda];
+			pB[6+3*bs] = B[3+6*lda];
+			pB[7+3*bs] = B[3+7*lda];
+			B  += 4;
+			pB += 4*bs;
+			}
+		for(; j<m; j++)
+			{
+			// unroll 0
+			pB[0+0*bs] = B[0+0*lda];
+			pB[1+0*bs] = B[0+1*lda];
+			pB[2+0*bs] = B[0+2*lda];
+			pB[3+0*bs] = B[0+3*lda];
+			pB[4+0*bs] = B[0+4*lda];
+			pB[5+0*bs] = B[0+5*lda];
+			pB[6+0*bs] = B[0+6*lda];
+			pB[7+0*bs] = B[0+7*lda];
+			B  += 1;
+			pB += 1*bs;
+			}
+		}
+	if(ii<m1)
+		{
+		m2 = m1-ii;
+		if(bs<m2) m2 = bs;
+		for(j=0; j<m; j++)
+			{
+			for(i=0; i<m2; i++)
+				{
+				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector into a vector structure
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		pa[ii] = a[ii];
+	return;
+	}
+
+
+
+// convert a matrix structure into a matrix
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, jj;
+	int m0 = (bs-ai%bs)%bs;
+	float *ptr_pA;
+	jj=0;
+	for(; jj<n-3; jj+=4)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				// unroll 0
+				A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+				// unroll 1
+				A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+				// unroll 2
+				A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+				// unroll 3
+				A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		// TODO update A !!!!!
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			// unroll 0
+			A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+			A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+			A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+			A[4+ii+lda*(jj+0)] = ptr_pA[4+bs*0];
+			A[5+ii+lda*(jj+0)] = ptr_pA[5+bs*0];
+			A[6+ii+lda*(jj+0)] = ptr_pA[6+bs*0];
+			A[7+ii+lda*(jj+0)] = ptr_pA[7+bs*0];
+			// unroll 0
+			A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+			A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
+			A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
+			A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
+			A[4+ii+lda*(jj+1)] = ptr_pA[4+bs*1];
+			A[5+ii+lda*(jj+1)] = ptr_pA[5+bs*1];
+			A[6+ii+lda*(jj+1)] = ptr_pA[6+bs*1];
+			A[7+ii+lda*(jj+1)] = ptr_pA[7+bs*1];
+			// unroll 0
+			A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+			A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
+			A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
+			A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
+			A[4+ii+lda*(jj+2)] = ptr_pA[4+bs*2];
+			A[5+ii+lda*(jj+2)] = ptr_pA[5+bs*2];
+			A[6+ii+lda*(jj+2)] = ptr_pA[6+bs*2];
+			A[7+ii+lda*(jj+2)] = ptr_pA[7+bs*2];
+			// unroll 0
+			A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+			A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
+			A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
+			A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
+			A[4+ii+lda*(jj+3)] = ptr_pA[4+bs*3];
+			A[5+ii+lda*(jj+3)] = ptr_pA[5+bs*3];
+			A[6+ii+lda*(jj+3)] = ptr_pA[6+bs*3];
+			A[7+ii+lda*(jj+3)] = ptr_pA[7+bs*3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			// unroll 0
+			A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			// unroll 1
+			A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+			// unroll 2
+			A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+			// unroll 3
+			A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+			ptr_pA++;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				A[ii+lda*jj] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+			A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+			A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+			A[4+ii+lda*(jj+0)] = ptr_pA[4+bs*0];
+			A[5+ii+lda*(jj+0)] = ptr_pA[5+bs*0];
+			A[6+ii+lda*(jj+0)] = ptr_pA[6+bs*0];
+			A[7+ii+lda*(jj+0)] = ptr_pA[7+bs*0];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			A[ii+lda*jj] = ptr_pA[0];
+			ptr_pA++;
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix structure into a matrix
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, jj;
+	int m0 = (bs-ai%bs)%bs;
+	float *ptr_pA;
+	jj=0;
+	for(; jj<n-3; jj+=4)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				// unroll 0
+				A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+				// unroll 1
+				A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+				// unroll 2
+				A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+				// unroll 3
+				A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		// TODO update A !!!!!
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			// unroll 0
+			A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
+			A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
+			A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
+			A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
+			A[jj+0+lda*(ii+4)] = ptr_pA[4+bs*0];
+			A[jj+0+lda*(ii+5)] = ptr_pA[5+bs*0];
+			A[jj+0+lda*(ii+6)] = ptr_pA[6+bs*0];
+			A[jj+0+lda*(ii+7)] = ptr_pA[7+bs*0];
+			// unroll 1
+			A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
+			A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
+			A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
+			A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
+			A[jj+1+lda*(ii+4)] = ptr_pA[4+bs*1];
+			A[jj+1+lda*(ii+5)] = ptr_pA[5+bs*1];
+			A[jj+1+lda*(ii+6)] = ptr_pA[6+bs*1];
+			A[jj+1+lda*(ii+7)] = ptr_pA[7+bs*1];
+			// unroll 2
+			A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
+			A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
+			A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
+			A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
+			A[jj+2+lda*(ii+4)] = ptr_pA[4+bs*2];
+			A[jj+2+lda*(ii+5)] = ptr_pA[5+bs*2];
+			A[jj+2+lda*(ii+6)] = ptr_pA[6+bs*2];
+			A[jj+2+lda*(ii+7)] = ptr_pA[7+bs*2];
+			// unroll 3
+			A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
+			A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
+			A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
+			A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
+			A[jj+3+lda*(ii+4)] = ptr_pA[4+bs*3];
+			A[jj+3+lda*(ii+5)] = ptr_pA[5+bs*3];
+			A[jj+3+lda*(ii+6)] = ptr_pA[6+bs*3];
+			A[jj+3+lda*(ii+7)] = ptr_pA[7+bs*3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			// unroll 0
+			A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+			// unroll 1
+			A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+			// unroll 2
+			A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+			// unroll 3
+			A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+			ptr_pA++;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				A[jj+lda*ii] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			i=0;
+			// TODO update A !!!!!
+			// TODO unroll !!!!!!
+			for(; i<bs; i++)
+				{
+				A[jj+lda*(i+ii)] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			A[jj+lda*ii] = ptr_pA[0];
+			ptr_pA++;
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector structure into a vector 
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		a[ii] = pa[ii];
+	return;
+	}
+
+
+
+// cast a matrix into a matrix structure
+void s_cast_mat2strmat(float *A, struct s_strmat *sA)
+	{
+	sA->pA = A;
+	return;
+	}
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA)
+	{
+	sA->dA = dA;
+	return;
+	}
+
+
+
+// cast a vector into a vector structure
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa)
+	{
+	sa->pa = a;
+	return;
+	}
+
+
+
+// insert element into strmat
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	pA[0] = a;
+	return;
+	}
+
+
+
+// extract element from strmat
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	return pA[0];
+	}
+
+
+
+// insert element into strvec
+void svecin1_libstr(float a, struct s_strvec *sx, int xi)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	x[0] = a;
+	return;
+	}
+
+
+
+// extract element from strvec
+float svecex1_libstr(struct s_strvec *sx, int xi)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	return x[0];
+	}
+
+
+
+// set all elements of a strmat to a value
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
+	int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
+	int ii, jj;
+	if(m0>0)
+		{
+		for(ii=0; ii<m0; ii++)
+			{
+			for(jj=0; jj<n; jj++)
+				{
+				pA[jj*bs] = alpha;
+				}
+			pA += 1;
+			}
+		pA += bs*(sda-1);
+		m -= m0;
+		}
+	for(ii=0; ii<m-7; ii+=8)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			pA[0+jj*bs] = alpha;
+			pA[1+jj*bs] = alpha;
+			pA[2+jj*bs] = alpha;
+			pA[3+jj*bs] = alpha;
+			pA[4+jj*bs] = alpha;
+			pA[5+jj*bs] = alpha;
+			pA[6+jj*bs] = alpha;
+			pA[7+jj*bs] = alpha;
+			}
+		pA += bs*sda;
+		}
+	for( ; ii<m; ii++)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			pA[jj*bs] = alpha;
+			}
+		pA += 1;
+		}
+	return;
+	}
+
+
+
+// set all elements of a strvec to a value
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi)
+	{
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		x[ii] = alpha;
+	return;
+	}
+
+
+
+// extract diagonal to vector
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	sdiaex_lib(kmax, alpha, ai%bs, pA, sda, x);
+	return;
+	}
+
+
+
+// insert a vector into diagonal
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	sdiain_lib(kmax, alpha, x, ai%bs, pA, sda);
+	return;
+	}
+
+
+
+// swap two rows of a matrix struct
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	srowsw_lib(kmax, pA, pC);
+	return;
+	}
+
+
+
+// permute the rows of a matrix struct
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			srowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+		}
+	return;
+	}
+
+
+// extract a row int a vector
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	srowex_lib(kmax, alpha, pA, x);
+	return;
+	}
+
+
+
+// insert a vector into a row
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	srowin_lib(kmax, alpha, x, pA);
+	return;
+	}
+
+
+
+// add a vector to a row
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	srowad_lib(kmax, alpha, x, pA);
+	return;
+	}
+
+
+
+// swap two cols of a matrix struct
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	scolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
+	return;
+	}
+
+
+
+// permute the cols of a matrix struct
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			scolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+		}
+	return;
+	}
+
+
+
+// scale a generic strmat
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+	{
+
+	// early return
+	if(m==0 | n==0)
+		return;
+	
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** sgesc_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgesc_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgesc_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgesc_libstr : aj<0 : %d<0 *****\n", aj);
+	// inside matrix
+	// A: m x n
+	if(ai+m > sA->m) printf("\n***** sgesc_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** sgesc_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+#endif
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	int offsetA = ai%bs;
+
+	int ii, mna;
+
+	if(offsetA>0)
+		{
+		mna = bs-offsetA;
+		mna = m<mna ? m : mna;
+		kernel_sgesc_8_gen_lib8(n, &alpha, &pA[offsetA], mna);
+		m -= mna;
+		pA += 8*sda;
+		}
+	ii = 0;
+	for( ; ii<m-7; ii+=8)
+		{
+		kernel_sgesc_8_lib8(n, &alpha, &pA[0]);
+		pA += 8*sda;
+		}
+	if(ii<m)
+		{
+		kernel_sgesc_8_gen_lib8(n, &alpha, &pA[0], m-ii);
+		}
+
+	return;
+
+	}
+
+
+
+// copy a generic strmat into a generic strmat
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+	{
+
+	// early return
+	if(m==0 | n==0)
+		return;
+	
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** sgecp_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgecp_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgecp_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgecp_libstr : aj<0 : %d<0 *****\n", aj);
+	if(bi<0) printf("\n****** sgecp_libstr : bi<0 : %d<0 *****\n", bi);
+	if(bj<0) printf("\n****** sgecp_libstr : bj<0 : %d<0 *****\n", bj);
+	// inside matrix
+	// A: m x n
+	if(ai+m > sA->m) printf("\n***** sgecp_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** sgecp_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// B: m x n
+	if(bi+m > sB->m) printf("\n***** sgecp_libstr : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
+	if(bj+n > sB->n) printf("\n***** sgecp_libstr : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
+#endif
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+	int offsetA = ai%bs;
+	int offsetB = bi%bs;
+
+	int ii, mna;
+
+#if 1
+	if(offsetB>0)
+		{
+		if(offsetB>offsetA)
+			{
+			mna = bs-offsetB;
+			mna = m<mna ? m : mna;
+			kernel_sgecp_8_0_gen_lib8(n, &pA[offsetA], &pB[offsetB], mna);
+			m -= mna;
+			//pA += 8*sda;
+			pB += 8*sdb;
+			}
+		else
+			{
+			if(offsetA==0)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_0_gen_lib8(n, &pA[0], &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==1)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_1_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==2)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_2_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==3)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_3_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==4)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_4_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==5)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_5_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==6)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_6_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==7)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_7_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			}
+		}
+#endif
+
+	// same alignment
+	if(offsetA==offsetB)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_0_lib8(n, pA, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_0_gen_lib8(n, pA, pB, m-ii);
+			}
+		return;
+		}
+	// XXX different alignment: search tree ???
+	// skip one element of A
+	else if(offsetA==(offsetB+1)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_1_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_1_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		}
+	// skip two elements of A
+	else if(offsetA==(offsetB+2)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_2_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_2_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip three elements of A
+	else if(offsetA==(offsetB+3)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_3_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_3_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip four elements of A
+	else if(offsetA==(offsetB+4)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_4_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_4_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip five elements of A
+	else if(offsetA==(offsetB+5)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_5_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_5_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip six elements of A
+	else if(offsetA==(offsetB+6)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_6_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_6_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip seven elements of A
+	else //if(offsetA==(offsetB+7)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_7_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_7_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	
+	return;
+
+	}
+
+
+
+// scale a strvec
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pa[ii+0] *= alpha;
+		pa[ii+1] *= alpha;
+		pa[ii+2] *= alpha;
+		pa[ii+3] *= alpha;
+		}
+	for(; ii<m; ii++)
+		{
+		pa[ii+0] *= alpha;
+		}
+	return;
+	}
+
+
+
+// copy a strvec into a strvec
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+	{
+	float *pa = sa->pa + ai;
+	float *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] = pa[ii+0];
+		pc[ii+1] = pa[ii+1];
+		pc[ii+2] = pa[ii+2];
+		pc[ii+3] = pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] = pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	strcp_l_lib(m, ai%bs, pA, sda, ci%bs, pC, sdc);
+	// XXX uses full matrix copy !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//	sgecp_libstr(m, m, sA, ai, aj, sC, ci, cj);
+	return;
+	}
+
+
+
+// scale and add a generic strmat into a generic strmat
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+	{
+
+	// early return
+	if(m==0 | n==0)
+		return;
+	
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** sgead_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgead_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgead_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgead_libstr : aj<0 : %d<0 *****\n", aj);
+	if(bi<0) printf("\n****** sgead_libstr : bi<0 : %d<0 *****\n", bi);
+	if(bj<0) printf("\n****** sgead_libstr : bj<0 : %d<0 *****\n", bj);
+	// inside matrix
+	// A: m x n
+	if(ai+m > sA->m) printf("\n***** sgead_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** sgead_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// B: m x n
+	if(bi+m > sB->m) printf("\n***** sgead_libstr : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
+	if(bj+n > sB->n) printf("\n***** sgead_libstr : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
+#endif
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+	int offsetA = ai%bs;
+	int offsetB = bi%bs;
+
+	int ii, mna;
+
+#if 1
+	if(offsetB>0)
+		{
+		if(offsetB>offsetA)
+			{
+			mna = bs-offsetB;
+			mna = m<mna ? m : mna;
+			kernel_sgead_8_0_gen_lib8(n, &alpha, &pA[offsetA], &pB[offsetB], mna);
+			m -= mna;
+			//pA += 8*sda;
+			pB += 8*sdb;
+			}
+		else
+			{
+			if(offsetA==0)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_0_gen_lib8(n, &alpha, &pA[0], &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==1)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_1_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==2)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_2_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==3)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_3_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==4)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_4_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==5)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_5_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==6)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_6_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==7)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_7_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			}
+		}
+#endif
+
+	// same alignment
+	if(offsetA==offsetB)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_0_lib8(n, &alpha, pA, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_0_gen_lib8(n, &alpha, pA, pB, m-ii);
+			}
+		return;
+		}
+	// XXX different alignment: search tree ???
+	// skip one element of A
+	else if(offsetA==(offsetB+1)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_1_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_1_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		}
+	// skip two elements of A
+	else if(offsetA==(offsetB+2)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_2_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_2_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip three elements of A
+	else if(offsetA==(offsetB+3)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_3_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_3_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip four elements of A
+	else if(offsetA==(offsetB+4)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_4_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_4_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip five elements of A
+	else if(offsetA==(offsetB+5)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_5_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_5_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip six elements of A
+	else if(offsetA==(offsetB+6)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_6_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_6_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip seven elements of A
+	else //if(offsetA==(offsetB+7)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_7_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_7_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	
+	return;
+
+	}
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+	{
+
+	// early return
+	if(m==0 | n==0)
+		return;
+	
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** sgetr_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgetr_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgetr_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgetr_libstr : aj<0 : %d<0 *****\n", aj);
+	if(bi<0) printf("\n****** sgetr_libstr : bi<0 : %d<0 *****\n", bi);
+	if(bj<0) printf("\n****** sgetr_libstr : bj<0 : %d<0 *****\n", bj);
+	// inside matrix
+	// A: m x n
+	if(ai+m > sA->m) printf("\n***** sgetr_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** sgetr_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// B: n x m
+	if(bi+n > sB->m) printf("\n***** sgetr_libstr : bi+n > row(B) : %d+%d > %d *****\n", bi, n, sB->m);
+	if(bj+m > sB->n) printf("\n***** sgetr_libstr : bj+m > col(B) : %d+%d > %d *****\n", bj, m, sB->n);
+#endif
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+	int offsetA = ai%bs;
+	int offsetB = bi%bs;
+
+	int ii, nna;
+
+	if(offsetA==0)
+		{
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_0_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for(ii=0; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_0_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_0_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	// TODO log serach for offsetA>0 ???
+	else if(offsetA==1)
+		{
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_1_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for(ii=0; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_1_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_1_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	else if(offsetA==2)
+		{
+		ii = 0;
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_2_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for( ; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_2_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_2_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	else if(offsetA==3)
+		{
+		ii = 0;
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_3_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for( ; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_3_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_3_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	else if(offsetA==4)
+		{
+		ii = 0;
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_4_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for( ; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_4_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_4_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	else if(offsetA==5)
+		{
+		ii = 0;
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_5_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for( ; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_5_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_5_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	else if(offsetA==6)
+		{
+		ii = 0;
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_6_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for( ; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_6_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_6_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	else if(offsetA==7)
+		{
+		ii = 0;
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_7_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for( ; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_7_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_7_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	strtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	strtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// insert a strvec to diagonal of strmat, sparse formulation 
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// extract the diagonal of a strmat to a strvec, sparse formulation 
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to diagonal of strmat, sparse formulation 
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation 
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to row of strmat, sparse formulation 
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
+	srowad_libsp(kmax, idx, alpha, x, pD);
+	return;
+	}
+
+
+
+// adds strvec to strvec, sparse formulation
+void svecad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sy, int yi)
+	{
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	svecad_libsp(kmax, idx, alpha, x, y);
+	return;
+	}
+
+
+
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] = alpha * x[ii];
+	return;
+	}
+
+
+
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[ii] = alpha * x[idx[ii]];
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/auxiliary/v_aux_ext_dep_lib.c b/auxiliary/v_aux_ext_dep_lib.c
new file mode 100644
index 0000000..3bf5f90
--- /dev/null
+++ b/auxiliary/v_aux_ext_dep_lib.c
@@ -0,0 +1,138 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+
+
+/* creates a zero matrix given the size in bytes */
+void v_zeros(void **ptrA, int size)
+	{
+	*ptrA = (void *) malloc(size);
+	char *A = *ptrA;
+	int i;
+	for(i=0; i<size; i++) A[i] = 0;
+	}
+
+
+
+/* creates a zero matrix aligned to a cache line given the size in bytes */
+void v_zeros_align(void **ptrA, int size)
+	{
+#if defined(OS_WINDOWS)
+	*ptrA = _aligned_malloc( size, 64 );
+#else
+	int err = posix_memalign(ptrA, 64, size);
+	if(err!=0)
+		{
+		printf("Memory allocation error");
+		exit(1);
+		}
+#endif
+	char *A = *ptrA;
+	int i;
+	for(i=0; i<size; i++) A[i] = 0;
+	}
+
+
+
+/* frees matrix */
+void v_free(void *pA)
+	{
+	free( pA );
+	}
+
+
+
+/* frees aligned matrix */
+void v_free_align(void *pA)
+	{
+#if defined(OS_WINDOWS)
+	_aligned_free( pA );
+#else
+	free( pA );
+#endif
+	}
+
+
+
+/* creates a zero matrix given the size in bytes */
+void c_zeros(char **ptrA, int size)
+	{
+	*ptrA = malloc(size);
+	char *A = *ptrA;
+	int i;
+	for(i=0; i<size; i++) A[i] = 0;
+	}
+
+
+
+/* creates a zero matrix aligned to a cache line given the size in bytes */
+void c_zeros_align(char **ptrA, int size)
+	{
+#if defined(OS_WINDOWS)
+	*ptrA = _aligned_malloc( size, 64 );
+#else
+	void *temp;
+	int err = posix_memalign(&temp, 64, size);
+	if(err!=0)
+		{
+		printf("Memory allocation error");
+		exit(1);
+		}
+	*ptrA = temp;
+#endif
+	char *A = *ptrA;
+	int i;
+	for(i=0; i<size; i++) A[i] = 0;
+	}
+
+
+
+/* frees matrix */
+void c_free(char *pA)
+	{
+	free( pA );
+	}
+
+
+
+/* frees aligned matrix */
+void c_free_align(char *pA)
+	{
+#if defined(OS_WINDOWS)
+	_aligned_free( pA );
+#else
+	free( pA );
+#endif
+	}
+