Squashed 'third_party/blasfeo/' content from commit 2a828ca

Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/blas/x_blas3_lib.c b/blas/x_blas3_lib.c
new file mode 100644
index 0000000..29a33c7
--- /dev/null
+++ b/blas/x_blas3_lib.c
@@ -0,0 +1,1531 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+// dgemm nt
+void GEMM_NT_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL 
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0;
+			c_10 = 0.0;
+			c_01 = 0.0;
+			c_11 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+				c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
+				c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
+				c_11 += pA[(ii+1)+lda*kk] * pB[(jj+1)+ldb*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			c_01 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+				c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0;
+			c_10 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+				c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			}
+		}
+	return;
+	}
+
+
+
+// dgemm nn
+void GEMM_NN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL 
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0; ;
+			c_10 = 0.0; ;
+			c_01 = 0.0; ;
+			c_11 = 0.0; ;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+				c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
+				c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
+				c_11 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+1)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0; ;
+			c_01 = 0.0; ;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+				c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0; ;
+			c_10 = 0.0; ;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+				c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0; ;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			}
+		}
+	return;
+	}
+
+
+
+// dtrsm_left_lower_nottransposed_unit
+void TRSM_LLNU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL
+		d_00, d_01,
+		d_10, d_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda; // triangular
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pD = sD->pA + di + dj*ldd;
+#if 1
+	// solve
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			d_00 = alpha * pB[ii+0+ldb*(jj+0)];
+			d_10 = alpha * pB[ii+1+ldb*(jj+0)];
+			d_01 = alpha * pB[ii+0+ldb*(jj+1)];
+			d_11 = alpha * pB[ii+1+ldb*(jj+1)];
+			kk = 0;
+#if 0
+			for(; kk<ii-1; kk+=2)
+				{
+				d_00 -= pA[ii+0+lda*(kk+0)] * pD[kk+ldd*(jj+0)];
+				d_10 -= pA[ii+1+lda*(kk+0)] * pD[kk+ldd*(jj+0)];
+				d_01 -= pA[ii+0+lda*(kk+0)] * pD[kk+ldd*(jj+1)];
+				d_11 -= pA[ii+1+lda*(kk+0)] * pD[kk+ldd*(jj+1)];
+				d_00 -= pA[ii+0+lda*(kk+1)] * pD[kk+ldd*(jj+0)];
+				d_10 -= pA[ii+1+lda*(kk+1)] * pD[kk+ldd*(jj+0)];
+				d_01 -= pA[ii+0+lda*(kk+1)] * pD[kk+ldd*(jj+1)];
+				d_11 -= pA[ii+1+lda*(kk+1)] * pD[kk+ldd*(jj+1)];
+				}
+			if(kk<ii)
+#else
+			for(; kk<ii; kk++)
+#endif
+				{
+				d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+0)];
+				d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+1)];
+				d_11 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+1)];
+				}
+			d_10 -= pA[ii+1+lda*kk] * d_00;
+			d_11 -= pA[ii+1+lda*kk] * d_01;
+			pD[ii+0+ldd*(jj+0)] = d_00;
+			pD[ii+1+ldd*(jj+0)] = d_10;
+			pD[ii+0+ldd*(jj+1)] = d_01;
+			pD[ii+1+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<m; ii++)
+			{
+			d_00 = alpha * pB[ii+ldb*(jj+0)];
+			d_01 = alpha * pB[ii+ldb*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+1)];
+				}
+			pD[ii+ldd*(jj+0)] = d_00;
+			pD[ii+ldd*(jj+1)] = d_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			d_00 = alpha * pB[ii+0+ldb*jj];
+			d_10 = alpha * pB[ii+1+ldb*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*jj];
+				d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*jj];
+				}
+			d_10 -= pA[ii+1+lda*kk] * d_00;
+			pD[ii+0+ldd*jj] = d_00;
+			pD[ii+1+ldd*jj] = d_10;
+			}
+		for(; ii<m; ii++)
+			{
+			d_00 = alpha * pB[ii+ldb*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pA[ii+lda*kk] * pD[kk+ldd*jj];
+				}
+			pD[ii+ldd*jj] = d_00;
+			}
+		}
+#else
+	// copy
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			for(ii=0; ii<m; ii++)
+				pD[ii+ldd*jj] = alpha * pB[ii+ldb*jj];
+		}
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m; ii++)
+			{
+			d_00 = pD[ii+ldd*jj];
+			for(kk=ii+1; kk<m; kk++)
+				{
+				pD[kk+ldd*jj] -= pA[kk+lda*ii] * d_00;
+				}
+			}
+		}
+#endif
+	return;
+	}
+
+
+
+// dtrsm_left_upper_nottransposed_notunit
+void TRSM_LUNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk, id;
+	REAL
+		d_00, d_01,
+		d_10, d_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda; // triangular
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL *dA = sA->dA;
+	if(!(sA->use_dA==1 & ai==0 & aj==0))
+		{
+		// inverte diagonal of pA
+		for(ii=0; ii<m; ii++)
+			dA[ii] = 1.0/pA[ii+lda*ii];
+		// use only now
+		sA->use_dA = 0;
+		}
+#if 1
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			id = m-ii-2;
+			d_00 = alpha * pB[id+0+ldb*(jj+0)];
+			d_10 = alpha * pB[id+1+ldb*(jj+0)];
+			d_01 = alpha * pB[id+0+ldb*(jj+1)];
+			d_11 = alpha * pB[id+1+ldb*(jj+1)];
+			kk = id+2;
+#if 0
+			for(; kk<m-1; kk+=2)
+				{
+				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+				d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+				d_00 -= pA[id+0+lda*(kk+1)] * pD[kk+1+ldd*(jj+0)];
+				d_10 -= pA[id+1+lda*(kk+1)] * pD[kk+1+ldd*(jj+0)];
+				d_01 -= pA[id+0+lda*(kk+1)] * pD[kk+1+ldd*(jj+1)];
+				d_11 -= pA[id+1+lda*(kk+1)] * pD[kk+1+ldd*(jj+1)];
+				}
+			if(kk<m)
+#else
+			for(; kk<m; kk++)
+#endif
+				{
+				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+				d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+				}
+			d_10 *= dA[id+1];
+			d_11 *= dA[id+1];
+			d_00 -= pA[id+0+lda*(id+1)] * d_10;
+			d_01 -= pA[id+0+lda*(id+1)] * d_11;
+			d_00 *= dA[id+0];
+			d_01 *= dA[id+0];
+			pD[id+0+ldd*(jj+0)] = d_00;
+			pD[id+1+ldd*(jj+0)] = d_10;
+			pD[id+0+ldd*(jj+1)] = d_01;
+			pD[id+1+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<m; ii++)
+			{
+			id = m-ii-1;
+			d_00 = alpha * pB[id+0+ldb*(jj+0)];
+			d_01 = alpha * pB[id+0+ldb*(jj+1)];
+			kk = id+1;
+			for(; kk<m; kk++)
+				{
+				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+				}
+			d_00 *= dA[id+0];
+			d_01 *= dA[id+0];
+			pD[id+0+ldd*(jj+0)] = d_00;
+			pD[id+0+ldd*(jj+1)] = d_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			id = m-ii-2;
+			d_00 = alpha * pB[id+0+ldb*(jj+0)];
+			d_10 = alpha * pB[id+1+ldb*(jj+0)];
+			kk = id+2;
+			for(; kk<m; kk++)
+				{
+				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				}
+			d_10 *= dA[id+1];
+			d_00 -= pA[id+0+lda*(id+1)] * d_10;
+			d_00 *= dA[id+0];
+			pD[id+0+ldd*(jj+0)] = d_00;
+			pD[id+1+ldd*(jj+0)] = d_10;
+			}
+		for(; ii<m; ii++)
+			{
+			id = m-ii-1;
+			d_00 = alpha * pB[id+0+ldb*(jj+0)];
+			kk = id+1;
+			for(; kk<m; kk++)
+				{
+				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				}
+			d_00 *= dA[id+0];
+			pD[id+0+ldd*(jj+0)] = d_00;
+			}
+		}
+#else
+	// copy
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			for(ii=0; ii<m; ii++)
+				pD[ii+ldd*jj] = alpha * pB[ii+ldb*jj];
+		}
+	// solve
+	for(jj=0; jj<n; jj++)
+		{
+		for(ii=m-1; ii>=0; ii--)
+			{
+			d_00 = pD[ii+ldd*jj] * dA[ii];
+			pD[ii+ldd*jj] = d_00;
+			for(kk=0; kk<ii; kk++)
+				{
+				pD[kk+ldd*jj] -= pA[kk+lda*ii] * d_00;
+				}
+			}
+		}
+#endif
+	return;
+	}
+
+
+
+// dtrsm_right_lower_transposed_unit
+void TRSM_RLTU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL
+		f_10,
+		c_00, c_01,
+		c_10, c_11;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		f_10 = pA[jj+1+lda*(jj+0)];
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+			c_10 = alpha * pB[ii+1+ldb*(jj+0)];
+			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+			c_11 = alpha * pB[ii+1+ldb*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
+				c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
+				c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
+				}
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			pD[ii+1+ldd*(jj+0)] = c_10;
+			c_01 -= c_00 * f_10;
+			c_11 -= c_10 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01;
+			pD[ii+1+ldd*(jj+1)] = c_11;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			c_01 -= c_00 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// factorize diagonal
+		for(ii=0; ii<m; ii++)
+			{
+			c_00 = alpha * pB[ii+ldb*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
+				}
+			pD[ii+ldd*jj] = c_00;
+			}
+		}
+	return;
+	}
+
+
+
+// dtrsm_right_lower_transposed_unit
+void TRSM_RLTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL *dA = sA->dA;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / pA[ii+lda*ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / pA[ii+lda*ii];
+		sA->use_dA = 0;
+		}
+	REAL
+		f_00_inv, 
+		f_10, f_11_inv,
+		c_00, c_01,
+		c_10, c_11;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		f_00_inv = dA[jj+0];
+		f_10 = pA[jj+1+lda*(jj+0)];
+		f_11_inv = dA[jj+1];
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+			c_10 = alpha * pB[ii+1+ldb*(jj+0)];
+			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+			c_11 = alpha * pB[ii+1+ldb*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
+				c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
+				c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
+				}
+			c_00 *= f_00_inv;
+			c_10 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			pD[ii+1+ldd*(jj+0)] = c_10;
+			c_01 -= c_00 * f_10;
+			c_11 -= c_10 * f_10;
+			c_01 *= f_11_inv;
+			c_11 *= f_11_inv;
+			pD[ii+0+ldd*(jj+1)] = c_01;
+			pD[ii+1+ldd*(jj+1)] = c_11;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
+				}
+			c_00 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			c_01 -= c_00 * f_10;
+			c_01 *= f_11_inv;
+			pD[ii+0+ldd*(jj+1)] = c_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// factorize diagonal
+		f_00_inv = dA[jj];
+		for(ii=0; ii<m; ii++)
+			{
+			c_00 = alpha * pB[ii+ldb*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
+				}
+			c_00 *= f_00_inv;
+			pD[ii+ldd*jj] = c_00;
+			}
+		}
+	return;
+	}
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void TRSM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+	printf("\ndtrsm_rutn_libstr: feature not implemented yet\n");
+	exit(1);
+//	if(!(pB==pD))
+//		{
+//		for(jj=0; jj<n; jj++)
+//			COPY(&m, pB+jj*sB->m, &i1, pD+jj*sD->m, &i1);
+//		}
+//	TRSM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &(sA->m), pD, &(sD->m));
+	return;
+	}
+
+
+
+// dtrmm_right_upper_transposed_notunit (A triangular !!!)
+void TRMM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL 
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pD = sD->pA + di + dj*ldd;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0;
+			c_10 = 0.0;
+			c_01 = 0.0;
+			c_11 = 0.0;
+			kk = jj;
+			c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+			c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
+			kk++;
+			for(; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+				c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
+				c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
+				c_11 += pB[(ii+1)+ldb*kk] * pA[(jj+1)+lda*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			c_01 = 0.0;
+			kk = jj;
+			c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+			kk++;
+			for(; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+				c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0;
+			c_10 = 0.0;
+			for(kk=jj; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+				c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			for(kk=jj; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			}
+		}	
+	return;
+	}
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
+void TRMM_RLNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL 
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pD = sD->pA + di + dj*ldd;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0; ;
+			c_10 = 0.0; ;
+			c_01 = 0.0; ;
+			c_11 = 0.0; ;
+			kk = jj;
+			c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+			c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
+			kk++;
+			for(; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+				c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
+				c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
+				c_11 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+1)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0; ;
+			c_01 = 0.0; ;
+			kk = jj;
+			c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+			kk++;
+			for(; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+				c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0; ;
+			c_10 = 0.0; ;
+			for(kk=jj; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+				c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0; ;
+			for(kk=jj; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			}
+		}
+	return;
+	}
+
+
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_LIBSTR(int m, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0)
+		return;
+	int ii, jj, kk;
+	int n = m; // TODO optimize for this case !!!!!!!!!
+	REAL
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		// diagonal
+		c_00 = 0.0;
+		c_10 = 0.0;
+		c_11 = 0.0;
+		for(kk=0; kk<k; kk++)
+			{
+			c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
+			c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
+			c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
+			}
+		pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
+		pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
+		pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
+		// lower
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0;
+			c_10 = 0.0;
+			c_01 = 0.0;
+			c_11 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+				c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
+				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+				c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
+				}
+			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+			pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
+			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+			pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			c_01 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+				}
+			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// diagonal
+		c_00 = 0.0;
+		for(kk=0; kk<k; kk++)
+			{
+			c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
+			}
+		pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
+		// lower
+		for(ii=jj+1; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
+				}
+			pD[ii+ldd*jj] = beta * pC[ii+ldc*jj] + alpha * c_00;
+			}
+		}
+	return;
+	}
+
+
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_MN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		// diagonal
+		c_00 = 0.0;
+		c_10 = 0.0;
+		c_11 = 0.0;
+		for(kk=0; kk<k; kk++)
+			{
+			c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
+			c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
+			c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
+			}
+		pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
+		pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
+		pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
+		// lower
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0;
+			c_10 = 0.0;
+			c_01 = 0.0;
+			c_11 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+				c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
+				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+				c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
+				}
+			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+			pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
+			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+			pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			c_01 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+				}
+			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// diagonal
+		c_00 = 0.0;
+		for(kk=0; kk<k; kk++)
+			{
+			c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
+			}
+		pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
+		// lower
+		for(ii=jj+1; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
+				}
+			pD[ii+ldd*jj] = beta * pC[ii+ldc*jj] + alpha * c_00;
+			}
+		}
+	return;
+	}
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+// dgemm nt
+void GEMM_NT_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cn = 'n';
+	char ct = 't';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long kk = k;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GEMM(&cn, &ct, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dgemm nn
+void GEMM_NN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cn = 'n';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long kk = k;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GEMM(&cn, &cn, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GEMM(&cn, &cn, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrsm_left_lower_nottransposed_unit
+void TRSM_LLNU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cl, &cl, &cn, &cu, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*sD->m, &i1);
+		}
+	TRSM(&cl, &cl, &cn, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrsm_left_upper_nottransposed_notunit
+void TRSM_LUNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cl, &cu, &cn, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cl, &cu, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrsm_right_lower_transposed_unit
+void TRSM_RLTU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cr, &cl, &ct, &cu, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cr, &cl, &ct, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrsm_right_lower_transposed_notunit
+void TRSM_RLTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cr, &cl, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cr, &cl, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void TRSM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cr, &cu, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrmm_right_upper_transposed_notunit (A triangular !!!)
+void TRMM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRMM(&cr, &cu, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRMM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
+void TRMM_RLNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRMM(&cr, &cl, &cn, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRMM(&cr, &cl, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_LIBSTR(int m, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA + ai + aj*sA->m;
+	REAL *pB = sB->pA + bi + bj*sB->m;
+	REAL *pC = sC->pA + ci + cj*sC->m;
+	REAL *pD = sD->pA + di + dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long kk = k;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<m; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	if(pA==pB)
+		{
+		SYRK(&cl, &cn, &mm, &kk, &alpha, pA, &lda, &beta, pD, &ldd);
+		}
+	else
+		{
+		GEMM(&cn, &ct, &mm, &mm, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+		}
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<m; jj++)
+			COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
+		}
+	if(pA==pB)
+		{
+		SYRK(&cl, &cn, &m, &k, &alpha, pA, &lda, &beta, pD, &ldd);
+		}
+	else
+		{
+		GEMM(&cn, &ct, &m, &m, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+		}
+#endif
+	return;
+	}
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_MN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA + ai + aj*sA->m;
+	REAL *pB = sB->pA + bi + bj*sB->m;
+	REAL *pC = sC->pA + ci + cj*sC->m;
+	REAL *pD = sD->pA + di + dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long kk = k;
+	long long mmn = mm-nn;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	if(pA==pB)
+		{
+		SYRK(&cl, &cn, &nn, &kk, &alpha, pA, &lda, &beta, pD, &ldd);
+		GEMM(&cn, &ct, &mmn, &nn, &kk, &alpha, pA+n, &lda, pB, &ldb, &beta, pD+n, &ldd);
+		}
+	else
+		{
+		GEMM(&cn, &ct, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+		}
+#else
+	int i1 = 1;
+	int mmn = m-n;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
+		}
+	if(pA==pB)
+		{
+		SYRK(&cl, &cn, &n, &k, &alpha, pA, &lda, &beta, pD, &ldd);
+		GEMM(&cn, &ct, &mmn, &n, &k, &alpha, pA+n, &lda, pB, &ldb, &beta, pD+n, &ldd);
+		}
+	else
+		{
+		GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+		}
+#endif
+	return;
+	}
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
+