Squashed 'third_party/blasfeo/' content from commit 2a828ca

Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/blas/x_lapack_lib.c b/blas/x_lapack_lib.c
new file mode 100644
index 0000000..762a8a0
--- /dev/null
+++ b/blas/x_lapack_lib.c
@@ -0,0 +1,2112 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+// dpotrf
+void POTRF_L_LIBSTR(int m, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0)
+		return;
+	int ii, jj, kk;
+	REAL
+		f_00_inv, 
+		f_10, f_11_inv,
+		c_00, c_01,
+		c_10, c_11;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL *dD = sD->dA;
+	if(di==0 & dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+	jj = 0;
+	for(; jj<m-1; jj+=2)
+		{
+		// factorize diagonal
+		c_00 = pC[jj+0+ldc*(jj+0)];;
+		c_10 = pC[jj+1+ldc*(jj+0)];;
+		c_11 = pC[jj+1+ldc*(jj+1)];;
+		for(kk=0; kk<jj; kk++)
+			{
+			c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
+			c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
+			c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
+			}
+		if(c_00>0)
+			{
+			f_00_inv = 1.0/sqrt(c_00);
+			}
+		else
+			{
+			f_00_inv = 0.0;
+			}
+		dD[jj+0] = f_00_inv;
+		pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
+		f_10 = c_10 * f_00_inv;
+		pD[jj+1+ldd*(jj+0)] = f_10;
+		c_11 -= f_10 * f_10;
+		if(c_11>0)
+			{
+			f_11_inv = 1.0/sqrt(c_11);
+			}
+		else
+			{
+			f_11_inv = 0.0;
+			}
+		dD[jj+1] = f_11_inv;
+		pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
+		// solve lower
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = pC[ii+0+ldc*(jj+0)];
+			c_10 = pC[ii+1+ldc*(jj+0)];
+			c_01 = pC[ii+0+ldc*(jj+1)];
+			c_11 = pC[ii+1+ldc*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			c_00 *= f_00_inv;
+			c_10 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			pD[ii+1+ldd*(jj+0)] = c_10;
+			c_01 -= c_00 * f_10;
+			c_11 -= c_10 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+			pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = pC[ii+0+ldc*(jj+0)];
+			c_01 = pC[ii+0+ldc*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			c_00 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			c_01 -= c_00 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+			}
+		}
+	for(; jj<m; jj++)
+		{
+		// factorize diagonal
+		c_00 = pC[jj+ldc*jj];;
+		for(kk=0; kk<jj; kk++)
+			{
+			c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
+			}
+		if(c_00>0)
+			{
+			f_00_inv = 1.0/sqrt(c_00);
+			}
+		else
+			{
+			f_00_inv = 0.0;
+			}
+		dD[jj] = f_00_inv;
+		pD[jj+ldd*jj] = c_00 * f_00_inv;
+		// solve lower
+//		for(ii=jj+1; ii<m; ii++)
+//			{
+//			c_00 = pC[ii+ldc*jj];
+//			for(kk=0; kk<jj; kk++)
+//				{
+//				c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
+//				}
+//			pD[ii+ldd*jj] = c_00 * f_00_inv;
+//			}
+		}
+	return;
+	}
+
+
+
+// dpotrf
+void POTRF_L_MN_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL
+		f_00_inv, 
+		f_10, f_11_inv,
+		c_00, c_01,
+		c_10, c_11;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL *dD = sD->dA;
+	if(di==0 & dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		// factorize diagonal
+		c_00 = pC[jj+0+ldc*(jj+0)];;
+		c_10 = pC[jj+1+ldc*(jj+0)];;
+		c_11 = pC[jj+1+ldc*(jj+1)];;
+		for(kk=0; kk<jj; kk++)
+			{
+			c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
+			c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
+			c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
+			}
+		if(c_00>0)
+			{
+			f_00_inv = 1.0/sqrt(c_00);
+			}
+		else
+			{
+			f_00_inv = 0.0;
+			}
+		dD[jj+0] = f_00_inv;
+		pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
+		f_10 = c_10 * f_00_inv;
+		pD[jj+1+ldd*(jj+0)] = f_10;
+		c_11 -= f_10 * f_10;
+		if(c_11>0)
+			{
+			f_11_inv = 1.0/sqrt(c_11);
+			}
+		else
+			{
+			f_11_inv = 0.0;
+			}
+		dD[jj+1] = f_11_inv;
+		pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
+		// solve lower
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = pC[ii+0+ldc*(jj+0)];
+			c_10 = pC[ii+1+ldc*(jj+0)];
+			c_01 = pC[ii+0+ldc*(jj+1)];
+			c_11 = pC[ii+1+ldc*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			c_00 *= f_00_inv;
+			c_10 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			pD[ii+1+ldd*(jj+0)] = c_10;
+			c_01 -= c_00 * f_10;
+			c_11 -= c_10 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+			pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = pC[ii+0+ldc*(jj+0)];
+			c_01 = pC[ii+0+ldc*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			c_00 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			c_01 -= c_00 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// factorize diagonal
+		c_00 = pC[jj+ldc*jj];;
+		for(kk=0; kk<jj; kk++)
+			{
+			c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
+			}
+		if(c_00>0)
+			{
+			f_00_inv = 1.0/sqrt(c_00);
+			}
+		else
+			{
+			f_00_inv = 0.0;
+			}
+		dD[jj] = f_00_inv;
+		pD[jj+ldd*jj] = c_00 * f_00_inv;
+		// solve lower
+		for(ii=jj+1; ii<m; ii++)
+			{
+			c_00 = pC[ii+ldc*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
+				}
+			pD[ii+ldd*jj] = c_00 * f_00_inv;
+			}
+		}
+	return;
+	}
+
+
+
+// dsyrk dpotrf
+void SYRK_POTRF_LN_LIBSTR(int m, int n, int k, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	int ii, jj, kk;
+	REAL
+		f_00_inv, 
+		f_10, f_11_inv,
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL *dD = sD->dA;
+	if(di==0 & dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		// factorize diagonal
+		c_00 = pC[jj+0+ldc*(jj+0)];;
+		c_10 = pC[jj+1+ldc*(jj+0)];;
+		c_11 = pC[jj+1+ldc*(jj+1)];;
+		for(kk=0; kk<k; kk++)
+			{
+			c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
+			c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
+			c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
+			}
+		for(kk=0; kk<jj; kk++)
+			{
+			c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
+			c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
+			c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
+			}
+		if(c_00>0)
+			{
+			f_00_inv = 1.0/sqrt(c_00);
+			}
+		else
+			{
+			f_00_inv = 0.0;
+			}
+		dD[jj+0] = f_00_inv;
+		pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
+		f_10 = c_10 * f_00_inv;
+		pD[jj+1+ldd*(jj+0)] = f_10;
+		c_11 -= f_10 * f_10;
+		if(c_11>0)
+			{
+			f_11_inv = 1.0/sqrt(c_11);
+			}
+		else
+			{
+			f_11_inv = 0.0;
+			}
+		dD[jj+1] = f_11_inv;
+		pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
+		// solve lower
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = pC[ii+0+ldc*(jj+0)];
+			c_10 = pC[ii+1+ldc*(jj+0)];
+			c_01 = pC[ii+0+ldc*(jj+1)];
+			c_11 = pC[ii+1+ldc*(jj+1)];
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+				c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
+				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+				c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
+				}
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			c_00 *= f_00_inv;
+			c_10 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			pD[ii+1+ldd*(jj+0)] = c_10;
+			c_01 -= c_00 * f_10;
+			c_11 -= c_10 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+			pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = pC[ii+0+ldc*(jj+0)];
+			c_01 = pC[ii+0+ldc*(jj+1)];
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+				}
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			c_00 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			c_01 -= c_00 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// factorize diagonal
+		c_00 = pC[jj+ldc*jj];;
+		for(kk=0; kk<k; kk++)
+			{
+			c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
+			}
+		for(kk=0; kk<jj; kk++)
+			{
+			c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
+			}
+		if(c_00>0)
+			{
+			f_00_inv = 1.0/sqrt(c_00);
+			}
+		else
+			{
+			f_00_inv = 0.0;
+			}
+		dD[jj] = f_00_inv;
+		pD[jj+ldd*jj] = c_00 * f_00_inv;
+		// solve lower
+		for(ii=jj+1; ii<m; ii++)
+			{
+			c_00 = pC[ii+ldc*jj];
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
+				}
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
+				}
+			pD[ii+ldd*jj] = c_00 * f_00_inv;
+			}
+		}
+	return;
+	}
+
+
+
+// dgetrf without pivoting
+void GETF2_NOPIVOT(int m, int n, REAL *A, int lda, REAL *dA)
+	{
+	int ii, jj, kk, itmp0, itmp1;
+	int iimax = m<n ? m : n;
+	int i1 = 1;
+	REAL dtmp;
+	REAL dm1 = -1.0;
+
+	for(ii=0; ii<iimax; ii++)
+		{
+		itmp0 = m-ii-1;
+		dtmp = 1.0/A[ii+lda*ii];
+		dA[ii] = dtmp;
+		for(jj=0; jj<itmp0; jj++)
+			{
+			A[ii+1+jj+lda*ii] *= dtmp;
+			}
+		itmp1 = n-ii-1;
+		for(jj=0; jj<itmp1; jj++)
+			{
+			for(kk=0; kk<itmp0; kk++)
+				{
+				A[(ii+1+kk)+lda*(ii+1+jj)] -= A[(ii+1+kk)+lda*ii] * A[ii+lda*(ii+1+jj)];
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// dgetrf without pivoting
+void GETRF_NOPIVOT_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	int ii, jj, kk;
+//	int i1 = 1;
+//	REAL d1 = 1.0;
+	REAL
+		d_00_inv, d_11_inv,
+		d_00, d_01,
+		d_10, d_11;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL *dD = sD->dA;
+	if(di==0 & dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+#if 1
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		// upper
+		ii = 0;
+		for(; ii<jj-1; ii+=2)
+			{
+			// correct upper
+			d_00 = pC[(ii+0)+ldc*(jj+0)];
+			d_10 = pC[(ii+1)+ldc*(jj+0)];
+			d_01 = pC[(ii+0)+ldc*(jj+1)];
+			d_11 = pC[(ii+1)+ldc*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// solve upper
+			d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+			d_11 -= pD[(ii+1)+ldd*kk] * d_01;
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+1)+ldd*(jj+0)] = d_10;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			pD[(ii+1)+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<jj; ii++)
+			{
+			// correct upper
+			d_00 = pC[(ii+0)+ldc*(jj+0)];
+			d_01 = pC[(ii+0)+ldc*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// solve upper
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			}
+		// diagonal
+		ii = jj;
+		if(ii<m-1)
+			{
+			// correct diagonal
+			d_00 = pC[(ii+0)+ldc*(jj+0)];
+			d_10 = pC[(ii+1)+ldc*(jj+0)];
+			d_01 = pC[(ii+0)+ldc*(jj+1)];
+			d_11 = pC[(ii+1)+ldc*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// factorize diagonal
+			d_00_inv = 1.0/d_00;
+			d_10 *= d_00_inv;
+			d_11 -= d_10 * d_01;
+			d_11_inv = 1.0/d_11;
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+1)+ldd*(jj+0)] = d_10;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			pD[(ii+1)+ldd*(jj+1)] = d_11;
+			dD[ii+0] = d_00_inv;
+			dD[ii+1] = d_11_inv;
+			ii += 2;
+			}
+		else if(ii<m)
+			{
+			// correct diagonal
+			d_00 = pC[(ii+0)+ldc*(jj+0)];
+			d_01 = pC[(ii+0)+ldc*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// factorize diagonal
+			d_00_inv = 1.0/d_00;
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			dD[ii+0] = d_00_inv;
+			ii += 1;
+			}
+		// lower
+		for(; ii<m-1; ii+=2)
+			{
+			// correct lower
+			d_00 = pC[(ii+0)+ldc*(jj+0)];
+			d_10 = pC[(ii+1)+ldc*(jj+0)];
+			d_01 = pC[(ii+0)+ldc*(jj+1)];
+			d_11 = pC[(ii+1)+ldc*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// solve lower
+			d_00 *= d_00_inv;
+			d_10 *= d_00_inv;
+			d_01 -= d_00 * pD[kk+ldd*(jj+1)];
+			d_11 -= d_10 * pD[kk+ldd*(jj+1)];
+			d_01 *= d_11_inv;
+			d_11 *= d_11_inv;
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+1)+ldd*(jj+0)] = d_10;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			pD[(ii+1)+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<m; ii++)
+			{
+			// correct lower
+			d_00 = pC[(ii+0)+ldc*(jj+0)];
+			d_01 = pC[(ii+0)+ldc*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// solve lower
+			d_00 *= d_00_inv;
+			d_01 -= d_00 * pD[kk+ldd*(jj+1)];
+			d_01 *= d_11_inv;
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// upper
+		ii = 0;
+		for(; ii<jj-1; ii+=2)
+			{
+			// correct upper
+			d_00 = pC[(ii+0)+ldc*jj];
+			d_10 = pC[(ii+1)+ldc*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// solve upper
+			d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+			pD[(ii+0)+ldd*jj] = d_00;
+			pD[(ii+1)+ldd*jj] = d_10;
+			}
+		for(; ii<jj; ii++)
+			{
+			// correct upper
+			d_00 = pC[(ii+0)+ldc*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// solve upper
+			pD[(ii+0)+ldd*jj] = d_00;
+			}
+		// diagonal
+		ii = jj;
+		if(ii<m-1)
+			{
+			// correct diagonal
+			d_00 = pC[(ii+0)+ldc*jj];
+			d_10 = pC[(ii+1)+ldc*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// factorize diagonal
+			d_00_inv = 1.0/d_00;
+			d_10 *= d_00_inv;
+			pD[(ii+0)+ldd*jj] = d_00;
+			pD[(ii+1)+ldd*jj] = d_10;
+			dD[ii+0] = d_00_inv;
+			ii += 2;
+			}
+		else if(ii<m)
+			{
+			// correct diagonal
+			d_00 = pC[(ii+0)+ldc*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// factorize diagonal
+			d_00_inv = 1.0/d_00;
+			pD[(ii+0)+ldd*jj] = d_00;
+			dD[ii+0] = d_00_inv;
+			ii += 1;
+			}
+		// lower
+		for(; ii<m-1; ii+=2)
+			{
+			// correct lower
+			d_00 = pC[(ii+0)+ldc*jj];
+			d_10 = pC[(ii+1)+ldc*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// solve lower
+			d_00 *= d_00_inv;
+			d_10 *= d_00_inv;
+			pD[(ii+0)+ldd*jj] = d_00;
+			pD[(ii+1)+ldd*jj] = d_10;
+			}
+		for(; ii<m; ii++)
+			{
+			// correct lower
+			d_00 = pC[(ii+0)+ldc*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// solve lower
+			d_00 *= d_00_inv;
+			pD[(ii+0)+ldd*jj] = d_00;
+			}
+		}
+#else
+	if(pC!=pD)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			for(ii=0; ii<m; ii++)
+				{
+				pD[ii+ldd*jj] = pC[ii+ldc*jj];
+				}
+			}
+		}
+	GETF2_NOPIVOT(m, n, pD, ldd, dD);
+#endif
+	return;
+	}
+
+
+
+// dgetrf pivoting
+void GETRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, int *ipiv)
+	{
+	int ii, i0, jj, kk, ip, itmp0, itmp1;
+	REAL dtmp, dmax;
+	REAL
+		d_00_inv, d_11_inv,
+		d_00, d_01,
+		d_10, d_11;
+	int i1 = 1;
+	REAL d1 = 1.0;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pC = sC->pA+ci+cj*ldc;
+	REAL *pD = sD->pA+di+dj*ldd;
+	REAL *dD = sD->dA;
+	if(di==0 & dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+	// copy if needed
+	if(pC!=pD)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			for(ii=0; ii<m; ii++)
+				{
+				pD[ii+ldd*jj] = pC[ii+ldc*jj];
+				}
+			}
+		}
+	// factorize
+#if 1
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<jj-1; ii+=2)
+			{
+			// correct upper
+			d_00 = pD[(ii+0)+ldd*(jj+0)];
+			d_10 = pD[(ii+1)+ldd*(jj+0)];
+			d_01 = pD[(ii+0)+ldd*(jj+1)];
+			d_11 = pD[(ii+1)+ldd*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// solve upper
+			d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+			d_11 -= pD[(ii+1)+ldd*kk] * d_01;
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+1)+ldd*(jj+0)] = d_10;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			pD[(ii+1)+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<jj; ii++)
+			{
+			// correct upper
+			d_00 = pD[(ii+0)+ldd*(jj+0)];
+			d_01 = pD[(ii+0)+ldd*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// solve upper
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			}
+		// correct diagonal and lower and look for pivot
+		// correct
+		ii = jj;
+		i0 = ii;
+		for(; ii<m-1; ii+=2)
+			{
+			d_00 = pD[(ii+0)+ldd*(jj+0)];
+			d_10 = pD[(ii+1)+ldd*(jj+0)];
+			d_01 = pD[(ii+0)+ldd*(jj+1)];
+			d_11 = pD[(ii+1)+ldd*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+1)+ldd*(jj+0)] = d_10;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			pD[(ii+1)+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<m; ii++)
+			{
+			d_00 = pD[(ii+0)+ldd*(jj+0)];
+			d_01 = pD[(ii+0)+ldd*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			}
+		// look for pivot & solve
+		// left column
+		ii = i0;
+		dmax = 0;
+		ip = ii;
+		for(; ii<m-1; ii+=2)
+			{
+			d_00 = pD[(ii+0)+ldd*jj];
+			d_10 = pD[(ii+1)+ldd*jj];
+			dtmp = d_00>0.0 ? d_00 : -d_00;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+0;
+				}
+			dtmp = d_10>0.0 ? d_10 : -d_10;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+1;
+				}
+			}
+		for(; ii<m; ii++)
+			{
+			d_00 = pD[(ii+0)+ldd*jj];
+			dtmp = d_00>0.0 ? d_00 : -d_00;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+0;
+				}
+			}
+		// row swap
+		ii = i0;
+		ipiv[ii] = ip;
+		if(ip!=ii)
+			{
+			for(kk=0; kk<n; kk++)
+				{
+				dtmp = pD[ii+ldd*kk];
+				pD[ii+ldd*kk] = pD[ip+ldd*kk];
+				pD[ip+ldd*kk] = dtmp;
+				}
+			}
+		// factorize diagonal
+		d_00 = pD[(ii+0)+ldd*(jj+0)];
+		d_00_inv = 1.0/d_00;
+		pD[(ii+0)+ldd*(jj+0)] = d_00;
+		dD[ii] = d_00_inv;
+		ii += 1;
+		// solve & compute next pivot
+		dmax = 0;
+		ip = ii;
+		for(; ii<m-1; ii+=2)
+			{
+			d_00 = pD[(ii+0)+ldd*(jj+0)];
+			d_10 = pD[(ii+1)+ldd*(jj+0)];
+			d_00 *= d_00_inv;
+			d_10 *= d_00_inv;
+			d_01 = pD[(ii+0)+ldd*(jj+1)];
+			d_11 = pD[(ii+1)+ldd*(jj+1)];
+			d_01 -= d_00 * pD[jj+ldd*(jj+1)];
+			d_11 -= d_10 * pD[jj+ldd*(jj+1)];
+			dtmp = d_01>0.0 ? d_01 : -d_01;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+0;
+				}
+			dtmp = d_11>0.0 ? d_11 : -d_11;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+1;
+				}
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+1)+ldd*(jj+0)] = d_10;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			pD[(ii+1)+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<m; ii++)
+			{
+			d_00 = pD[(ii+0)+ldd*(jj+0)];
+			d_00 *= d_00_inv;
+			d_01 = pD[(ii+0)+ldd*(jj+1)];
+			d_01 -= d_00 * pD[jj+ldd*(jj+1)];
+			dtmp = d_01>0.0 ? d_01 : -d_01;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+0;
+				}
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			}
+		// row swap
+		ii = i0+1;
+		ipiv[ii] = ip;
+		if(ip!=ii)
+			{
+			for(kk=0; kk<n; kk++)
+				{
+				dtmp = pD[ii+ldd*kk];
+				pD[ii+ldd*kk] = pD[ip+ldd*kk];
+				pD[ip+ldd*kk] = dtmp;
+				}
+			}
+		// factorize diagonal
+		d_00 = pD[ii+ldd*(jj+1)];
+		d_00_inv = 1.0/d_00;
+		pD[ii+ldd*(jj+1)] = d_00;
+		dD[ii] = d_00_inv;
+		ii += 1;
+		// solve lower
+		for(; ii<m; ii++)
+			{
+			d_00 = pD[ii+ldd*(jj+1)];
+			d_00 *= d_00_inv;
+			pD[ii+ldd*(jj+1)] = d_00;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<jj-1; ii+=2)
+			{
+			// correct upper
+			d_00 = pD[(ii+0)+ldd*jj];
+			d_10 = pD[(ii+1)+ldd*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// solve upper
+			d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+			pD[(ii+0)+ldd*jj] = d_00;
+			pD[(ii+1)+ldd*jj] = d_10;
+			}
+		for(; ii<jj; ii++)
+			{
+			// correct upper
+			d_00 = pD[ii+ldd*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[ii+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// solve upper
+			pD[ii+ldd*jj] = d_00;
+			}
+		i0 = ii;
+		ii = jj;
+		// correct diagonal and lower and look for pivot
+		dmax = 0;
+		ip = ii;
+		for(; ii<m-1; ii+=2)
+			{
+			d_00 = pD[(ii+0)+ldd*jj];
+			d_10 = pD[(ii+1)+ldd*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			dtmp = d_00>0.0 ? d_00 : -d_00;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+0;
+				}
+			dtmp = d_10>0.0 ? d_10 : -d_10;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+1;
+				}
+			pD[(ii+0)+ldd*jj] = d_00;
+			pD[(ii+1)+ldd*jj] = d_10;
+			}
+		for(; ii<m; ii++)
+			{
+			d_00 = pD[(ii+0)+ldd*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			dtmp = d_00>0.0 ? d_00 : -d_00;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+0;
+				}
+			pD[(ii+0)+ldd*jj] = d_00;
+			}
+		// row swap
+		ii = i0;
+		ipiv[ii] = ip;
+		if(ip!=ii)
+			{
+			for(kk=0; kk<n; kk++)
+				{
+				dtmp = pD[ii+ldd*kk];
+				pD[ii+ldd*kk] = pD[ip+ldd*kk];
+				pD[ip+ldd*kk] = dtmp;
+				}
+			}
+		// factorize diagonal
+		d_00 = pD[ii+ldd*jj];
+		d_00_inv = 1.0/d_00;
+		pD[ii+ldd*jj] = d_00;
+		dD[ii] = d_00_inv;
+		ii += 1;
+		for(; ii<m; ii++)
+			{
+			// correct lower
+			d_00 = pD[ii+ldd*jj];
+			// solve lower
+			d_00 *= d_00_inv;
+			pD[ii+ldd*jj] = d_00;
+			}
+		}
+#else
+	int iimax = m<n ? m : n;
+	for(ii=0; ii<iimax; ii++)
+		{
+		dmax = (pD[ii+ldd*ii]>0 ? pD[ii+ldd*ii] : -pD[ii+ldd*ii]);
+		ip = ii;
+		for(jj=1; jj<m-ii; jj++)
+			{
+			dtmp = pD[ii+jj+ldd*ii]>0 ? pD[ii+jj+ldd*ii] : -pD[ii+jj+ldd*ii];
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+jj;
+				}
+			}
+		ipiv[ii] = ip;
+		if(ip!=ii)
+			{
+			for(jj=0; jj<n; jj++)
+				{
+				dtmp = pD[ii+jj*ldd];
+				pD[ii+jj*ldd] = pD[ip+jj*ldd];
+				pD[ip+jj*ldd] = dtmp;
+				}
+			}
+		itmp0 = m-ii-1;
+		dtmp = 1.0/pD[ii+ldd*ii];
+		dD[ii] = dtmp;
+		for(jj=0; jj<itmp0; jj++)
+			{
+			pD[ii+1+jj+ldd*ii] *= dtmp;
+			}
+		itmp1 = n-ii-1;
+		for(jj=0; jj<itmp1; jj++)
+			{
+			for(kk=0; kk<itmp0; kk++)
+				{
+				pD[(ii+1+kk)+ldd*(ii+1+jj)] -= pD[(ii+1+kk)+ldd*ii] * pD[ii+ldd*(ii+1+jj)];
+				}
+			}
+		}
+#endif
+	return;	
+	}
+
+
+
+int GEQRF_WORK_SIZE_LIBSTR(int m, int n)
+	{
+	return 0;
+	}
+
+
+
+void GEQRF_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRMAT *sD, int di, int dj, void *work)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	int lda = sA->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA+ai+aj*lda;
+	REAL *pD = sD->pA+di+dj*ldd; // matrix of QR
+	REAL *dD = sD->dA+di; // vectors of tau
+	REAL alpha, beta, tmp, w0, w1;
+	REAL *pC00, *pC01, *pC11, *pv0, *pv1;
+	REAL pW[4] = {0.0, 0.0, 0.0, 0.0};
+	int ldw = 2;
+	REAL pT[4] = {0.0, 0.0, 0.0, 0.0};
+	int ldb = 2;
+	int imax, jmax, kmax;
+	// copy if needed
+	if(pA!=pD)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			for(ii=0; ii<m; ii++)
+				{
+				pD[ii+ldd*jj] = pA[ii+lda*jj];
+				}
+			}
+		}
+	imax = m<n ? m : n;
+	ii = 0;
+#if 1
+	for(; ii<imax-1; ii+=2)
+		{
+		// first column
+		pC00 = &pD[ii+ldd*ii];
+		beta = 0.0;
+		for(jj=1; jj<m-ii; jj++)
+			{
+			tmp = pC00[jj+ldd*0];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			// tau0
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0+ldd*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			// tau0
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			// compute v0
+			pC00[0+ldd*0] = beta;
+			for(jj=1; jj<m-ii; jj++)
+				{
+				pC00[jj+ldd*0] *= tmp;
+				}
+			}
+		// gemv_t & ger
+		pC01 = &pC00[0+ldd*1];
+		pv0 = &pC00[0+ldd*0];
+		kmax = m-ii;
+		w0 = pC01[0+ldd*0]; // pv0[0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			w0 += pC01[kk+ldd*0] * pv0[kk];
+			}
+		w0 = - dD[ii] * w0;
+		pC01[0+ldd*0] += w0; // pv0[0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			pC01[kk+ldd*0] += w0 * pv0[kk];
+			}
+		// second column
+		pC11 = &pD[(ii+1)+ldd*(ii+1)];
+		beta = 0.0;
+		for(jj=1; jj<m-(ii+1); jj++)
+			{
+			tmp = pC11[jj+ldd*0];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			// tau1
+			dD[(ii+1)] = 0.0;
+			}
+		else
+			{
+			alpha = pC11[0+ldd*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			// tau1
+			dD[(ii+1)] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			// compute v1
+			pC11[0+ldd*0] = beta;
+			for(jj=1; jj<m-(ii+1); jj++)
+				pC11[jj+ldd*0] *= tmp;
+			}
+		// compute lower triangular T containing tau for matrix update
+		pv0 = &pC00[0+ldd*0];
+		pv1 = &pC00[0+ldd*1];
+		kmax = m-ii;
+		tmp = pv0[1];
+		for(kk=2; kk<kmax; kk++)
+			tmp += pv0[kk]*pv1[kk];
+		pT[0+ldb*0] = dD[ii+0];
+		pT[1+ldb*0] = - dD[ii+1] * tmp * dD[ii+0];
+		pT[1+ldb*1] = dD[ii+1];
+		jmax = n-ii-2;
+		jj = 0;
+		for(; jj<jmax-1; jj+=2)
+			{
+			// compute W^T = C^T * V
+			pW[0+ldw*0] = pC00[0+ldd*(jj+0+2)] + pC00[1+ldd*(jj+0+2)] * pv0[1];
+			pW[1+ldw*0] = pC00[0+ldd*(jj+1+2)] + pC00[1+ldd*(jj+1+2)] * pv0[1];
+			pW[0+ldw*1] =                        pC00[1+ldd*(jj+0+2)];
+			pW[1+ldw*1] =                        pC00[1+ldd*(jj+1+2)];
+			kk = 2;
+			for(; kk<kmax; kk++)
+				{
+				tmp = pC00[kk+ldd*(jj+0+2)];
+				pW[0+ldw*0] += tmp * pv0[kk];
+				pW[0+ldw*1] += tmp * pv1[kk];
+				tmp = pC00[kk+ldd*(jj+1+2)];
+				pW[1+ldw*0] += tmp * pv0[kk];
+				pW[1+ldw*1] += tmp * pv1[kk];
+				}
+			// compute W^T *= T
+			pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+			pW[1+ldw*1] = pT[1+ldb*0]*pW[1+ldw*0] + pT[1+ldb*1]*pW[1+ldw*1];
+			pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+			pW[1+ldw*0] = pT[0+ldb*0]*pW[1+ldw*0];
+			// compute C -= V * W^T
+			pC00[0+ldd*(jj+0+2)] -= pW[0+ldw*0];
+			pC00[0+ldd*(jj+1+2)] -= pW[1+ldw*0];
+			pC00[1+ldd*(jj+0+2)] -= pv0[1]*pW[0+ldw*0] + pW[0+ldw*1];
+			pC00[1+ldd*(jj+1+2)] -= pv0[1]*pW[1+ldw*0] + pW[1+ldw*1];
+			kk = 2;
+			for(; kk<kmax-1; kk+=2)
+				{
+				pC00[kk+0+ldd*(jj+0+2)] -= pv0[kk+0]*pW[0+ldw*0] + pv1[kk+0]*pW[0+ldw*1];
+				pC00[kk+1+ldd*(jj+0+2)] -= pv0[kk+1]*pW[0+ldw*0] + pv1[kk+1]*pW[0+ldw*1];
+				pC00[kk+0+ldd*(jj+1+2)] -= pv0[kk+0]*pW[1+ldw*0] + pv1[kk+0]*pW[1+ldw*1];
+				pC00[kk+1+ldd*(jj+1+2)] -= pv0[kk+1]*pW[1+ldw*0] + pv1[kk+1]*pW[1+ldw*1];
+				}
+			for(; kk<kmax; kk++)
+				{
+				pC00[kk+ldd*(jj+0+2)] -= pv0[kk]*pW[0+ldw*0] + pv1[kk]*pW[0+ldw*1];
+				pC00[kk+ldd*(jj+1+2)] -= pv0[kk]*pW[1+ldw*0] + pv1[kk]*pW[1+ldw*1];
+				}
+			}
+		for(; jj<jmax; jj++)
+			{
+			// compute W = T * V^T * C
+			pW[0+ldw*0] = pC00[0+ldd*(jj+0+2)] + pC00[1+ldd*(jj+0+2)] * pv0[1];
+			pW[0+ldw*1] =                        pC00[1+ldd*(jj+0+2)];
+			for(kk=2; kk<kmax; kk++)
+				{
+				tmp = pC00[kk+ldd*(jj+0+2)];
+				pW[0+ldw*0] += tmp * pv0[kk];
+				pW[0+ldw*1] += tmp * pv1[kk];
+				}
+			pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+			pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+			// compute C -= V * W^T
+			pC00[0+ldd*(jj+0+2)] -= pW[0+ldw*0];
+			pC00[1+ldd*(jj+0+2)] -= pv0[1]*pW[0+ldw*0] + pW[0+ldw*1];
+			for(kk=2; kk<kmax; kk++)
+				{
+				pC00[kk+ldd*(jj+0+2)] -= pv0[kk]*pW[0+ldw*0] + pv1[kk]*pW[0+ldw*1];
+				}
+			}
+		}
+#endif
+	for(; ii<imax; ii++)
+		{
+		pC00 = &pD[ii+ldd*ii];
+		beta = 0.0;
+		for(jj=1; jj<m-ii; jj++)
+			{
+			tmp = pC00[jj+ldd*0];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0+ldd*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			for(jj=1; jj<m-ii; jj++)
+				pC00[jj+ldd*0] *= tmp;
+			pC00[0+ldd*0] = beta;
+			}
+		if(ii<n)
+			{
+			// gemv_t & ger
+			pC01 = &pC00[0+ldd*1];
+			pv0 = &pC00[0+ldd*0];
+			jmax = n-ii-1;
+			kmax = m-ii;
+			jj = 0;
+			for(; jj<jmax-1; jj+=2)
+				{
+				w0 = pC01[0+ldd*(jj+0)]; // pv0[0] = 1.0
+				w1 = pC01[0+ldd*(jj+1)]; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					w0 += pC01[kk+ldd*(jj+0)] * pv0[kk];
+					w1 += pC01[kk+ldd*(jj+1)] * pv0[kk];
+					}
+				w0 = - dD[ii] * w0;
+				w1 = - dD[ii] * w1;
+				pC01[0+ldd*(jj+0)] += w0; // pv0[0] = 1.0
+				pC01[0+ldd*(jj+1)] += w1; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC01[kk+ldd*(jj+0)] += w0 * pv0[kk];
+					pC01[kk+ldd*(jj+1)] += w1 * pv0[kk];
+					}
+				}
+			for(; jj<jmax; jj++)
+				{
+				w0 = pC01[0+ldd*jj]; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					w0 += pC01[kk+ldd*jj] * pv0[kk];
+					}
+				w0 = - dD[ii] * w0;
+				pC01[0+ldd*jj] += w0; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC01[kk+ldd*jj] += w0 * pv0[kk];
+					}
+				}
+			}
+		}
+	return;
+	}
+
+
+
+int GELQF_WORK_SIZE_LIBSTR(int m, int n)
+	{
+	return 0;
+	}
+
+
+
+void GELQF_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRMAT *sD, int di, int dj, void *work)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	int lda = sA->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA+ai+aj*lda;
+	REAL *pD = sD->pA+di+dj*ldd; // matrix of QR
+	REAL *dD = sD->dA+di; // vectors of tau
+	REAL alpha, beta, tmp, w0, w1;
+	REAL *pC00, *pC10, *pC11, *pv0, *pv1;
+	REAL pW[4] = {0.0, 0.0, 0.0, 0.0};
+	int ldw = 2;
+	REAL pT[4] = {0.0, 0.0, 0.0, 0.0};
+	int ldb = 2;
+	int imax, jmax, kmax;
+	// copy if needed
+	if(pA!=pD)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			for(ii=0; ii<m; ii++)
+				{
+				pD[ii+ldd*jj] = pA[ii+lda*jj];
+				}
+			}
+		}
+	imax = m<n ? m : n;
+	ii = 0;
+#if 1
+	for(; ii<imax-1; ii+=2)
+		{
+		// first column
+		pC00 = &pD[ii+ldd*ii];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ldd*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			// tau0
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0+ldd*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			// tau0
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			// compute v0
+			pC00[0+ldd*0] = beta;
+			for(jj=1; jj<n-ii; jj++)
+				{
+				pC00[0+ldd*jj] *= tmp;
+				}
+			}
+		// gemv_t & ger
+		pC10 = &pC00[1+ldd*0];
+		pv0 = &pC00[0+ldd*0];
+		kmax = n-ii;
+		w0 = pC10[0+ldd*0]; // pv0[0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			w0 += pC10[0+ldd*kk] * pv0[0+ldd*kk];
+			}
+		w0 = - dD[ii] * w0;
+		pC10[0+ldd*0] += w0; // pv0[0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			pC10[0+ldd*kk] += w0 * pv0[0+ldd*kk];
+			}
+		// second row
+		pC11 = &pD[(ii+1)+ldd*(ii+1)];
+		beta = 0.0;
+		for(jj=1; jj<n-(ii+1); jj++)
+			{
+			tmp = pC11[0+ldd*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			// tau1
+			dD[(ii+1)] = 0.0;
+			}
+		else
+			{
+			alpha = pC11[0+ldd*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			// tau1
+			dD[(ii+1)] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			// compute v1
+			pC11[0+ldd*0] = beta;
+			for(jj=1; jj<n-(ii+1); jj++)
+				pC11[0+ldd*jj] *= tmp;
+			}
+		// compute lower triangular T containing tau for matrix update
+		pv0 = &pC00[0+ldd*0];
+		pv1 = &pC00[1+ldd*0];
+		kmax = n-ii;
+		tmp = pv0[0+ldd*1];
+		for(kk=2; kk<kmax; kk++)
+			tmp += pv0[0+ldd*kk]*pv1[0+ldd*kk];
+		pT[0+ldb*0] = dD[ii+0];
+		pT[1+ldb*0] = - dD[ii+1] * tmp * dD[ii+0];
+		pT[1+ldb*1] = dD[ii+1];
+		// downgrade
+		jmax = m-ii-2;
+		jj = 0;
+		for(; jj<jmax-1; jj+=2)
+			{
+			// compute W^T = C^T * V
+			pW[0+ldw*0] = pC00[jj+0+2+ldd*0] + pC00[jj+0+2+ldd*1] * pv0[0+ldd*1];
+			pW[1+ldw*0] = pC00[jj+1+2+ldd*0] + pC00[jj+1+2+ldd*1] * pv0[0+ldd*1];
+			pW[0+ldw*1] =                      pC00[jj+0+2+ldd*1];
+			pW[1+ldw*1] =                      pC00[jj+1+2+ldd*1];
+			kk = 2;
+			for(; kk<kmax; kk++)
+				{
+				tmp = pC00[jj+0+2+ldd*kk];
+				pW[0+ldw*0] += tmp * pv0[0+ldd*kk];
+				pW[0+ldw*1] += tmp * pv1[0+ldd*kk];
+				tmp = pC00[jj+1+2+ldd*kk];
+				pW[1+ldw*0] += tmp * pv0[0+ldd*kk];
+				pW[1+ldw*1] += tmp * pv1[0+ldd*kk];
+				}
+			// compute W^T *= T
+			pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+			pW[1+ldw*1] = pT[1+ldb*0]*pW[1+ldw*0] + pT[1+ldb*1]*pW[1+ldw*1];
+			pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+			pW[1+ldw*0] = pT[0+ldb*0]*pW[1+ldw*0];
+			// compute C -= V * W^T
+			pC00[jj+0+2+ldd*0] -= pW[0+ldw*0];
+			pC00[jj+1+2+ldd*0] -= pW[1+ldw*0];
+			pC00[jj+0+2+ldd*1] -= pv0[0+ldd*1]*pW[0+ldw*0] + pW[0+ldw*1];
+			pC00[jj+1+2+ldd*1] -= pv0[0+ldd*1]*pW[1+ldw*0] + pW[1+ldw*1];
+			kk = 2;
+			for(; kk<kmax-1; kk+=2)
+				{
+				pC00[jj+0+2+ldd*(kk+0)] -= pv0[0+ldd*(kk+0)]*pW[0+ldw*0] + pv1[0+ldd*(kk+0)]*pW[0+ldw*1];
+				pC00[jj+0+2+ldd*(kk+1)] -= pv0[0+ldd*(kk+1)]*pW[0+ldw*0] + pv1[0+ldd*(kk+1)]*pW[0+ldw*1];
+				pC00[jj+1+2+ldd*(kk+0)] -= pv0[0+ldd*(kk+0)]*pW[1+ldw*0] + pv1[0+ldd*(kk+0)]*pW[1+ldw*1];
+				pC00[jj+1+2+ldd*(kk+1)] -= pv0[0+ldd*(kk+1)]*pW[1+ldw*0] + pv1[0+ldd*(kk+1)]*pW[1+ldw*1];
+				}
+			for(; kk<kmax; kk++)
+				{
+				pC00[jj+0+2+ldd*kk] -= pv0[0+ldd*kk]*pW[0+ldw*0] + pv1[0+ldd*kk]*pW[0+ldw*1];
+				pC00[jj+1+2+ldd*kk] -= pv0[0+ldd*kk]*pW[1+ldw*0] + pv1[0+ldd*kk]*pW[1+ldw*1];
+				}
+			}
+		for(; jj<jmax; jj++)
+			{
+			// compute W = T * V^T * C
+			pW[0+ldw*0] = pC00[jj+0+2+ldd*0] + pC00[jj+0+2+ldd*1] * pv0[0+ldd*1];
+			pW[0+ldw*1] =                      pC00[jj+0+2+ldd*1];
+			for(kk=2; kk<kmax; kk++)
+				{
+				tmp = pC00[jj+0+2+ldd*kk];
+				pW[0+ldw*0] += tmp * pv0[0+ldd*kk];
+				pW[0+ldw*1] += tmp * pv1[0+ldd*kk];
+				}
+			pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+			pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+			// compute C -= V * W^T
+			pC00[jj+0+2+ldd*0] -= pW[0+ldw*0];
+			pC00[jj+0+2+ldd*1] -= pv0[0+ldd*1]*pW[0+ldw*0] + pW[0+ldw*1];
+			for(kk=2; kk<kmax; kk++)
+				{
+				pC00[jj+0+2+ldd*kk] -= pv0[0+ldd*kk]*pW[0+ldw*0] + pv1[0+ldd*kk]*pW[0+ldw*1];
+				}
+			}
+		}
+#endif
+	for(; ii<imax; ii++)
+		{
+		pC00 = &pD[ii+ldd*ii];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ldd*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0+ldd*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			for(jj=1; jj<n-ii; jj++)
+				pC00[0+ldd*jj] *= tmp;
+			pC00[0+ldd*0] = beta;
+			}
+		if(ii<n)
+			{
+			// gemv_t & ger
+			pC10 = &pC00[1+ldd*0];
+			pv0 = &pC00[0+ldd*0];
+			jmax = m-ii-1;
+			kmax = n-ii;
+			jj = 0;
+			for(; jj<jmax-1; jj+=2)
+				{
+				w0 = pC10[jj+0+ldd*0]; // pv0[0] = 1.0
+				w1 = pC10[jj+1+ldd*0]; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					w0 += pC10[jj+0+ldd*kk] * pv0[0+ldd*kk];
+					w1 += pC10[jj+1+ldd*kk] * pv0[0+ldd*kk];
+					}
+				w0 = - dD[ii] * w0;
+				w1 = - dD[ii] * w1;
+				pC10[jj+0+ldd*0] += w0; // pv0[0] = 1.0
+				pC10[jj+1+ldd*0] += w1; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC10[jj+0+ldd*kk] += w0 * pv0[0+ldd*kk];
+					pC10[jj+1+ldd*kk] += w1 * pv0[0+ldd*kk];
+					}
+				}
+			for(; jj<jmax; jj++)
+				{
+				w0 = pC10[jj+ldd*0]; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					w0 += pC10[jj+ldd*kk] * pv0[0+ldd*kk];
+					}
+				w0 = - dD[ii] * w0;
+				pC10[jj+ldd*0] += w0; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC10[jj+ldd*kk] += w0 * pv0[0+ldd*kk];
+					}
+				}
+			}
+		}
+	return;
+	}
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+// dpotrf
+void POTRF_L_LIBSTR(int m, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0)
+		return;
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	REAL d1 = 1.0;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long info;
+	long long tmp;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<m; jj++)
+			{
+			tmp = m-jj;
+			COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+			}
+		}
+	POTRF(&cl, &mm, pD, &ldd, &info);
+#else
+	int i1 = 1;
+	int info;
+	int tmp;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<m; jj++)
+			{
+			tmp = m-jj;
+			COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+			}
+		}
+	POTRF(&cl, &m, pD, &ldd, &info);
+#endif
+	return;
+	}
+
+
+
+// dpotrf
+void POTRF_L_MN_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	REAL d1 = 1.0;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long mmn = mm-nn;
+	long long info;
+	long long tmp;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			tmp = m-jj;
+			COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+			}
+		}
+	POTRF(&cl, &nn, pD, &ldd, &info);
+	TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
+#else
+	int i1 = 1;
+	int mmn = m-n;
+	int info;
+	int tmp;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			tmp = m-jj;
+			COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+			}
+		}
+	POTRF(&cl, &n, pD, &ldd, &info);
+	TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dsyrk dpotrf
+void SYRK_POTRF_LN_LIBSTR(int m, int n, int k, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL d1 = 1.0;
+	REAL *pA = sA->pA + ai + aj*sA->m;
+	REAL *pB = sB->pA + bi + bj*sB->m;
+	REAL *pC = sC->pA + ci + cj*sC->m;
+	REAL *pD = sD->pA + di + dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long kk = k;
+	long long mmn = mm-nn;
+	long long info;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	if(pA==pB)
+		{
+		SYRK(&cl, &cn, &nn, &kk, &d1, pA, &lda, &d1, pD, &ldd);
+		GEMM(&cn, &ct, &mmn, &nn, &kk, &d1, pA+n, &lda, pB, &ldb, &d1, pD+n, &ldd);
+		POTRF(&cl, &nn, pD, &ldd, &info);
+		TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
+		}
+	else
+		{
+		GEMM(&cn, &ct, &mm, &nn, &kk, &d1, pA, &lda, pB, &ldb, &d1, pD, &ldd);
+		POTRF(&cl, &nn, pD, &ldd, &info);
+		TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
+		}
+#else
+	int i1 = 1;
+	int mmn = m-n;
+	int info;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	if(pA==pB)
+		{
+		SYRK(&cl, &cn, &n, &k, &d1, pA, &lda, &d1, pD, &ldd);
+		GEMM(&cn, &ct, &mmn, &n, &k, &d1, pA+n, &lda, pB, &ldb, &d1, pD+n, &ldd);
+		POTRF(&cl, &n, pD, &ldd, &info);
+		TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
+		}
+	else
+		{
+		GEMM(&cn, &ct, &m, &n, &k, &d1, pA, &lda, pB, &ldb, &d1, pD, &ldd);
+		POTRF(&cl, &n, pD, &ldd, &info);
+		TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
+		}
+#endif
+	return;
+	}
+
+
+
+// dgetrf without pivoting
+#if defined(REF_BLAS_BLIS)
+static void GETF2_NOPIVOT(long long m, long long n, REAL *A, long long lda)
+	{
+	if(m<=0 | n<=0)
+		return;
+	long long i, j;
+	long long jmax = m<n ? m : n;
+	REAL dtmp;
+	REAL dm1 = -1.0;
+	long long itmp0, itmp1;
+	long long i1 = 1;
+	for(j=0; j<jmax; j++)
+		{
+		itmp0 = m-j-1;
+		dtmp = 1.0/A[j+lda*j];
+		SCAL(&itmp0, &dtmp, &A[(j+1)+lda*j], &i1);
+		itmp1 = n-j-1;
+		GER(&itmp0, &itmp1, &dm1, &A[(j+1)+lda*j], &i1, &A[j+lda*(j+1)], &lda, &A[(j+1)+lda*(j+1)], &lda);
+		}
+	return;
+	}
+#else
+static void GETF2_NOPIVOT(int m, int n, REAL *A, int lda)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int i, j;
+	int jmax = m<n ? m : n;
+	REAL dtmp;
+	REAL dm1 = -1.0;
+	int itmp0, itmp1;
+	int i1 = 1;
+	for(j=0; j<jmax; j++)
+		{
+		itmp0 = m-j-1;
+		dtmp = 1.0/A[j+lda*j];
+		SCAL(&itmp0, &dtmp, &A[(j+1)+lda*j], &i1);
+		itmp1 = n-j-1;
+		GER(&itmp0, &itmp1, &dm1, &A[(j+1)+lda*j], &i1, &A[j+lda*(j+1)], &lda, &A[(j+1)+lda*(j+1)], &lda);
+		}
+	return;
+	}
+#endif
+
+
+
+// dgetrf without pivoting
+void GETRF_NOPIVOT_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	// TODO with custom level 2 LAPACK + level 3 BLAS
+	if(m<=0 | n<=0)
+		return;
+	int jj;
+	REAL d1 = 1.0;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GETF2_NOPIVOT(mm, nn, pD, ldd);
+#else
+	int i1 = 1;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GETF2_NOPIVOT(m, n, pD, ldd);
+#endif
+	return;
+	}
+
+
+
+// dgetrf pivoting
+void GETRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, int *ipiv)
+	{
+	// TODO with custom level 2 LAPACK + level 3 BLAS
+	if(m<=0 | n<=0)
+		return;
+	int jj;
+	int tmp = m<n ? m : n;
+	REAL d1 = 1.0;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long info;
+	long long mm = m;
+	long long nn = n;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GETRF(&mm, &nn, pD, &ldd, (long long *) ipiv, &info);
+	for(jj=0; jj<tmp; jj++)
+		ipiv[jj] -= 1;
+#else
+	int i1 = 1;
+	int info;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GETRF(&m, &n, pD, &ldd, ipiv, &info);
+	for(jj=0; jj<tmp; jj++)
+		ipiv[jj] -= 1;
+#endif
+	return;
+	}
+
+
+
+int GEQRF_WORK_SIZE_LIBSTR(int m, int n)
+	{
+	REAL dwork;
+	REAL *pD, *dD;
+#if defined(REF_BLAS_BLIS)
+	long long mm = m;
+	long long nn = n;
+	long long lwork = -1;
+	long long info;
+	long long ldd = mm;
+	GEQRF(&mm, &nn, pD, &ldd, dD, &dwork, &lwork, &info);
+#else
+	int lwork = -1;
+	int info;
+	int ldd = m;
+	GEQRF(&m, &n, pD, &ldd, dD, &dwork, &lwork, &info);
+#endif
+	int size = dwork;
+	return size*sizeof(REAL);
+	}
+
+
+
+void GEQRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, void *work)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int jj;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+	REAL *dD = sD->dA+di;
+	REAL *dwork = (REAL *) work;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long info = -1;
+	long long mm = m;
+	long long nn = n;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+//	GEQR2(&mm, &nn, pD, &ldd, dD, dwork, &info);
+	long long lwork = -1;
+	GEQRF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+	lwork = dwork[0];
+	GEQRF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+#else
+	int i1 = 1;
+	int info = -1;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+//	GEQR2(&m, &n, pD, &ldd, dD, dwork, &info);
+	int lwork = -1;
+	GEQRF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+	lwork = dwork[0];
+	GEQRF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+#endif
+	return;
+	}
+
+
+
+int GELQF_WORK_SIZE_LIBSTR(int m, int n)
+	{
+	REAL dwork;
+	REAL *pD, *dD;
+#if defined(REF_BLAS_BLIS)
+	long long mm = m;
+	long long nn = n;
+	long long lwork = -1;
+	long long info;
+	long long ldd = mm;
+	GELQF(&mm, &nn, pD, &ldd, dD, &dwork, &lwork, &info);
+#else
+	int lwork = -1;
+	int info;
+	int ldd = m;
+	GELQF(&m, &n, pD, &ldd, dD, &dwork, &lwork, &info);
+#endif
+	int size = dwork;
+	return size*sizeof(REAL);
+	}
+
+
+
+void GELQF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, void *work)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int jj;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+	REAL *dD = sD->dA+di;
+	REAL *dwork = (REAL *) work;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long info = -1;
+	long long mm = m;
+	long long nn = n;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+//	GEQR2(&mm, &nn, pD, &ldd, dD, dwork, &info);
+	long long lwork = -1;
+	GELQF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+	lwork = dwork[0];
+	GELQF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+#else
+	int i1 = 1;
+	int info = -1;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+//	GEQR2(&m, &n, pD, &ldd, dD, dwork, &info);
+	int lwork = -1;
+	GELQF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+	lwork = dwork[0];
+	GELQF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+#endif
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+