Squashed 'third_party/blasfeo/' content from commit 2a828ca

Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/kernel/avx/kernel_dgeqrf_4_lib4.c b/kernel/avx/kernel_dgeqrf_4_lib4.c
new file mode 100644
index 0000000..a5faf20
--- /dev/null
+++ b/kernel/avx/kernel_dgeqrf_4_lib4.c
@@ -0,0 +1,2751 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+#include "../../include/blasfeo_d_kernel.h"
+
+
+
+void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w1, w2, w3;
+	const int ps = 4;
+	// first column
+	beta = 0.0;
+	ii = 1;
+	if(m>1)
+		{
+		tmp = pD[1+ps*0];
+		beta += tmp*tmp;
+		if(m>2)
+			{
+			tmp = pD[2+ps*0];
+			beta += tmp*tmp;
+			if(m>3)
+				{
+				tmp = pD[3+ps*0];
+				beta += tmp*tmp;
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[0] = 0.0;
+		}
+	else
+		{
+		alpha = pD[0+ps*0];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[0] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[0+ps*0] = beta;
+		ii = 1;
+		if(m>1)
+			{
+			pD[1+ps*0] *= tmp;
+			if(m>2)
+				{
+				pD[2+ps*0] *= tmp;
+				if(m>3)
+					{
+					pD[3+ps*0] *= tmp;
+					}
+				}
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*0] *= tmp;
+			pD[1+ii*sdd+ps*0] *= tmp;
+			pD[2+ii*sdd+ps*0] *= tmp;
+			pD[3+ii*sdd+ps*0] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*0] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w1 = pD[0+ps*1];
+	w2 = pD[0+ps*2];
+	w3 = pD[0+ps*3];
+	if(m>1)
+		{
+		w1 += pD[1+ps*1] * pD[1+ps*0];
+		w2 += pD[1+ps*2] * pD[1+ps*0];
+		w3 += pD[1+ps*3] * pD[1+ps*0];
+		if(m>2)
+			{
+			w1 += pD[2+ps*1] * pD[2+ps*0];
+			w2 += pD[2+ps*2] * pD[2+ps*0];
+			w3 += pD[2+ps*3] * pD[2+ps*0];
+			if(m>3)
+				{
+				w1 += pD[3+ps*1] * pD[3+ps*0];
+				w2 += pD[3+ps*2] * pD[3+ps*0];
+				w3 += pD[3+ps*3] * pD[3+ps*0];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w1 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		w1 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		w1 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		w1 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w1 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		}
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	pD[0+ps*1] += w1;
+	pD[0+ps*2] += w2;
+	pD[0+ps*3] += w3;
+	if(m>1)
+		{
+		pD[1+ps*1] += w1 * pD[1+ps*0];
+		pD[1+ps*2] += w2 * pD[1+ps*0];
+		pD[1+ps*3] += w3 * pD[1+ps*0];
+		if(m>2)
+			{
+			pD[2+ps*1] += w1 * pD[2+ps*0];
+			pD[2+ps*2] += w2 * pD[2+ps*0];
+			pD[2+ps*3] += w3 * pD[2+ps*0];
+			if(m>3)
+				{
+				pD[3+ps*1] += w1 * pD[3+ps*0];
+				pD[3+ps*2] += w2 * pD[3+ps*0];
+				pD[3+ps*3] += w3 * pD[3+ps*0];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*1] += w1 * pD[0+ii*sdd+ps*0];
+		pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*0];
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*1] += w1 * pD[1+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*1] += w1 * pD[2+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*1] += w1 * pD[3+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*0];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*1] += w1 * pD[ll+ii*sdd+ps*0];
+		pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*0];
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*0];
+		}
+	if(m==1)
+		return;
+	// second column
+	beta = 0.0;
+	if(m>2)
+		{
+		tmp = pD[2+ps*1];
+		beta += tmp*tmp;
+		if(m>3)
+			{
+			tmp = pD[3+ps*1];
+			beta += tmp*tmp;
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[1] = 0.0;
+		}
+	else
+		{
+		alpha = pD[1+ps*1];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[1] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[1+ps*1] = beta;
+		if(m>2)
+			{
+			pD[2+ps*1] *= tmp;
+			if(m>3)
+				{
+				pD[3+ps*1] *= tmp;
+				}
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*1] *= tmp;
+			pD[1+ii*sdd+ps*1] *= tmp;
+			pD[2+ii*sdd+ps*1] *= tmp;
+			pD[3+ii*sdd+ps*1] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*1] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w2 = pD[1+ps*2];
+	w3 = pD[1+ps*3];
+	if(m>2)
+		{
+		w2 += pD[2+ps*2] * pD[2+ps*1];
+		w3 += pD[2+ps*3] * pD[2+ps*1];
+		if(m>3)
+			{
+			w2 += pD[3+ps*2] * pD[3+ps*1];
+			w3 += pD[3+ps*3] * pD[3+ps*1];
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		}
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	pD[1+ps*2] += w2;
+	pD[1+ps*3] += w3;
+	if(m>2)
+		{
+		pD[2+ps*2] += w2 * pD[2+ps*1];
+		pD[2+ps*3] += w3 * pD[2+ps*1];
+		if(m>3)
+			{
+			pD[3+ps*2] += w2 * pD[3+ps*1];
+			pD[3+ps*3] += w3 * pD[3+ps*1];
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*1];
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*1];
+		pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*1];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*1];
+		pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*1];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*1];
+		pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*1];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*1];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*1];
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*1];
+		}
+	if(m==2)
+		return;
+	// third column
+	beta = 0.0;
+	if(m>3)
+		{
+		tmp = pD[3+ps*2];
+		beta += tmp*tmp;
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[2] = 0.0;
+		}
+	else
+		{
+		alpha = pD[2+ps*2];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[2] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[2+ps*2] = beta;
+		if(m>3)
+			{
+			pD[3+ps*2] *= tmp;
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*2] *= tmp;
+			pD[1+ii*sdd+ps*2] *= tmp;
+			pD[2+ii*sdd+ps*2] *= tmp;
+			pD[3+ii*sdd+ps*2] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*2] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w3 = pD[2+ps*3];
+	if(m>3)
+		{
+		w3 += pD[3+ps*3] * pD[3+ps*2];
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	w3 = - dD[2] * w3;
+	pD[2+ps*3] += w3;
+	if(m>3)
+		{
+		pD[3+ps*3] += w3 * pD[3+ps*2];
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*2];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*2];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*2];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*2];
+		}
+	if(m==3)
+		return;
+	// fourth column
+	beta = 0.0;
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[3] = 0.0;
+		}
+	else
+		{
+		alpha = pD[3+ps*3];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[3] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[3+ps*3] = beta;
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*3] *= tmp;
+			pD[1+ii*sdd+ps*3] *= tmp;
+			pD[2+ii*sdd+ps*3] *= tmp;
+			pD[3+ii*sdd+ps*3] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*3] *= tmp;
+			}
+		}
+	return;
+	}
+
+
+// unblocked algorithm
+void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+	const int ps = 4;
+	imax = k;//m<n ? m : n;
+	double alpha, beta, tmp, w0;
+	double *pC00, *pC10, *pC01, *pC11;
+	int offset;
+	double *pD0 = pD-offD;
+	for(ii=0; ii<imax; ii++)
+		{
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		jmax = m-ii-1;
+		jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+		jmax0 = jmax<jmax0 ? jmax : jmax0;
+		offset = 0;
+		jj = 0;
+		if(jmax0>0)
+			{
+			for( ; jj<jmax0; jj++)
+				{
+				tmp = pC10[0+offset];
+				beta += tmp*tmp;
+				offset += 1;
+				}
+			offset += -ps+ps*sdd;
+			}
+		for( ; jj<jmax-3; jj+=4)
+			{
+			tmp = pC10[0+offset];
+			beta += tmp*tmp;
+			tmp = pC10[1+offset];
+			beta += tmp*tmp;
+			tmp = pC10[2+offset];
+			beta += tmp*tmp;
+			tmp = pC10[3+offset];
+			beta += tmp*tmp;
+			offset += ps*sdd;
+			}
+		for(ll=0; ll<jmax-jj; ll++)
+			{
+			tmp = pC10[0+offset];
+			beta += tmp*tmp;
+			offset += 1;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			offset = 0;
+			jj = 0;
+			if(jmax0>0)
+				{
+				for( ; jj<jmax0; jj++)
+					{
+					pC10[0+offset] *= tmp;
+					offset += 1;
+					}
+				offset += -ps+ps*sdd;
+				}
+			for( ; jj<jmax-3; jj+=4)
+				{
+				pC10[0+offset] *= tmp;
+				pC10[1+offset] *= tmp;
+				pC10[2+offset] *= tmp;
+				pC10[3+offset] *= tmp;
+				offset += ps*sdd;
+				}
+			for(ll=0; ll<jmax-jj; ll++)
+				{
+				pC10[0+offset] *= tmp;
+				offset += 1;
+				}
+			pC00[0] = beta;
+			}
+		if(ii<n)
+			{
+			pC01 = pC00 + ps;
+			pC11 = pC10 + ps;
+			kmax = jmax;
+			kmax0 = jmax0;
+			jmax = n-ii-1;
+			jj = 0;
+			for( ; jj<jmax; jj++)
+				{
+				w0 = pC01[0+ps*jj] * 1.0;
+				offset = 0;
+				kk = 0;
+				if(kmax0>0)
+					{
+					for( ; kk<kmax0; kk++)
+						{
+						w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+						offset += 1;
+						}
+					offset += -ps+ps*sdd;
+					}
+				for( ; kk<kmax-3; kk+=4)
+					{
+					w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+					w0 += pC11[1+offset+ps*jj] * pC10[1+offset];
+					w0 += pC11[2+offset+ps*jj] * pC10[2+offset];
+					w0 += pC11[3+offset+ps*jj] * pC10[3+offset];
+					offset += ps*sdd;
+					}
+				for(ll=0; ll<kmax-kk; ll++)
+					{
+					w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+					offset += 1;
+					}
+				w0 = - dD[ii] * w0;
+				pC01[0+ps*jj] += w0;
+				offset = 0;
+				kk = 0;
+				if(kmax0>0)
+					{
+					for( ; kk<kmax0; kk++)
+						{
+						pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+						offset += 1;
+						}
+					offset = offset-ps+ps*sdd;
+					}
+				for( ; kk<kmax-3; kk+=4)
+					{
+					pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+					pC11[1+offset+ps*jj] += w0 * pC10[1+offset];
+					pC11[2+offset+ps*jj] += w0 * pC10[2+offset];
+					pC11[3+offset+ps*jj] += w0 * pC10[3+offset];
+					offset += ps*sdd;
+					}
+				for(ll=0; ll<kmax-kk; ll++)
+					{
+					pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+					offset += 1;
+					}
+				}
+			}
+		}
+	return;
+	}
+
+
+
+void kernel_dlarf_4_lib4(int m, int n, double *pD, int sdd, double *dD, double *pC0, int sdc)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, ll;
+	const int ps = 4;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	double tmp, d0, d1, d2, d3;
+	double *pC;
+	double pT[16];// = {};
+	int ldt = 4;
+	double pW[8];// = {};
+	int ldw = 2;
+	// dot product of v
+	v10 = 0.0;
+	v20 = 0.0;
+	v30 = 0.0;
+	v21 = 0.0;
+	v31 = 0.0;
+	v32 = 0.0;
+	if(m>1)
+		{
+		v10 = 1.0 * pD[1+ps*0];
+		if(m>2)
+			{
+			v10 += pD[2+ps*1] * pD[2+ps*0];
+			v20 = 1.0 * pD[2+ps*0];
+			v21 = 1.0 * pD[2+ps*1];
+			if(m>3)
+				{
+				v10 += pD[3+ps*1] * pD[3+ps*0];
+				v20 += pD[3+ps*2] * pD[3+ps*0];
+				v21 += pD[3+ps*2] * pD[3+ps*1];
+				v30 = 1.0 * pD[3+ps*0];
+				v31 = 1.0 * pD[3+ps*1];
+				v32 = 1.0 * pD[3+ps*2];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	// compute lower triangular T containing tau for matrix update
+	pT[0+ldt*0] = dD[0];
+	pT[1+ldt*1] = dD[1];
+	pT[2+ldt*2] = dD[2];
+	pT[3+ldt*3] = dD[3];
+	pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+	pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+	pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+	pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+	pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+	pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+	// downgrade matrix
+	pW[0] = 0.0;
+	pW[1] = 0.0;
+	pW[2] = 0.0;
+	pW[3] = 0.0;
+	pW[4] = 0.0;
+	pW[5] = 0.0;
+	pW[6] = 0.0;
+	pW[7] = 0.0;
+	ii = 0;
+	for( ; ii<n-1; ii+=2)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		tmp = pC[0+ps*1];
+		pW[1+ldw*0] = tmp;
+		if(m>1)
+			{
+			d0 = pD[1+ps*0];
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] = tmp;
+			tmp = pC[1+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] = tmp;
+			if(m>2)
+				{
+				d0 = pD[2+ps*0];
+				d1 = pD[2+ps*1];
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += tmp * d0;
+				pW[0+ldw*1] += tmp * d1;
+				pW[0+ldw*2] = tmp;
+				tmp = pC[2+ps*1];
+				pW[1+ldw*0] += tmp * d0;
+				pW[1+ldw*1] += tmp * d1;
+				pW[1+ldw*2] = tmp;
+				if(m>3)
+					{
+					d0 = pD[3+ps*0];
+					d1 = pD[3+ps*1];
+					d2 = pD[3+ps*2];
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += tmp * d0;
+					pW[0+ldw*1] += tmp * d1;
+					pW[0+ldw*2] += tmp * d2;
+					pW[0+ldw*3] = tmp;
+					tmp = pC[3+ps*1];
+					pW[1+ldw*0] += tmp * d0;
+					pW[1+ldw*1] += tmp * d1;
+					pW[1+ldw*2] += tmp * d2;
+					pW[1+ldw*3] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pD[0+jj*sdd+ps*0];
+			d1 = pD[0+jj*sdd+ps*1];
+			d2 = pD[0+jj*sdd+ps*2];
+			d3 = pD[0+jj*sdd+ps*3];
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[0+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[1+jj*sdd+ps*0];
+			d1 = pD[1+jj*sdd+ps*1];
+			d2 = pD[1+jj*sdd+ps*2];
+			d3 = pD[1+jj*sdd+ps*3];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[1+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[2+jj*sdd+ps*0];
+			d1 = pD[2+jj*sdd+ps*1];
+			d2 = pD[2+jj*sdd+ps*2];
+			d3 = pD[2+jj*sdd+ps*3];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[2+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[3+jj*sdd+ps*0];
+			d1 = pD[3+jj*sdd+ps*1];
+			d2 = pD[3+jj*sdd+ps*2];
+			d3 = pD[3+jj*sdd+ps*3];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[3+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pD[ll+jj*sdd+ps*0];
+			d1 = pD[ll+jj*sdd+ps*1];
+			d2 = pD[ll+jj*sdd+ps*2];
+			d3 = pD[ll+jj*sdd+ps*3];
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[ll+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			}
+		// compute W^T *= T
+		pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+		pW[1+ldw*3] = pT[3+ldt*0]*pW[1+ldw*0] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[1+ldw*2] + pT[3+ldt*3]*pW[1+ldw*3];
+		pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+		pW[1+ldw*2] = pT[2+ldt*0]*pW[1+ldw*0] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[1+ldw*2];
+		pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+		pW[1+ldw*1] = pT[1+ldt*0]*pW[1+ldw*0] + pT[1+ldt*1]*pW[1+ldw*1];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		pW[1+ldw*0] = pT[0+ldt*0]*pW[1+ldw*0];
+		// compute C -= V * W^T
+		pC[0+ps*0] -= pW[0+ldw*0];
+		pC[0+ps*1] -= pW[1+ldw*0];
+		if(m>1)
+			{
+			pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+			pC[1+ps*1] -= pD[1+ps*0]*pW[1+ldw*0] + pW[1+ldw*1];
+			if(m>2)
+				{
+				pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+				pC[2+ps*1] -= pD[2+ps*0]*pW[1+ldw*0] + pD[2+ps*1]*pW[1+ldw*1] + pW[1+ldw*2];
+				if(m>3)
+					{
+					pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+					pC[3+ps*1] -= pD[3+ps*0]*pW[1+ldw*0] + pD[3+ps*1]*pW[1+ldw*1] + pD[3+ps*2]*pW[1+ldw*2] + pW[1+ldw*3];
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pD[0+jj*sdd+ps*0];
+			d1 = pD[0+jj*sdd+ps*1];
+			d2 = pD[0+jj*sdd+ps*2];
+			d3 = pD[0+jj*sdd+ps*3];
+			pC[0+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[0+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[1+jj*sdd+ps*0];
+			d1 = pD[1+jj*sdd+ps*1];
+			d2 = pD[1+jj*sdd+ps*2];
+			d3 = pD[1+jj*sdd+ps*3];
+			pC[1+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[1+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[2+jj*sdd+ps*0];
+			d1 = pD[2+jj*sdd+ps*1];
+			d2 = pD[2+jj*sdd+ps*2];
+			d3 = pD[2+jj*sdd+ps*3];
+			pC[2+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[2+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[3+jj*sdd+ps*0];
+			d1 = pD[3+jj*sdd+ps*1];
+			d2 = pD[3+jj*sdd+ps*2];
+			d3 = pD[3+jj*sdd+ps*3];
+			pC[3+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[3+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pD[ll+jj*sdd+ps*0];
+			d1 = pD[ll+jj*sdd+ps*1];
+			d2 = pD[ll+jj*sdd+ps*2];
+			d3 = pD[ll+jj*sdd+ps*3];
+			pC[ll+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[ll+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			}
+		}
+	for( ; ii<n; ii++)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		if(m>1)
+			{
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += tmp * pD[1+ps*0];
+			pW[0+ldw*1] = tmp;
+			if(m>2)
+				{
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += tmp * pD[2+ps*0];
+				pW[0+ldw*1] += tmp * pD[2+ps*1];
+				pW[0+ldw*2] = tmp;
+				if(m>3)
+					{
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += tmp * pD[3+ps*0];
+					pW[0+ldw*1] += tmp * pD[3+ps*1];
+					pW[0+ldw*2] += tmp * pD[3+ps*2];
+					pW[0+ldw*3] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[0+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[0+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[0+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[0+jj*sdd+ps*3];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[1+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[1+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[1+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[1+jj*sdd+ps*3];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[2+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[2+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[2+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[2+jj*sdd+ps*3];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[3+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[3+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[3+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[3+jj*sdd+ps*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[ll+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[ll+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[ll+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[ll+jj*sdd+ps*3];
+			}
+		// compute W^T *= T
+		pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+		pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+		pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		// compute C -= V * W^T
+		pC[0+ps*0] -= pW[0+ldw*0];
+		if(m>1)
+			{
+			pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+			if(m>2)
+				{
+				pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+				if(m>3)
+					{
+					pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			pC[0+jj*sdc+ps*0] -= pD[0+jj*sdd+ps*0]*pW[0+ldw*0] + pD[0+jj*sdd+ps*1]*pW[0+ldw*1] + pD[0+jj*sdd+ps*2]*pW[0+ldw*2] + pD[0+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[1+jj*sdc+ps*0] -= pD[1+jj*sdd+ps*0]*pW[0+ldw*0] + pD[1+jj*sdd+ps*1]*pW[0+ldw*1] + pD[1+jj*sdd+ps*2]*pW[0+ldw*2] + pD[1+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[2+jj*sdc+ps*0] -= pD[2+jj*sdd+ps*0]*pW[0+ldw*0] + pD[2+jj*sdd+ps*1]*pW[0+ldw*1] + pD[2+jj*sdd+ps*2]*pW[0+ldw*2] + pD[2+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[3+jj*sdc+ps*0] -= pD[3+jj*sdd+ps*0]*pW[0+ldw*0] + pD[3+jj*sdd+ps*1]*pW[0+ldw*1] + pD[3+jj*sdd+ps*2]*pW[0+ldw*2] + pD[3+jj*sdd+ps*3]*pW[0+ldw*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			pC[ll+jj*sdc+ps*0] -= pD[ll+jj*sdd+ps*0]*pW[0+ldw*0] + pD[ll+jj*sdd+ps*1]*pW[0+ldw*1] + pD[ll+jj*sdd+ps*2]*pW[0+ldw*2] + pD[ll+jj*sdd+ps*3]*pW[0+ldw*3];
+			}
+		}
+
+	return;
+	}
+
+
+
+void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc, double *pW0)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, ll;
+	const int ps = 4;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	double c00, c01,
+	       c10, c11,
+	       c20, c21,
+	       c30, c31;
+	double a0, a1, a2, a3, b0, b1;
+	double tmp, d0, d1, d2, d3;
+	double *pC, *pW;
+	double pT[16];// = {};
+	int ldt = 4;
+	// dot product of v
+	v10 = 0.0;
+	v20 = 0.0;
+	v30 = 0.0;
+	v21 = 0.0;
+	v31 = 0.0;
+	v32 = 0.0;
+	if(m>1)
+		{
+		v10 = 1.0 * pD[1+ps*0];
+		if(m>2)
+			{
+			v10 += pD[2+ps*1] * pD[2+ps*0];
+			v20 = 1.0 * pD[2+ps*0];
+			v21 = 1.0 * pD[2+ps*1];
+			if(m>3)
+				{
+				v10 += pD[3+ps*1] * pD[3+ps*0];
+				v20 += pD[3+ps*2] * pD[3+ps*0];
+				v21 += pD[3+ps*2] * pD[3+ps*1];
+				v30 = 1.0 * pD[3+ps*0];
+				v31 = 1.0 * pD[3+ps*1];
+				v32 = 1.0 * pD[3+ps*2];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	// compute lower triangular T containing tau for matrix update
+	pT[0+ldt*0] = dD[0];
+	pT[1+ldt*1] = dD[1];
+	pT[2+ldt*2] = dD[2];
+	pT[3+ldt*3] = dD[3];
+	pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+	pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+	pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+	pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+	pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+	pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+	// downgrade matrix
+	__m256d
+		_w0, _w1, _w2, _w3, _d0, _t0, _tp, _c0, _c1, _c2, _c3, _a0, _b0, _tz;
+
+	ii = 0;
+#if 1
+	double alpha = 1.0;
+	double beta = 0.0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for( ; ii<n-11; ii+=12)
+		{
+		kernel_dgemm_nn_4x12_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+		}
+#endif
+	for( ; ii<n-7; ii+=8)
+		{
+		kernel_dgemm_nn_4x8_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+		}
+	for( ; ii<n-3; ii+=4)
+		{
+		kernel_dgemm_nn_4x4_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+		}
+	if(ii<n)
+		{
+//		kernel_dgemm_nn_4x4_vs_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii], 4, n-ii);
+		kernel_dgemm_nn_4x4_gen_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, 0, &pW0[0+ps*ii], 0, 0, &pW0[0+ps*ii], 0, 0, 4, 0, n-ii);
+		}
+#else
+	for( ; ii<n-3; ii+=4)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		_w0 = _mm256_setzero_pd();
+		_w1 = _mm256_setzero_pd();
+		_w2 = _mm256_setzero_pd();
+		_w3 = _mm256_setzero_pd();
+		for(jj=0; jj<m-3; jj+=4)
+			{
+			//
+			_d0 = _mm256_load_pd( &pVt[0+ps*(0+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			//
+			_d0 = _mm256_load_pd( &pVt[0+ps*(1+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			//
+			_d0 = _mm256_load_pd( &pVt[0+ps*(2+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			//
+			_d0 = _mm256_load_pd( &pVt[0+ps*(3+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			_d0 = _mm256_load_pd( &pVt[0+ps*(ll+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			}
+		// TODO mask store
+		_mm256_storeu_pd( &pW[0+ps*0], _w0 );
+		_mm256_storeu_pd( &pW[0+ps*1], _w1 );
+		_mm256_storeu_pd( &pW[0+ps*2], _w2 );
+		_mm256_storeu_pd( &pW[0+ps*3], _w3 );
+		}
+	for( ; ii<n; ii++)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ps*0] = tmp;
+		if(m>1)
+			{
+			d0 = pVt[0+ps*1];
+			tmp = pC[1+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] = tmp;
+			if(m>2)
+				{
+				d0 = pVt[0+ps*2];
+				d1 = pVt[1+ps*2];
+				tmp = pC[2+ps*0];
+				pW[0+ps*0] += d0 * tmp;
+				pW[1+ps*0] += d1 * tmp;
+				pW[2+ps*0] = tmp;
+				if(m>3)
+					{
+					d0 = pVt[0+ps*3];
+					d1 = pVt[1+ps*3];
+					d2 = pVt[2+ps*3];
+					tmp = pC[3+ps*0];
+					pW[0+ps*0] += d0 * tmp;
+					pW[1+ps*0] += d1 * tmp;
+					pW[2+ps*0] += d2 * tmp;
+					pW[3+ps*0] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pVt[0+ps*(0+jj)];
+			d1 = pVt[1+ps*(0+jj)];
+			d2 = pVt[2+ps*(0+jj)];
+			d3 = pVt[3+ps*(0+jj)];
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(1+jj)];
+			d1 = pVt[1+ps*(1+jj)];
+			d2 = pVt[2+ps*(1+jj)];
+			d3 = pVt[3+ps*(1+jj)];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(2+jj)];
+			d1 = pVt[1+ps*(2+jj)];
+			d2 = pVt[2+ps*(2+jj)];
+			d3 = pVt[3+ps*(2+jj)];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(3+jj)];
+			d1 = pVt[1+ps*(3+jj)];
+			d2 = pVt[2+ps*(3+jj)];
+			d3 = pVt[3+ps*(3+jj)];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pVt[0+ps*(ll+jj)];
+			d1 = pVt[1+ps*(ll+jj)];
+			d2 = pVt[2+ps*(ll+jj)];
+			d3 = pVt[3+ps*(ll+jj)];
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			}
+		}
+#endif
+
+	ii = 0;
+	for( ; ii<n-3; ii+=4)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+
+		// compute W^T *= T
+		_tz = _mm256_setzero_pd();
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*0] );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*0] );
+		_w0 = _mm256_mul_pd( _t0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*1] );
+		_w1 = _mm256_mul_pd( _t0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*2] );
+		_w2 = _mm256_mul_pd( _t0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*3] );
+		_w3 = _mm256_mul_pd( _t0, _tp );
+
+#if defined(TARGET_X64_INTEL_GASWELL)
+		_t0 = _mm256_load_pd( &pT[0+ldt*1] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+		_w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*1] );
+		_w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*2] );
+		_w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*3] );
+		_w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*2] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+		_w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*1] );
+		_w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*2] );
+		_w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*3] );
+		_w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*3] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+		_w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*1] );
+		_w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*2] );
+		_w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*3] );
+		_w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+#else
+		_t0 = _mm256_load_pd( &pT[0+ldt*1] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*1] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w1 = _mm256_add_pd( _w1, _tp );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*2] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w2 = _mm256_add_pd( _w2, _tp );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*3] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w3 = _mm256_add_pd( _w3, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*2] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*1] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w1 = _mm256_add_pd( _w1, _tp );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*2] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w2 = _mm256_add_pd( _w2, _tp );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*3] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w3 = _mm256_add_pd( _w3, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*3] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*1] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w1 = _mm256_add_pd( _w1, _tp );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*2] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w2 = _mm256_add_pd( _w2, _tp );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*3] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w3 = _mm256_add_pd( _w3, _tp );
+#endif
+
+		_mm256_store_pd( &pW[0+ps*0], _w0 );
+		_mm256_store_pd( &pW[0+ps*1], _w1 );
+		_mm256_store_pd( &pW[0+ps*2], _w2 );
+		_mm256_store_pd( &pW[0+ps*3], _w3 );
+		}
+	for( ; ii<n; ii++)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+
+		// compute W^T *= T
+		_tz = _mm256_setzero_pd();
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*0] );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*0] );
+		_w0 = _mm256_mul_pd( _t0, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*1] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*2] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*3] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+
+		_mm256_store_pd( &pW[0+ps*0], _w0 );
+		}
+
+	ii = 0;
+	for( ; ii<n-3; ii+=4)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		// compute C -= V * W^T
+		jj = 0;
+		// load
+		c00 = pC[0+jj*sdc+ps*0];
+		c10 = pC[1+jj*sdc+ps*0];
+		c20 = pC[2+jj*sdc+ps*0];
+		c30 = pC[3+jj*sdc+ps*0];
+		c01 = pC[0+jj*sdc+ps*1];
+		c11 = pC[1+jj*sdc+ps*1];
+		c21 = pC[2+jj*sdc+ps*1];
+		c31 = pC[3+jj*sdc+ps*1];
+		// rank1
+		a1 = pD[1+jj*sdd+ps*0];
+		a2 = pD[2+jj*sdd+ps*0];
+		a3 = pD[3+jj*sdd+ps*0];
+		b0 = pW[0+ps*0];
+		c00 -= b0;
+		c10 -= a1*b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[0+ps*1];
+		c01 -= b1;
+		c11 -= a1*b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank2
+		a2 = pD[2+jj*sdd+ps*1];
+		a3 = pD[3+jj*sdd+ps*1];
+		b0 = pW[1+ps*0];
+		c10 -= b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[1+ps*1];
+		c11 -= b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank3
+		a3 = pD[3+jj*sdd+ps*2];
+		b0 = pW[2+ps*0];
+		c20 -= b0;
+		c30 -= a3*b0;
+		b1 = pW[2+ps*1];
+		c21 -= b1;
+		c31 -= a3*b1;
+		// rank4
+		a3 = pD[3+jj*sdd+ps*3];
+		b0 = pW[3+ps*0];
+		c30 -= b0;
+		b1 = pW[3+ps*1];
+		c31 -= b1;
+		// store
+		pC[0+jj*sdc+ps*0] = c00;
+		pC[0+jj*sdc+ps*1] = c01;
+		if(m>1)
+			{
+			pC[1+jj*sdc+ps*0] = c10;
+			pC[1+jj*sdc+ps*1] = c11;
+			if(m>2)
+				{
+				pC[2+jj*sdc+ps*0] = c20;
+				pC[2+jj*sdc+ps*1] = c21;
+				if(m>3)
+					{
+					pC[3+jj*sdc+ps*0] = c30;
+					pC[3+jj*sdc+ps*1] = c31;
+					}
+				}
+			}
+		// load
+		c00 = pC[0+jj*sdc+ps*2];
+		c10 = pC[1+jj*sdc+ps*2];
+		c20 = pC[2+jj*sdc+ps*2];
+		c30 = pC[3+jj*sdc+ps*2];
+		c01 = pC[0+jj*sdc+ps*3];
+		c11 = pC[1+jj*sdc+ps*3];
+		c21 = pC[2+jj*sdc+ps*3];
+		c31 = pC[3+jj*sdc+ps*3];
+		// rank1
+		a1 = pD[1+jj*sdd+ps*0];
+		a2 = pD[2+jj*sdd+ps*0];
+		a3 = pD[3+jj*sdd+ps*0];
+		b0 = pW[0+ps*2];
+		c00 -= b0;
+		c10 -= a1*b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[0+ps*3];
+		c01 -= b1;
+		c11 -= a1*b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank2
+		a2 = pD[2+jj*sdd+ps*1];
+		a3 = pD[3+jj*sdd+ps*1];
+		b0 = pW[1+ps*2];
+		c10 -= b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[1+ps*3];
+		c11 -= b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank3
+		a3 = pD[3+jj*sdd+ps*2];
+		b0 = pW[2+ps*2];
+		c20 -= b0;
+		c30 -= a3*b0;
+		b1 = pW[2+ps*3];
+		c21 -= b1;
+		c31 -= a3*b1;
+		// rank4
+		a3 = pD[3+jj*sdd+ps*3];
+		b0 = pW[3+ps*2];
+		c30 -= b0;
+		b1 = pW[3+ps*3];
+		c31 -= b1;
+		// store
+		pC[0+jj*sdc+ps*2] = c00;
+		pC[0+jj*sdc+ps*3] = c01;
+		if(m>1)
+			{
+			pC[1+jj*sdc+ps*2] = c10;
+			pC[1+jj*sdc+ps*3] = c11;
+			if(m>2)
+				{
+				pC[2+jj*sdc+ps*2] = c20;
+				pC[2+jj*sdc+ps*3] = c21;
+				if(m>3)
+					{
+					pC[3+jj*sdc+ps*2] = c30;
+					pC[3+jj*sdc+ps*3] = c31;
+					}
+				}
+			}
+		}
+	for( ; ii<n; ii++)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		// compute C -= V * W^T
+		jj = 0;
+		// load
+		c00 = pC[0+jj*sdc+ps*0];
+		c10 = pC[1+jj*sdc+ps*0];
+		c20 = pC[2+jj*sdc+ps*0];
+		c30 = pC[3+jj*sdc+ps*0];
+		// rank1
+		a1 = pD[1+jj*sdd+ps*0];
+		a2 = pD[2+jj*sdd+ps*0];
+		a3 = pD[3+jj*sdd+ps*0];
+		b0 = pW[0+ps*0];
+		c00 -= b0;
+		c10 -= a1*b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		// rank2
+		a2 = pD[2+jj*sdd+ps*1];
+		a3 = pD[3+jj*sdd+ps*1];
+		b0 = pW[1+ps*0];
+		c10 -= b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		// rank3
+		a3 = pD[3+jj*sdd+ps*2];
+		b0 = pW[2+ps*0];
+		c20 -= b0;
+		c30 -= a3*b0;
+		// rank4
+		a3 = pD[3+jj*sdd+ps*3];
+		b0 = pW[3+ps*0];
+		c30 -= b0;
+		// store
+		pC[0+jj*sdc+ps*0] = c00;
+		if(m>1)
+			{
+			pC[1+jj*sdc+ps*0] = c10;
+			if(m>2)
+				{
+				pC[2+jj*sdc+ps*0] = c20;
+				if(m>3)
+					{
+					pC[3+jj*sdc+ps*0] = c30;
+					}
+				}
+			}
+		}
+
+#if 1
+	jj = 4;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; jj<m-11; jj+=12)
+		{
+		kernel_dger4_sub_12r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
+		}
+#endif
+	for(; jj<m-7; jj+=8)
+		{
+		kernel_dger4_sub_8r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
+		}
+	for(; jj<m-3; jj+=4)
+		{
+		kernel_dger4_sub_4r_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc]);
+		}
+	if(jj<m)
+		{
+		kernel_dger4_sub_4r_vs_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc], m-jj);
+		}
+#else
+	ii = 0;
+	for( ; ii<n-3; ii+=4)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			// load
+			_c0 = _mm256_load_pd( &pC[0+jj*sdc+ps*0] );
+			_c1 = _mm256_load_pd( &pC[0+jj*sdc+ps*1] );
+			_c2 = _mm256_load_pd( &pC[0+jj*sdc+ps*2] );
+			_c3 = _mm256_load_pd( &pC[0+jj*sdc+ps*3] );
+			//
+			_a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*0] );
+			_b0 = _mm256_broadcast_sd( &pW[0+ps*0] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c0 = _mm256_sub_pd( _c0, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[0+ps*1] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c1 = _mm256_sub_pd( _c1, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[0+ps*2] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c2 = _mm256_sub_pd( _c2, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[0+ps*3] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c3 = _mm256_sub_pd( _c3, _tp );
+			//
+			_a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*1] );
+			_b0 = _mm256_broadcast_sd( &pW[1+ps*0] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c0 = _mm256_sub_pd( _c0, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[1+ps*1] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c1 = _mm256_sub_pd( _c1, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[1+ps*2] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c2 = _mm256_sub_pd( _c2, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[1+ps*3] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c3 = _mm256_sub_pd( _c3, _tp );
+			//
+			_a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*2] );
+			_b0 = _mm256_broadcast_sd( &pW[2+ps*0] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c0 = _mm256_sub_pd( _c0, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[2+ps*1] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c1 = _mm256_sub_pd( _c1, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[2+ps*2] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c2 = _mm256_sub_pd( _c2, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[2+ps*3] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c3 = _mm256_sub_pd( _c3, _tp );
+			//
+			_a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*3] );
+			_b0 = _mm256_broadcast_sd( &pW[3+ps*0] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c0 = _mm256_sub_pd( _c0, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[3+ps*1] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c1 = _mm256_sub_pd( _c1, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[3+ps*2] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c2 = _mm256_sub_pd( _c2, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[3+ps*3] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c3 = _mm256_sub_pd( _c3, _tp );
+			// store
+			_mm256_store_pd( &pC[0+jj*sdc+ps*0], _c0 );
+			_mm256_store_pd( &pC[0+jj*sdc+ps*1], _c1 );
+			_mm256_store_pd( &pC[0+jj*sdc+ps*2], _c2 );
+			_mm256_store_pd( &pC[0+jj*sdc+ps*3], _c3 );
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			// load
+			c00 = pC[ll+jj*sdc+ps*0];
+			c01 = pC[ll+jj*sdc+ps*1];
+			//
+			a0 = pD[ll+jj*sdd+ps*0];
+			b0 = pW[0+ps*0];
+			c00 -= a0*b0;
+			b1 = pW[0+ps*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*1];
+			b0 = pW[1+ps*0];
+			c00 -= a0*b0;
+			b1 = pW[1+ps*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*2];
+			b0 = pW[2+ps*0];
+			c00 -= a0*b0;
+			b1 = pW[2+ps*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*3];
+			b0 = pW[3+ps*0];
+			c00 -= a0*b0;
+			b1 = pW[3+ps*1];
+			c01 -= a0*b1;
+			// store
+			pC[ll+jj*sdc+ps*0] = c00;
+			pC[ll+jj*sdc+ps*1] = c01;
+			// load
+			c00 = pC[ll+jj*sdc+ps*2];
+			c01 = pC[ll+jj*sdc+ps*3];
+			//
+			a0 = pD[ll+jj*sdd+ps*0];
+			b0 = pW[0+ps*2];
+			c00 -= a0*b0;
+			b1 = pW[0+ps*3];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*1];
+			b0 = pW[1+ps*2];
+			c00 -= a0*b0;
+			b1 = pW[1+ps*3];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*2];
+			b0 = pW[2+ps*2];
+			c00 -= a0*b0;
+			b1 = pW[2+ps*3];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*3];
+			b0 = pW[3+ps*2];
+			c00 -= a0*b0;
+			b1 = pW[3+ps*3];
+			c01 -= a0*b1;
+			// store
+			pC[ll+jj*sdc+ps*2] = c00;
+			pC[ll+jj*sdc+ps*3] = c01;
+			}
+		}
+	for( ; ii<n; ii++)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			// load
+			c00 = pC[0+jj*sdc+ps*0];
+			c10 = pC[1+jj*sdc+ps*0];
+			c20 = pC[2+jj*sdc+ps*0];
+			c30 = pC[3+jj*sdc+ps*0];
+			//
+			a0 = pD[0+jj*sdd+ps*0];
+			a1 = pD[1+jj*sdd+ps*0];
+			a2 = pD[2+jj*sdd+ps*0];
+			a3 = pD[3+jj*sdd+ps*0];
+			b0 = pW[0+ps*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*1];
+			a1 = pD[1+jj*sdd+ps*1];
+			a2 = pD[2+jj*sdd+ps*1];
+			a3 = pD[3+jj*sdd+ps*1];
+			b0 = pW[1+ps*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*2];
+			a1 = pD[1+jj*sdd+ps*2];
+			a2 = pD[2+jj*sdd+ps*2];
+			a3 = pD[3+jj*sdd+ps*2];
+			b0 = pW[2+ps*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*3];
+			a1 = pD[1+jj*sdd+ps*3];
+			a2 = pD[2+jj*sdd+ps*3];
+			a3 = pD[3+jj*sdd+ps*3];
+			b0 = pW[3+ps*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			// store
+			pC[0+jj*sdc+ps*0] = c00;
+			pC[1+jj*sdc+ps*0] = c10;
+			pC[2+jj*sdc+ps*0] = c20;
+			pC[3+jj*sdc+ps*0] = c30;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			// load
+			c00 = pC[ll+jj*sdc+ps*0];
+			//
+			a0 = pD[ll+jj*sdd+ps*0];
+			b0 = pW[0+ps*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*1];
+			b0 = pW[1+ps*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*2];
+			b0 = pW[2+ps*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*3];
+			b0 = pW[3+ps*0];
+			c00 -= a0*b0;
+			// store
+			pC[ll+jj*sdc+ps*0] = c00;
+			}
+		}
+#endif
+
+	return;
+	}
+
+
+
+// assume n>=4
+void kernel_dgelqf_4_lib4(int n, double *pD, double *dD)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w1, w2, w3;
+	const int ps = 4;
+	// first column
+	beta = 0.0;
+	for(ii=1; ii<n; ii++)
+		{
+		tmp = pD[0+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		dD[0] = 0.0;
+		tmp = 0.0;
+		goto col2;
+		}
+	alpha = pD[0+ps*0];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[0] = (beta-alpha) / beta;
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[0+ps*0] = beta;
+	w1 = pD[1+ps*0];
+	w2 = pD[2+ps*0];
+	w3 = pD[3+ps*0];
+	//
+	pD[0+ps*1] *= tmp;
+	w1 += pD[1+ps*1] * pD[0+ps*1];
+	w2 += pD[2+ps*1] * pD[0+ps*1];
+	w3 += pD[3+ps*1] * pD[0+ps*1];
+	//
+	pD[0+ps*2] *= tmp;
+	w1 += pD[1+ps*2] * pD[0+ps*2];
+	w2 += pD[2+ps*2] * pD[0+ps*2];
+	w3 += pD[3+ps*2] * pD[0+ps*2];
+	//
+	pD[0+ps*3] *= tmp;
+	w1 += pD[1+ps*3] * pD[0+ps*3];
+	w2 += pD[2+ps*3] * pD[0+ps*3];
+	w3 += pD[3+ps*3] * pD[0+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[0+ps*ii] *= tmp;
+		w1 += pD[1+ps*ii] * pD[0+ps*ii];
+		w2 += pD[2+ps*ii] * pD[0+ps*ii];
+		w3 += pD[3+ps*ii] * pD[0+ps*ii];
+		}
+	//
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	//
+	pD[1+ps*0] += w1;
+	pD[2+ps*0] += w2;
+	pD[3+ps*0] += w3;
+	//
+	pD[1+ps*1] += w1 * pD[0+ps*1];
+	pD[2+ps*1] += w2 * pD[0+ps*1];
+	pD[3+ps*1] += w3 * pD[0+ps*1];
+	//
+	pD[1+ps*2] += w1 * pD[0+ps*2];
+	pD[2+ps*2] += w2 * pD[0+ps*2];
+	pD[3+ps*2] += w3 * pD[0+ps*2];
+	beta = pD[1+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] += w1 * pD[0+ps*3];
+	pD[2+ps*3] += w2 * pD[0+ps*3];
+	pD[3+ps*3] += w3 * pD[0+ps*3];
+	beta += pD[1+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] += w1 * pD[0+ps*ii];
+		pD[2+ps*ii] += w2 * pD[0+ps*ii];
+		pD[3+ps*ii] += w3 * pD[0+ps*ii];
+		beta += pD[1+ps*ii] * pD[1+ps*ii];
+		}
+	// second column
+col2:
+	if(beta==0.0)
+		{
+		dD[1] = 0.0;
+		tmp = 0.0;
+		goto col3;
+		}
+	alpha = pD[1+ps*1];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[1] = (beta-alpha) / beta;
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[1+ps*1] = beta;
+	w2 = pD[2+ps*1];
+	w3 = pD[3+ps*1];
+	//
+	pD[1+ps*2] *= tmp;
+	w2 += pD[2+ps*2] * pD[1+ps*2];
+	w3 += pD[3+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] *= tmp;
+	w2 += pD[2+ps*3] * pD[1+ps*3];
+	w3 += pD[3+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] *= tmp;
+		w2 += pD[2+ps*ii] * pD[1+ps*ii];
+		w3 += pD[3+ps*ii] * pD[1+ps*ii];
+		}
+	//
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	//
+	pD[2+ps*1] += w2;
+	pD[3+ps*1] += w3;
+	//
+	pD[2+ps*2] += w2 * pD[1+ps*2];
+	pD[3+ps*2] += w3 * pD[1+ps*2];
+	//
+	pD[2+ps*3] += w2 * pD[1+ps*3];
+	pD[3+ps*3] += w3 * pD[1+ps*3];
+	beta = pD[2+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] += w2 * pD[1+ps*ii];
+		pD[3+ps*ii] += w3 * pD[1+ps*ii];
+		beta += pD[2+ps*ii] * pD[2+ps*ii];
+		}
+	// third column
+col3:
+	if(beta==0.0)
+		{
+		dD[2] = 0.0;
+		tmp = 0.0;
+		goto col4;
+		}
+	alpha = pD[2+ps*2];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[2] = (beta-alpha) / beta;
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[2+ps*2] = beta;
+	w3 = pD[3+ps*2];
+	//
+	pD[2+ps*3] *= tmp;
+	w3 += pD[3+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] *= tmp;
+		w3 += pD[3+ps*ii] * pD[2+ps*ii];
+		}
+	//
+	w3 = - dD[2] * w3;
+	//
+	pD[3+ps*2] += w3;
+	//
+	pD[3+ps*3] += w3 * pD[2+ps*3];
+	//
+	beta = 0.0;
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] += w3 * pD[2+ps*ii];
+		beta += pD[3+ps*ii] * pD[3+ps*ii];
+		}
+	// fourth column
+col4:
+	if(beta==0.0)
+		{
+		dD[3] = 0.0;
+		tmp = 0.0;
+		return;
+		}
+	alpha = pD[3+ps*3];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[3] = (beta-alpha) / beta;
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[3+ps*3] = beta;
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] *= tmp;
+		}
+	return;
+	}
+
+
+
+// unblocked algorithm
+void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+	const int ps = 4;
+	imax = k;//m<n ? m : n;
+	double alpha, beta, tmp;
+	double w00, w01,
+		   w10, w11,
+		   w20, w21,
+		   w30, w31;
+	__m256d
+		_a0, _b0, _t0, _w0, _w1;
+	double *pC00, *pC10, *pC10a, *pC20, *pC20a, *pC01, *pC11;
+	double pT[4];
+	int ldt = 2;
+	double *pD0 = pD-offD;
+	ii = 0;
+#if 1 // rank 2
+	for(; ii<imax-1; ii+=2)
+		{
+		// first row
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC00[0] = beta;
+			for(jj=1; jj<n-ii; jj++)
+				pC00[0+ps*jj] *= tmp;
+			}
+		pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+		kmax = n-ii;
+		w00 = pC10[0+ps*0]; // pC00[0+ps*0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+			}
+		w00 = - w00*dD[ii];
+		pC10[0+ps*0] += w00; // pC00[0+ps*0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+			}
+		// second row
+		pC11 = pC10+ps*1;
+		beta = 0.0;
+		for(jj=1; jj<n-(ii+1); jj++)
+			{
+			tmp = pC11[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[(ii+1)] = 0.0;
+			}
+		else
+			{
+			alpha = pC11[0+ps*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[(ii+1)] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC11[0+ps*0] = beta;
+			for(jj=1; jj<n-(ii+1); jj++)
+				pC11[0+ps*jj] *= tmp;
+			}
+		// compute T
+		kmax = n-ii;
+		tmp = 1.0*0.0 + pC00[0+ps*1]*1.0;
+		for(kk=2; kk<kmax; kk++)
+			tmp += pC00[0+ps*kk]*pC10[0+ps*kk];
+		pT[0+ldt*0] = - dD[ii+0];
+		pT[0+ldt*1] = + dD[ii+1] * tmp * dD[ii+0];
+		pT[1+ldt*1] = - dD[ii+1];
+		// downgrade
+		kmax = n-ii;
+		jmax = m-ii-2;
+		jmax0 = (ps-((ii+2+offD)&(ps-1)))&(ps-1);
+		jmax0 = jmax<jmax0 ? jmax : jmax0;
+		jj = 0;
+		pC20a = &pD0[((offD+ii+2)&(ps-1))+((offD+ii+2)-((offD+ii+2)&(ps-1)))*sdd+ii*ps];
+		pC20 = pC20a;
+		if(jmax0>0)
+			{
+			for( ; jj<jmax0; jj++)
+				{
+				w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+				w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+				for(kk=2; kk<kmax; kk++)
+					{
+					w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+					w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+					}
+				w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
+				w00 = w00*pT[0+ldt*0];
+				pC20[0+ps*0] += w00*1.0          + w01*0.0;
+				pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+				for(kk=2; kk<kmax; kk++)
+					{
+					pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+					}
+				pC20 += 1;
+				}
+			pC20 += -ps+ps*sdd;
+			}
+		for( ; jj<jmax-3; jj+=4)
+			{
+			//
+			_w0 = _mm256_load_pd( &pC20[0+ps*0] );
+			_a0 = _mm256_load_pd( &pC20[0+ps*1] );
+			_b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
+			_t0 = _mm256_mul_pd( _a0, _b0 );
+			_w0 = _mm256_add_pd( _w0, _t0 );
+			_w1 = _mm256_load_pd( &pC20[0+ps*1] );
+			for(kk=2; kk<kmax; kk++)
+				{
+				_a0 = _mm256_load_pd( &pC20[0+ps*kk] );
+				_b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+				_t0 = _mm256_mul_pd( _a0, _b0 );
+				_w0 = _mm256_add_pd( _w0, _t0 );
+				_b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
+				_t0 = _mm256_mul_pd( _a0, _b0 );
+				_w1 = _mm256_add_pd( _w1, _t0 );
+				}
+			//
+			_b0 = _mm256_broadcast_sd( &pT[1+ldt*1] );
+			_w1 = _mm256_mul_pd( _w1, _b0 );
+			_b0 = _mm256_broadcast_sd( &pT[0+ldt*1] );
+			_t0 = _mm256_mul_pd( _w0, _b0 );
+			_w1 = _mm256_add_pd( _w1, _t0 );
+			_b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
+			_w0 = _mm256_mul_pd( _w0, _b0 );
+			//
+			_a0 = _mm256_load_pd( &pC20[0+ps*0] );
+			_a0 = _mm256_add_pd( _a0, _w0 );
+			_mm256_store_pd( &pC20[0+ps*0], _a0 );
+			_a0 = _mm256_load_pd( &pC20[0+ps*1] );
+			_b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
+			_t0 = _mm256_mul_pd( _w0, _b0 );
+			_a0 = _mm256_add_pd( _a0, _t0 );
+			_a0 = _mm256_add_pd( _a0, _w1 );
+			_mm256_store_pd( &pC20[0+ps*1], _a0 );
+			for(kk=2; kk<kmax; kk++)
+				{
+				_a0 = _mm256_load_pd( &pC20[0+ps*kk] );
+				_b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+				_t0 = _mm256_mul_pd( _w0, _b0 );
+				_a0 = _mm256_add_pd( _a0, _t0 );
+				_b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
+				_t0 = _mm256_mul_pd( _w1, _b0 );
+				_a0 = _mm256_add_pd( _a0, _t0 );
+				_mm256_store_pd( &pC20[0+ps*kk], _a0 );
+				}
+			pC20 += ps*sdd;
+			}
+		for(ll=0; ll<jmax-jj; ll++)
+			{
+			w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+			w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+				w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+				}
+			w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
+			w00 = w00*pT[0+ldt*0];
+			pC20[0+ps*0] += w00*1.0          + w01*0.0;
+			pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+				}
+			pC20 += 1;
+			}
+		}
+#endif
+	for(; ii<imax; ii++)
+		{
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC00[0] = beta;
+			for(jj=1; jj<n-ii; jj++)
+				pC00[0+ps*jj] *= tmp;
+			}
+		if(ii<n)
+			{
+			// compute T
+			pT[0+ldt*0] = - dD[ii+0];
+			// downgrade
+			kmax = n-ii;
+			jmax = m-ii-1;
+			jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+			jmax0 = jmax<jmax0 ? jmax : jmax0;
+			jj = 0;
+			pC10a = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+			pC10 = pC10a;
+			if(jmax0>0)
+				{
+				for( ; jj<jmax0; jj++)
+					{
+					w00 = pC10[0+ps*0];
+					for(kk=1; kk<kmax; kk++)
+						{
+						w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+						}
+					w00 = w00*pT[0+ldt*0];
+					pC10[0+ps*0] += w00;
+					for(kk=1; kk<kmax; kk++)
+						{
+						pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+						}
+					pC10 += 1;
+					}
+				pC10 += -ps+ps*sdd;
+				}
+			for( ; jj<jmax-3; jj+=4)
+				{
+				//
+				_w0 = _mm256_load_pd( &pC10[0+ps*0] );
+				for(kk=1; kk<kmax; kk++)
+					{
+					_a0 = _mm256_load_pd( &pC10[0+ps*kk] );
+					_b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+					_t0 = _mm256_mul_pd( _a0, _b0 );
+					_w0 = _mm256_add_pd( _w0, _t0 );
+					}
+				//
+				_b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
+				_w0 = _mm256_mul_pd( _w0, _b0 );
+				//
+				_a0 = _mm256_load_pd( &pC10[0+ps*0] );
+				_a0 = _mm256_add_pd( _a0, _w0 );
+				_mm256_store_pd( &pC10[0+ps*0], _a0 );
+				for(kk=1; kk<kmax; kk++)
+					{
+					_a0 = _mm256_load_pd( &pC10[0+ps*kk] );
+					_b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+					_t0 = _mm256_mul_pd( _w0, _b0 );
+					_a0 = _mm256_add_pd( _a0, _t0 );
+					_mm256_store_pd( &pC10[0+ps*kk], _a0 );
+					}
+				pC10 += ps*sdd;
+				}
+			for(ll=0; ll<jmax-jj; ll++)
+				{
+				w00 = pC10[0+ps*0];
+				for(kk=1; kk<kmax; kk++)
+					{
+					w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+					}
+				w00 = w00*pT[0+ldt*0];
+				pC10[0+ps*0] += w00;
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+					}
+				pC10 += 1;
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// assume kmax>=4
+void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT)
+	{
+	const int ps = 4;
+	int kk;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	// 0
+	// 1
+	v10 =  pD[0+ps*1];
+	// 2
+	v10 += pD[1+ps*2]*pD[0+ps*2];
+	v20 =  pD[0+ps*2];
+	v21 =  pD[1+ps*2];
+	// 3
+	v10 += pD[1+ps*3]*pD[0+ps*3];
+	v20 += pD[2+ps*3]*pD[0+ps*3];
+	v21 += pD[2+ps*3]*pD[1+ps*3];
+	v30 =  pD[0+ps*3];
+	v31 =  pD[1+ps*3];
+	v32 =  pD[2+ps*3];
+	//
+	for(kk=4; kk<kmax; kk++)
+		{
+		v10 += pD[1+ps*kk]*pD[0+ps*kk];
+		v20 += pD[2+ps*kk]*pD[0+ps*kk];
+		v30 += pD[3+ps*kk]*pD[0+ps*kk];
+		v21 += pD[2+ps*kk]*pD[1+ps*kk];
+		v31 += pD[3+ps*kk]*pD[1+ps*kk];
+		v32 += pD[3+ps*kk]*pD[2+ps*kk];
+		}
+	pT[0+ps*0] = - dD[0];
+	pT[1+ps*1] = - dD[1];
+	pT[2+ps*2] = - dD[2];
+	pT[3+ps*3] = - dD[3];
+	pT[0+ps*1] = - dD[1] * (v10*pT[0+ps*0]);
+	pT[1+ps*2] = - dD[2] * (v21*pT[1+ps*1]);
+	pT[2+ps*3] = - dD[3] * (v32*pT[2+ps*2]);
+	pT[0+ps*2] = - dD[2] * (v20*pT[0+ps*0] + v21*pT[0+ps*1]);
+	pT[1+ps*3] = - dD[3] * (v31*pT[1+ps*1] + v32*pT[1+ps*2]);
+	pT[0+ps*3] = - dD[3] * (v30*pT[0+ps*0] + v31*pT[0+ps*1] + v32*pT[0+ps*2]);
+	return;
+	}
+
+
+
+// assume n>=4
+#if ! defined(TARGET_X64_INTEL_HASWELL)
+void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w0, w1, w2, w3;
+	const int ps = 4;
+	// zero tau matrix
+	for(ii=0; ii<16; ii++)
+		pT[ii] = 0.0;
+	// first column
+	beta = 0.0;
+	for(ii=1; ii<n; ii++)
+		{
+		tmp = pD[0+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		dD[0] = 0.0;
+		tmp = 0.0;
+		goto col2;
+		}
+	alpha = pD[0+ps*0];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[0] = (beta-alpha) / beta;
+	pT[0+ps*0] = - dD[0];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[0+ps*0] = beta;
+	w1 = pD[1+ps*0];
+	w2 = pD[2+ps*0];
+	w3 = pD[3+ps*0];
+	//
+	pD[0+ps*1] *= tmp;
+	w1 += pD[1+ps*1] * pD[0+ps*1];
+	w2 += pD[2+ps*1] * pD[0+ps*1];
+	w3 += pD[3+ps*1] * pD[0+ps*1];
+	//
+	pD[0+ps*2] *= tmp;
+	w1 += pD[1+ps*2] * pD[0+ps*2];
+	w2 += pD[2+ps*2] * pD[0+ps*2];
+	w3 += pD[3+ps*2] * pD[0+ps*2];
+	//
+	pD[0+ps*3] *= tmp;
+	w1 += pD[1+ps*3] * pD[0+ps*3];
+	w2 += pD[2+ps*3] * pD[0+ps*3];
+	w3 += pD[3+ps*3] * pD[0+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[0+ps*ii] *= tmp;
+		w1 += pD[1+ps*ii] * pD[0+ps*ii];
+		w2 += pD[2+ps*ii] * pD[0+ps*ii];
+		w3 += pD[3+ps*ii] * pD[0+ps*ii];
+		}
+	//
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	//
+	pD[1+ps*0] += w1;
+	pD[2+ps*0] += w2;
+	pD[3+ps*0] += w3;
+	//
+	pD[1+ps*1] += w1 * pD[0+ps*1];
+	pD[2+ps*1] += w2 * pD[0+ps*1];
+	pD[3+ps*1] += w3 * pD[0+ps*1];
+	//
+	pD[1+ps*2] += w1 * pD[0+ps*2];
+	pD[2+ps*2] += w2 * pD[0+ps*2];
+	pD[3+ps*2] += w3 * pD[0+ps*2];
+	beta = pD[1+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] += w1 * pD[0+ps*3];
+	pD[2+ps*3] += w2 * pD[0+ps*3];
+	pD[3+ps*3] += w3 * pD[0+ps*3];
+	beta += pD[1+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] += w1 * pD[0+ps*ii];
+		pD[2+ps*ii] += w2 * pD[0+ps*ii];
+		pD[3+ps*ii] += w3 * pD[0+ps*ii];
+		beta += pD[1+ps*ii] * pD[1+ps*ii];
+		}
+	// second column
+col2:
+	if(beta==0.0)
+		{
+		dD[1] = 0.0;
+		tmp = 0.0;
+		goto col3;
+		}
+	alpha = pD[1+ps*1];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[1] = (beta-alpha) / beta;
+	pT[1+ps*1] = - dD[1];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[1+ps*1] = beta;
+	w0 = pD[0+ps*1]; //
+	w2 = pD[2+ps*1];
+	w3 = pD[3+ps*1];
+	//
+	pD[1+ps*2] *= tmp;
+	w0 += pD[0+ps*2] * pD[1+ps*2]; //
+	w2 += pD[2+ps*2] * pD[1+ps*2];
+	w3 += pD[3+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] *= tmp;
+	w0 += pD[0+ps*3] * pD[1+ps*3]; //
+	w2 += pD[2+ps*3] * pD[1+ps*3];
+	w3 += pD[3+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+		w2 += pD[2+ps*ii] * pD[1+ps*ii];
+		w3 += pD[3+ps*ii] * pD[1+ps*ii];
+		}
+	//
+	pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	//
+	pD[2+ps*1] += w2;
+	pD[3+ps*1] += w3;
+	//
+	pD[2+ps*2] += w2 * pD[1+ps*2];
+	pD[3+ps*2] += w3 * pD[1+ps*2];
+	//
+	pD[2+ps*3] += w2 * pD[1+ps*3];
+	pD[3+ps*3] += w3 * pD[1+ps*3];
+	beta = pD[2+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] += w2 * pD[1+ps*ii];
+		pD[3+ps*ii] += w3 * pD[1+ps*ii];
+		beta += pD[2+ps*ii] * pD[2+ps*ii];
+		}
+	// third column
+col3:
+	if(beta==0.0)
+		{
+		dD[2] = 0.0;
+		tmp = 0.0;
+		goto col4;
+		}
+	alpha = pD[2+ps*2];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[2] = (beta-alpha) / beta;
+	pT[2+ps*2] = - dD[2];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[2+ps*2] = beta;
+	w0 = pD[0+ps*2];
+	w1 = pD[1+ps*2];
+	w3 = pD[3+ps*2];
+	//
+	pD[2+ps*3] *= tmp;
+	w0 += pD[0+ps*3] * pD[2+ps*3];
+	w1 += pD[1+ps*3] * pD[2+ps*3];
+	w3 += pD[3+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[2+ps*ii];
+		w1 += pD[1+ps*ii] * pD[2+ps*ii];
+		w3 += pD[3+ps*ii] * pD[2+ps*ii];
+		}
+	//
+	pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+	pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+	w3 = - dD[2] * w3;
+	//
+	pD[3+ps*2] += w3;
+	//
+	pD[3+ps*3] += w3 * pD[2+ps*3];
+	//
+	beta = 0.0;
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] += w3 * pD[2+ps*ii];
+		beta += pD[3+ps*ii] * pD[3+ps*ii];
+		}
+	// fourth column
+col4:
+	if(beta==0.0)
+		{
+		dD[3] = 0.0;
+		tmp = 0.0;
+		return;
+		}
+	alpha = pD[3+ps*3];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[3] = (beta-alpha) / beta;
+	pT[3+ps*3] = - dD[3];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[3+ps*3] = beta;
+	w0 =  pD[0+ps*3];
+	w1 =  pD[1+ps*3];
+	w2 =  pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[3+ps*ii];
+		w1 += pD[1+ps*ii] * pD[3+ps*ii];
+		w2 += pD[2+ps*ii] * pD[3+ps*ii];
+		}
+	//
+	pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+	pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+	pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+	return;
+	}
+#endif
+
+
+
+void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD)
+	{
+	const int ps = 4;
+	double pW[16];
+	int kk;
+	// 0
+	pW[0+ps*0] = pD[0+ps*0];
+	// 1
+	pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+	pW[0+ps*1] = pD[0+ps*1];
+	// 2
+	pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+	pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+	pW[0+ps*2] = pD[0+ps*2];
+	// 3
+	pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+	pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+	pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+	pW[0+ps*3] = pD[0+ps*3];
+	//
+	for(kk=4; kk<kmax; kk++)
+		{
+		pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+		pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+		pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+		pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+		}
+	//
+	pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+	//
+	pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+	//
+	pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+	//
+	pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+	//
+	pD[0+ps*0] += pW[0+ps*0];
+	//
+	pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+	//
+	pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+	//
+	pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+	for(kk=4; kk<kmax; kk++)
+		{
+		pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+		}
+	return;
+	}
+
+
+
+