blob: a5faf2073f770725fb8098e209334c1ac29f4620 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#include <math.h>
30#include <stdio.h>
31
32#include <mmintrin.h>
33#include <xmmintrin.h> // SSE
34#include <emmintrin.h> // SSE2
35#include <pmmintrin.h> // SSE3
36#include <smmintrin.h> // SSE4
37#include <immintrin.h> // AVX
38
39#include "../../include/blasfeo_common.h"
40#include "../../include/blasfeo_d_aux.h"
41#include "../../include/blasfeo_d_kernel.h"
42
43
44
45void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD)
46 {
47 int ii, jj, ll;
48 double alpha, beta, tmp, w1, w2, w3;
49 const int ps = 4;
50 // first column
51 beta = 0.0;
52 ii = 1;
53 if(m>1)
54 {
55 tmp = pD[1+ps*0];
56 beta += tmp*tmp;
57 if(m>2)
58 {
59 tmp = pD[2+ps*0];
60 beta += tmp*tmp;
61 if(m>3)
62 {
63 tmp = pD[3+ps*0];
64 beta += tmp*tmp;
65 }
66 }
67 }
68 for(ii=4; ii<m-3; ii+=4)
69 {
70 tmp = pD[0+ii*sdd+ps*0];
71 beta += tmp*tmp;
72 tmp = pD[1+ii*sdd+ps*0];
73 beta += tmp*tmp;
74 tmp = pD[2+ii*sdd+ps*0];
75 beta += tmp*tmp;
76 tmp = pD[3+ii*sdd+ps*0];
77 beta += tmp*tmp;
78 }
79 for(ll=0; ll<m-ii; ll++)
80 {
81 tmp = pD[ll+ii*sdd+ps*0];
82 beta += tmp*tmp;
83 }
84 if(beta==0.0)
85 {
86 // tau
87 dD[0] = 0.0;
88 }
89 else
90 {
91 alpha = pD[0+ps*0];
92 beta += alpha*alpha;
93 beta = sqrt(beta);
94 if(alpha>0)
95 beta = -beta;
96 // tau0
97 dD[0] = (beta-alpha) / beta;
98 tmp = 1.0 / (alpha-beta);
99 // compute v0
100 pD[0+ps*0] = beta;
101 ii = 1;
102 if(m>1)
103 {
104 pD[1+ps*0] *= tmp;
105 if(m>2)
106 {
107 pD[2+ps*0] *= tmp;
108 if(m>3)
109 {
110 pD[3+ps*0] *= tmp;
111 }
112 }
113 }
114 for(ii=4; ii<m-3; ii+=4)
115 {
116 pD[0+ii*sdd+ps*0] *= tmp;
117 pD[1+ii*sdd+ps*0] *= tmp;
118 pD[2+ii*sdd+ps*0] *= tmp;
119 pD[3+ii*sdd+ps*0] *= tmp;
120 }
121 for(ll=0; ll<m-ii; ll++)
122 {
123 pD[ll+ii*sdd+ps*0] *= tmp;
124 }
125 }
126 // gemv_t & ger
127 w1 = pD[0+ps*1];
128 w2 = pD[0+ps*2];
129 w3 = pD[0+ps*3];
130 if(m>1)
131 {
132 w1 += pD[1+ps*1] * pD[1+ps*0];
133 w2 += pD[1+ps*2] * pD[1+ps*0];
134 w3 += pD[1+ps*3] * pD[1+ps*0];
135 if(m>2)
136 {
137 w1 += pD[2+ps*1] * pD[2+ps*0];
138 w2 += pD[2+ps*2] * pD[2+ps*0];
139 w3 += pD[2+ps*3] * pD[2+ps*0];
140 if(m>3)
141 {
142 w1 += pD[3+ps*1] * pD[3+ps*0];
143 w2 += pD[3+ps*2] * pD[3+ps*0];
144 w3 += pD[3+ps*3] * pD[3+ps*0];
145 }
146 }
147 }
148 for(ii=4; ii<m-3; ii+=4)
149 {
150 w1 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
151 w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
152 w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
153 w1 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
154 w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
155 w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
156 w1 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
157 w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
158 w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
159 w1 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
160 w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
161 w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
162 }
163 for(ll=0; ll<m-ii; ll++)
164 {
165 w1 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
166 w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
167 w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
168 }
169 w1 = - dD[0] * w1;
170 w2 = - dD[0] * w2;
171 w3 = - dD[0] * w3;
172 pD[0+ps*1] += w1;
173 pD[0+ps*2] += w2;
174 pD[0+ps*3] += w3;
175 if(m>1)
176 {
177 pD[1+ps*1] += w1 * pD[1+ps*0];
178 pD[1+ps*2] += w2 * pD[1+ps*0];
179 pD[1+ps*3] += w3 * pD[1+ps*0];
180 if(m>2)
181 {
182 pD[2+ps*1] += w1 * pD[2+ps*0];
183 pD[2+ps*2] += w2 * pD[2+ps*0];
184 pD[2+ps*3] += w3 * pD[2+ps*0];
185 if(m>3)
186 {
187 pD[3+ps*1] += w1 * pD[3+ps*0];
188 pD[3+ps*2] += w2 * pD[3+ps*0];
189 pD[3+ps*3] += w3 * pD[3+ps*0];
190 }
191 }
192 }
193 for(ii=4; ii<m-3; ii+=4)
194 {
195 pD[0+ii*sdd+ps*1] += w1 * pD[0+ii*sdd+ps*0];
196 pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*0];
197 pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*0];
198 pD[1+ii*sdd+ps*1] += w1 * pD[1+ii*sdd+ps*0];
199 pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*0];
200 pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*0];
201 pD[2+ii*sdd+ps*1] += w1 * pD[2+ii*sdd+ps*0];
202 pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*0];
203 pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*0];
204 pD[3+ii*sdd+ps*1] += w1 * pD[3+ii*sdd+ps*0];
205 pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*0];
206 pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*0];
207 }
208 for(ll=0; ll<m-ii; ll++)
209 {
210 pD[ll+ii*sdd+ps*1] += w1 * pD[ll+ii*sdd+ps*0];
211 pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*0];
212 pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*0];
213 }
214 if(m==1)
215 return;
216 // second column
217 beta = 0.0;
218 if(m>2)
219 {
220 tmp = pD[2+ps*1];
221 beta += tmp*tmp;
222 if(m>3)
223 {
224 tmp = pD[3+ps*1];
225 beta += tmp*tmp;
226 }
227 }
228 for(ii=4; ii<m-3; ii+=4)
229 {
230 tmp = pD[0+ii*sdd+ps*1];
231 beta += tmp*tmp;
232 tmp = pD[1+ii*sdd+ps*1];
233 beta += tmp*tmp;
234 tmp = pD[2+ii*sdd+ps*1];
235 beta += tmp*tmp;
236 tmp = pD[3+ii*sdd+ps*1];
237 beta += tmp*tmp;
238 }
239 for(ll=0; ll<m-ii; ll++)
240 {
241 tmp = pD[ll+ii*sdd+ps*1];
242 beta += tmp*tmp;
243 }
244 if(beta==0.0)
245 {
246 // tau
247 dD[1] = 0.0;
248 }
249 else
250 {
251 alpha = pD[1+ps*1];
252 beta += alpha*alpha;
253 beta = sqrt(beta);
254 if(alpha>0)
255 beta = -beta;
256 // tau0
257 dD[1] = (beta-alpha) / beta;
258 tmp = 1.0 / (alpha-beta);
259 // compute v0
260 pD[1+ps*1] = beta;
261 if(m>2)
262 {
263 pD[2+ps*1] *= tmp;
264 if(m>3)
265 {
266 pD[3+ps*1] *= tmp;
267 }
268 }
269 for(ii=4; ii<m-3; ii+=4)
270 {
271 pD[0+ii*sdd+ps*1] *= tmp;
272 pD[1+ii*sdd+ps*1] *= tmp;
273 pD[2+ii*sdd+ps*1] *= tmp;
274 pD[3+ii*sdd+ps*1] *= tmp;
275 }
276 for(ll=0; ll<m-ii; ll++)
277 {
278 pD[ll+ii*sdd+ps*1] *= tmp;
279 }
280 }
281 // gemv_t & ger
282 w2 = pD[1+ps*2];
283 w3 = pD[1+ps*3];
284 if(m>2)
285 {
286 w2 += pD[2+ps*2] * pD[2+ps*1];
287 w3 += pD[2+ps*3] * pD[2+ps*1];
288 if(m>3)
289 {
290 w2 += pD[3+ps*2] * pD[3+ps*1];
291 w3 += pD[3+ps*3] * pD[3+ps*1];
292 }
293 }
294 for(ii=4; ii<m-3; ii+=4)
295 {
296 w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
297 w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
298 w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
299 w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
300 w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
301 w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
302 w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
303 w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
304 }
305 for(ll=0; ll<m-ii; ll++)
306 {
307 w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
308 w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
309 }
310 w2 = - dD[1] * w2;
311 w3 = - dD[1] * w3;
312 pD[1+ps*2] += w2;
313 pD[1+ps*3] += w3;
314 if(m>2)
315 {
316 pD[2+ps*2] += w2 * pD[2+ps*1];
317 pD[2+ps*3] += w3 * pD[2+ps*1];
318 if(m>3)
319 {
320 pD[3+ps*2] += w2 * pD[3+ps*1];
321 pD[3+ps*3] += w3 * pD[3+ps*1];
322 }
323 }
324 for(ii=4; ii<m-3; ii+=4)
325 {
326 pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*1];
327 pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*1];
328 pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*1];
329 pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*1];
330 pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*1];
331 pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*1];
332 pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*1];
333 pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*1];
334 }
335 for(ll=0; ll<m-ii; ll++)
336 {
337 pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*1];
338 pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*1];
339 }
340 if(m==2)
341 return;
342 // third column
343 beta = 0.0;
344 if(m>3)
345 {
346 tmp = pD[3+ps*2];
347 beta += tmp*tmp;
348 }
349 for(ii=4; ii<m-3; ii+=4)
350 {
351 tmp = pD[0+ii*sdd+ps*2];
352 beta += tmp*tmp;
353 tmp = pD[1+ii*sdd+ps*2];
354 beta += tmp*tmp;
355 tmp = pD[2+ii*sdd+ps*2];
356 beta += tmp*tmp;
357 tmp = pD[3+ii*sdd+ps*2];
358 beta += tmp*tmp;
359 }
360 for(ll=0; ll<m-ii; ll++)
361 {
362 tmp = pD[ll+ii*sdd+ps*2];
363 beta += tmp*tmp;
364 }
365 if(beta==0.0)
366 {
367 // tau
368 dD[2] = 0.0;
369 }
370 else
371 {
372 alpha = pD[2+ps*2];
373 beta += alpha*alpha;
374 beta = sqrt(beta);
375 if(alpha>0)
376 beta = -beta;
377 // tau0
378 dD[2] = (beta-alpha) / beta;
379 tmp = 1.0 / (alpha-beta);
380 // compute v0
381 pD[2+ps*2] = beta;
382 if(m>3)
383 {
384 pD[3+ps*2] *= tmp;
385 }
386 for(ii=4; ii<m-3; ii+=4)
387 {
388 pD[0+ii*sdd+ps*2] *= tmp;
389 pD[1+ii*sdd+ps*2] *= tmp;
390 pD[2+ii*sdd+ps*2] *= tmp;
391 pD[3+ii*sdd+ps*2] *= tmp;
392 }
393 for(ll=0; ll<m-ii; ll++)
394 {
395 pD[ll+ii*sdd+ps*2] *= tmp;
396 }
397 }
398 // gemv_t & ger
399 w3 = pD[2+ps*3];
400 if(m>3)
401 {
402 w3 += pD[3+ps*3] * pD[3+ps*2];
403 }
404 for(ii=4; ii<m-3; ii+=4)
405 {
406 w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
407 w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
408 w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
409 w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
410 }
411 for(ll=0; ll<m-ii; ll++)
412 {
413 w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
414 }
415 w3 = - dD[2] * w3;
416 pD[2+ps*3] += w3;
417 if(m>3)
418 {
419 pD[3+ps*3] += w3 * pD[3+ps*2];
420 }
421 for(ii=4; ii<m-3; ii+=4)
422 {
423 pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*2];
424 pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*2];
425 pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*2];
426 pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*2];
427 }
428 for(ll=0; ll<m-ii; ll++)
429 {
430 pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*2];
431 }
432 if(m==3)
433 return;
434 // fourth column
435 beta = 0.0;
436 for(ii=4; ii<m-3; ii+=4)
437 {
438 tmp = pD[0+ii*sdd+ps*3];
439 beta += tmp*tmp;
440 tmp = pD[1+ii*sdd+ps*3];
441 beta += tmp*tmp;
442 tmp = pD[2+ii*sdd+ps*3];
443 beta += tmp*tmp;
444 tmp = pD[3+ii*sdd+ps*3];
445 beta += tmp*tmp;
446 }
447 for(ll=0; ll<m-ii; ll++)
448 {
449 tmp = pD[ll+ii*sdd+ps*3];
450 beta += tmp*tmp;
451 }
452 if(beta==0.0)
453 {
454 // tau
455 dD[3] = 0.0;
456 }
457 else
458 {
459 alpha = pD[3+ps*3];
460 beta += alpha*alpha;
461 beta = sqrt(beta);
462 if(alpha>0)
463 beta = -beta;
464 // tau0
465 dD[3] = (beta-alpha) / beta;
466 tmp = 1.0 / (alpha-beta);
467 // compute v0
468 pD[3+ps*3] = beta;
469 for(ii=4; ii<m-3; ii+=4)
470 {
471 pD[0+ii*sdd+ps*3] *= tmp;
472 pD[1+ii*sdd+ps*3] *= tmp;
473 pD[2+ii*sdd+ps*3] *= tmp;
474 pD[3+ii*sdd+ps*3] *= tmp;
475 }
476 for(ll=0; ll<m-ii; ll++)
477 {
478 pD[ll+ii*sdd+ps*3] *= tmp;
479 }
480 }
481 return;
482 }
483
484
485// unblocked algorithm
486void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
487 {
488 if(m<=0 | n<=0)
489 return;
490 int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
491 const int ps = 4;
492 imax = k;//m<n ? m : n;
493 double alpha, beta, tmp, w0;
494 double *pC00, *pC10, *pC01, *pC11;
495 int offset;
496 double *pD0 = pD-offD;
497 for(ii=0; ii<imax; ii++)
498 {
499 pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
500 pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
501 beta = 0.0;
502 jmax = m-ii-1;
503 jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
504 jmax0 = jmax<jmax0 ? jmax : jmax0;
505 offset = 0;
506 jj = 0;
507 if(jmax0>0)
508 {
509 for( ; jj<jmax0; jj++)
510 {
511 tmp = pC10[0+offset];
512 beta += tmp*tmp;
513 offset += 1;
514 }
515 offset += -ps+ps*sdd;
516 }
517 for( ; jj<jmax-3; jj+=4)
518 {
519 tmp = pC10[0+offset];
520 beta += tmp*tmp;
521 tmp = pC10[1+offset];
522 beta += tmp*tmp;
523 tmp = pC10[2+offset];
524 beta += tmp*tmp;
525 tmp = pC10[3+offset];
526 beta += tmp*tmp;
527 offset += ps*sdd;
528 }
529 for(ll=0; ll<jmax-jj; ll++)
530 {
531 tmp = pC10[0+offset];
532 beta += tmp*tmp;
533 offset += 1;
534 }
535 if(beta==0.0)
536 {
537 dD[ii] = 0.0;
538 }
539 else
540 {
541 alpha = pC00[0];
542 beta += alpha*alpha;
543 beta = sqrt(beta);
544 if(alpha>0)
545 beta = -beta;
546 dD[ii] = (beta-alpha) / beta;
547 tmp = 1.0 / (alpha-beta);
548 offset = 0;
549 jj = 0;
550 if(jmax0>0)
551 {
552 for( ; jj<jmax0; jj++)
553 {
554 pC10[0+offset] *= tmp;
555 offset += 1;
556 }
557 offset += -ps+ps*sdd;
558 }
559 for( ; jj<jmax-3; jj+=4)
560 {
561 pC10[0+offset] *= tmp;
562 pC10[1+offset] *= tmp;
563 pC10[2+offset] *= tmp;
564 pC10[3+offset] *= tmp;
565 offset += ps*sdd;
566 }
567 for(ll=0; ll<jmax-jj; ll++)
568 {
569 pC10[0+offset] *= tmp;
570 offset += 1;
571 }
572 pC00[0] = beta;
573 }
574 if(ii<n)
575 {
576 pC01 = pC00 + ps;
577 pC11 = pC10 + ps;
578 kmax = jmax;
579 kmax0 = jmax0;
580 jmax = n-ii-1;
581 jj = 0;
582 for( ; jj<jmax; jj++)
583 {
584 w0 = pC01[0+ps*jj] * 1.0;
585 offset = 0;
586 kk = 0;
587 if(kmax0>0)
588 {
589 for( ; kk<kmax0; kk++)
590 {
591 w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
592 offset += 1;
593 }
594 offset += -ps+ps*sdd;
595 }
596 for( ; kk<kmax-3; kk+=4)
597 {
598 w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
599 w0 += pC11[1+offset+ps*jj] * pC10[1+offset];
600 w0 += pC11[2+offset+ps*jj] * pC10[2+offset];
601 w0 += pC11[3+offset+ps*jj] * pC10[3+offset];
602 offset += ps*sdd;
603 }
604 for(ll=0; ll<kmax-kk; ll++)
605 {
606 w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
607 offset += 1;
608 }
609 w0 = - dD[ii] * w0;
610 pC01[0+ps*jj] += w0;
611 offset = 0;
612 kk = 0;
613 if(kmax0>0)
614 {
615 for( ; kk<kmax0; kk++)
616 {
617 pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
618 offset += 1;
619 }
620 offset = offset-ps+ps*sdd;
621 }
622 for( ; kk<kmax-3; kk+=4)
623 {
624 pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
625 pC11[1+offset+ps*jj] += w0 * pC10[1+offset];
626 pC11[2+offset+ps*jj] += w0 * pC10[2+offset];
627 pC11[3+offset+ps*jj] += w0 * pC10[3+offset];
628 offset += ps*sdd;
629 }
630 for(ll=0; ll<kmax-kk; ll++)
631 {
632 pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
633 offset += 1;
634 }
635 }
636 }
637 }
638 return;
639 }
640
641
642
643void kernel_dlarf_4_lib4(int m, int n, double *pD, int sdd, double *dD, double *pC0, int sdc)
644 {
645 if(m<=0 | n<=0)
646 return;
647 int ii, jj, ll;
648 const int ps = 4;
649 double v10,
650 v20, v21,
651 v30, v31, v32;
652 double tmp, d0, d1, d2, d3;
653 double *pC;
654 double pT[16];// = {};
655 int ldt = 4;
656 double pW[8];// = {};
657 int ldw = 2;
658 // dot product of v
659 v10 = 0.0;
660 v20 = 0.0;
661 v30 = 0.0;
662 v21 = 0.0;
663 v31 = 0.0;
664 v32 = 0.0;
665 if(m>1)
666 {
667 v10 = 1.0 * pD[1+ps*0];
668 if(m>2)
669 {
670 v10 += pD[2+ps*1] * pD[2+ps*0];
671 v20 = 1.0 * pD[2+ps*0];
672 v21 = 1.0 * pD[2+ps*1];
673 if(m>3)
674 {
675 v10 += pD[3+ps*1] * pD[3+ps*0];
676 v20 += pD[3+ps*2] * pD[3+ps*0];
677 v21 += pD[3+ps*2] * pD[3+ps*1];
678 v30 = 1.0 * pD[3+ps*0];
679 v31 = 1.0 * pD[3+ps*1];
680 v32 = 1.0 * pD[3+ps*2];
681 }
682 }
683 }
684 for(ii=4; ii<m-3; ii+=4)
685 {
686 v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
687 v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
688 v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
689 v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
690 v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
691 v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
692 v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
693 v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
694 v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
695 v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
696 v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
697 v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
698 v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
699 v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
700 v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
701 v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
702 v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
703 v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
704 v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
705 v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
706 v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
707 v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
708 v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
709 v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
710 }
711 for(ll=0; ll<m-ii; ll++)
712 {
713 v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
714 v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
715 v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
716 v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
717 v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
718 v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
719 }
720 // compute lower triangular T containing tau for matrix update
721 pT[0+ldt*0] = dD[0];
722 pT[1+ldt*1] = dD[1];
723 pT[2+ldt*2] = dD[2];
724 pT[3+ldt*3] = dD[3];
725 pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
726 pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
727 pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
728 pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
729 pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
730 pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
731 // downgrade matrix
732 pW[0] = 0.0;
733 pW[1] = 0.0;
734 pW[2] = 0.0;
735 pW[3] = 0.0;
736 pW[4] = 0.0;
737 pW[5] = 0.0;
738 pW[6] = 0.0;
739 pW[7] = 0.0;
740 ii = 0;
741 for( ; ii<n-1; ii+=2)
742 {
743 pC = pC0+ii*ps;
744 // compute W^T = C^T * V
745 tmp = pC[0+ps*0];
746 pW[0+ldw*0] = tmp;
747 tmp = pC[0+ps*1];
748 pW[1+ldw*0] = tmp;
749 if(m>1)
750 {
751 d0 = pD[1+ps*0];
752 tmp = pC[1+ps*0];
753 pW[0+ldw*0] += tmp * d0;
754 pW[0+ldw*1] = tmp;
755 tmp = pC[1+ps*1];
756 pW[1+ldw*0] += tmp * d0;
757 pW[1+ldw*1] = tmp;
758 if(m>2)
759 {
760 d0 = pD[2+ps*0];
761 d1 = pD[2+ps*1];
762 tmp = pC[2+ps*0];
763 pW[0+ldw*0] += tmp * d0;
764 pW[0+ldw*1] += tmp * d1;
765 pW[0+ldw*2] = tmp;
766 tmp = pC[2+ps*1];
767 pW[1+ldw*0] += tmp * d0;
768 pW[1+ldw*1] += tmp * d1;
769 pW[1+ldw*2] = tmp;
770 if(m>3)
771 {
772 d0 = pD[3+ps*0];
773 d1 = pD[3+ps*1];
774 d2 = pD[3+ps*2];
775 tmp = pC[3+ps*0];
776 pW[0+ldw*0] += tmp * d0;
777 pW[0+ldw*1] += tmp * d1;
778 pW[0+ldw*2] += tmp * d2;
779 pW[0+ldw*3] = tmp;
780 tmp = pC[3+ps*1];
781 pW[1+ldw*0] += tmp * d0;
782 pW[1+ldw*1] += tmp * d1;
783 pW[1+ldw*2] += tmp * d2;
784 pW[1+ldw*3] = tmp;
785 }
786 }
787 }
788 for(jj=4; jj<m-3; jj+=4)
789 {
790 //
791 d0 = pD[0+jj*sdd+ps*0];
792 d1 = pD[0+jj*sdd+ps*1];
793 d2 = pD[0+jj*sdd+ps*2];
794 d3 = pD[0+jj*sdd+ps*3];
795 tmp = pC[0+jj*sdc+ps*0];
796 pW[0+ldw*0] += tmp * d0;
797 pW[0+ldw*1] += tmp * d1;
798 pW[0+ldw*2] += tmp * d2;
799 pW[0+ldw*3] += tmp * d3;
800 tmp = pC[0+jj*sdc+ps*1];
801 pW[1+ldw*0] += tmp * d0;
802 pW[1+ldw*1] += tmp * d1;
803 pW[1+ldw*2] += tmp * d2;
804 pW[1+ldw*3] += tmp * d3;
805 //
806 d0 = pD[1+jj*sdd+ps*0];
807 d1 = pD[1+jj*sdd+ps*1];
808 d2 = pD[1+jj*sdd+ps*2];
809 d3 = pD[1+jj*sdd+ps*3];
810 tmp = pC[1+jj*sdc+ps*0];
811 pW[0+ldw*0] += tmp * d0;
812 pW[0+ldw*1] += tmp * d1;
813 pW[0+ldw*2] += tmp * d2;
814 pW[0+ldw*3] += tmp * d3;
815 tmp = pC[1+jj*sdc+ps*1];
816 pW[1+ldw*0] += tmp * d0;
817 pW[1+ldw*1] += tmp * d1;
818 pW[1+ldw*2] += tmp * d2;
819 pW[1+ldw*3] += tmp * d3;
820 //
821 d0 = pD[2+jj*sdd+ps*0];
822 d1 = pD[2+jj*sdd+ps*1];
823 d2 = pD[2+jj*sdd+ps*2];
824 d3 = pD[2+jj*sdd+ps*3];
825 tmp = pC[2+jj*sdc+ps*0];
826 pW[0+ldw*0] += tmp * d0;
827 pW[0+ldw*1] += tmp * d1;
828 pW[0+ldw*2] += tmp * d2;
829 pW[0+ldw*3] += tmp * d3;
830 tmp = pC[2+jj*sdc+ps*1];
831 pW[1+ldw*0] += tmp * d0;
832 pW[1+ldw*1] += tmp * d1;
833 pW[1+ldw*2] += tmp * d2;
834 pW[1+ldw*3] += tmp * d3;
835 //
836 d0 = pD[3+jj*sdd+ps*0];
837 d1 = pD[3+jj*sdd+ps*1];
838 d2 = pD[3+jj*sdd+ps*2];
839 d3 = pD[3+jj*sdd+ps*3];
840 tmp = pC[3+jj*sdc+ps*0];
841 pW[0+ldw*0] += tmp * d0;
842 pW[0+ldw*1] += tmp * d1;
843 pW[0+ldw*2] += tmp * d2;
844 pW[0+ldw*3] += tmp * d3;
845 tmp = pC[3+jj*sdc+ps*1];
846 pW[1+ldw*0] += tmp * d0;
847 pW[1+ldw*1] += tmp * d1;
848 pW[1+ldw*2] += tmp * d2;
849 pW[1+ldw*3] += tmp * d3;
850 }
851 for(ll=0; ll<m-jj; ll++)
852 {
853 d0 = pD[ll+jj*sdd+ps*0];
854 d1 = pD[ll+jj*sdd+ps*1];
855 d2 = pD[ll+jj*sdd+ps*2];
856 d3 = pD[ll+jj*sdd+ps*3];
857 tmp = pC[ll+jj*sdc+ps*0];
858 pW[0+ldw*0] += tmp * d0;
859 pW[0+ldw*1] += tmp * d1;
860 pW[0+ldw*2] += tmp * d2;
861 pW[0+ldw*3] += tmp * d3;
862 tmp = pC[ll+jj*sdc+ps*1];
863 pW[1+ldw*0] += tmp * d0;
864 pW[1+ldw*1] += tmp * d1;
865 pW[1+ldw*2] += tmp * d2;
866 pW[1+ldw*3] += tmp * d3;
867 }
868 // compute W^T *= T
869 pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
870 pW[1+ldw*3] = pT[3+ldt*0]*pW[1+ldw*0] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[1+ldw*2] + pT[3+ldt*3]*pW[1+ldw*3];
871 pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
872 pW[1+ldw*2] = pT[2+ldt*0]*pW[1+ldw*0] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[1+ldw*2];
873 pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
874 pW[1+ldw*1] = pT[1+ldt*0]*pW[1+ldw*0] + pT[1+ldt*1]*pW[1+ldw*1];
875 pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
876 pW[1+ldw*0] = pT[0+ldt*0]*pW[1+ldw*0];
877 // compute C -= V * W^T
878 pC[0+ps*0] -= pW[0+ldw*0];
879 pC[0+ps*1] -= pW[1+ldw*0];
880 if(m>1)
881 {
882 pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
883 pC[1+ps*1] -= pD[1+ps*0]*pW[1+ldw*0] + pW[1+ldw*1];
884 if(m>2)
885 {
886 pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
887 pC[2+ps*1] -= pD[2+ps*0]*pW[1+ldw*0] + pD[2+ps*1]*pW[1+ldw*1] + pW[1+ldw*2];
888 if(m>3)
889 {
890 pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
891 pC[3+ps*1] -= pD[3+ps*0]*pW[1+ldw*0] + pD[3+ps*1]*pW[1+ldw*1] + pD[3+ps*2]*pW[1+ldw*2] + pW[1+ldw*3];
892 }
893 }
894 }
895 for(jj=4; jj<m-3; jj+=4)
896 {
897 //
898 d0 = pD[0+jj*sdd+ps*0];
899 d1 = pD[0+jj*sdd+ps*1];
900 d2 = pD[0+jj*sdd+ps*2];
901 d3 = pD[0+jj*sdd+ps*3];
902 pC[0+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
903 pC[0+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
904 //
905 d0 = pD[1+jj*sdd+ps*0];
906 d1 = pD[1+jj*sdd+ps*1];
907 d2 = pD[1+jj*sdd+ps*2];
908 d3 = pD[1+jj*sdd+ps*3];
909 pC[1+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
910 pC[1+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
911 //
912 d0 = pD[2+jj*sdd+ps*0];
913 d1 = pD[2+jj*sdd+ps*1];
914 d2 = pD[2+jj*sdd+ps*2];
915 d3 = pD[2+jj*sdd+ps*3];
916 pC[2+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
917 pC[2+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
918 //
919 d0 = pD[3+jj*sdd+ps*0];
920 d1 = pD[3+jj*sdd+ps*1];
921 d2 = pD[3+jj*sdd+ps*2];
922 d3 = pD[3+jj*sdd+ps*3];
923 pC[3+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
924 pC[3+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
925 }
926 for(ll=0; ll<m-jj; ll++)
927 {
928 d0 = pD[ll+jj*sdd+ps*0];
929 d1 = pD[ll+jj*sdd+ps*1];
930 d2 = pD[ll+jj*sdd+ps*2];
931 d3 = pD[ll+jj*sdd+ps*3];
932 pC[ll+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
933 pC[ll+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
934 }
935 }
936 for( ; ii<n; ii++)
937 {
938 pC = pC0+ii*ps;
939 // compute W^T = C^T * V
940 tmp = pC[0+ps*0];
941 pW[0+ldw*0] = tmp;
942 if(m>1)
943 {
944 tmp = pC[1+ps*0];
945 pW[0+ldw*0] += tmp * pD[1+ps*0];
946 pW[0+ldw*1] = tmp;
947 if(m>2)
948 {
949 tmp = pC[2+ps*0];
950 pW[0+ldw*0] += tmp * pD[2+ps*0];
951 pW[0+ldw*1] += tmp * pD[2+ps*1];
952 pW[0+ldw*2] = tmp;
953 if(m>3)
954 {
955 tmp = pC[3+ps*0];
956 pW[0+ldw*0] += tmp * pD[3+ps*0];
957 pW[0+ldw*1] += tmp * pD[3+ps*1];
958 pW[0+ldw*2] += tmp * pD[3+ps*2];
959 pW[0+ldw*3] = tmp;
960 }
961 }
962 }
963 for(jj=4; jj<m-3; jj+=4)
964 {
965 tmp = pC[0+jj*sdc+ps*0];
966 pW[0+ldw*0] += tmp * pD[0+jj*sdd+ps*0];
967 pW[0+ldw*1] += tmp * pD[0+jj*sdd+ps*1];
968 pW[0+ldw*2] += tmp * pD[0+jj*sdd+ps*2];
969 pW[0+ldw*3] += tmp * pD[0+jj*sdd+ps*3];
970 tmp = pC[1+jj*sdc+ps*0];
971 pW[0+ldw*0] += tmp * pD[1+jj*sdd+ps*0];
972 pW[0+ldw*1] += tmp * pD[1+jj*sdd+ps*1];
973 pW[0+ldw*2] += tmp * pD[1+jj*sdd+ps*2];
974 pW[0+ldw*3] += tmp * pD[1+jj*sdd+ps*3];
975 tmp = pC[2+jj*sdc+ps*0];
976 pW[0+ldw*0] += tmp * pD[2+jj*sdd+ps*0];
977 pW[0+ldw*1] += tmp * pD[2+jj*sdd+ps*1];
978 pW[0+ldw*2] += tmp * pD[2+jj*sdd+ps*2];
979 pW[0+ldw*3] += tmp * pD[2+jj*sdd+ps*3];
980 tmp = pC[3+jj*sdc+ps*0];
981 pW[0+ldw*0] += tmp * pD[3+jj*sdd+ps*0];
982 pW[0+ldw*1] += tmp * pD[3+jj*sdd+ps*1];
983 pW[0+ldw*2] += tmp * pD[3+jj*sdd+ps*2];
984 pW[0+ldw*3] += tmp * pD[3+jj*sdd+ps*3];
985 }
986 for(ll=0; ll<m-jj; ll++)
987 {
988 tmp = pC[ll+jj*sdc+ps*0];
989 pW[0+ldw*0] += tmp * pD[ll+jj*sdd+ps*0];
990 pW[0+ldw*1] += tmp * pD[ll+jj*sdd+ps*1];
991 pW[0+ldw*2] += tmp * pD[ll+jj*sdd+ps*2];
992 pW[0+ldw*3] += tmp * pD[ll+jj*sdd+ps*3];
993 }
994 // compute W^T *= T
995 pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
996 pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
997 pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
998 pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
999 // compute C -= V * W^T
1000 pC[0+ps*0] -= pW[0+ldw*0];
1001 if(m>1)
1002 {
1003 pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
1004 if(m>2)
1005 {
1006 pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
1007 if(m>3)
1008 {
1009 pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
1010 }
1011 }
1012 }
1013 for(jj=4; jj<m-3; jj+=4)
1014 {
1015 pC[0+jj*sdc+ps*0] -= pD[0+jj*sdd+ps*0]*pW[0+ldw*0] + pD[0+jj*sdd+ps*1]*pW[0+ldw*1] + pD[0+jj*sdd+ps*2]*pW[0+ldw*2] + pD[0+jj*sdd+ps*3]*pW[0+ldw*3];
1016 pC[1+jj*sdc+ps*0] -= pD[1+jj*sdd+ps*0]*pW[0+ldw*0] + pD[1+jj*sdd+ps*1]*pW[0+ldw*1] + pD[1+jj*sdd+ps*2]*pW[0+ldw*2] + pD[1+jj*sdd+ps*3]*pW[0+ldw*3];
1017 pC[2+jj*sdc+ps*0] -= pD[2+jj*sdd+ps*0]*pW[0+ldw*0] + pD[2+jj*sdd+ps*1]*pW[0+ldw*1] + pD[2+jj*sdd+ps*2]*pW[0+ldw*2] + pD[2+jj*sdd+ps*3]*pW[0+ldw*3];
1018 pC[3+jj*sdc+ps*0] -= pD[3+jj*sdd+ps*0]*pW[0+ldw*0] + pD[3+jj*sdd+ps*1]*pW[0+ldw*1] + pD[3+jj*sdd+ps*2]*pW[0+ldw*2] + pD[3+jj*sdd+ps*3]*pW[0+ldw*3];
1019 }
1020 for(ll=0; ll<m-jj; ll++)
1021 {
1022 pC[ll+jj*sdc+ps*0] -= pD[ll+jj*sdd+ps*0]*pW[0+ldw*0] + pD[ll+jj*sdd+ps*1]*pW[0+ldw*1] + pD[ll+jj*sdd+ps*2]*pW[0+ldw*2] + pD[ll+jj*sdd+ps*3]*pW[0+ldw*3];
1023 }
1024 }
1025
1026 return;
1027 }
1028
1029
1030
1031void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc, double *pW0)
1032 {
1033 if(m<=0 | n<=0)
1034 return;
1035 int ii, jj, ll;
1036 const int ps = 4;
1037 double v10,
1038 v20, v21,
1039 v30, v31, v32;
1040 double c00, c01,
1041 c10, c11,
1042 c20, c21,
1043 c30, c31;
1044 double a0, a1, a2, a3, b0, b1;
1045 double tmp, d0, d1, d2, d3;
1046 double *pC, *pW;
1047 double pT[16];// = {};
1048 int ldt = 4;
1049 // dot product of v
1050 v10 = 0.0;
1051 v20 = 0.0;
1052 v30 = 0.0;
1053 v21 = 0.0;
1054 v31 = 0.0;
1055 v32 = 0.0;
1056 if(m>1)
1057 {
1058 v10 = 1.0 * pD[1+ps*0];
1059 if(m>2)
1060 {
1061 v10 += pD[2+ps*1] * pD[2+ps*0];
1062 v20 = 1.0 * pD[2+ps*0];
1063 v21 = 1.0 * pD[2+ps*1];
1064 if(m>3)
1065 {
1066 v10 += pD[3+ps*1] * pD[3+ps*0];
1067 v20 += pD[3+ps*2] * pD[3+ps*0];
1068 v21 += pD[3+ps*2] * pD[3+ps*1];
1069 v30 = 1.0 * pD[3+ps*0];
1070 v31 = 1.0 * pD[3+ps*1];
1071 v32 = 1.0 * pD[3+ps*2];
1072 }
1073 }
1074 }
1075 for(ii=4; ii<m-3; ii+=4)
1076 {
1077 v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
1078 v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
1079 v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
1080 v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
1081 v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
1082 v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
1083 v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
1084 v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
1085 v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
1086 v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
1087 v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
1088 v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
1089 v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
1090 v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
1091 v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
1092 v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
1093 v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
1094 v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
1095 v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
1096 v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
1097 v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
1098 v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
1099 v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
1100 v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
1101 }
1102 for(ll=0; ll<m-ii; ll++)
1103 {
1104 v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
1105 v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
1106 v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
1107 v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
1108 v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
1109 v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
1110 }
1111 // compute lower triangular T containing tau for matrix update
1112 pT[0+ldt*0] = dD[0];
1113 pT[1+ldt*1] = dD[1];
1114 pT[2+ldt*2] = dD[2];
1115 pT[3+ldt*3] = dD[3];
1116 pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
1117 pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
1118 pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
1119 pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
1120 pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
1121 pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
1122 // downgrade matrix
1123 __m256d
1124 _w0, _w1, _w2, _w3, _d0, _t0, _tp, _c0, _c1, _c2, _c3, _a0, _b0, _tz;
1125
1126 ii = 0;
1127#if 1
1128 double alpha = 1.0;
1129 double beta = 0.0;
1130#if defined(TARGET_X64_INTEL_HASWELL)
1131 for( ; ii<n-11; ii+=12)
1132 {
1133 kernel_dgemm_nn_4x12_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
1134 }
1135#endif
1136 for( ; ii<n-7; ii+=8)
1137 {
1138 kernel_dgemm_nn_4x8_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
1139 }
1140 for( ; ii<n-3; ii+=4)
1141 {
1142 kernel_dgemm_nn_4x4_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
1143 }
1144 if(ii<n)
1145 {
1146// kernel_dgemm_nn_4x4_vs_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii], 4, n-ii);
1147 kernel_dgemm_nn_4x4_gen_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, 0, &pW0[0+ps*ii], 0, 0, &pW0[0+ps*ii], 0, 0, 4, 0, n-ii);
1148 }
1149#else
1150 for( ; ii<n-3; ii+=4)
1151 {
1152 pW = pW0+ii*ps;
1153 pC = pC0+ii*ps;
1154 // compute W^T = C^T * V
1155 _w0 = _mm256_setzero_pd();
1156 _w1 = _mm256_setzero_pd();
1157 _w2 = _mm256_setzero_pd();
1158 _w3 = _mm256_setzero_pd();
1159 for(jj=0; jj<m-3; jj+=4)
1160 {
1161 //
1162 _d0 = _mm256_load_pd( &pVt[0+ps*(0+jj)] );
1163 _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*0] );
1164 _tp = _mm256_mul_pd( _d0, _t0 );
1165 _w0 = _mm256_add_pd( _w0, _tp );
1166 _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*1] );
1167 _tp = _mm256_mul_pd( _d0, _t0 );
1168 _w1 = _mm256_add_pd( _w1, _tp );
1169 _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*2] );
1170 _tp = _mm256_mul_pd( _d0, _t0 );
1171 _w2 = _mm256_add_pd( _w2, _tp );
1172 _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*3] );
1173 _tp = _mm256_mul_pd( _d0, _t0 );
1174 _w3 = _mm256_add_pd( _w3, _tp );
1175 //
1176 _d0 = _mm256_load_pd( &pVt[0+ps*(1+jj)] );
1177 _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*0] );
1178 _tp = _mm256_mul_pd( _d0, _t0 );
1179 _w0 = _mm256_add_pd( _w0, _tp );
1180 _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*1] );
1181 _tp = _mm256_mul_pd( _d0, _t0 );
1182 _w1 = _mm256_add_pd( _w1, _tp );
1183 _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*2] );
1184 _tp = _mm256_mul_pd( _d0, _t0 );
1185 _w2 = _mm256_add_pd( _w2, _tp );
1186 _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*3] );
1187 _tp = _mm256_mul_pd( _d0, _t0 );
1188 _w3 = _mm256_add_pd( _w3, _tp );
1189 //
1190 _d0 = _mm256_load_pd( &pVt[0+ps*(2+jj)] );
1191 _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*0] );
1192 _tp = _mm256_mul_pd( _d0, _t0 );
1193 _w0 = _mm256_add_pd( _w0, _tp );
1194 _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*1] );
1195 _tp = _mm256_mul_pd( _d0, _t0 );
1196 _w1 = _mm256_add_pd( _w1, _tp );
1197 _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*2] );
1198 _tp = _mm256_mul_pd( _d0, _t0 );
1199 _w2 = _mm256_add_pd( _w2, _tp );
1200 _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*3] );
1201 _tp = _mm256_mul_pd( _d0, _t0 );
1202 _w3 = _mm256_add_pd( _w3, _tp );
1203 //
1204 _d0 = _mm256_load_pd( &pVt[0+ps*(3+jj)] );
1205 _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*0] );
1206 _tp = _mm256_mul_pd( _d0, _t0 );
1207 _w0 = _mm256_add_pd( _w0, _tp );
1208 _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*1] );
1209 _tp = _mm256_mul_pd( _d0, _t0 );
1210 _w1 = _mm256_add_pd( _w1, _tp );
1211 _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*2] );
1212 _tp = _mm256_mul_pd( _d0, _t0 );
1213 _w2 = _mm256_add_pd( _w2, _tp );
1214 _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*3] );
1215 _tp = _mm256_mul_pd( _d0, _t0 );
1216 _w3 = _mm256_add_pd( _w3, _tp );
1217 }
1218 for(ll=0; ll<m-jj; ll++)
1219 {
1220 _d0 = _mm256_load_pd( &pVt[0+ps*(ll+jj)] );
1221 _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*0] );
1222 _tp = _mm256_mul_pd( _d0, _t0 );
1223 _w0 = _mm256_add_pd( _w0, _tp );
1224 _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*1] );
1225 _tp = _mm256_mul_pd( _d0, _t0 );
1226 _w1 = _mm256_add_pd( _w1, _tp );
1227 _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*2] );
1228 _tp = _mm256_mul_pd( _d0, _t0 );
1229 _w2 = _mm256_add_pd( _w2, _tp );
1230 _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*3] );
1231 _tp = _mm256_mul_pd( _d0, _t0 );
1232 _w3 = _mm256_add_pd( _w3, _tp );
1233 }
1234 // TODO mask store
1235 _mm256_storeu_pd( &pW[0+ps*0], _w0 );
1236 _mm256_storeu_pd( &pW[0+ps*1], _w1 );
1237 _mm256_storeu_pd( &pW[0+ps*2], _w2 );
1238 _mm256_storeu_pd( &pW[0+ps*3], _w3 );
1239 }
1240 for( ; ii<n; ii++)
1241 {
1242 pW = pW0+ii*ps;
1243 pC = pC0+ii*ps;
1244 // compute W^T = C^T * V
1245 tmp = pC[0+ps*0];
1246 pW[0+ps*0] = tmp;
1247 if(m>1)
1248 {
1249 d0 = pVt[0+ps*1];
1250 tmp = pC[1+ps*0];
1251 pW[0+ps*0] += d0 * tmp;
1252 pW[1+ps*0] = tmp;
1253 if(m>2)
1254 {
1255 d0 = pVt[0+ps*2];
1256 d1 = pVt[1+ps*2];
1257 tmp = pC[2+ps*0];
1258 pW[0+ps*0] += d0 * tmp;
1259 pW[1+ps*0] += d1 * tmp;
1260 pW[2+ps*0] = tmp;
1261 if(m>3)
1262 {
1263 d0 = pVt[0+ps*3];
1264 d1 = pVt[1+ps*3];
1265 d2 = pVt[2+ps*3];
1266 tmp = pC[3+ps*0];
1267 pW[0+ps*0] += d0 * tmp;
1268 pW[1+ps*0] += d1 * tmp;
1269 pW[2+ps*0] += d2 * tmp;
1270 pW[3+ps*0] = tmp;
1271 }
1272 }
1273 }
1274 for(jj=4; jj<m-3; jj+=4)
1275 {
1276 //
1277 d0 = pVt[0+ps*(0+jj)];
1278 d1 = pVt[1+ps*(0+jj)];
1279 d2 = pVt[2+ps*(0+jj)];
1280 d3 = pVt[3+ps*(0+jj)];
1281 tmp = pC[0+jj*sdc+ps*0];
1282 pW[0+ps*0] += d0 * tmp;
1283 pW[1+ps*0] += d1 * tmp;
1284 pW[2+ps*0] += d2 * tmp;
1285 pW[3+ps*0] += d3 * tmp;
1286 //
1287 d0 = pVt[0+ps*(1+jj)];
1288 d1 = pVt[1+ps*(1+jj)];
1289 d2 = pVt[2+ps*(1+jj)];
1290 d3 = pVt[3+ps*(1+jj)];
1291 tmp = pC[1+jj*sdc+ps*0];
1292 pW[0+ps*0] += d0 * tmp;
1293 pW[1+ps*0] += d1 * tmp;
1294 pW[2+ps*0] += d2 * tmp;
1295 pW[3+ps*0] += d3 * tmp;
1296 //
1297 d0 = pVt[0+ps*(2+jj)];
1298 d1 = pVt[1+ps*(2+jj)];
1299 d2 = pVt[2+ps*(2+jj)];
1300 d3 = pVt[3+ps*(2+jj)];
1301 tmp = pC[2+jj*sdc+ps*0];
1302 pW[0+ps*0] += d0 * tmp;
1303 pW[1+ps*0] += d1 * tmp;
1304 pW[2+ps*0] += d2 * tmp;
1305 pW[3+ps*0] += d3 * tmp;
1306 //
1307 d0 = pVt[0+ps*(3+jj)];
1308 d1 = pVt[1+ps*(3+jj)];
1309 d2 = pVt[2+ps*(3+jj)];
1310 d3 = pVt[3+ps*(3+jj)];
1311 tmp = pC[3+jj*sdc+ps*0];
1312 pW[0+ps*0] += d0 * tmp;
1313 pW[1+ps*0] += d1 * tmp;
1314 pW[2+ps*0] += d2 * tmp;
1315 pW[3+ps*0] += d3 * tmp;
1316 }
1317 for(ll=0; ll<m-jj; ll++)
1318 {
1319 d0 = pVt[0+ps*(ll+jj)];
1320 d1 = pVt[1+ps*(ll+jj)];
1321 d2 = pVt[2+ps*(ll+jj)];
1322 d3 = pVt[3+ps*(ll+jj)];
1323 tmp = pC[ll+jj*sdc+ps*0];
1324 pW[0+ps*0] += d0 * tmp;
1325 pW[1+ps*0] += d1 * tmp;
1326 pW[2+ps*0] += d2 * tmp;
1327 pW[3+ps*0] += d3 * tmp;
1328 }
1329 }
1330#endif
1331
1332 ii = 0;
1333 for( ; ii<n-3; ii+=4)
1334 {
1335 pW = pW0+ii*ps;
1336 pC = pC0+ii*ps;
1337
1338 // compute W^T *= T
1339 _tz = _mm256_setzero_pd();
1340
1341 _t0 = _mm256_load_pd( &pT[0+ldt*0] );
1342 _tp = _mm256_broadcast_sd( &pW[0+ps*0] );
1343 _w0 = _mm256_mul_pd( _t0, _tp );
1344 _tp = _mm256_broadcast_sd( &pW[0+ps*1] );
1345 _w1 = _mm256_mul_pd( _t0, _tp );
1346 _tp = _mm256_broadcast_sd( &pW[0+ps*2] );
1347 _w2 = _mm256_mul_pd( _t0, _tp );
1348 _tp = _mm256_broadcast_sd( &pW[0+ps*3] );
1349 _w3 = _mm256_mul_pd( _t0, _tp );
1350
1351#if defined(TARGET_X64_INTEL_GASWELL)
1352 _t0 = _mm256_load_pd( &pT[0+ldt*1] );
1353 _t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
1354 _tp = _mm256_broadcast_sd( &pW[1+ps*0] );
1355 _w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
1356 _tp = _mm256_broadcast_sd( &pW[1+ps*1] );
1357 _w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
1358 _tp = _mm256_broadcast_sd( &pW[1+ps*2] );
1359 _w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
1360 _tp = _mm256_broadcast_sd( &pW[1+ps*3] );
1361 _w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
1362
1363 _t0 = _mm256_load_pd( &pT[0+ldt*2] );
1364 _t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
1365 _tp = _mm256_broadcast_sd( &pW[2+ps*0] );
1366 _w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
1367 _tp = _mm256_broadcast_sd( &pW[2+ps*1] );
1368 _w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
1369 _tp = _mm256_broadcast_sd( &pW[2+ps*2] );
1370 _w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
1371 _tp = _mm256_broadcast_sd( &pW[2+ps*3] );
1372 _w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
1373
1374 _t0 = _mm256_load_pd( &pT[0+ldt*3] );
1375 _t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
1376 _tp = _mm256_broadcast_sd( &pW[3+ps*0] );
1377 _w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
1378 _tp = _mm256_broadcast_sd( &pW[3+ps*1] );
1379 _w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
1380 _tp = _mm256_broadcast_sd( &pW[3+ps*2] );
1381 _w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
1382 _tp = _mm256_broadcast_sd( &pW[3+ps*3] );
1383 _w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
1384#else
1385 _t0 = _mm256_load_pd( &pT[0+ldt*1] );
1386 _t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
1387 _tp = _mm256_broadcast_sd( &pW[1+ps*0] );
1388 _tp = _mm256_mul_pd( _t0, _tp );
1389 _w0 = _mm256_add_pd( _w0, _tp );
1390 _tp = _mm256_broadcast_sd( &pW[1+ps*1] );
1391 _tp = _mm256_mul_pd( _t0, _tp );
1392 _w1 = _mm256_add_pd( _w1, _tp );
1393 _tp = _mm256_broadcast_sd( &pW[1+ps*2] );
1394 _tp = _mm256_mul_pd( _t0, _tp );
1395 _w2 = _mm256_add_pd( _w2, _tp );
1396 _tp = _mm256_broadcast_sd( &pW[1+ps*3] );
1397 _tp = _mm256_mul_pd( _t0, _tp );
1398 _w3 = _mm256_add_pd( _w3, _tp );
1399
1400 _t0 = _mm256_load_pd( &pT[0+ldt*2] );
1401 _t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
1402 _tp = _mm256_broadcast_sd( &pW[2+ps*0] );
1403 _tp = _mm256_mul_pd( _t0, _tp );
1404 _w0 = _mm256_add_pd( _w0, _tp );
1405 _tp = _mm256_broadcast_sd( &pW[2+ps*1] );
1406 _tp = _mm256_mul_pd( _t0, _tp );
1407 _w1 = _mm256_add_pd( _w1, _tp );
1408 _tp = _mm256_broadcast_sd( &pW[2+ps*2] );
1409 _tp = _mm256_mul_pd( _t0, _tp );
1410 _w2 = _mm256_add_pd( _w2, _tp );
1411 _tp = _mm256_broadcast_sd( &pW[2+ps*3] );
1412 _tp = _mm256_mul_pd( _t0, _tp );
1413 _w3 = _mm256_add_pd( _w3, _tp );
1414
1415 _t0 = _mm256_load_pd( &pT[0+ldt*3] );
1416 _t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
1417 _tp = _mm256_broadcast_sd( &pW[3+ps*0] );
1418 _tp = _mm256_mul_pd( _t0, _tp );
1419 _w0 = _mm256_add_pd( _w0, _tp );
1420 _tp = _mm256_broadcast_sd( &pW[3+ps*1] );
1421 _tp = _mm256_mul_pd( _t0, _tp );
1422 _w1 = _mm256_add_pd( _w1, _tp );
1423 _tp = _mm256_broadcast_sd( &pW[3+ps*2] );
1424 _tp = _mm256_mul_pd( _t0, _tp );
1425 _w2 = _mm256_add_pd( _w2, _tp );
1426 _tp = _mm256_broadcast_sd( &pW[3+ps*3] );
1427 _tp = _mm256_mul_pd( _t0, _tp );
1428 _w3 = _mm256_add_pd( _w3, _tp );
1429#endif
1430
1431 _mm256_store_pd( &pW[0+ps*0], _w0 );
1432 _mm256_store_pd( &pW[0+ps*1], _w1 );
1433 _mm256_store_pd( &pW[0+ps*2], _w2 );
1434 _mm256_store_pd( &pW[0+ps*3], _w3 );
1435 }
1436 for( ; ii<n; ii++)
1437 {
1438 pW = pW0+ii*ps;
1439 pC = pC0+ii*ps;
1440
1441 // compute W^T *= T
1442 _tz = _mm256_setzero_pd();
1443
1444 _t0 = _mm256_load_pd( &pT[0+ldt*0] );
1445 _tp = _mm256_broadcast_sd( &pW[0+ps*0] );
1446 _w0 = _mm256_mul_pd( _t0, _tp );
1447
1448 _t0 = _mm256_load_pd( &pT[0+ldt*1] );
1449 _t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
1450 _tp = _mm256_broadcast_sd( &pW[1+ps*0] );
1451 _tp = _mm256_mul_pd( _t0, _tp );
1452 _w0 = _mm256_add_pd( _w0, _tp );
1453
1454 _t0 = _mm256_load_pd( &pT[0+ldt*2] );
1455 _t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
1456 _tp = _mm256_broadcast_sd( &pW[2+ps*0] );
1457 _tp = _mm256_mul_pd( _t0, _tp );
1458 _w0 = _mm256_add_pd( _w0, _tp );
1459
1460 _t0 = _mm256_load_pd( &pT[0+ldt*3] );
1461 _t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
1462 _tp = _mm256_broadcast_sd( &pW[3+ps*0] );
1463 _tp = _mm256_mul_pd( _t0, _tp );
1464 _w0 = _mm256_add_pd( _w0, _tp );
1465
1466 _mm256_store_pd( &pW[0+ps*0], _w0 );
1467 }
1468
1469 ii = 0;
1470 for( ; ii<n-3; ii+=4)
1471 {
1472 pW = pW0+ii*ps;
1473 pC = pC0+ii*ps;
1474 // compute C -= V * W^T
1475 jj = 0;
1476 // load
1477 c00 = pC[0+jj*sdc+ps*0];
1478 c10 = pC[1+jj*sdc+ps*0];
1479 c20 = pC[2+jj*sdc+ps*0];
1480 c30 = pC[3+jj*sdc+ps*0];
1481 c01 = pC[0+jj*sdc+ps*1];
1482 c11 = pC[1+jj*sdc+ps*1];
1483 c21 = pC[2+jj*sdc+ps*1];
1484 c31 = pC[3+jj*sdc+ps*1];
1485 // rank1
1486 a1 = pD[1+jj*sdd+ps*0];
1487 a2 = pD[2+jj*sdd+ps*0];
1488 a3 = pD[3+jj*sdd+ps*0];
1489 b0 = pW[0+ps*0];
1490 c00 -= b0;
1491 c10 -= a1*b0;
1492 c20 -= a2*b0;
1493 c30 -= a3*b0;
1494 b1 = pW[0+ps*1];
1495 c01 -= b1;
1496 c11 -= a1*b1;
1497 c21 -= a2*b1;
1498 c31 -= a3*b1;
1499 // rank2
1500 a2 = pD[2+jj*sdd+ps*1];
1501 a3 = pD[3+jj*sdd+ps*1];
1502 b0 = pW[1+ps*0];
1503 c10 -= b0;
1504 c20 -= a2*b0;
1505 c30 -= a3*b0;
1506 b1 = pW[1+ps*1];
1507 c11 -= b1;
1508 c21 -= a2*b1;
1509 c31 -= a3*b1;
1510 // rank3
1511 a3 = pD[3+jj*sdd+ps*2];
1512 b0 = pW[2+ps*0];
1513 c20 -= b0;
1514 c30 -= a3*b0;
1515 b1 = pW[2+ps*1];
1516 c21 -= b1;
1517 c31 -= a3*b1;
1518 // rank4
1519 a3 = pD[3+jj*sdd+ps*3];
1520 b0 = pW[3+ps*0];
1521 c30 -= b0;
1522 b1 = pW[3+ps*1];
1523 c31 -= b1;
1524 // store
1525 pC[0+jj*sdc+ps*0] = c00;
1526 pC[0+jj*sdc+ps*1] = c01;
1527 if(m>1)
1528 {
1529 pC[1+jj*sdc+ps*0] = c10;
1530 pC[1+jj*sdc+ps*1] = c11;
1531 if(m>2)
1532 {
1533 pC[2+jj*sdc+ps*0] = c20;
1534 pC[2+jj*sdc+ps*1] = c21;
1535 if(m>3)
1536 {
1537 pC[3+jj*sdc+ps*0] = c30;
1538 pC[3+jj*sdc+ps*1] = c31;
1539 }
1540 }
1541 }
1542 // load
1543 c00 = pC[0+jj*sdc+ps*2];
1544 c10 = pC[1+jj*sdc+ps*2];
1545 c20 = pC[2+jj*sdc+ps*2];
1546 c30 = pC[3+jj*sdc+ps*2];
1547 c01 = pC[0+jj*sdc+ps*3];
1548 c11 = pC[1+jj*sdc+ps*3];
1549 c21 = pC[2+jj*sdc+ps*3];
1550 c31 = pC[3+jj*sdc+ps*3];
1551 // rank1
1552 a1 = pD[1+jj*sdd+ps*0];
1553 a2 = pD[2+jj*sdd+ps*0];
1554 a3 = pD[3+jj*sdd+ps*0];
1555 b0 = pW[0+ps*2];
1556 c00 -= b0;
1557 c10 -= a1*b0;
1558 c20 -= a2*b0;
1559 c30 -= a3*b0;
1560 b1 = pW[0+ps*3];
1561 c01 -= b1;
1562 c11 -= a1*b1;
1563 c21 -= a2*b1;
1564 c31 -= a3*b1;
1565 // rank2
1566 a2 = pD[2+jj*sdd+ps*1];
1567 a3 = pD[3+jj*sdd+ps*1];
1568 b0 = pW[1+ps*2];
1569 c10 -= b0;
1570 c20 -= a2*b0;
1571 c30 -= a3*b0;
1572 b1 = pW[1+ps*3];
1573 c11 -= b1;
1574 c21 -= a2*b1;
1575 c31 -= a3*b1;
1576 // rank3
1577 a3 = pD[3+jj*sdd+ps*2];
1578 b0 = pW[2+ps*2];
1579 c20 -= b0;
1580 c30 -= a3*b0;
1581 b1 = pW[2+ps*3];
1582 c21 -= b1;
1583 c31 -= a3*b1;
1584 // rank4
1585 a3 = pD[3+jj*sdd+ps*3];
1586 b0 = pW[3+ps*2];
1587 c30 -= b0;
1588 b1 = pW[3+ps*3];
1589 c31 -= b1;
1590 // store
1591 pC[0+jj*sdc+ps*2] = c00;
1592 pC[0+jj*sdc+ps*3] = c01;
1593 if(m>1)
1594 {
1595 pC[1+jj*sdc+ps*2] = c10;
1596 pC[1+jj*sdc+ps*3] = c11;
1597 if(m>2)
1598 {
1599 pC[2+jj*sdc+ps*2] = c20;
1600 pC[2+jj*sdc+ps*3] = c21;
1601 if(m>3)
1602 {
1603 pC[3+jj*sdc+ps*2] = c30;
1604 pC[3+jj*sdc+ps*3] = c31;
1605 }
1606 }
1607 }
1608 }
1609 for( ; ii<n; ii++)
1610 {
1611 pW = pW0+ii*ps;
1612 pC = pC0+ii*ps;
1613 // compute C -= V * W^T
1614 jj = 0;
1615 // load
1616 c00 = pC[0+jj*sdc+ps*0];
1617 c10 = pC[1+jj*sdc+ps*0];
1618 c20 = pC[2+jj*sdc+ps*0];
1619 c30 = pC[3+jj*sdc+ps*0];
1620 // rank1
1621 a1 = pD[1+jj*sdd+ps*0];
1622 a2 = pD[2+jj*sdd+ps*0];
1623 a3 = pD[3+jj*sdd+ps*0];
1624 b0 = pW[0+ps*0];
1625 c00 -= b0;
1626 c10 -= a1*b0;
1627 c20 -= a2*b0;
1628 c30 -= a3*b0;
1629 // rank2
1630 a2 = pD[2+jj*sdd+ps*1];
1631 a3 = pD[3+jj*sdd+ps*1];
1632 b0 = pW[1+ps*0];
1633 c10 -= b0;
1634 c20 -= a2*b0;
1635 c30 -= a3*b0;
1636 // rank3
1637 a3 = pD[3+jj*sdd+ps*2];
1638 b0 = pW[2+ps*0];
1639 c20 -= b0;
1640 c30 -= a3*b0;
1641 // rank4
1642 a3 = pD[3+jj*sdd+ps*3];
1643 b0 = pW[3+ps*0];
1644 c30 -= b0;
1645 // store
1646 pC[0+jj*sdc+ps*0] = c00;
1647 if(m>1)
1648 {
1649 pC[1+jj*sdc+ps*0] = c10;
1650 if(m>2)
1651 {
1652 pC[2+jj*sdc+ps*0] = c20;
1653 if(m>3)
1654 {
1655 pC[3+jj*sdc+ps*0] = c30;
1656 }
1657 }
1658 }
1659 }
1660
1661#if 1
1662 jj = 4;
1663#if defined(TARGET_X64_INTEL_HASWELL)
1664 for(; jj<m-11; jj+=12)
1665 {
1666 kernel_dger4_sub_12r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
1667 }
1668#endif
1669 for(; jj<m-7; jj+=8)
1670 {
1671 kernel_dger4_sub_8r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
1672 }
1673 for(; jj<m-3; jj+=4)
1674 {
1675 kernel_dger4_sub_4r_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc]);
1676 }
1677 if(jj<m)
1678 {
1679 kernel_dger4_sub_4r_vs_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc], m-jj);
1680 }
1681#else
1682 ii = 0;
1683 for( ; ii<n-3; ii+=4)
1684 {
1685 pW = pW0+ii*ps;
1686 pC = pC0+ii*ps;
1687 for(jj=4; jj<m-3; jj+=4)
1688 {
1689 // load
1690 _c0 = _mm256_load_pd( &pC[0+jj*sdc+ps*0] );
1691 _c1 = _mm256_load_pd( &pC[0+jj*sdc+ps*1] );
1692 _c2 = _mm256_load_pd( &pC[0+jj*sdc+ps*2] );
1693 _c3 = _mm256_load_pd( &pC[0+jj*sdc+ps*3] );
1694 //
1695 _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*0] );
1696 _b0 = _mm256_broadcast_sd( &pW[0+ps*0] );
1697 _tp = _mm256_mul_pd( _a0, _b0 );
1698 _c0 = _mm256_sub_pd( _c0, _tp );
1699 _b0 = _mm256_broadcast_sd( &pW[0+ps*1] );
1700 _tp = _mm256_mul_pd( _a0, _b0 );
1701 _c1 = _mm256_sub_pd( _c1, _tp );
1702 _b0 = _mm256_broadcast_sd( &pW[0+ps*2] );
1703 _tp = _mm256_mul_pd( _a0, _b0 );
1704 _c2 = _mm256_sub_pd( _c2, _tp );
1705 _b0 = _mm256_broadcast_sd( &pW[0+ps*3] );
1706 _tp = _mm256_mul_pd( _a0, _b0 );
1707 _c3 = _mm256_sub_pd( _c3, _tp );
1708 //
1709 _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*1] );
1710 _b0 = _mm256_broadcast_sd( &pW[1+ps*0] );
1711 _tp = _mm256_mul_pd( _a0, _b0 );
1712 _c0 = _mm256_sub_pd( _c0, _tp );
1713 _b0 = _mm256_broadcast_sd( &pW[1+ps*1] );
1714 _tp = _mm256_mul_pd( _a0, _b0 );
1715 _c1 = _mm256_sub_pd( _c1, _tp );
1716 _b0 = _mm256_broadcast_sd( &pW[1+ps*2] );
1717 _tp = _mm256_mul_pd( _a0, _b0 );
1718 _c2 = _mm256_sub_pd( _c2, _tp );
1719 _b0 = _mm256_broadcast_sd( &pW[1+ps*3] );
1720 _tp = _mm256_mul_pd( _a0, _b0 );
1721 _c3 = _mm256_sub_pd( _c3, _tp );
1722 //
1723 _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*2] );
1724 _b0 = _mm256_broadcast_sd( &pW[2+ps*0] );
1725 _tp = _mm256_mul_pd( _a0, _b0 );
1726 _c0 = _mm256_sub_pd( _c0, _tp );
1727 _b0 = _mm256_broadcast_sd( &pW[2+ps*1] );
1728 _tp = _mm256_mul_pd( _a0, _b0 );
1729 _c1 = _mm256_sub_pd( _c1, _tp );
1730 _b0 = _mm256_broadcast_sd( &pW[2+ps*2] );
1731 _tp = _mm256_mul_pd( _a0, _b0 );
1732 _c2 = _mm256_sub_pd( _c2, _tp );
1733 _b0 = _mm256_broadcast_sd( &pW[2+ps*3] );
1734 _tp = _mm256_mul_pd( _a0, _b0 );
1735 _c3 = _mm256_sub_pd( _c3, _tp );
1736 //
1737 _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*3] );
1738 _b0 = _mm256_broadcast_sd( &pW[3+ps*0] );
1739 _tp = _mm256_mul_pd( _a0, _b0 );
1740 _c0 = _mm256_sub_pd( _c0, _tp );
1741 _b0 = _mm256_broadcast_sd( &pW[3+ps*1] );
1742 _tp = _mm256_mul_pd( _a0, _b0 );
1743 _c1 = _mm256_sub_pd( _c1, _tp );
1744 _b0 = _mm256_broadcast_sd( &pW[3+ps*2] );
1745 _tp = _mm256_mul_pd( _a0, _b0 );
1746 _c2 = _mm256_sub_pd( _c2, _tp );
1747 _b0 = _mm256_broadcast_sd( &pW[3+ps*3] );
1748 _tp = _mm256_mul_pd( _a0, _b0 );
1749 _c3 = _mm256_sub_pd( _c3, _tp );
1750 // store
1751 _mm256_store_pd( &pC[0+jj*sdc+ps*0], _c0 );
1752 _mm256_store_pd( &pC[0+jj*sdc+ps*1], _c1 );
1753 _mm256_store_pd( &pC[0+jj*sdc+ps*2], _c2 );
1754 _mm256_store_pd( &pC[0+jj*sdc+ps*3], _c3 );
1755 }
1756 for(ll=0; ll<m-jj; ll++)
1757 {
1758 // load
1759 c00 = pC[ll+jj*sdc+ps*0];
1760 c01 = pC[ll+jj*sdc+ps*1];
1761 //
1762 a0 = pD[ll+jj*sdd+ps*0];
1763 b0 = pW[0+ps*0];
1764 c00 -= a0*b0;
1765 b1 = pW[0+ps*1];
1766 c01 -= a0*b1;
1767 //
1768 a0 = pD[ll+jj*sdd+ps*1];
1769 b0 = pW[1+ps*0];
1770 c00 -= a0*b0;
1771 b1 = pW[1+ps*1];
1772 c01 -= a0*b1;
1773 //
1774 a0 = pD[ll+jj*sdd+ps*2];
1775 b0 = pW[2+ps*0];
1776 c00 -= a0*b0;
1777 b1 = pW[2+ps*1];
1778 c01 -= a0*b1;
1779 //
1780 a0 = pD[ll+jj*sdd+ps*3];
1781 b0 = pW[3+ps*0];
1782 c00 -= a0*b0;
1783 b1 = pW[3+ps*1];
1784 c01 -= a0*b1;
1785 // store
1786 pC[ll+jj*sdc+ps*0] = c00;
1787 pC[ll+jj*sdc+ps*1] = c01;
1788 // load
1789 c00 = pC[ll+jj*sdc+ps*2];
1790 c01 = pC[ll+jj*sdc+ps*3];
1791 //
1792 a0 = pD[ll+jj*sdd+ps*0];
1793 b0 = pW[0+ps*2];
1794 c00 -= a0*b0;
1795 b1 = pW[0+ps*3];
1796 c01 -= a0*b1;
1797 //
1798 a0 = pD[ll+jj*sdd+ps*1];
1799 b0 = pW[1+ps*2];
1800 c00 -= a0*b0;
1801 b1 = pW[1+ps*3];
1802 c01 -= a0*b1;
1803 //
1804 a0 = pD[ll+jj*sdd+ps*2];
1805 b0 = pW[2+ps*2];
1806 c00 -= a0*b0;
1807 b1 = pW[2+ps*3];
1808 c01 -= a0*b1;
1809 //
1810 a0 = pD[ll+jj*sdd+ps*3];
1811 b0 = pW[3+ps*2];
1812 c00 -= a0*b0;
1813 b1 = pW[3+ps*3];
1814 c01 -= a0*b1;
1815 // store
1816 pC[ll+jj*sdc+ps*2] = c00;
1817 pC[ll+jj*sdc+ps*3] = c01;
1818 }
1819 }
1820 for( ; ii<n; ii++)
1821 {
1822 pW = pW0+ii*ps;
1823 pC = pC0+ii*ps;
1824 for(jj=4; jj<m-3; jj+=4)
1825 {
1826 // load
1827 c00 = pC[0+jj*sdc+ps*0];
1828 c10 = pC[1+jj*sdc+ps*0];
1829 c20 = pC[2+jj*sdc+ps*0];
1830 c30 = pC[3+jj*sdc+ps*0];
1831 //
1832 a0 = pD[0+jj*sdd+ps*0];
1833 a1 = pD[1+jj*sdd+ps*0];
1834 a2 = pD[2+jj*sdd+ps*0];
1835 a3 = pD[3+jj*sdd+ps*0];
1836 b0 = pW[0+ps*0];
1837 c00 -= a0*b0;
1838 c10 -= a1*b0;
1839 c20 -= a2*b0;
1840 c30 -= a3*b0;
1841 //
1842 a0 = pD[0+jj*sdd+ps*1];
1843 a1 = pD[1+jj*sdd+ps*1];
1844 a2 = pD[2+jj*sdd+ps*1];
1845 a3 = pD[3+jj*sdd+ps*1];
1846 b0 = pW[1+ps*0];
1847 c00 -= a0*b0;
1848 c10 -= a1*b0;
1849 c20 -= a2*b0;
1850 c30 -= a3*b0;
1851 //
1852 a0 = pD[0+jj*sdd+ps*2];
1853 a1 = pD[1+jj*sdd+ps*2];
1854 a2 = pD[2+jj*sdd+ps*2];
1855 a3 = pD[3+jj*sdd+ps*2];
1856 b0 = pW[2+ps*0];
1857 c00 -= a0*b0;
1858 c10 -= a1*b0;
1859 c20 -= a2*b0;
1860 c30 -= a3*b0;
1861 //
1862 a0 = pD[0+jj*sdd+ps*3];
1863 a1 = pD[1+jj*sdd+ps*3];
1864 a2 = pD[2+jj*sdd+ps*3];
1865 a3 = pD[3+jj*sdd+ps*3];
1866 b0 = pW[3+ps*0];
1867 c00 -= a0*b0;
1868 c10 -= a1*b0;
1869 c20 -= a2*b0;
1870 c30 -= a3*b0;
1871 // store
1872 pC[0+jj*sdc+ps*0] = c00;
1873 pC[1+jj*sdc+ps*0] = c10;
1874 pC[2+jj*sdc+ps*0] = c20;
1875 pC[3+jj*sdc+ps*0] = c30;
1876 }
1877 for(ll=0; ll<m-jj; ll++)
1878 {
1879 // load
1880 c00 = pC[ll+jj*sdc+ps*0];
1881 //
1882 a0 = pD[ll+jj*sdd+ps*0];
1883 b0 = pW[0+ps*0];
1884 c00 -= a0*b0;
1885 //
1886 a0 = pD[ll+jj*sdd+ps*1];
1887 b0 = pW[1+ps*0];
1888 c00 -= a0*b0;
1889 //
1890 a0 = pD[ll+jj*sdd+ps*2];
1891 b0 = pW[2+ps*0];
1892 c00 -= a0*b0;
1893 //
1894 a0 = pD[ll+jj*sdd+ps*3];
1895 b0 = pW[3+ps*0];
1896 c00 -= a0*b0;
1897 // store
1898 pC[ll+jj*sdc+ps*0] = c00;
1899 }
1900 }
1901#endif
1902
1903 return;
1904 }
1905
1906
1907
1908// assume n>=4
1909void kernel_dgelqf_4_lib4(int n, double *pD, double *dD)
1910 {
1911 int ii, jj, ll;
1912 double alpha, beta, tmp, w1, w2, w3;
1913 const int ps = 4;
1914 // first column
1915 beta = 0.0;
1916 for(ii=1; ii<n; ii++)
1917 {
1918 tmp = pD[0+ps*ii];
1919 beta += tmp*tmp;
1920 }
1921 if(beta==0.0)
1922 {
1923 dD[0] = 0.0;
1924 tmp = 0.0;
1925 goto col2;
1926 }
1927 alpha = pD[0+ps*0];
1928 beta += alpha*alpha;
1929 beta = sqrt(beta);
1930 if(alpha>0)
1931 beta = -beta;
1932 dD[0] = (beta-alpha) / beta;
1933 tmp = 1.0 / (alpha-beta);
1934 //
1935 pD[0+ps*0] = beta;
1936 w1 = pD[1+ps*0];
1937 w2 = pD[2+ps*0];
1938 w3 = pD[3+ps*0];
1939 //
1940 pD[0+ps*1] *= tmp;
1941 w1 += pD[1+ps*1] * pD[0+ps*1];
1942 w2 += pD[2+ps*1] * pD[0+ps*1];
1943 w3 += pD[3+ps*1] * pD[0+ps*1];
1944 //
1945 pD[0+ps*2] *= tmp;
1946 w1 += pD[1+ps*2] * pD[0+ps*2];
1947 w2 += pD[2+ps*2] * pD[0+ps*2];
1948 w3 += pD[3+ps*2] * pD[0+ps*2];
1949 //
1950 pD[0+ps*3] *= tmp;
1951 w1 += pD[1+ps*3] * pD[0+ps*3];
1952 w2 += pD[2+ps*3] * pD[0+ps*3];
1953 w3 += pD[3+ps*3] * pD[0+ps*3];
1954 //
1955 for(ii=4; ii<n; ii++)
1956 {
1957 pD[0+ps*ii] *= tmp;
1958 w1 += pD[1+ps*ii] * pD[0+ps*ii];
1959 w2 += pD[2+ps*ii] * pD[0+ps*ii];
1960 w3 += pD[3+ps*ii] * pD[0+ps*ii];
1961 }
1962 //
1963 w1 = - dD[0] * w1;
1964 w2 = - dD[0] * w2;
1965 w3 = - dD[0] * w3;
1966 //
1967 pD[1+ps*0] += w1;
1968 pD[2+ps*0] += w2;
1969 pD[3+ps*0] += w3;
1970 //
1971 pD[1+ps*1] += w1 * pD[0+ps*1];
1972 pD[2+ps*1] += w2 * pD[0+ps*1];
1973 pD[3+ps*1] += w3 * pD[0+ps*1];
1974 //
1975 pD[1+ps*2] += w1 * pD[0+ps*2];
1976 pD[2+ps*2] += w2 * pD[0+ps*2];
1977 pD[3+ps*2] += w3 * pD[0+ps*2];
1978 beta = pD[1+ps*2] * pD[1+ps*2];
1979 //
1980 pD[1+ps*3] += w1 * pD[0+ps*3];
1981 pD[2+ps*3] += w2 * pD[0+ps*3];
1982 pD[3+ps*3] += w3 * pD[0+ps*3];
1983 beta += pD[1+ps*3] * pD[1+ps*3];
1984 //
1985 for(ii=4; ii<n; ii++)
1986 {
1987 pD[1+ps*ii] += w1 * pD[0+ps*ii];
1988 pD[2+ps*ii] += w2 * pD[0+ps*ii];
1989 pD[3+ps*ii] += w3 * pD[0+ps*ii];
1990 beta += pD[1+ps*ii] * pD[1+ps*ii];
1991 }
1992 // second column
1993col2:
1994 if(beta==0.0)
1995 {
1996 dD[1] = 0.0;
1997 tmp = 0.0;
1998 goto col3;
1999 }
2000 alpha = pD[1+ps*1];
2001 beta += alpha*alpha;
2002 beta = sqrt(beta);
2003 if(alpha>0)
2004 beta = -beta;
2005 dD[1] = (beta-alpha) / beta;
2006 tmp = 1.0 / (alpha-beta);
2007 //
2008 pD[1+ps*1] = beta;
2009 w2 = pD[2+ps*1];
2010 w3 = pD[3+ps*1];
2011 //
2012 pD[1+ps*2] *= tmp;
2013 w2 += pD[2+ps*2] * pD[1+ps*2];
2014 w3 += pD[3+ps*2] * pD[1+ps*2];
2015 //
2016 pD[1+ps*3] *= tmp;
2017 w2 += pD[2+ps*3] * pD[1+ps*3];
2018 w3 += pD[3+ps*3] * pD[1+ps*3];
2019 //
2020 for(ii=4; ii<n; ii++)
2021 {
2022 pD[1+ps*ii] *= tmp;
2023 w2 += pD[2+ps*ii] * pD[1+ps*ii];
2024 w3 += pD[3+ps*ii] * pD[1+ps*ii];
2025 }
2026 //
2027 w2 = - dD[1] * w2;
2028 w3 = - dD[1] * w3;
2029 //
2030 pD[2+ps*1] += w2;
2031 pD[3+ps*1] += w3;
2032 //
2033 pD[2+ps*2] += w2 * pD[1+ps*2];
2034 pD[3+ps*2] += w3 * pD[1+ps*2];
2035 //
2036 pD[2+ps*3] += w2 * pD[1+ps*3];
2037 pD[3+ps*3] += w3 * pD[1+ps*3];
2038 beta = pD[2+ps*3] * pD[2+ps*3];
2039 //
2040 for(ii=4; ii<n; ii++)
2041 {
2042 pD[2+ps*ii] += w2 * pD[1+ps*ii];
2043 pD[3+ps*ii] += w3 * pD[1+ps*ii];
2044 beta += pD[2+ps*ii] * pD[2+ps*ii];
2045 }
2046 // third column
2047col3:
2048 if(beta==0.0)
2049 {
2050 dD[2] = 0.0;
2051 tmp = 0.0;
2052 goto col4;
2053 }
2054 alpha = pD[2+ps*2];
2055 beta += alpha*alpha;
2056 beta = sqrt(beta);
2057 if(alpha>0)
2058 beta = -beta;
2059 dD[2] = (beta-alpha) / beta;
2060 tmp = 1.0 / (alpha-beta);
2061 //
2062 pD[2+ps*2] = beta;
2063 w3 = pD[3+ps*2];
2064 //
2065 pD[2+ps*3] *= tmp;
2066 w3 += pD[3+ps*3] * pD[2+ps*3];
2067 //
2068 for(ii=4; ii<n; ii++)
2069 {
2070 pD[2+ps*ii] *= tmp;
2071 w3 += pD[3+ps*ii] * pD[2+ps*ii];
2072 }
2073 //
2074 w3 = - dD[2] * w3;
2075 //
2076 pD[3+ps*2] += w3;
2077 //
2078 pD[3+ps*3] += w3 * pD[2+ps*3];
2079 //
2080 beta = 0.0;
2081 for(ii=4; ii<n; ii++)
2082 {
2083 pD[3+ps*ii] += w3 * pD[2+ps*ii];
2084 beta += pD[3+ps*ii] * pD[3+ps*ii];
2085 }
2086 // fourth column
2087col4:
2088 if(beta==0.0)
2089 {
2090 dD[3] = 0.0;
2091 tmp = 0.0;
2092 return;
2093 }
2094 alpha = pD[3+ps*3];
2095 beta += alpha*alpha;
2096 beta = sqrt(beta);
2097 if(alpha>0)
2098 beta = -beta;
2099 dD[3] = (beta-alpha) / beta;
2100 tmp = 1.0 / (alpha-beta);
2101 //
2102 pD[3+ps*3] = beta;
2103 for(ii=4; ii<n; ii++)
2104 {
2105 pD[3+ps*ii] *= tmp;
2106 }
2107 return;
2108 }
2109
2110
2111
2112// unblocked algorithm
2113void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
2114 {
2115 if(m<=0 | n<=0)
2116 return;
2117 int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
2118 const int ps = 4;
2119 imax = k;//m<n ? m : n;
2120 double alpha, beta, tmp;
2121 double w00, w01,
2122 w10, w11,
2123 w20, w21,
2124 w30, w31;
2125 __m256d
2126 _a0, _b0, _t0, _w0, _w1;
2127 double *pC00, *pC10, *pC10a, *pC20, *pC20a, *pC01, *pC11;
2128 double pT[4];
2129 int ldt = 2;
2130 double *pD0 = pD-offD;
2131 ii = 0;
2132#if 1 // rank 2
2133 for(; ii<imax-1; ii+=2)
2134 {
2135 // first row
2136 pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
2137 beta = 0.0;
2138 for(jj=1; jj<n-ii; jj++)
2139 {
2140 tmp = pC00[0+ps*jj];
2141 beta += tmp*tmp;
2142 }
2143 if(beta==0.0)
2144 {
2145 dD[ii] = 0.0;
2146 }
2147 else
2148 {
2149 alpha = pC00[0];
2150 beta += alpha*alpha;
2151 beta = sqrt(beta);
2152 if(alpha>0)
2153 beta = -beta;
2154 dD[ii] = (beta-alpha) / beta;
2155 tmp = 1.0 / (alpha-beta);
2156 pC00[0] = beta;
2157 for(jj=1; jj<n-ii; jj++)
2158 pC00[0+ps*jj] *= tmp;
2159 }
2160 pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
2161 kmax = n-ii;
2162 w00 = pC10[0+ps*0]; // pC00[0+ps*0] = 1.0
2163 for(kk=1; kk<kmax; kk++)
2164 {
2165 w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
2166 }
2167 w00 = - w00*dD[ii];
2168 pC10[0+ps*0] += w00; // pC00[0+ps*0] = 1.0
2169 for(kk=1; kk<kmax; kk++)
2170 {
2171 pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
2172 }
2173 // second row
2174 pC11 = pC10+ps*1;
2175 beta = 0.0;
2176 for(jj=1; jj<n-(ii+1); jj++)
2177 {
2178 tmp = pC11[0+ps*jj];
2179 beta += tmp*tmp;
2180 }
2181 if(beta==0.0)
2182 {
2183 dD[(ii+1)] = 0.0;
2184 }
2185 else
2186 {
2187 alpha = pC11[0+ps*0];
2188 beta += alpha*alpha;
2189 beta = sqrt(beta);
2190 if(alpha>0)
2191 beta = -beta;
2192 dD[(ii+1)] = (beta-alpha) / beta;
2193 tmp = 1.0 / (alpha-beta);
2194 pC11[0+ps*0] = beta;
2195 for(jj=1; jj<n-(ii+1); jj++)
2196 pC11[0+ps*jj] *= tmp;
2197 }
2198 // compute T
2199 kmax = n-ii;
2200 tmp = 1.0*0.0 + pC00[0+ps*1]*1.0;
2201 for(kk=2; kk<kmax; kk++)
2202 tmp += pC00[0+ps*kk]*pC10[0+ps*kk];
2203 pT[0+ldt*0] = - dD[ii+0];
2204 pT[0+ldt*1] = + dD[ii+1] * tmp * dD[ii+0];
2205 pT[1+ldt*1] = - dD[ii+1];
2206 // downgrade
2207 kmax = n-ii;
2208 jmax = m-ii-2;
2209 jmax0 = (ps-((ii+2+offD)&(ps-1)))&(ps-1);
2210 jmax0 = jmax<jmax0 ? jmax : jmax0;
2211 jj = 0;
2212 pC20a = &pD0[((offD+ii+2)&(ps-1))+((offD+ii+2)-((offD+ii+2)&(ps-1)))*sdd+ii*ps];
2213 pC20 = pC20a;
2214 if(jmax0>0)
2215 {
2216 for( ; jj<jmax0; jj++)
2217 {
2218 w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
2219 w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
2220 for(kk=2; kk<kmax; kk++)
2221 {
2222 w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
2223 w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
2224 }
2225 w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
2226 w00 = w00*pT[0+ldt*0];
2227 pC20[0+ps*0] += w00*1.0 + w01*0.0;
2228 pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
2229 for(kk=2; kk<kmax; kk++)
2230 {
2231 pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
2232 }
2233 pC20 += 1;
2234 }
2235 pC20 += -ps+ps*sdd;
2236 }
2237 for( ; jj<jmax-3; jj+=4)
2238 {
2239 //
2240 _w0 = _mm256_load_pd( &pC20[0+ps*0] );
2241 _a0 = _mm256_load_pd( &pC20[0+ps*1] );
2242 _b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
2243 _t0 = _mm256_mul_pd( _a0, _b0 );
2244 _w0 = _mm256_add_pd( _w0, _t0 );
2245 _w1 = _mm256_load_pd( &pC20[0+ps*1] );
2246 for(kk=2; kk<kmax; kk++)
2247 {
2248 _a0 = _mm256_load_pd( &pC20[0+ps*kk] );
2249 _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
2250 _t0 = _mm256_mul_pd( _a0, _b0 );
2251 _w0 = _mm256_add_pd( _w0, _t0 );
2252 _b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
2253 _t0 = _mm256_mul_pd( _a0, _b0 );
2254 _w1 = _mm256_add_pd( _w1, _t0 );
2255 }
2256 //
2257 _b0 = _mm256_broadcast_sd( &pT[1+ldt*1] );
2258 _w1 = _mm256_mul_pd( _w1, _b0 );
2259 _b0 = _mm256_broadcast_sd( &pT[0+ldt*1] );
2260 _t0 = _mm256_mul_pd( _w0, _b0 );
2261 _w1 = _mm256_add_pd( _w1, _t0 );
2262 _b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
2263 _w0 = _mm256_mul_pd( _w0, _b0 );
2264 //
2265 _a0 = _mm256_load_pd( &pC20[0+ps*0] );
2266 _a0 = _mm256_add_pd( _a0, _w0 );
2267 _mm256_store_pd( &pC20[0+ps*0], _a0 );
2268 _a0 = _mm256_load_pd( &pC20[0+ps*1] );
2269 _b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
2270 _t0 = _mm256_mul_pd( _w0, _b0 );
2271 _a0 = _mm256_add_pd( _a0, _t0 );
2272 _a0 = _mm256_add_pd( _a0, _w1 );
2273 _mm256_store_pd( &pC20[0+ps*1], _a0 );
2274 for(kk=2; kk<kmax; kk++)
2275 {
2276 _a0 = _mm256_load_pd( &pC20[0+ps*kk] );
2277 _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
2278 _t0 = _mm256_mul_pd( _w0, _b0 );
2279 _a0 = _mm256_add_pd( _a0, _t0 );
2280 _b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
2281 _t0 = _mm256_mul_pd( _w1, _b0 );
2282 _a0 = _mm256_add_pd( _a0, _t0 );
2283 _mm256_store_pd( &pC20[0+ps*kk], _a0 );
2284 }
2285 pC20 += ps*sdd;
2286 }
2287 for(ll=0; ll<jmax-jj; ll++)
2288 {
2289 w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
2290 w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
2291 for(kk=2; kk<kmax; kk++)
2292 {
2293 w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
2294 w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
2295 }
2296 w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
2297 w00 = w00*pT[0+ldt*0];
2298 pC20[0+ps*0] += w00*1.0 + w01*0.0;
2299 pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
2300 for(kk=2; kk<kmax; kk++)
2301 {
2302 pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
2303 }
2304 pC20 += 1;
2305 }
2306 }
2307#endif
2308 for(; ii<imax; ii++)
2309 {
2310 pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
2311 beta = 0.0;
2312 for(jj=1; jj<n-ii; jj++)
2313 {
2314 tmp = pC00[0+ps*jj];
2315 beta += tmp*tmp;
2316 }
2317 if(beta==0.0)
2318 {
2319 dD[ii] = 0.0;
2320 }
2321 else
2322 {
2323 alpha = pC00[0];
2324 beta += alpha*alpha;
2325 beta = sqrt(beta);
2326 if(alpha>0)
2327 beta = -beta;
2328 dD[ii] = (beta-alpha) / beta;
2329 tmp = 1.0 / (alpha-beta);
2330 pC00[0] = beta;
2331 for(jj=1; jj<n-ii; jj++)
2332 pC00[0+ps*jj] *= tmp;
2333 }
2334 if(ii<n)
2335 {
2336 // compute T
2337 pT[0+ldt*0] = - dD[ii+0];
2338 // downgrade
2339 kmax = n-ii;
2340 jmax = m-ii-1;
2341 jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
2342 jmax0 = jmax<jmax0 ? jmax : jmax0;
2343 jj = 0;
2344 pC10a = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
2345 pC10 = pC10a;
2346 if(jmax0>0)
2347 {
2348 for( ; jj<jmax0; jj++)
2349 {
2350 w00 = pC10[0+ps*0];
2351 for(kk=1; kk<kmax; kk++)
2352 {
2353 w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
2354 }
2355 w00 = w00*pT[0+ldt*0];
2356 pC10[0+ps*0] += w00;
2357 for(kk=1; kk<kmax; kk++)
2358 {
2359 pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
2360 }
2361 pC10 += 1;
2362 }
2363 pC10 += -ps+ps*sdd;
2364 }
2365 for( ; jj<jmax-3; jj+=4)
2366 {
2367 //
2368 _w0 = _mm256_load_pd( &pC10[0+ps*0] );
2369 for(kk=1; kk<kmax; kk++)
2370 {
2371 _a0 = _mm256_load_pd( &pC10[0+ps*kk] );
2372 _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
2373 _t0 = _mm256_mul_pd( _a0, _b0 );
2374 _w0 = _mm256_add_pd( _w0, _t0 );
2375 }
2376 //
2377 _b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
2378 _w0 = _mm256_mul_pd( _w0, _b0 );
2379 //
2380 _a0 = _mm256_load_pd( &pC10[0+ps*0] );
2381 _a0 = _mm256_add_pd( _a0, _w0 );
2382 _mm256_store_pd( &pC10[0+ps*0], _a0 );
2383 for(kk=1; kk<kmax; kk++)
2384 {
2385 _a0 = _mm256_load_pd( &pC10[0+ps*kk] );
2386 _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
2387 _t0 = _mm256_mul_pd( _w0, _b0 );
2388 _a0 = _mm256_add_pd( _a0, _t0 );
2389 _mm256_store_pd( &pC10[0+ps*kk], _a0 );
2390 }
2391 pC10 += ps*sdd;
2392 }
2393 for(ll=0; ll<jmax-jj; ll++)
2394 {
2395 w00 = pC10[0+ps*0];
2396 for(kk=1; kk<kmax; kk++)
2397 {
2398 w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
2399 }
2400 w00 = w00*pT[0+ldt*0];
2401 pC10[0+ps*0] += w00;
2402 for(kk=1; kk<kmax; kk++)
2403 {
2404 pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
2405 }
2406 pC10 += 1;
2407 }
2408 }
2409 }
2410 return;
2411 }
2412
2413
2414
2415// assume kmax>=4
2416void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT)
2417 {
2418 const int ps = 4;
2419 int kk;
2420 double v10,
2421 v20, v21,
2422 v30, v31, v32;
2423 // 0
2424 // 1
2425 v10 = pD[0+ps*1];
2426 // 2
2427 v10 += pD[1+ps*2]*pD[0+ps*2];
2428 v20 = pD[0+ps*2];
2429 v21 = pD[1+ps*2];
2430 // 3
2431 v10 += pD[1+ps*3]*pD[0+ps*3];
2432 v20 += pD[2+ps*3]*pD[0+ps*3];
2433 v21 += pD[2+ps*3]*pD[1+ps*3];
2434 v30 = pD[0+ps*3];
2435 v31 = pD[1+ps*3];
2436 v32 = pD[2+ps*3];
2437 //
2438 for(kk=4; kk<kmax; kk++)
2439 {
2440 v10 += pD[1+ps*kk]*pD[0+ps*kk];
2441 v20 += pD[2+ps*kk]*pD[0+ps*kk];
2442 v30 += pD[3+ps*kk]*pD[0+ps*kk];
2443 v21 += pD[2+ps*kk]*pD[1+ps*kk];
2444 v31 += pD[3+ps*kk]*pD[1+ps*kk];
2445 v32 += pD[3+ps*kk]*pD[2+ps*kk];
2446 }
2447 pT[0+ps*0] = - dD[0];
2448 pT[1+ps*1] = - dD[1];
2449 pT[2+ps*2] = - dD[2];
2450 pT[3+ps*3] = - dD[3];
2451 pT[0+ps*1] = - dD[1] * (v10*pT[0+ps*0]);
2452 pT[1+ps*2] = - dD[2] * (v21*pT[1+ps*1]);
2453 pT[2+ps*3] = - dD[3] * (v32*pT[2+ps*2]);
2454 pT[0+ps*2] = - dD[2] * (v20*pT[0+ps*0] + v21*pT[0+ps*1]);
2455 pT[1+ps*3] = - dD[3] * (v31*pT[1+ps*1] + v32*pT[1+ps*2]);
2456 pT[0+ps*3] = - dD[3] * (v30*pT[0+ps*0] + v31*pT[0+ps*1] + v32*pT[0+ps*2]);
2457 return;
2458 }
2459
2460
2461
2462// assume n>=4
2463#if ! defined(TARGET_X64_INTEL_HASWELL)
2464void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT)
2465 {
2466 int ii, jj, ll;
2467 double alpha, beta, tmp, w0, w1, w2, w3;
2468 const int ps = 4;
2469 // zero tau matrix
2470 for(ii=0; ii<16; ii++)
2471 pT[ii] = 0.0;
2472 // first column
2473 beta = 0.0;
2474 for(ii=1; ii<n; ii++)
2475 {
2476 tmp = pD[0+ps*ii];
2477 beta += tmp*tmp;
2478 }
2479 if(beta==0.0)
2480 {
2481 dD[0] = 0.0;
2482 tmp = 0.0;
2483 goto col2;
2484 }
2485 alpha = pD[0+ps*0];
2486 beta += alpha*alpha;
2487 beta = sqrt(beta);
2488 if(alpha>0)
2489 beta = -beta;
2490 dD[0] = (beta-alpha) / beta;
2491 pT[0+ps*0] = - dD[0];
2492 tmp = 1.0 / (alpha-beta);
2493 //
2494 pD[0+ps*0] = beta;
2495 w1 = pD[1+ps*0];
2496 w2 = pD[2+ps*0];
2497 w3 = pD[3+ps*0];
2498 //
2499 pD[0+ps*1] *= tmp;
2500 w1 += pD[1+ps*1] * pD[0+ps*1];
2501 w2 += pD[2+ps*1] * pD[0+ps*1];
2502 w3 += pD[3+ps*1] * pD[0+ps*1];
2503 //
2504 pD[0+ps*2] *= tmp;
2505 w1 += pD[1+ps*2] * pD[0+ps*2];
2506 w2 += pD[2+ps*2] * pD[0+ps*2];
2507 w3 += pD[3+ps*2] * pD[0+ps*2];
2508 //
2509 pD[0+ps*3] *= tmp;
2510 w1 += pD[1+ps*3] * pD[0+ps*3];
2511 w2 += pD[2+ps*3] * pD[0+ps*3];
2512 w3 += pD[3+ps*3] * pD[0+ps*3];
2513 //
2514 for(ii=4; ii<n; ii++)
2515 {
2516 pD[0+ps*ii] *= tmp;
2517 w1 += pD[1+ps*ii] * pD[0+ps*ii];
2518 w2 += pD[2+ps*ii] * pD[0+ps*ii];
2519 w3 += pD[3+ps*ii] * pD[0+ps*ii];
2520 }
2521 //
2522 w1 = - dD[0] * w1;
2523 w2 = - dD[0] * w2;
2524 w3 = - dD[0] * w3;
2525 //
2526 pD[1+ps*0] += w1;
2527 pD[2+ps*0] += w2;
2528 pD[3+ps*0] += w3;
2529 //
2530 pD[1+ps*1] += w1 * pD[0+ps*1];
2531 pD[2+ps*1] += w2 * pD[0+ps*1];
2532 pD[3+ps*1] += w3 * pD[0+ps*1];
2533 //
2534 pD[1+ps*2] += w1 * pD[0+ps*2];
2535 pD[2+ps*2] += w2 * pD[0+ps*2];
2536 pD[3+ps*2] += w3 * pD[0+ps*2];
2537 beta = pD[1+ps*2] * pD[1+ps*2];
2538 //
2539 pD[1+ps*3] += w1 * pD[0+ps*3];
2540 pD[2+ps*3] += w2 * pD[0+ps*3];
2541 pD[3+ps*3] += w3 * pD[0+ps*3];
2542 beta += pD[1+ps*3] * pD[1+ps*3];
2543 //
2544 for(ii=4; ii<n; ii++)
2545 {
2546 pD[1+ps*ii] += w1 * pD[0+ps*ii];
2547 pD[2+ps*ii] += w2 * pD[0+ps*ii];
2548 pD[3+ps*ii] += w3 * pD[0+ps*ii];
2549 beta += pD[1+ps*ii] * pD[1+ps*ii];
2550 }
2551 // second column
2552col2:
2553 if(beta==0.0)
2554 {
2555 dD[1] = 0.0;
2556 tmp = 0.0;
2557 goto col3;
2558 }
2559 alpha = pD[1+ps*1];
2560 beta += alpha*alpha;
2561 beta = sqrt(beta);
2562 if(alpha>0)
2563 beta = -beta;
2564 dD[1] = (beta-alpha) / beta;
2565 pT[1+ps*1] = - dD[1];
2566 tmp = 1.0 / (alpha-beta);
2567 //
2568 pD[1+ps*1] = beta;
2569 w0 = pD[0+ps*1]; //
2570 w2 = pD[2+ps*1];
2571 w3 = pD[3+ps*1];
2572 //
2573 pD[1+ps*2] *= tmp;
2574 w0 += pD[0+ps*2] * pD[1+ps*2]; //
2575 w2 += pD[2+ps*2] * pD[1+ps*2];
2576 w3 += pD[3+ps*2] * pD[1+ps*2];
2577 //
2578 pD[1+ps*3] *= tmp;
2579 w0 += pD[0+ps*3] * pD[1+ps*3]; //
2580 w2 += pD[2+ps*3] * pD[1+ps*3];
2581 w3 += pD[3+ps*3] * pD[1+ps*3];
2582 //
2583 for(ii=4; ii<n; ii++)
2584 {
2585 pD[1+ps*ii] *= tmp;
2586 w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
2587 w2 += pD[2+ps*ii] * pD[1+ps*ii];
2588 w3 += pD[3+ps*ii] * pD[1+ps*ii];
2589 }
2590 //
2591 pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
2592 w2 = - dD[1] * w2;
2593 w3 = - dD[1] * w3;
2594 //
2595 pD[2+ps*1] += w2;
2596 pD[3+ps*1] += w3;
2597 //
2598 pD[2+ps*2] += w2 * pD[1+ps*2];
2599 pD[3+ps*2] += w3 * pD[1+ps*2];
2600 //
2601 pD[2+ps*3] += w2 * pD[1+ps*3];
2602 pD[3+ps*3] += w3 * pD[1+ps*3];
2603 beta = pD[2+ps*3] * pD[2+ps*3];
2604 //
2605 for(ii=4; ii<n; ii++)
2606 {
2607 pD[2+ps*ii] += w2 * pD[1+ps*ii];
2608 pD[3+ps*ii] += w3 * pD[1+ps*ii];
2609 beta += pD[2+ps*ii] * pD[2+ps*ii];
2610 }
2611 // third column
2612col3:
2613 if(beta==0.0)
2614 {
2615 dD[2] = 0.0;
2616 tmp = 0.0;
2617 goto col4;
2618 }
2619 alpha = pD[2+ps*2];
2620 beta += alpha*alpha;
2621 beta = sqrt(beta);
2622 if(alpha>0)
2623 beta = -beta;
2624 dD[2] = (beta-alpha) / beta;
2625 pT[2+ps*2] = - dD[2];
2626 tmp = 1.0 / (alpha-beta);
2627 //
2628 pD[2+ps*2] = beta;
2629 w0 = pD[0+ps*2];
2630 w1 = pD[1+ps*2];
2631 w3 = pD[3+ps*2];
2632 //
2633 pD[2+ps*3] *= tmp;
2634 w0 += pD[0+ps*3] * pD[2+ps*3];
2635 w1 += pD[1+ps*3] * pD[2+ps*3];
2636 w3 += pD[3+ps*3] * pD[2+ps*3];
2637 //
2638 for(ii=4; ii<n; ii++)
2639 {
2640 pD[2+ps*ii] *= tmp;
2641 w0 += pD[0+ps*ii] * pD[2+ps*ii];
2642 w1 += pD[1+ps*ii] * pD[2+ps*ii];
2643 w3 += pD[3+ps*ii] * pD[2+ps*ii];
2644 }
2645 //
2646 pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
2647 pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
2648 w3 = - dD[2] * w3;
2649 //
2650 pD[3+ps*2] += w3;
2651 //
2652 pD[3+ps*3] += w3 * pD[2+ps*3];
2653 //
2654 beta = 0.0;
2655 for(ii=4; ii<n; ii++)
2656 {
2657 pD[3+ps*ii] += w3 * pD[2+ps*ii];
2658 beta += pD[3+ps*ii] * pD[3+ps*ii];
2659 }
2660 // fourth column
2661col4:
2662 if(beta==0.0)
2663 {
2664 dD[3] = 0.0;
2665 tmp = 0.0;
2666 return;
2667 }
2668 alpha = pD[3+ps*3];
2669 beta += alpha*alpha;
2670 beta = sqrt(beta);
2671 if(alpha>0)
2672 beta = -beta;
2673 dD[3] = (beta-alpha) / beta;
2674 pT[3+ps*3] = - dD[3];
2675 tmp = 1.0 / (alpha-beta);
2676 //
2677 pD[3+ps*3] = beta;
2678 w0 = pD[0+ps*3];
2679 w1 = pD[1+ps*3];
2680 w2 = pD[2+ps*3];
2681 //
2682 for(ii=4; ii<n; ii++)
2683 {
2684 pD[3+ps*ii] *= tmp;
2685 w0 += pD[0+ps*ii] * pD[3+ps*ii];
2686 w1 += pD[1+ps*ii] * pD[3+ps*ii];
2687 w2 += pD[2+ps*ii] * pD[3+ps*ii];
2688 }
2689 //
2690 pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
2691 pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
2692 pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
2693 return;
2694 }
2695#endif
2696
2697
2698
2699void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD)
2700 {
2701 const int ps = 4;
2702 double pW[16];
2703 int kk;
2704 // 0
2705 pW[0+ps*0] = pD[0+ps*0];
2706 // 1
2707 pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
2708 pW[0+ps*1] = pD[0+ps*1];
2709 // 2
2710 pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
2711 pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
2712 pW[0+ps*2] = pD[0+ps*2];
2713 // 3
2714 pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
2715 pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
2716 pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
2717 pW[0+ps*3] = pD[0+ps*3];
2718 //
2719 for(kk=4; kk<kmax; kk++)
2720 {
2721 pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
2722 pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
2723 pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
2724 pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
2725 }
2726 //
2727 pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
2728 //
2729 pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
2730 //
2731 pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
2732 //
2733 pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
2734 //
2735 pD[0+ps*0] += pW[0+ps*0];
2736 //
2737 pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
2738 //
2739 pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
2740 //
2741 pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
2742 for(kk=4; kk<kmax; kk++)
2743 {
2744 pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
2745 }
2746 return;
2747 }
2748
2749
2750
2751