blob: 071ec8674122249a7e6b630ce2f89679e5027a77 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#include <math.h>
30#include <stdio.h>
31
32#include "../../include/blasfeo_common.h"
33#include "../../include/blasfeo_d_aux.h"
34
35
36
37void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD)
38 {
39 int ii, jj, ll;
40 double alpha, beta, tmp, w1, w2, w3;
41 const int ps = 4;
42 // first column
43 beta = 0.0;
44 ii = 1;
45 if(m>1)
46 {
47 tmp = pD[1+ps*0];
48 beta += tmp*tmp;
49 if(m>2)
50 {
51 tmp = pD[2+ps*0];
52 beta += tmp*tmp;
53 if(m>3)
54 {
55 tmp = pD[3+ps*0];
56 beta += tmp*tmp;
57 }
58 }
59 }
60 for(ii=4; ii<m-3; ii+=4)
61 {
62 tmp = pD[0+ii*sdd+ps*0];
63 beta += tmp*tmp;
64 tmp = pD[1+ii*sdd+ps*0];
65 beta += tmp*tmp;
66 tmp = pD[2+ii*sdd+ps*0];
67 beta += tmp*tmp;
68 tmp = pD[3+ii*sdd+ps*0];
69 beta += tmp*tmp;
70 }
71 for(ll=0; ll<m-ii; ll++)
72 {
73 tmp = pD[ll+ii*sdd+ps*0];
74 beta += tmp*tmp;
75 }
76 if(beta==0.0)
77 {
78 // tau
79 dD[0] = 0.0;
80 }
81 else
82 {
83 alpha = pD[0+ps*0];
84 beta += alpha*alpha;
85 beta = sqrt(beta);
86 if(alpha>0)
87 beta = -beta;
88 // tau0
89 dD[0] = (beta-alpha) / beta;
90 tmp = 1.0 / (alpha-beta);
91 // compute v0
92 pD[0+ps*0] = beta;
93 ii = 1;
94 if(m>1)
95 {
96 pD[1+ps*0] *= tmp;
97 if(m>2)
98 {
99 pD[2+ps*0] *= tmp;
100 if(m>3)
101 {
102 pD[3+ps*0] *= tmp;
103 }
104 }
105 }
106 for(ii=4; ii<m-3; ii+=4)
107 {
108 pD[0+ii*sdd+ps*0] *= tmp;
109 pD[1+ii*sdd+ps*0] *= tmp;
110 pD[2+ii*sdd+ps*0] *= tmp;
111 pD[3+ii*sdd+ps*0] *= tmp;
112 }
113 for(ll=0; ll<m-ii; ll++)
114 {
115 pD[ll+ii*sdd+ps*0] *= tmp;
116 }
117 }
118 // gemv_t & ger
119 w1 = pD[0+ps*1];
120 w2 = pD[0+ps*2];
121 w3 = pD[0+ps*3];
122 if(m>1)
123 {
124 w1 += pD[1+ps*1] * pD[1+ps*0];
125 w2 += pD[1+ps*2] * pD[1+ps*0];
126 w3 += pD[1+ps*3] * pD[1+ps*0];
127 if(m>2)
128 {
129 w1 += pD[2+ps*1] * pD[2+ps*0];
130 w2 += pD[2+ps*2] * pD[2+ps*0];
131 w3 += pD[2+ps*3] * pD[2+ps*0];
132 if(m>3)
133 {
134 w1 += pD[3+ps*1] * pD[3+ps*0];
135 w2 += pD[3+ps*2] * pD[3+ps*0];
136 w3 += pD[3+ps*3] * pD[3+ps*0];
137 }
138 }
139 }
140 for(ii=4; ii<m-3; ii+=4)
141 {
142 w1 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
143 w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
144 w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
145 w1 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
146 w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
147 w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
148 w1 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
149 w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
150 w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
151 w1 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
152 w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
153 w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
154 }
155 for(ll=0; ll<m-ii; ll++)
156 {
157 w1 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
158 w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
159 w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
160 }
161 w1 = - dD[0] * w1;
162 w2 = - dD[0] * w2;
163 w3 = - dD[0] * w3;
164 pD[0+ps*1] += w1;
165 pD[0+ps*2] += w2;
166 pD[0+ps*3] += w3;
167 if(m>1)
168 {
169 pD[1+ps*1] += w1 * pD[1+ps*0];
170 pD[1+ps*2] += w2 * pD[1+ps*0];
171 pD[1+ps*3] += w3 * pD[1+ps*0];
172 if(m>2)
173 {
174 pD[2+ps*1] += w1 * pD[2+ps*0];
175 pD[2+ps*2] += w2 * pD[2+ps*0];
176 pD[2+ps*3] += w3 * pD[2+ps*0];
177 if(m>3)
178 {
179 pD[3+ps*1] += w1 * pD[3+ps*0];
180 pD[3+ps*2] += w2 * pD[3+ps*0];
181 pD[3+ps*3] += w3 * pD[3+ps*0];
182 }
183 }
184 }
185 for(ii=4; ii<m-3; ii+=4)
186 {
187 pD[0+ii*sdd+ps*1] += w1 * pD[0+ii*sdd+ps*0];
188 pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*0];
189 pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*0];
190 pD[1+ii*sdd+ps*1] += w1 * pD[1+ii*sdd+ps*0];
191 pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*0];
192 pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*0];
193 pD[2+ii*sdd+ps*1] += w1 * pD[2+ii*sdd+ps*0];
194 pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*0];
195 pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*0];
196 pD[3+ii*sdd+ps*1] += w1 * pD[3+ii*sdd+ps*0];
197 pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*0];
198 pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*0];
199 }
200 for(ll=0; ll<m-ii; ll++)
201 {
202 pD[ll+ii*sdd+ps*1] += w1 * pD[ll+ii*sdd+ps*0];
203 pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*0];
204 pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*0];
205 }
206 if(m==1)
207 return;
208 // second column
209 beta = 0.0;
210 if(m>2)
211 {
212 tmp = pD[2+ps*1];
213 beta += tmp*tmp;
214 if(m>3)
215 {
216 tmp = pD[3+ps*1];
217 beta += tmp*tmp;
218 }
219 }
220 for(ii=4; ii<m-3; ii+=4)
221 {
222 tmp = pD[0+ii*sdd+ps*1];
223 beta += tmp*tmp;
224 tmp = pD[1+ii*sdd+ps*1];
225 beta += tmp*tmp;
226 tmp = pD[2+ii*sdd+ps*1];
227 beta += tmp*tmp;
228 tmp = pD[3+ii*sdd+ps*1];
229 beta += tmp*tmp;
230 }
231 for(ll=0; ll<m-ii; ll++)
232 {
233 tmp = pD[ll+ii*sdd+ps*1];
234 beta += tmp*tmp;
235 }
236 if(beta==0.0)
237 {
238 // tau
239 dD[1] = 0.0;
240 }
241 else
242 {
243 alpha = pD[1+ps*1];
244 beta += alpha*alpha;
245 beta = sqrt(beta);
246 if(alpha>0)
247 beta = -beta;
248 // tau0
249 dD[1] = (beta-alpha) / beta;
250 tmp = 1.0 / (alpha-beta);
251 // compute v0
252 pD[1+ps*1] = beta;
253 if(m>2)
254 {
255 pD[2+ps*1] *= tmp;
256 if(m>3)
257 {
258 pD[3+ps*1] *= tmp;
259 }
260 }
261 for(ii=4; ii<m-3; ii+=4)
262 {
263 pD[0+ii*sdd+ps*1] *= tmp;
264 pD[1+ii*sdd+ps*1] *= tmp;
265 pD[2+ii*sdd+ps*1] *= tmp;
266 pD[3+ii*sdd+ps*1] *= tmp;
267 }
268 for(ll=0; ll<m-ii; ll++)
269 {
270 pD[ll+ii*sdd+ps*1] *= tmp;
271 }
272 }
273 // gemv_t & ger
274 w2 = pD[1+ps*2];
275 w3 = pD[1+ps*3];
276 if(m>2)
277 {
278 w2 += pD[2+ps*2] * pD[2+ps*1];
279 w3 += pD[2+ps*3] * pD[2+ps*1];
280 if(m>3)
281 {
282 w2 += pD[3+ps*2] * pD[3+ps*1];
283 w3 += pD[3+ps*3] * pD[3+ps*1];
284 }
285 }
286 for(ii=4; ii<m-3; ii+=4)
287 {
288 w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
289 w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
290 w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
291 w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
292 w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
293 w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
294 w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
295 w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
296 }
297 for(ll=0; ll<m-ii; ll++)
298 {
299 w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
300 w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
301 }
302 w2 = - dD[1] * w2;
303 w3 = - dD[1] * w3;
304 pD[1+ps*2] += w2;
305 pD[1+ps*3] += w3;
306 if(m>2)
307 {
308 pD[2+ps*2] += w2 * pD[2+ps*1];
309 pD[2+ps*3] += w3 * pD[2+ps*1];
310 if(m>3)
311 {
312 pD[3+ps*2] += w2 * pD[3+ps*1];
313 pD[3+ps*3] += w3 * pD[3+ps*1];
314 }
315 }
316 for(ii=4; ii<m-3; ii+=4)
317 {
318 pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*1];
319 pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*1];
320 pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*1];
321 pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*1];
322 pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*1];
323 pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*1];
324 pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*1];
325 pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*1];
326 }
327 for(ll=0; ll<m-ii; ll++)
328 {
329 pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*1];
330 pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*1];
331 }
332 if(m==2)
333 return;
334 // third column
335 beta = 0.0;
336 if(m>3)
337 {
338 tmp = pD[3+ps*2];
339 beta += tmp*tmp;
340 }
341 for(ii=4; ii<m-3; ii+=4)
342 {
343 tmp = pD[0+ii*sdd+ps*2];
344 beta += tmp*tmp;
345 tmp = pD[1+ii*sdd+ps*2];
346 beta += tmp*tmp;
347 tmp = pD[2+ii*sdd+ps*2];
348 beta += tmp*tmp;
349 tmp = pD[3+ii*sdd+ps*2];
350 beta += tmp*tmp;
351 }
352 for(ll=0; ll<m-ii; ll++)
353 {
354 tmp = pD[ll+ii*sdd+ps*2];
355 beta += tmp*tmp;
356 }
357 if(beta==0.0)
358 {
359 // tau
360 dD[2] = 0.0;
361 }
362 else
363 {
364 alpha = pD[2+ps*2];
365 beta += alpha*alpha;
366 beta = sqrt(beta);
367 if(alpha>0)
368 beta = -beta;
369 // tau0
370 dD[2] = (beta-alpha) / beta;
371 tmp = 1.0 / (alpha-beta);
372 // compute v0
373 pD[2+ps*2] = beta;
374 if(m>3)
375 {
376 pD[3+ps*2] *= tmp;
377 }
378 for(ii=4; ii<m-3; ii+=4)
379 {
380 pD[0+ii*sdd+ps*2] *= tmp;
381 pD[1+ii*sdd+ps*2] *= tmp;
382 pD[2+ii*sdd+ps*2] *= tmp;
383 pD[3+ii*sdd+ps*2] *= tmp;
384 }
385 for(ll=0; ll<m-ii; ll++)
386 {
387 pD[ll+ii*sdd+ps*2] *= tmp;
388 }
389 }
390 // gemv_t & ger
391 w3 = pD[2+ps*3];
392 if(m>3)
393 {
394 w3 += pD[3+ps*3] * pD[3+ps*2];
395 }
396 for(ii=4; ii<m-3; ii+=4)
397 {
398 w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
399 w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
400 w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
401 w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
402 }
403 for(ll=0; ll<m-ii; ll++)
404 {
405 w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
406 }
407 w3 = - dD[2] * w3;
408 pD[2+ps*3] += w3;
409 if(m>3)
410 {
411 pD[3+ps*3] += w3 * pD[3+ps*2];
412 }
413 for(ii=4; ii<m-3; ii+=4)
414 {
415 pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*2];
416 pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*2];
417 pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*2];
418 pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*2];
419 }
420 for(ll=0; ll<m-ii; ll++)
421 {
422 pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*2];
423 }
424 if(m==3)
425 return;
426 // fourth column
427 beta = 0.0;
428 for(ii=4; ii<m-3; ii+=4)
429 {
430 tmp = pD[0+ii*sdd+ps*3];
431 beta += tmp*tmp;
432 tmp = pD[1+ii*sdd+ps*3];
433 beta += tmp*tmp;
434 tmp = pD[2+ii*sdd+ps*3];
435 beta += tmp*tmp;
436 tmp = pD[3+ii*sdd+ps*3];
437 beta += tmp*tmp;
438 }
439 for(ll=0; ll<m-ii; ll++)
440 {
441 tmp = pD[ll+ii*sdd+ps*3];
442 beta += tmp*tmp;
443 }
444 if(beta==0.0)
445 {
446 // tau
447 dD[3] = 0.0;
448 }
449 else
450 {
451 alpha = pD[3+ps*3];
452 beta += alpha*alpha;
453 beta = sqrt(beta);
454 if(alpha>0)
455 beta = -beta;
456 // tau0
457 dD[3] = (beta-alpha) / beta;
458 tmp = 1.0 / (alpha-beta);
459 // compute v0
460 pD[3+ps*3] = beta;
461 for(ii=4; ii<m-3; ii+=4)
462 {
463 pD[0+ii*sdd+ps*3] *= tmp;
464 pD[1+ii*sdd+ps*3] *= tmp;
465 pD[2+ii*sdd+ps*3] *= tmp;
466 pD[3+ii*sdd+ps*3] *= tmp;
467 }
468 for(ll=0; ll<m-ii; ll++)
469 {
470 pD[ll+ii*sdd+ps*3] *= tmp;
471 }
472 }
473 return;
474 }
475
476
477// unblocked algorithm
478void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
479 {
480 if(m<=0 | n<=0)
481 return;
482 int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
483 const int ps = 4;
484 imax = k; //m<n ? m : n;
485 double alpha, beta, tmp, w0;
486 double *pC00, *pC10, *pC01, *pC11;
487 int offset;
488 double *pD0 = pD-offD;
489 for(ii=0; ii<imax; ii++)
490 {
491 pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
492 pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
493 beta = 0.0;
494 jmax = m-ii-1;
495 jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
496 jmax0 = jmax<jmax0 ? jmax : jmax0;
497 offset = 0;
498 jj = 0;
499 if(jmax0>0)
500 {
501 for( ; jj<jmax0; jj++)
502 {
503 tmp = pC10[0+offset];
504 beta += tmp*tmp;
505 offset += 1;
506 }
507 offset += -ps+ps*sdd;
508 }
509 for( ; jj<jmax-3; jj+=4)
510 {
511 tmp = pC10[0+offset];
512 beta += tmp*tmp;
513 tmp = pC10[1+offset];
514 beta += tmp*tmp;
515 tmp = pC10[2+offset];
516 beta += tmp*tmp;
517 tmp = pC10[3+offset];
518 beta += tmp*tmp;
519 offset += ps*sdd;
520 }
521 for(ll=0; ll<jmax-jj; ll++)
522 {
523 tmp = pC10[0+offset];
524 beta += tmp*tmp;
525 offset += 1;
526 }
527 if(beta==0.0)
528 {
529 dD[ii] = 0.0;
530 }
531 else
532 {
533 alpha = pC00[0];
534 beta += alpha*alpha;
535 beta = sqrt(beta);
536 if(alpha>0)
537 beta = -beta;
538 dD[ii] = (beta-alpha) / beta;
539 tmp = 1.0 / (alpha-beta);
540 offset = 0;
541 jj = 0;
542 if(jmax0>0)
543 {
544 for( ; jj<jmax0; jj++)
545 {
546 pC10[0+offset] *= tmp;
547 offset += 1;
548 }
549 offset += -ps+ps*sdd;
550 }
551 for( ; jj<jmax-3; jj+=4)
552 {
553 pC10[0+offset] *= tmp;
554 pC10[1+offset] *= tmp;
555 pC10[2+offset] *= tmp;
556 pC10[3+offset] *= tmp;
557 offset += ps*sdd;
558 }
559 for(ll=0; ll<jmax-jj; ll++)
560 {
561 pC10[0+offset] *= tmp;
562 offset += 1;
563 }
564 pC00[0] = beta;
565 }
566 if(ii<n)
567 {
568 pC01 = pC00 + ps;
569 pC11 = pC10 + ps;
570 kmax = jmax;
571 kmax0 = jmax0;
572 jmax = n-ii-1;
573 jj = 0;
574 for( ; jj<jmax; jj++)
575 {
576 w0 = pC01[0+ps*jj] * 1.0;
577 offset = 0;
578 kk = 0;
579 if(kmax0>0)
580 {
581 for( ; kk<kmax0; kk++)
582 {
583 w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
584 offset += 1;
585 }
586 offset += -ps+ps*sdd;
587 }
588 for( ; kk<kmax-3; kk+=4)
589 {
590 w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
591 w0 += pC11[1+offset+ps*jj] * pC10[1+offset];
592 w0 += pC11[2+offset+ps*jj] * pC10[2+offset];
593 w0 += pC11[3+offset+ps*jj] * pC10[3+offset];
594 offset += ps*sdd;
595 }
596 for(ll=0; ll<kmax-kk; ll++)
597 {
598 w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
599 offset += 1;
600 }
601 w0 = - dD[ii] * w0;
602 pC01[0+ps*jj] += w0;
603 offset = 0;
604 kk = 0;
605 if(kmax0>0)
606 {
607 for( ; kk<kmax0; kk++)
608 {
609 pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
610 offset += 1;
611 }
612 offset = offset-ps+ps*sdd;
613 }
614 for( ; kk<kmax-3; kk+=4)
615 {
616 pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
617 pC11[1+offset+ps*jj] += w0 * pC10[1+offset];
618 pC11[2+offset+ps*jj] += w0 * pC10[2+offset];
619 pC11[3+offset+ps*jj] += w0 * pC10[3+offset];
620 offset += ps*sdd;
621 }
622 for(ll=0; ll<kmax-kk; ll++)
623 {
624 pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
625 offset += 1;
626 }
627 }
628 }
629 }
630 return;
631 }
632
633
634
635void kernel_dlarf_4_lib4(int m, int n, double *pD, int sdd, double *dD, double *pC0, int sdc)
636 {
637 if(m<=0 | n<=0)
638 return;
639 int ii, jj, ll;
640 const int ps = 4;
641 double v10,
642 v20, v21,
643 v30, v31, v32;
644 double tmp, d0, d1, d2, d3;
645 double *pC;
646 double pT[16];// = {};
647 int ldt = 4;
648 double pW[8];// = {};
649 int ldw = 2;
650 // dot product of v
651 v10 = 0.0;
652 v20 = 0.0;
653 v30 = 0.0;
654 v21 = 0.0;
655 v31 = 0.0;
656 v32 = 0.0;
657 if(m>1)
658 {
659 v10 = 1.0 * pD[1+ps*0];
660 if(m>2)
661 {
662 v10 += pD[2+ps*1] * pD[2+ps*0];
663 v20 = 1.0 * pD[2+ps*0];
664 v21 = 1.0 * pD[2+ps*1];
665 if(m>3)
666 {
667 v10 += pD[3+ps*1] * pD[3+ps*0];
668 v20 += pD[3+ps*2] * pD[3+ps*0];
669 v21 += pD[3+ps*2] * pD[3+ps*1];
670 v30 = 1.0 * pD[3+ps*0];
671 v31 = 1.0 * pD[3+ps*1];
672 v32 = 1.0 * pD[3+ps*2];
673 }
674 }
675 }
676 for(ii=4; ii<m-3; ii+=4)
677 {
678 v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
679 v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
680 v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
681 v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
682 v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
683 v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
684 v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
685 v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
686 v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
687 v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
688 v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
689 v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
690 v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
691 v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
692 v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
693 v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
694 v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
695 v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
696 v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
697 v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
698 v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
699 v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
700 v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
701 v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
702 }
703 for(ll=0; ll<m-ii; ll++)
704 {
705 v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
706 v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
707 v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
708 v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
709 v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
710 v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
711 }
712 // compute lower triangular T containing tau for matrix update
713 pT[0+ldt*0] = dD[0];
714 pT[1+ldt*1] = dD[1];
715 pT[2+ldt*2] = dD[2];
716 pT[3+ldt*3] = dD[3];
717 pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
718 pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
719 pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
720 pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
721 pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
722 pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
723 // downgrade matrix
724 pW[0] = 0.0;
725 pW[1] = 0.0;
726 pW[2] = 0.0;
727 pW[3] = 0.0;
728 pW[4] = 0.0;
729 pW[5] = 0.0;
730 pW[6] = 0.0;
731 pW[7] = 0.0;
732 ii = 0;
733 for( ; ii<n-1; ii+=2)
734 {
735 pC = pC0+ii*ps;
736 // compute W^T = C^T * V
737 tmp = pC[0+ps*0];
738 pW[0+ldw*0] = tmp;
739 tmp = pC[0+ps*1];
740 pW[1+ldw*0] = tmp;
741 if(m>1)
742 {
743 d0 = pD[1+ps*0];
744 tmp = pC[1+ps*0];
745 pW[0+ldw*0] += tmp * d0;
746 pW[0+ldw*1] = tmp;
747 tmp = pC[1+ps*1];
748 pW[1+ldw*0] += tmp * d0;
749 pW[1+ldw*1] = tmp;
750 if(m>2)
751 {
752 d0 = pD[2+ps*0];
753 d1 = pD[2+ps*1];
754 tmp = pC[2+ps*0];
755 pW[0+ldw*0] += tmp * d0;
756 pW[0+ldw*1] += tmp * d1;
757 pW[0+ldw*2] = tmp;
758 tmp = pC[2+ps*1];
759 pW[1+ldw*0] += tmp * d0;
760 pW[1+ldw*1] += tmp * d1;
761 pW[1+ldw*2] = tmp;
762 if(m>3)
763 {
764 d0 = pD[3+ps*0];
765 d1 = pD[3+ps*1];
766 d2 = pD[3+ps*2];
767 tmp = pC[3+ps*0];
768 pW[0+ldw*0] += tmp * d0;
769 pW[0+ldw*1] += tmp * d1;
770 pW[0+ldw*2] += tmp * d2;
771 pW[0+ldw*3] = tmp;
772 tmp = pC[3+ps*1];
773 pW[1+ldw*0] += tmp * d0;
774 pW[1+ldw*1] += tmp * d1;
775 pW[1+ldw*2] += tmp * d2;
776 pW[1+ldw*3] = tmp;
777 }
778 }
779 }
780 for(jj=4; jj<m-3; jj+=4)
781 {
782 //
783 d0 = pD[0+jj*sdd+ps*0];
784 d1 = pD[0+jj*sdd+ps*1];
785 d2 = pD[0+jj*sdd+ps*2];
786 d3 = pD[0+jj*sdd+ps*3];
787 tmp = pC[0+jj*sdc+ps*0];
788 pW[0+ldw*0] += tmp * d0;
789 pW[0+ldw*1] += tmp * d1;
790 pW[0+ldw*2] += tmp * d2;
791 pW[0+ldw*3] += tmp * d3;
792 tmp = pC[0+jj*sdc+ps*1];
793 pW[1+ldw*0] += tmp * d0;
794 pW[1+ldw*1] += tmp * d1;
795 pW[1+ldw*2] += tmp * d2;
796 pW[1+ldw*3] += tmp * d3;
797 //
798 d0 = pD[1+jj*sdd+ps*0];
799 d1 = pD[1+jj*sdd+ps*1];
800 d2 = pD[1+jj*sdd+ps*2];
801 d3 = pD[1+jj*sdd+ps*3];
802 tmp = pC[1+jj*sdc+ps*0];
803 pW[0+ldw*0] += tmp * d0;
804 pW[0+ldw*1] += tmp * d1;
805 pW[0+ldw*2] += tmp * d2;
806 pW[0+ldw*3] += tmp * d3;
807 tmp = pC[1+jj*sdc+ps*1];
808 pW[1+ldw*0] += tmp * d0;
809 pW[1+ldw*1] += tmp * d1;
810 pW[1+ldw*2] += tmp * d2;
811 pW[1+ldw*3] += tmp * d3;
812 //
813 d0 = pD[2+jj*sdd+ps*0];
814 d1 = pD[2+jj*sdd+ps*1];
815 d2 = pD[2+jj*sdd+ps*2];
816 d3 = pD[2+jj*sdd+ps*3];
817 tmp = pC[2+jj*sdc+ps*0];
818 pW[0+ldw*0] += tmp * d0;
819 pW[0+ldw*1] += tmp * d1;
820 pW[0+ldw*2] += tmp * d2;
821 pW[0+ldw*3] += tmp * d3;
822 tmp = pC[2+jj*sdc+ps*1];
823 pW[1+ldw*0] += tmp * d0;
824 pW[1+ldw*1] += tmp * d1;
825 pW[1+ldw*2] += tmp * d2;
826 pW[1+ldw*3] += tmp * d3;
827 //
828 d0 = pD[3+jj*sdd+ps*0];
829 d1 = pD[3+jj*sdd+ps*1];
830 d2 = pD[3+jj*sdd+ps*2];
831 d3 = pD[3+jj*sdd+ps*3];
832 tmp = pC[3+jj*sdc+ps*0];
833 pW[0+ldw*0] += tmp * d0;
834 pW[0+ldw*1] += tmp * d1;
835 pW[0+ldw*2] += tmp * d2;
836 pW[0+ldw*3] += tmp * d3;
837 tmp = pC[3+jj*sdc+ps*1];
838 pW[1+ldw*0] += tmp * d0;
839 pW[1+ldw*1] += tmp * d1;
840 pW[1+ldw*2] += tmp * d2;
841 pW[1+ldw*3] += tmp * d3;
842 }
843 for(ll=0; ll<m-jj; ll++)
844 {
845 d0 = pD[ll+jj*sdd+ps*0];
846 d1 = pD[ll+jj*sdd+ps*1];
847 d2 = pD[ll+jj*sdd+ps*2];
848 d3 = pD[ll+jj*sdd+ps*3];
849 tmp = pC[ll+jj*sdc+ps*0];
850 pW[0+ldw*0] += tmp * d0;
851 pW[0+ldw*1] += tmp * d1;
852 pW[0+ldw*2] += tmp * d2;
853 pW[0+ldw*3] += tmp * d3;
854 tmp = pC[ll+jj*sdc+ps*1];
855 pW[1+ldw*0] += tmp * d0;
856 pW[1+ldw*1] += tmp * d1;
857 pW[1+ldw*2] += tmp * d2;
858 pW[1+ldw*3] += tmp * d3;
859 }
860 // compute W^T *= T
861 pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
862 pW[1+ldw*3] = pT[3+ldt*0]*pW[1+ldw*0] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[1+ldw*2] + pT[3+ldt*3]*pW[1+ldw*3];
863 pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
864 pW[1+ldw*2] = pT[2+ldt*0]*pW[1+ldw*0] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[1+ldw*2];
865 pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
866 pW[1+ldw*1] = pT[1+ldt*0]*pW[1+ldw*0] + pT[1+ldt*1]*pW[1+ldw*1];
867 pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
868 pW[1+ldw*0] = pT[0+ldt*0]*pW[1+ldw*0];
869 // compute C -= V * W^T
870 pC[0+ps*0] -= pW[0+ldw*0];
871 pC[0+ps*1] -= pW[1+ldw*0];
872 if(m>1)
873 {
874 pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
875 pC[1+ps*1] -= pD[1+ps*0]*pW[1+ldw*0] + pW[1+ldw*1];
876 if(m>2)
877 {
878 pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
879 pC[2+ps*1] -= pD[2+ps*0]*pW[1+ldw*0] + pD[2+ps*1]*pW[1+ldw*1] + pW[1+ldw*2];
880 if(m>3)
881 {
882 pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
883 pC[3+ps*1] -= pD[3+ps*0]*pW[1+ldw*0] + pD[3+ps*1]*pW[1+ldw*1] + pD[3+ps*2]*pW[1+ldw*2] + pW[1+ldw*3];
884 }
885 }
886 }
887 for(jj=4; jj<m-3; jj+=4)
888 {
889 //
890 d0 = pD[0+jj*sdd+ps*0];
891 d1 = pD[0+jj*sdd+ps*1];
892 d2 = pD[0+jj*sdd+ps*2];
893 d3 = pD[0+jj*sdd+ps*3];
894 pC[0+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
895 pC[0+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
896 //
897 d0 = pD[1+jj*sdd+ps*0];
898 d1 = pD[1+jj*sdd+ps*1];
899 d2 = pD[1+jj*sdd+ps*2];
900 d3 = pD[1+jj*sdd+ps*3];
901 pC[1+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
902 pC[1+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
903 //
904 d0 = pD[2+jj*sdd+ps*0];
905 d1 = pD[2+jj*sdd+ps*1];
906 d2 = pD[2+jj*sdd+ps*2];
907 d3 = pD[2+jj*sdd+ps*3];
908 pC[2+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
909 pC[2+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
910 //
911 d0 = pD[3+jj*sdd+ps*0];
912 d1 = pD[3+jj*sdd+ps*1];
913 d2 = pD[3+jj*sdd+ps*2];
914 d3 = pD[3+jj*sdd+ps*3];
915 pC[3+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
916 pC[3+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
917 }
918 for(ll=0; ll<m-jj; ll++)
919 {
920 d0 = pD[ll+jj*sdd+ps*0];
921 d1 = pD[ll+jj*sdd+ps*1];
922 d2 = pD[ll+jj*sdd+ps*2];
923 d3 = pD[ll+jj*sdd+ps*3];
924 pC[ll+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
925 pC[ll+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
926 }
927 }
928 for( ; ii<n; ii++)
929 {
930 pC = pC0+ii*ps;
931 // compute W^T = C^T * V
932 tmp = pC[0+ps*0];
933 pW[0+ldw*0] = tmp;
934 if(m>1)
935 {
936 tmp = pC[1+ps*0];
937 pW[0+ldw*0] += tmp * pD[1+ps*0];
938 pW[0+ldw*1] = tmp;
939 if(m>2)
940 {
941 tmp = pC[2+ps*0];
942 pW[0+ldw*0] += tmp * pD[2+ps*0];
943 pW[0+ldw*1] += tmp * pD[2+ps*1];
944 pW[0+ldw*2] = tmp;
945 if(m>3)
946 {
947 tmp = pC[3+ps*0];
948 pW[0+ldw*0] += tmp * pD[3+ps*0];
949 pW[0+ldw*1] += tmp * pD[3+ps*1];
950 pW[0+ldw*2] += tmp * pD[3+ps*2];
951 pW[0+ldw*3] = tmp;
952 }
953 }
954 }
955 for(jj=4; jj<m-3; jj+=4)
956 {
957 tmp = pC[0+jj*sdc+ps*0];
958 pW[0+ldw*0] += tmp * pD[0+jj*sdd+ps*0];
959 pW[0+ldw*1] += tmp * pD[0+jj*sdd+ps*1];
960 pW[0+ldw*2] += tmp * pD[0+jj*sdd+ps*2];
961 pW[0+ldw*3] += tmp * pD[0+jj*sdd+ps*3];
962 tmp = pC[1+jj*sdc+ps*0];
963 pW[0+ldw*0] += tmp * pD[1+jj*sdd+ps*0];
964 pW[0+ldw*1] += tmp * pD[1+jj*sdd+ps*1];
965 pW[0+ldw*2] += tmp * pD[1+jj*sdd+ps*2];
966 pW[0+ldw*3] += tmp * pD[1+jj*sdd+ps*3];
967 tmp = pC[2+jj*sdc+ps*0];
968 pW[0+ldw*0] += tmp * pD[2+jj*sdd+ps*0];
969 pW[0+ldw*1] += tmp * pD[2+jj*sdd+ps*1];
970 pW[0+ldw*2] += tmp * pD[2+jj*sdd+ps*2];
971 pW[0+ldw*3] += tmp * pD[2+jj*sdd+ps*3];
972 tmp = pC[3+jj*sdc+ps*0];
973 pW[0+ldw*0] += tmp * pD[3+jj*sdd+ps*0];
974 pW[0+ldw*1] += tmp * pD[3+jj*sdd+ps*1];
975 pW[0+ldw*2] += tmp * pD[3+jj*sdd+ps*2];
976 pW[0+ldw*3] += tmp * pD[3+jj*sdd+ps*3];
977 }
978 for(ll=0; ll<m-jj; ll++)
979 {
980 tmp = pC[ll+jj*sdc+ps*0];
981 pW[0+ldw*0] += tmp * pD[ll+jj*sdd+ps*0];
982 pW[0+ldw*1] += tmp * pD[ll+jj*sdd+ps*1];
983 pW[0+ldw*2] += tmp * pD[ll+jj*sdd+ps*2];
984 pW[0+ldw*3] += tmp * pD[ll+jj*sdd+ps*3];
985 }
986 // compute W^T *= T
987 pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
988 pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
989 pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
990 pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
991 // compute C -= V * W^T
992 pC[0+ps*0] -= pW[0+ldw*0];
993 if(m>1)
994 {
995 pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
996 if(m>2)
997 {
998 pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
999 if(m>3)
1000 {
1001 pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
1002 }
1003 }
1004 }
1005 for(jj=4; jj<m-3; jj+=4)
1006 {
1007 pC[0+jj*sdc+ps*0] -= pD[0+jj*sdd+ps*0]*pW[0+ldw*0] + pD[0+jj*sdd+ps*1]*pW[0+ldw*1] + pD[0+jj*sdd+ps*2]*pW[0+ldw*2] + pD[0+jj*sdd+ps*3]*pW[0+ldw*3];
1008 pC[1+jj*sdc+ps*0] -= pD[1+jj*sdd+ps*0]*pW[0+ldw*0] + pD[1+jj*sdd+ps*1]*pW[0+ldw*1] + pD[1+jj*sdd+ps*2]*pW[0+ldw*2] + pD[1+jj*sdd+ps*3]*pW[0+ldw*3];
1009 pC[2+jj*sdc+ps*0] -= pD[2+jj*sdd+ps*0]*pW[0+ldw*0] + pD[2+jj*sdd+ps*1]*pW[0+ldw*1] + pD[2+jj*sdd+ps*2]*pW[0+ldw*2] + pD[2+jj*sdd+ps*3]*pW[0+ldw*3];
1010 pC[3+jj*sdc+ps*0] -= pD[3+jj*sdd+ps*0]*pW[0+ldw*0] + pD[3+jj*sdd+ps*1]*pW[0+ldw*1] + pD[3+jj*sdd+ps*2]*pW[0+ldw*2] + pD[3+jj*sdd+ps*3]*pW[0+ldw*3];
1011 }
1012 for(ll=0; ll<m-jj; ll++)
1013 {
1014 pC[ll+jj*sdc+ps*0] -= pD[ll+jj*sdd+ps*0]*pW[0+ldw*0] + pD[ll+jj*sdd+ps*1]*pW[0+ldw*1] + pD[ll+jj*sdd+ps*2]*pW[0+ldw*2] + pD[ll+jj*sdd+ps*3]*pW[0+ldw*3];
1015 }
1016 }
1017
1018 return;
1019 }
1020
1021
1022
1023void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc)
1024 {
1025 if(m<=0 | n<=0)
1026 return;
1027 int ii, jj, ll;
1028 const int ps = 4;
1029 double v10,
1030 v20, v21,
1031 v30, v31, v32;
1032 double c00, c01,
1033 c10, c11,
1034 c20, c21,
1035 c30, c31;
1036 double a0, a1, a2, a3, b0, b1;
1037 double tmp, d0, d1, d2, d3;
1038 double *pC;
1039 double pT[16];// = {};
1040 int ldt = 4;
1041 double pW[8];// = {};
1042 int ldw = 4;
1043 // dot product of v
1044 v10 = 0.0;
1045 v20 = 0.0;
1046 v30 = 0.0;
1047 v21 = 0.0;
1048 v31 = 0.0;
1049 v32 = 0.0;
1050 if(m>1)
1051 {
1052 v10 = 1.0 * pD[1+ps*0];
1053 if(m>2)
1054 {
1055 v10 += pD[2+ps*1] * pD[2+ps*0];
1056 v20 = 1.0 * pD[2+ps*0];
1057 v21 = 1.0 * pD[2+ps*1];
1058 if(m>3)
1059 {
1060 v10 += pD[3+ps*1] * pD[3+ps*0];
1061 v20 += pD[3+ps*2] * pD[3+ps*0];
1062 v21 += pD[3+ps*2] * pD[3+ps*1];
1063 v30 = 1.0 * pD[3+ps*0];
1064 v31 = 1.0 * pD[3+ps*1];
1065 v32 = 1.0 * pD[3+ps*2];
1066 }
1067 }
1068 }
1069 for(ii=4; ii<m-3; ii+=4)
1070 {
1071 v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
1072 v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
1073 v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
1074 v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
1075 v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
1076 v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
1077 v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
1078 v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
1079 v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
1080 v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
1081 v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
1082 v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
1083 v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
1084 v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
1085 v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
1086 v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
1087 v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
1088 v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
1089 v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
1090 v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
1091 v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
1092 v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
1093 v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
1094 v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
1095 }
1096 for(ll=0; ll<m-ii; ll++)
1097 {
1098 v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
1099 v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
1100 v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
1101 v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
1102 v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
1103 v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
1104 }
1105 // compute lower triangular T containing tau for matrix update
1106 pT[0+ldt*0] = dD[0];
1107 pT[1+ldt*1] = dD[1];
1108 pT[2+ldt*2] = dD[2];
1109 pT[3+ldt*3] = dD[3];
1110 pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
1111 pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
1112 pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
1113 pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
1114 pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
1115 pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
1116 // downgrade matrix
1117 pW[0] = 0.0;
1118 pW[1] = 0.0;
1119 pW[2] = 0.0;
1120 pW[3] = 0.0;
1121 pW[4] = 0.0;
1122 pW[5] = 0.0;
1123 pW[6] = 0.0;
1124 pW[7] = 0.0;
1125 ii = 0;
1126 for( ; ii<n-1; ii+=2)
1127 {
1128 pC = pC0+ii*ps;
1129 // compute W^T = C^T * V
1130 tmp = pC[0+ps*0];
1131 pW[0+ldw*0] = tmp;
1132 tmp = pC[0+ps*1];
1133 pW[0+ldw*1] = tmp;
1134 if(m>1)
1135 {
1136 d0 = pVt[0+ps*1];
1137 tmp = pC[1+ps*0];
1138 pW[0+ldw*0] += d0 * tmp;
1139 pW[1+ldw*0] = tmp;
1140 tmp = pC[1+ps*1];
1141 pW[0+ldw*1] += d0 * tmp;
1142 pW[1+ldw*1] = tmp;
1143 if(m>2)
1144 {
1145 d0 = pVt[0+ps*2];
1146 d1 = pVt[1+ps*2];
1147 tmp = pC[2+ps*0];
1148 pW[0+ldw*0] += d0 * tmp;
1149 pW[1+ldw*0] += d1 * tmp;
1150 pW[2+ldw*0] = tmp;
1151 tmp = pC[2+ps*1];
1152 pW[0+ldw*1] += d0 * tmp;
1153 pW[1+ldw*1] += d1 * tmp;
1154 pW[2+ldw*1] = tmp;
1155 if(m>3)
1156 {
1157 d0 = pVt[0+ps*3];
1158 d1 = pVt[1+ps*3];
1159 d2 = pVt[2+ps*3];
1160 tmp = pC[3+ps*0];
1161 pW[0+ldw*0] += d0 * tmp;
1162 pW[1+ldw*0] += d1 * tmp;
1163 pW[2+ldw*0] += d2 * tmp;
1164 pW[3+ldw*0] = tmp;
1165 tmp = pC[3+ps*1];
1166 pW[0+ldw*1] += d0 * tmp;
1167 pW[1+ldw*1] += d1 * tmp;
1168 pW[2+ldw*1] += d2 * tmp;
1169 pW[3+ldw*1] = tmp;
1170 }
1171 }
1172 }
1173 for(jj=4; jj<m-3; jj+=4)
1174 {
1175 //
1176 d0 = pVt[0+ps*(0+jj)];
1177 d1 = pVt[1+ps*(0+jj)];
1178 d2 = pVt[2+ps*(0+jj)];
1179 d3 = pVt[3+ps*(0+jj)];
1180 tmp = pC[0+jj*sdc+ps*0];
1181 pW[0+ldw*0] += d0 * tmp;
1182 pW[1+ldw*0] += d1 * tmp;
1183 pW[2+ldw*0] += d2 * tmp;
1184 pW[3+ldw*0] += d3 * tmp;
1185 tmp = pC[0+jj*sdc+ps*1];
1186 pW[0+ldw*1] += d0 * tmp;
1187 pW[1+ldw*1] += d1 * tmp;
1188 pW[2+ldw*1] += d2 * tmp;
1189 pW[3+ldw*1] += d3 * tmp;
1190 //
1191 d0 = pVt[0+ps*(1+jj)];
1192 d1 = pVt[1+ps*(1+jj)];
1193 d2 = pVt[2+ps*(1+jj)];
1194 d3 = pVt[3+ps*(1+jj)];
1195 tmp = pC[1+jj*sdc+ps*0];
1196 pW[0+ldw*0] += d0 * tmp;
1197 pW[1+ldw*0] += d1 * tmp;
1198 pW[2+ldw*0] += d2 * tmp;
1199 pW[3+ldw*0] += d3 * tmp;
1200 tmp = pC[1+jj*sdc+ps*1];
1201 pW[0+ldw*1] += d0 * tmp;
1202 pW[1+ldw*1] += d1 * tmp;
1203 pW[2+ldw*1] += d2 * tmp;
1204 pW[3+ldw*1] += d3 * tmp;
1205 //
1206 d0 = pVt[0+ps*(2+jj)];
1207 d1 = pVt[1+ps*(2+jj)];
1208 d2 = pVt[2+ps*(2+jj)];
1209 d3 = pVt[3+ps*(2+jj)];
1210 tmp = pC[2+jj*sdc+ps*0];
1211 pW[0+ldw*0] += d0 * tmp;
1212 pW[1+ldw*0] += d1 * tmp;
1213 pW[2+ldw*0] += d2 * tmp;
1214 pW[3+ldw*0] += d3 * tmp;
1215 tmp = pC[2+jj*sdc+ps*1];
1216 pW[0+ldw*1] += d0 * tmp;
1217 pW[1+ldw*1] += d1 * tmp;
1218 pW[2+ldw*1] += d2 * tmp;
1219 pW[3+ldw*1] += d3 * tmp;
1220 //
1221 d0 = pVt[0+ps*(3+jj)];
1222 d1 = pVt[1+ps*(3+jj)];
1223 d2 = pVt[2+ps*(3+jj)];
1224 d3 = pVt[3+ps*(3+jj)];
1225 tmp = pC[3+jj*sdc+ps*0];
1226 pW[0+ldw*0] += d0 * tmp;
1227 pW[1+ldw*0] += d1 * tmp;
1228 pW[2+ldw*0] += d2 * tmp;
1229 pW[3+ldw*0] += d3 * tmp;
1230 tmp = pC[3+jj*sdc+ps*1];
1231 pW[0+ldw*1] += d0 * tmp;
1232 pW[1+ldw*1] += d1 * tmp;
1233 pW[2+ldw*1] += d2 * tmp;
1234 pW[3+ldw*1] += d3 * tmp;
1235 }
1236 for(ll=0; ll<m-jj; ll++)
1237 {
1238 d0 = pVt[0+ps*(ll+jj)];
1239 d1 = pVt[1+ps*(ll+jj)];
1240 d2 = pVt[2+ps*(ll+jj)];
1241 d3 = pVt[3+ps*(ll+jj)];
1242 tmp = pC[ll+jj*sdc+ps*0];
1243 pW[0+ldw*0] += d0 * tmp;
1244 pW[1+ldw*0] += d1 * tmp;
1245 pW[2+ldw*0] += d2 * tmp;
1246 pW[3+ldw*0] += d3 * tmp;
1247 tmp = pC[ll+jj*sdc+ps*1];
1248 pW[0+ldw*1] += d0 * tmp;
1249 pW[1+ldw*1] += d1 * tmp;
1250 pW[2+ldw*1] += d2 * tmp;
1251 pW[3+ldw*1] += d3 * tmp;
1252 }
1253 // compute W^T *= T
1254 pW[3+ldw*0] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[1+ldw*0] + pT[3+ldt*2]*pW[2+ldw*0] + pT[3+ldt*3]*pW[3+ldw*0];
1255 pW[3+ldw*1] = pT[3+ldt*0]*pW[0+ldw*1] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[2+ldw*1] + pT[3+ldt*3]*pW[3+ldw*1];
1256 pW[2+ldw*0] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[1+ldw*0] + pT[2+ldt*2]*pW[2+ldw*0];
1257 pW[2+ldw*1] = pT[2+ldt*0]*pW[0+ldw*1] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[2+ldw*1];
1258 pW[1+ldw*0] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[1+ldw*0];
1259 pW[1+ldw*1] = pT[1+ldt*0]*pW[0+ldw*1] + pT[1+ldt*1]*pW[1+ldw*1];
1260 pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
1261 pW[0+ldw*1] = pT[0+ldt*0]*pW[0+ldw*1];
1262 // compute C -= V * W^T
1263 jj = 0;
1264 // load
1265 c00 = pC[0+jj*sdc+ps*0];
1266 c10 = pC[1+jj*sdc+ps*0];
1267 c20 = pC[2+jj*sdc+ps*0];
1268 c30 = pC[3+jj*sdc+ps*0];
1269 c01 = pC[0+jj*sdc+ps*1];
1270 c11 = pC[1+jj*sdc+ps*1];
1271 c21 = pC[2+jj*sdc+ps*1];
1272 c31 = pC[3+jj*sdc+ps*1];
1273 // rank1
1274 a1 = pD[1+jj*sdd+ps*0];
1275 a2 = pD[2+jj*sdd+ps*0];
1276 a3 = pD[3+jj*sdd+ps*0];
1277 b0 = pW[0+ldw*0];
1278 c00 -= b0;
1279 c10 -= a1*b0;
1280 c20 -= a2*b0;
1281 c30 -= a3*b0;
1282 b1 = pW[0+ldw*1];
1283 c01 -= b1;
1284 c11 -= a1*b1;
1285 c21 -= a2*b1;
1286 c31 -= a3*b1;
1287 // rank2
1288 a2 = pD[2+jj*sdd+ps*1];
1289 a3 = pD[3+jj*sdd+ps*1];
1290 b0 = pW[1+ldw*0];
1291 c10 -= b0;
1292 c20 -= a2*b0;
1293 c30 -= a3*b0;
1294 b1 = pW[1+ldw*1];
1295 c11 -= b1;
1296 c21 -= a2*b1;
1297 c31 -= a3*b1;
1298 // rank3
1299 a3 = pD[3+jj*sdd+ps*2];
1300 b0 = pW[2+ldw*0];
1301 c20 -= b0;
1302 c30 -= a3*b0;
1303 b1 = pW[2+ldw*1];
1304 c21 -= b1;
1305 c31 -= a3*b1;
1306 // rank4
1307 a3 = pD[3+jj*sdd+ps*3];
1308 b0 = pW[3+ldw*0];
1309 c30 -= b0;
1310 b1 = pW[3+ldw*1];
1311 c31 -= b1;
1312 // store
1313 pC[0+jj*sdc+ps*0] = c00;
1314 pC[0+jj*sdc+ps*1] = c01;
1315 if(m>1)
1316 {
1317 pC[1+jj*sdc+ps*0] = c10;
1318 pC[1+jj*sdc+ps*1] = c11;
1319 if(m>2)
1320 {
1321 pC[2+jj*sdc+ps*0] = c20;
1322 pC[2+jj*sdc+ps*1] = c21;
1323 if(m>3)
1324 {
1325 pC[3+jj*sdc+ps*0] = c30;
1326 pC[3+jj*sdc+ps*1] = c31;
1327 }
1328 }
1329 }
1330 for(jj=4; jj<m-3; jj+=4)
1331 {
1332 // load
1333 c00 = pC[0+jj*sdc+ps*0];
1334 c10 = pC[1+jj*sdc+ps*0];
1335 c20 = pC[2+jj*sdc+ps*0];
1336 c30 = pC[3+jj*sdc+ps*0];
1337 c01 = pC[0+jj*sdc+ps*1];
1338 c11 = pC[1+jj*sdc+ps*1];
1339 c21 = pC[2+jj*sdc+ps*1];
1340 c31 = pC[3+jj*sdc+ps*1];
1341 //
1342 a0 = pD[0+jj*sdd+ps*0];
1343 a1 = pD[1+jj*sdd+ps*0];
1344 a2 = pD[2+jj*sdd+ps*0];
1345 a3 = pD[3+jj*sdd+ps*0];
1346 b0 = pW[0+ldw*0];
1347 c00 -= a0*b0;
1348 c10 -= a1*b0;
1349 c20 -= a2*b0;
1350 c30 -= a3*b0;
1351 b1 = pW[0+ldw*1];
1352 c01 -= a0*b1;
1353 c11 -= a1*b1;
1354 c21 -= a2*b1;
1355 c31 -= a3*b1;
1356 //
1357 a0 = pD[0+jj*sdd+ps*1];
1358 a1 = pD[1+jj*sdd+ps*1];
1359 a2 = pD[2+jj*sdd+ps*1];
1360 a3 = pD[3+jj*sdd+ps*1];
1361 b0 = pW[1+ldw*0];
1362 c00 -= a0*b0;
1363 c10 -= a1*b0;
1364 c20 -= a2*b0;
1365 c30 -= a3*b0;
1366 b1 = pW[1+ldw*1];
1367 c01 -= a0*b1;
1368 c11 -= a1*b1;
1369 c21 -= a2*b1;
1370 c31 -= a3*b1;
1371 //
1372 a0 = pD[0+jj*sdd+ps*2];
1373 a1 = pD[1+jj*sdd+ps*2];
1374 a2 = pD[2+jj*sdd+ps*2];
1375 a3 = pD[3+jj*sdd+ps*2];
1376 b0 = pW[2+ldw*0];
1377 c00 -= a0*b0;
1378 c10 -= a1*b0;
1379 c20 -= a2*b0;
1380 c30 -= a3*b0;
1381 b1 = pW[2+ldw*1];
1382 c01 -= a0*b1;
1383 c11 -= a1*b1;
1384 c21 -= a2*b1;
1385 c31 -= a3*b1;
1386 //
1387 a0 = pD[0+jj*sdd+ps*3];
1388 a1 = pD[1+jj*sdd+ps*3];
1389 a2 = pD[2+jj*sdd+ps*3];
1390 a3 = pD[3+jj*sdd+ps*3];
1391 b0 = pW[3+ldw*0];
1392 c00 -= a0*b0;
1393 c10 -= a1*b0;
1394 c20 -= a2*b0;
1395 c30 -= a3*b0;
1396 b1 = pW[3+ldw*1];
1397 c01 -= a0*b1;
1398 c11 -= a1*b1;
1399 c21 -= a2*b1;
1400 c31 -= a3*b1;
1401 // store
1402 pC[0+jj*sdc+ps*0] = c00;
1403 pC[1+jj*sdc+ps*0] = c10;
1404 pC[2+jj*sdc+ps*0] = c20;
1405 pC[3+jj*sdc+ps*0] = c30;
1406 pC[0+jj*sdc+ps*1] = c01;
1407 pC[1+jj*sdc+ps*1] = c11;
1408 pC[2+jj*sdc+ps*1] = c21;
1409 pC[3+jj*sdc+ps*1] = c31;
1410 }
1411 for(ll=0; ll<m-jj; ll++)
1412 {
1413 // load
1414 c00 = pC[ll+jj*sdc+ps*0];
1415 c01 = pC[ll+jj*sdc+ps*1];
1416 //
1417 a0 = pD[ll+jj*sdd+ps*0];
1418 b0 = pW[0+ldw*0];
1419 c00 -= a0*b0;
1420 b1 = pW[0+ldw*1];
1421 c01 -= a0*b1;
1422 //
1423 a0 = pD[ll+jj*sdd+ps*1];
1424 b0 = pW[1+ldw*0];
1425 c00 -= a0*b0;
1426 b1 = pW[1+ldw*1];
1427 c01 -= a0*b1;
1428 //
1429 a0 = pD[ll+jj*sdd+ps*2];
1430 b0 = pW[2+ldw*0];
1431 c00 -= a0*b0;
1432 b1 = pW[2+ldw*1];
1433 c01 -= a0*b1;
1434 //
1435 a0 = pD[ll+jj*sdd+ps*3];
1436 b0 = pW[3+ldw*0];
1437 c00 -= a0*b0;
1438 b1 = pW[3+ldw*1];
1439 c01 -= a0*b1;
1440 // store
1441 pC[ll+jj*sdc+ps*0] = c00;
1442 pC[ll+jj*sdc+ps*1] = c01;
1443 }
1444 }
1445 for( ; ii<n; ii++)
1446 {
1447 pC = pC0+ii*ps;
1448 // compute W^T = C^T * V
1449 tmp = pC[0+ps*0];
1450 pW[0+ldw*0] = tmp;
1451 if(m>1)
1452 {
1453 d0 = pVt[0+ps*1];
1454 tmp = pC[1+ps*0];
1455 pW[0+ldw*0] += d0 * tmp;
1456 pW[1+ldw*0] = tmp;
1457 if(m>2)
1458 {
1459 d0 = pVt[0+ps*2];
1460 d1 = pVt[1+ps*2];
1461 tmp = pC[2+ps*0];
1462 pW[0+ldw*0] += d0 * tmp;
1463 pW[1+ldw*0] += d1 * tmp;
1464 pW[2+ldw*0] = tmp;
1465 if(m>3)
1466 {
1467 d0 = pVt[0+ps*3];
1468 d1 = pVt[1+ps*3];
1469 d2 = pVt[2+ps*3];
1470 tmp = pC[3+ps*0];
1471 pW[0+ldw*0] += d0 * tmp;
1472 pW[1+ldw*0] += d1 * tmp;
1473 pW[2+ldw*0] += d2 * tmp;
1474 pW[3+ldw*0] = tmp;
1475 }
1476 }
1477 }
1478 for(jj=4; jj<m-3; jj+=4)
1479 {
1480 //
1481 d0 = pVt[0+ps*(0+jj)];
1482 d1 = pVt[1+ps*(0+jj)];
1483 d2 = pVt[2+ps*(0+jj)];
1484 d3 = pVt[3+ps*(0+jj)];
1485 tmp = pC[0+jj*sdc+ps*0];
1486 pW[0+ldw*0] += d0 * tmp;
1487 pW[1+ldw*0] += d1 * tmp;
1488 pW[2+ldw*0] += d2 * tmp;
1489 pW[3+ldw*0] += d3 * tmp;
1490 //
1491 d0 = pVt[0+ps*(1+jj)];
1492 d1 = pVt[1+ps*(1+jj)];
1493 d2 = pVt[2+ps*(1+jj)];
1494 d3 = pVt[3+ps*(1+jj)];
1495 tmp = pC[1+jj*sdc+ps*0];
1496 pW[0+ldw*0] += d0 * tmp;
1497 pW[1+ldw*0] += d1 * tmp;
1498 pW[2+ldw*0] += d2 * tmp;
1499 pW[3+ldw*0] += d3 * tmp;
1500 //
1501 d0 = pVt[0+ps*(2+jj)];
1502 d1 = pVt[1+ps*(2+jj)];
1503 d2 = pVt[2+ps*(2+jj)];
1504 d3 = pVt[3+ps*(2+jj)];
1505 tmp = pC[2+jj*sdc+ps*0];
1506 pW[0+ldw*0] += d0 * tmp;
1507 pW[1+ldw*0] += d1 * tmp;
1508 pW[2+ldw*0] += d2 * tmp;
1509 pW[3+ldw*0] += d3 * tmp;
1510 //
1511 d0 = pVt[0+ps*(3+jj)];
1512 d1 = pVt[1+ps*(3+jj)];
1513 d2 = pVt[2+ps*(3+jj)];
1514 d3 = pVt[3+ps*(3+jj)];
1515 tmp = pC[3+jj*sdc+ps*0];
1516 pW[0+ldw*0] += d0 * tmp;
1517 pW[1+ldw*0] += d1 * tmp;
1518 pW[2+ldw*0] += d2 * tmp;
1519 pW[3+ldw*0] += d3 * tmp;
1520 }
1521 for(ll=0; ll<m-jj; ll++)
1522 {
1523 d0 = pVt[0+ps*(ll+jj)];
1524 d1 = pVt[1+ps*(ll+jj)];
1525 d2 = pVt[2+ps*(ll+jj)];
1526 d3 = pVt[3+ps*(ll+jj)];
1527 tmp = pC[ll+jj*sdc+ps*0];
1528 pW[0+ldw*0] += d0 * tmp;
1529 pW[1+ldw*0] += d1 * tmp;
1530 pW[2+ldw*0] += d2 * tmp;
1531 pW[3+ldw*0] += d3 * tmp;
1532 }
1533 // compute W^T *= T
1534 pW[3+ldw*0] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[1+ldw*0] + pT[3+ldt*2]*pW[2+ldw*0] + pT[3+ldt*3]*pW[3+ldw*0];
1535 pW[2+ldw*0] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[1+ldw*0] + pT[2+ldt*2]*pW[2+ldw*0];
1536 pW[1+ldw*0] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[1+ldw*0];
1537 pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
1538 // compute C -= V * W^T
1539 jj = 0;
1540 // load
1541 c00 = pC[0+jj*sdc+ps*0];
1542 c10 = pC[1+jj*sdc+ps*0];
1543 c20 = pC[2+jj*sdc+ps*0];
1544 c30 = pC[3+jj*sdc+ps*0];
1545 // rank1
1546 a1 = pD[1+jj*sdd+ps*0];
1547 a2 = pD[2+jj*sdd+ps*0];
1548 a3 = pD[3+jj*sdd+ps*0];
1549 b0 = pW[0+ldw*0];
1550 c00 -= b0;
1551 c10 -= a1*b0;
1552 c20 -= a2*b0;
1553 c30 -= a3*b0;
1554 // rank2
1555 a2 = pD[2+jj*sdd+ps*1];
1556 a3 = pD[3+jj*sdd+ps*1];
1557 b0 = pW[1+ldw*0];
1558 c10 -= b0;
1559 c20 -= a2*b0;
1560 c30 -= a3*b0;
1561 // rank3
1562 a3 = pD[3+jj*sdd+ps*2];
1563 b0 = pW[2+ldw*0];
1564 c20 -= b0;
1565 c30 -= a3*b0;
1566 // rank4
1567 a3 = pD[3+jj*sdd+ps*3];
1568 b0 = pW[3+ldw*0];
1569 c30 -= b0;
1570 // store
1571 pC[0+jj*sdc+ps*0] = c00;
1572 if(m>1)
1573 {
1574 pC[1+jj*sdc+ps*0] = c10;
1575 if(m>2)
1576 {
1577 pC[2+jj*sdc+ps*0] = c20;
1578 if(m>3)
1579 {
1580 pC[3+jj*sdc+ps*0] = c30;
1581 }
1582 }
1583 }
1584 for(jj=4; jj<m-3; jj+=4)
1585 {
1586 // load
1587 c00 = pC[0+jj*sdc+ps*0];
1588 c10 = pC[1+jj*sdc+ps*0];
1589 c20 = pC[2+jj*sdc+ps*0];
1590 c30 = pC[3+jj*sdc+ps*0];
1591 //
1592 a0 = pD[0+jj*sdd+ps*0];
1593 a1 = pD[1+jj*sdd+ps*0];
1594 a2 = pD[2+jj*sdd+ps*0];
1595 a3 = pD[3+jj*sdd+ps*0];
1596 b0 = pW[0+ldw*0];
1597 c00 -= a0*b0;
1598 c10 -= a1*b0;
1599 c20 -= a2*b0;
1600 c30 -= a3*b0;
1601 //
1602 a0 = pD[0+jj*sdd+ps*1];
1603 a1 = pD[1+jj*sdd+ps*1];
1604 a2 = pD[2+jj*sdd+ps*1];
1605 a3 = pD[3+jj*sdd+ps*1];
1606 b0 = pW[1+ldw*0];
1607 c00 -= a0*b0;
1608 c10 -= a1*b0;
1609 c20 -= a2*b0;
1610 c30 -= a3*b0;
1611 //
1612 a0 = pD[0+jj*sdd+ps*2];
1613 a1 = pD[1+jj*sdd+ps*2];
1614 a2 = pD[2+jj*sdd+ps*2];
1615 a3 = pD[3+jj*sdd+ps*2];
1616 b0 = pW[2+ldw*0];
1617 c00 -= a0*b0;
1618 c10 -= a1*b0;
1619 c20 -= a2*b0;
1620 c30 -= a3*b0;
1621 //
1622 a0 = pD[0+jj*sdd+ps*3];
1623 a1 = pD[1+jj*sdd+ps*3];
1624 a2 = pD[2+jj*sdd+ps*3];
1625 a3 = pD[3+jj*sdd+ps*3];
1626 b0 = pW[3+ldw*0];
1627 c00 -= a0*b0;
1628 c10 -= a1*b0;
1629 c20 -= a2*b0;
1630 c30 -= a3*b0;
1631 // store
1632 pC[0+jj*sdc+ps*0] = c00;
1633 pC[1+jj*sdc+ps*0] = c10;
1634 pC[2+jj*sdc+ps*0] = c20;
1635 pC[3+jj*sdc+ps*0] = c30;
1636 }
1637 for(ll=0; ll<m-jj; ll++)
1638 {
1639 // load
1640 c00 = pC[ll+jj*sdc+ps*0];
1641 //
1642 a0 = pD[ll+jj*sdd+ps*0];
1643 b0 = pW[0+ldw*0];
1644 c00 -= a0*b0;
1645 //
1646 a0 = pD[ll+jj*sdd+ps*1];
1647 b0 = pW[1+ldw*0];
1648 c00 -= a0*b0;
1649 //
1650 a0 = pD[ll+jj*sdd+ps*2];
1651 b0 = pW[2+ldw*0];
1652 c00 -= a0*b0;
1653 //
1654 a0 = pD[ll+jj*sdd+ps*3];
1655 b0 = pW[3+ldw*0];
1656 c00 -= a0*b0;
1657 // store
1658 pC[ll+jj*sdc+ps*0] = c00;
1659 }
1660 }
1661
1662 return;
1663 }
1664
1665
1666
1667// assume n>=4
1668void kernel_dgelqf_4_lib4(int n, double *pD, double *dD)
1669 {
1670 int ii, jj, ll;
1671 double alpha, beta, tmp, w1, w2, w3;
1672 const int ps = 4;
1673 // first column
1674 beta = 0.0;
1675 for(ii=1; ii<n; ii++)
1676 {
1677 tmp = pD[0+ps*ii];
1678 beta += tmp*tmp;
1679 }
1680 if(beta==0.0)
1681 {
1682 // tau
1683 dD[0] = 0.0;
1684 }
1685 else
1686 {
1687 alpha = pD[0+ps*0];
1688 beta += alpha*alpha;
1689 beta = sqrt(beta);
1690 if(alpha>0)
1691 beta = -beta;
1692 // tau0
1693 dD[0] = (beta-alpha) / beta;
1694 tmp = 1.0 / (alpha-beta);
1695 // compute v0
1696 pD[0+ps*0] = beta;
1697 for(ii=1; ii<n; ii++)
1698 {
1699 pD[0+ps*ii] *= tmp;
1700 }
1701 }
1702 // gemv_t & ger
1703 w1 = pD[1+ps*0];
1704 w2 = pD[2+ps*0];
1705 w3 = pD[3+ps*0];
1706 w1 += pD[1+ps*1] * pD[0+ps*1];
1707 w2 += pD[2+ps*1] * pD[0+ps*1];
1708 w3 += pD[3+ps*1] * pD[0+ps*1];
1709 w1 += pD[1+ps*2] * pD[0+ps*2];
1710 w2 += pD[2+ps*2] * pD[0+ps*2];
1711 w3 += pD[3+ps*2] * pD[0+ps*2];
1712 w1 += pD[1+ps*3] * pD[0+ps*3];
1713 w2 += pD[2+ps*3] * pD[0+ps*3];
1714 w3 += pD[3+ps*3] * pD[0+ps*3];
1715 for(ii=4; ii<n; ii++)
1716 {
1717 w1 += pD[1+ps*ii] * pD[0+ps*ii];
1718 w2 += pD[2+ps*ii] * pD[0+ps*ii];
1719 w3 += pD[3+ps*ii] * pD[0+ps*ii];
1720 }
1721 w1 = - dD[0] * w1;
1722 w2 = - dD[0] * w2;
1723 w3 = - dD[0] * w3;
1724 pD[1+ps*0] += w1;
1725 pD[2+ps*0] += w2;
1726 pD[3+ps*0] += w3;
1727 pD[1+ps*1] += w1 * pD[0+ps*1];
1728 pD[2+ps*1] += w2 * pD[0+ps*1];
1729 pD[3+ps*1] += w3 * pD[0+ps*1];
1730 pD[1+ps*2] += w1 * pD[0+ps*2];
1731 pD[2+ps*2] += w2 * pD[0+ps*2];
1732 pD[3+ps*2] += w3 * pD[0+ps*2];
1733 pD[1+ps*3] += w1 * pD[0+ps*3];
1734 pD[2+ps*3] += w2 * pD[0+ps*3];
1735 pD[3+ps*3] += w3 * pD[0+ps*3];
1736 for(ii=4; ii<n; ii++)
1737 {
1738 pD[1+ps*ii] += w1 * pD[0+ps*ii];
1739 pD[2+ps*ii] += w2 * pD[0+ps*ii];
1740 pD[3+ps*ii] += w3 * pD[0+ps*ii];
1741 }
1742 // second column
1743 beta = 0.0;
1744 for(ii=2; ii<n; ii++)
1745 {
1746 tmp = pD[1+ps*ii];
1747 beta += tmp*tmp;
1748 }
1749 if(beta==0.0)
1750 {
1751 // tau
1752 dD[1] = 0.0;
1753 }
1754 else
1755 {
1756 alpha = pD[1+ps*1];
1757 beta += alpha*alpha;
1758 beta = sqrt(beta);
1759 if(alpha>0)
1760 beta = -beta;
1761 // tau0
1762 dD[1] = (beta-alpha) / beta;
1763 tmp = 1.0 / (alpha-beta);
1764 // compute v0
1765 pD[1+ps*1] = beta;
1766 for(ii=2; ii<n; ii++)
1767 {
1768 pD[1+ps*ii] *= tmp;
1769 }
1770 }
1771 // gemv_t & ger
1772 w2 = pD[2+ps*1];
1773 w3 = pD[3+ps*1];
1774 w2 += pD[2+ps*2] * pD[1+ps*2];
1775 w3 += pD[3+ps*2] * pD[1+ps*2];
1776 w2 += pD[2+ps*3] * pD[1+ps*3];
1777 w3 += pD[3+ps*3] * pD[1+ps*3];
1778 for(ii=4; ii<n; ii++)
1779 {
1780 w2 += pD[2+ps*ii] * pD[1+ps*ii];
1781 w3 += pD[3+ps*ii] * pD[1+ps*ii];
1782 }
1783 w2 = - dD[1] * w2;
1784 w3 = - dD[1] * w3;
1785 pD[2+ps*1] += w2;
1786 pD[3+ps*1] += w3;
1787 pD[2+ps*2] += w2 * pD[1+ps*2];
1788 pD[3+ps*2] += w3 * pD[1+ps*2];
1789 pD[2+ps*3] += w2 * pD[1+ps*3];
1790 pD[3+ps*3] += w3 * pD[1+ps*3];
1791 for(ii=4; ii<n; ii++)
1792 {
1793 pD[2+ps*ii] += w2 * pD[1+ps*ii];
1794 pD[3+ps*ii] += w3 * pD[1+ps*ii];
1795 }
1796 // third column
1797 beta = 0.0;
1798 for(ii=3; ii<n; ii++)
1799 {
1800 tmp = pD[2+ps*ii];
1801 beta += tmp*tmp;
1802 }
1803 if(beta==0.0)
1804 {
1805 // tau
1806 dD[2] = 0.0;
1807 }
1808 else
1809 {
1810 alpha = pD[2+ps*2];
1811 beta += alpha*alpha;
1812 beta = sqrt(beta);
1813 if(alpha>0)
1814 beta = -beta;
1815 // tau0
1816 dD[2] = (beta-alpha) / beta;
1817 tmp = 1.0 / (alpha-beta);
1818 // compute v0
1819 pD[2+ps*2] = beta;
1820 for(ii=3; ii<n; ii++)
1821 {
1822 pD[2+ps*ii] *= tmp;
1823 }
1824 }
1825 // gemv_t & ger
1826 w3 = pD[3+ps*2];
1827 w3 += pD[3+ps*3] * pD[2+ps*3];
1828 for(ii=4; ii<n; ii++)
1829 {
1830 w3 += pD[3+ps*ii] * pD[2+ps*ii];
1831 }
1832 w3 = - dD[2] * w3;
1833 pD[3+ps*2] += w3;
1834 pD[3+ps*3] += w3 * pD[2+ps*3];
1835 for(ii=4; ii<n; ii++)
1836 {
1837 pD[3+ps*ii] += w3 * pD[2+ps*ii];
1838 }
1839 // fourth column
1840 beta = 0.0;
1841 for(ii=4; ii<n; ii++)
1842 {
1843 tmp = pD[3+ps*ii];
1844 beta += tmp*tmp;
1845 }
1846 if(beta==0.0)
1847 {
1848 // tau
1849 dD[3] = 0.0;
1850 }
1851 else
1852 {
1853 alpha = pD[3+ps*3];
1854 beta += alpha*alpha;
1855 beta = sqrt(beta);
1856 if(alpha>0)
1857 beta = -beta;
1858 // tau0
1859 dD[3] = (beta-alpha) / beta;
1860 tmp = 1.0 / (alpha-beta);
1861 // compute v0
1862 pD[3+ps*3] = beta;
1863 for(ii=4; ii<n; ii++)
1864 {
1865 pD[3+ps*ii] *= tmp;
1866 }
1867 }
1868 return;
1869 }
1870
1871
1872
1873// unblocked algorithm
1874void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
1875 {
1876 if(m<=0 | n<=0)
1877 return;
1878 int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
1879 const int ps = 4;
1880 imax = k;//m<n ? m : n;
1881 double alpha, beta, tmp;
1882 double w00, w01,
1883 w10, w11,
1884 w20, w21,
1885 w30, w31;
1886 double *pC00, *pC10, *pC10a, *pC20, *pC20a, *pC01, *pC11;
1887 double pT[4];
1888 int ldt = 2;
1889 double *pD0 = pD-offD;
1890 ii = 0;
1891#if 1
1892 for(; ii<imax-1; ii+=2)
1893 {
1894 // first row
1895 pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
1896 beta = 0.0;
1897 for(jj=1; jj<n-ii; jj++)
1898 {
1899 tmp = pC00[0+ps*jj];
1900 beta += tmp*tmp;
1901 }
1902 if(beta==0.0)
1903 {
1904 dD[ii] = 0.0;
1905 }
1906 else
1907 {
1908 alpha = pC00[0];
1909 beta += alpha*alpha;
1910 beta = sqrt(beta);
1911 if(alpha>0)
1912 beta = -beta;
1913 dD[ii] = (beta-alpha) / beta;
1914 tmp = 1.0 / (alpha-beta);
1915 pC00[0] = beta;
1916 for(jj=1; jj<n-ii; jj++)
1917 pC00[0+ps*jj] *= tmp;
1918 }
1919 pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
1920 kmax = n-ii;
1921 w00 = pC10[0+ps*0]; // pC00[0+ps*0] = 1.0
1922 for(kk=1; kk<kmax; kk++)
1923 {
1924 w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
1925 }
1926 w00 = - w00*dD[ii];
1927 pC10[0+ps*0] += w00; // pC00[0+ps*0] = 1.0
1928 for(kk=1; kk<kmax; kk++)
1929 {
1930 pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
1931 }
1932 // second row
1933 pC11 = pC10+ps*1;
1934 beta = 0.0;
1935 for(jj=1; jj<n-(ii+1); jj++)
1936 {
1937 tmp = pC11[0+ps*jj];
1938 beta += tmp*tmp;
1939 }
1940 if(beta==0.0)
1941 {
1942 dD[(ii+1)] = 0.0;
1943 }
1944 else
1945 {
1946 alpha = pC11[0+ps*0];
1947 beta += alpha*alpha;
1948 beta = sqrt(beta);
1949 if(alpha>0)
1950 beta = -beta;
1951 dD[(ii+1)] = (beta-alpha) / beta;
1952 tmp = 1.0 / (alpha-beta);
1953 pC11[0+ps*0] = beta;
1954 for(jj=1; jj<n-(ii+1); jj++)
1955 pC11[0+ps*jj] *= tmp;
1956 }
1957 // compute T
1958 kmax = n-ii;
1959 tmp = 1.0*0.0 + pC00[0+ps*1]*1.0;
1960 for(kk=2; kk<kmax; kk++)
1961 tmp += pC00[0+ps*kk]*pC10[0+ps*kk];
1962 pT[0+ldt*0] = dD[ii+0];
1963 pT[0+ldt*1] = - dD[ii+1] * tmp * dD[ii+0];
1964 pT[1+ldt*1] = dD[ii+1];
1965 // downgrade
1966 kmax = n-ii;
1967 jmax = m-ii-2;
1968 jmax0 = (ps-((ii+2+offD)&(ps-1)))&(ps-1);
1969 jmax0 = jmax<jmax0 ? jmax : jmax0;
1970 jj = 0;
1971 pC20a = &pD0[((offD+ii+2)&(ps-1))+((offD+ii+2)-((offD+ii+2)&(ps-1)))*sdd+ii*ps];
1972 pC20 = pC20a;
1973 if(jmax0>0)
1974 {
1975 for( ; jj<jmax0; jj++)
1976 {
1977 w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
1978 w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
1979 for(kk=2; kk<kmax; kk++)
1980 {
1981 w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
1982 w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
1983 }
1984 w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
1985 w00 = - w00*pT[0+ldt*0];
1986 pC20[0+ps*0] += w00*1.0 + w01*0.0;
1987 pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
1988 for(kk=2; kk<kmax; kk++)
1989 {
1990 pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
1991 }
1992 pC20 += 1;
1993 }
1994 pC20 += -ps+ps*sdd;
1995 }
1996 for( ; jj<jmax-3; jj+=4)
1997 {
1998 w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
1999 w10 = pC20[1+ps*0]*1.0 + pC20[1+ps*1]*pC00[0+ps*1];
2000 w20 = pC20[2+ps*0]*1.0 + pC20[2+ps*1]*pC00[0+ps*1];
2001 w30 = pC20[3+ps*0]*1.0 + pC20[3+ps*1]*pC00[0+ps*1];
2002 w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
2003 w11 = pC20[1+ps*0]*0.0 + pC20[1+ps*1]*1.0;
2004 w21 = pC20[2+ps*0]*0.0 + pC20[2+ps*1]*1.0;
2005 w31 = pC20[3+ps*0]*0.0 + pC20[3+ps*1]*1.0;
2006 for(kk=2; kk<kmax; kk++)
2007 {
2008 w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
2009 w10 += pC20[1+ps*kk]*pC00[0+ps*kk];
2010 w20 += pC20[2+ps*kk]*pC00[0+ps*kk];
2011 w30 += pC20[3+ps*kk]*pC00[0+ps*kk];
2012 w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
2013 w11 += pC20[1+ps*kk]*pC10[0+ps*kk];
2014 w21 += pC20[2+ps*kk]*pC10[0+ps*kk];
2015 w31 += pC20[3+ps*kk]*pC10[0+ps*kk];
2016 }
2017 w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
2018 w11 = - w10*pT[0+ldt*1] - w11*pT[1+ldt*1];
2019 w21 = - w20*pT[0+ldt*1] - w21*pT[1+ldt*1];
2020 w31 = - w30*pT[0+ldt*1] - w31*pT[1+ldt*1];
2021 w00 = - w00*pT[0+ldt*0];
2022 w10 = - w10*pT[0+ldt*0];
2023 w20 = - w20*pT[0+ldt*0];
2024 w30 = - w30*pT[0+ldt*0];
2025 pC20[0+ps*0] += w00*1.0 + w01*0.0;
2026 pC20[1+ps*0] += w10*1.0 + w11*0.0;
2027 pC20[2+ps*0] += w20*1.0 + w21*0.0;
2028 pC20[3+ps*0] += w30*1.0 + w31*0.0;
2029 pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
2030 pC20[1+ps*1] += w10*pC00[0+ps*1] + w11*1.0;
2031 pC20[2+ps*1] += w20*pC00[0+ps*1] + w21*1.0;
2032 pC20[3+ps*1] += w30*pC00[0+ps*1] + w31*1.0;
2033 for(kk=2; kk<kmax; kk++)
2034 {
2035 pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
2036 pC20[1+ps*kk] += w10*pC00[0+ps*kk] + w11*pC10[0+ps*kk];
2037 pC20[2+ps*kk] += w20*pC00[0+ps*kk] + w21*pC10[0+ps*kk];
2038 pC20[3+ps*kk] += w30*pC00[0+ps*kk] + w31*pC10[0+ps*kk];
2039 }
2040 pC20 += ps*sdd;
2041 }
2042 for(ll=0; ll<jmax-jj; ll++)
2043 {
2044 w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
2045 w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
2046 for(kk=2; kk<kmax; kk++)
2047 {
2048 w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
2049 w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
2050 }
2051 w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
2052 w00 = - w00*pT[0+ldt*0];
2053 pC20[0+ps*0] += w00*1.0 + w01*0.0;
2054 pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
2055 for(kk=2; kk<kmax; kk++)
2056 {
2057 pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
2058 }
2059 pC20 += 1;
2060 }
2061 }
2062#endif
2063 for(; ii<imax; ii++)
2064 {
2065 pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
2066 beta = 0.0;
2067 for(jj=1; jj<n-ii; jj++)
2068 {
2069 tmp = pC00[0+ps*jj];
2070 beta += tmp*tmp;
2071 }
2072 if(beta==0.0)
2073 {
2074 dD[ii] = 0.0;
2075 }
2076 else
2077 {
2078 alpha = pC00[0];
2079 beta += alpha*alpha;
2080 beta = sqrt(beta);
2081 if(alpha>0)
2082 beta = -beta;
2083 dD[ii] = (beta-alpha) / beta;
2084 tmp = 1.0 / (alpha-beta);
2085 pC00[0] = beta;
2086 for(jj=1; jj<n-ii; jj++)
2087 pC00[0+ps*jj] *= tmp;
2088 }
2089 if(ii<n)
2090 {
2091 kmax = n-ii;
2092 jmax = m-ii-1;
2093 jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
2094 jmax0 = jmax<jmax0 ? jmax : jmax0;
2095 jj = 0;
2096 pC10a = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
2097 pC10 = pC10a;
2098 if(jmax0>0)
2099 {
2100 for( ; jj<jmax0; jj++)
2101 {
2102 w00 = pC10[0+ps*0];
2103 for(kk=1; kk<kmax; kk++)
2104 {
2105 w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
2106 }
2107 w00 = - w00*dD[ii];
2108 pC10[0+ps*0] += w00;
2109 for(kk=1; kk<kmax; kk++)
2110 {
2111 pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
2112 }
2113 pC10 += 1;
2114 }
2115 pC10 += -ps+ps*sdd;
2116 }
2117 for( ; jj<jmax-3; jj+=4)
2118 {
2119 w00 = pC10[0+ps*0];
2120 w10 = pC10[1+ps*0];
2121 w20 = pC10[2+ps*0];
2122 w30 = pC10[3+ps*0];
2123 for(kk=1; kk<kmax; kk++)
2124 {
2125 w00 += pC10[0+ps*kk]*pC00[0+ps*kk];
2126 w10 += pC10[1+ps*kk]*pC00[0+ps*kk];
2127 w20 += pC10[2+ps*kk]*pC00[0+ps*kk];
2128 w30 += pC10[3+ps*kk]*pC00[0+ps*kk];
2129 }
2130 w00 = - w00*dD[ii];
2131 w10 = - w10*dD[ii];
2132 w20 = - w20*dD[ii];
2133 w30 = - w30*dD[ii];
2134 pC10[0+ps*0] += w00;
2135 pC10[1+ps*0] += w10;
2136 pC10[2+ps*0] += w20;
2137 pC10[3+ps*0] += w30;
2138 for(kk=1; kk<kmax; kk++)
2139 {
2140 pC10[0+ps*kk] += w00*pC00[0+ps*kk];
2141 pC10[1+ps*kk] += w10*pC00[0+ps*kk];
2142 pC10[2+ps*kk] += w20*pC00[0+ps*kk];
2143 pC10[3+ps*kk] += w30*pC00[0+ps*kk];
2144 }
2145 pC10 += ps*sdd;
2146 }
2147 for(ll=0; ll<jmax-jj; ll++)
2148 {
2149 w00 = pC10[0+ps*0];
2150 for(kk=1; kk<kmax; kk++)
2151 {
2152 w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
2153 }
2154 w00 = - w00*dD[ii];
2155 pC10[0+ps*0] += w00;
2156 for(kk=1; kk<kmax; kk++)
2157 {
2158 pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
2159 }
2160 pC10 += 1;
2161 }
2162 }
2163 }
2164 return;
2165 }
2166
2167
2168
2169// assume kmax>=4
2170void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT)
2171 {
2172 const int ps = 4;
2173 int kk;
2174 double v10,
2175 v20, v21,
2176 v30, v31, v32;
2177 // 0
2178 // 1
2179 v10 = pD[0+ps*1];
2180 // 2
2181 v10 += pD[1+ps*2]*pD[0+ps*2];
2182 v20 = pD[0+ps*2];
2183 v21 = pD[1+ps*2];
2184 // 3
2185 v10 += pD[1+ps*3]*pD[0+ps*3];
2186 v20 += pD[2+ps*3]*pD[0+ps*3];
2187 v21 += pD[2+ps*3]*pD[1+ps*3];
2188 v30 = pD[0+ps*3];
2189 v31 = pD[1+ps*3];
2190 v32 = pD[2+ps*3];
2191 //
2192 for(kk=4; kk<kmax; kk++)
2193 {
2194 v10 += pD[1+ps*kk]*pD[0+ps*kk];
2195 v20 += pD[2+ps*kk]*pD[0+ps*kk];
2196 v30 += pD[3+ps*kk]*pD[0+ps*kk];
2197 v21 += pD[2+ps*kk]*pD[1+ps*kk];
2198 v31 += pD[3+ps*kk]*pD[1+ps*kk];
2199 v32 += pD[3+ps*kk]*pD[2+ps*kk];
2200 }
2201 pT[0+ps*0] = - dD[0];
2202 pT[1+ps*1] = - dD[1];
2203 pT[2+ps*2] = - dD[2];
2204 pT[3+ps*3] = - dD[3];
2205 pT[0+ps*1] = - dD[1] * (v10*pT[0+ps*0]);
2206 pT[1+ps*2] = - dD[2] * (v21*pT[1+ps*1]);
2207 pT[2+ps*3] = - dD[3] * (v32*pT[2+ps*2]);
2208 pT[0+ps*2] = - dD[2] * (v20*pT[0+ps*0] + v21*pT[0+ps*1]);
2209 pT[1+ps*3] = - dD[3] * (v31*pT[1+ps*1] + v32*pT[1+ps*2]);
2210 pT[0+ps*3] = - dD[3] * (v30*pT[0+ps*0] + v31*pT[0+ps*1] + v32*pT[0+ps*2]);
2211 return;
2212 }
2213
2214
2215
2216// assume n>=4
2217void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT)
2218 {
2219 int ii, jj, ll;
2220 double alpha, beta, tmp, w0, w1, w2, w3;
2221 const int ps = 4;
2222 // zero tau matrix
2223 for(ii=0; ii<16; ii++)
2224 pT[ii] = 0.0;
2225 // first column
2226 beta = 0.0;
2227 for(ii=1; ii<n; ii++)
2228 {
2229 tmp = pD[0+ps*ii];
2230 beta += tmp*tmp;
2231 }
2232 if(beta==0.0)
2233 {
2234 dD[0] = 0.0;
2235 tmp = 0.0;
2236 goto col2;
2237 }
2238 alpha = pD[0+ps*0];
2239 beta += alpha*alpha;
2240 beta = sqrt(beta);
2241 if(alpha>0)
2242 beta = -beta;
2243 dD[0] = (beta-alpha) / beta;
2244 pT[0+ps*0] = - dD[0];
2245 tmp = 1.0 / (alpha-beta);
2246 //
2247 pD[0+ps*0] = beta;
2248 w1 = pD[1+ps*0];
2249 w2 = pD[2+ps*0];
2250 w3 = pD[3+ps*0];
2251 //
2252 pD[0+ps*1] *= tmp;
2253 w1 += pD[1+ps*1] * pD[0+ps*1];
2254 w2 += pD[2+ps*1] * pD[0+ps*1];
2255 w3 += pD[3+ps*1] * pD[0+ps*1];
2256 //
2257 pD[0+ps*2] *= tmp;
2258 w1 += pD[1+ps*2] * pD[0+ps*2];
2259 w2 += pD[2+ps*2] * pD[0+ps*2];
2260 w3 += pD[3+ps*2] * pD[0+ps*2];
2261 //
2262 pD[0+ps*3] *= tmp;
2263 w1 += pD[1+ps*3] * pD[0+ps*3];
2264 w2 += pD[2+ps*3] * pD[0+ps*3];
2265 w3 += pD[3+ps*3] * pD[0+ps*3];
2266 //
2267 for(ii=4; ii<n; ii++)
2268 {
2269 pD[0+ps*ii] *= tmp;
2270 w1 += pD[1+ps*ii] * pD[0+ps*ii];
2271 w2 += pD[2+ps*ii] * pD[0+ps*ii];
2272 w3 += pD[3+ps*ii] * pD[0+ps*ii];
2273 }
2274 //
2275 w1 = - dD[0] * w1;
2276 w2 = - dD[0] * w2;
2277 w3 = - dD[0] * w3;
2278 //
2279 pD[1+ps*0] += w1;
2280 pD[2+ps*0] += w2;
2281 pD[3+ps*0] += w3;
2282 //
2283 pD[1+ps*1] += w1 * pD[0+ps*1];
2284 pD[2+ps*1] += w2 * pD[0+ps*1];
2285 pD[3+ps*1] += w3 * pD[0+ps*1];
2286 //
2287 pD[1+ps*2] += w1 * pD[0+ps*2];
2288 pD[2+ps*2] += w2 * pD[0+ps*2];
2289 pD[3+ps*2] += w3 * pD[0+ps*2];
2290 beta = pD[1+ps*2] * pD[1+ps*2];
2291 //
2292 pD[1+ps*3] += w1 * pD[0+ps*3];
2293 pD[2+ps*3] += w2 * pD[0+ps*3];
2294 pD[3+ps*3] += w3 * pD[0+ps*3];
2295 beta += pD[1+ps*3] * pD[1+ps*3];
2296 //
2297 for(ii=4; ii<n; ii++)
2298 {
2299 pD[1+ps*ii] += w1 * pD[0+ps*ii];
2300 pD[2+ps*ii] += w2 * pD[0+ps*ii];
2301 pD[3+ps*ii] += w3 * pD[0+ps*ii];
2302 beta += pD[1+ps*ii] * pD[1+ps*ii];
2303 }
2304 // second column
2305col2:
2306 if(beta==0.0)
2307 {
2308 dD[1] = 0.0;
2309 tmp = 0.0;
2310 goto col3;
2311 }
2312 alpha = pD[1+ps*1];
2313 beta += alpha*alpha;
2314 beta = sqrt(beta);
2315 if(alpha>0)
2316 beta = -beta;
2317 dD[1] = (beta-alpha) / beta;
2318 pT[1+ps*1] = - dD[1];
2319 tmp = 1.0 / (alpha-beta);
2320 //
2321 pD[1+ps*1] = beta;
2322 w0 = pD[0+ps*1]; //
2323 w2 = pD[2+ps*1];
2324 w3 = pD[3+ps*1];
2325 //
2326 pD[1+ps*2] *= tmp;
2327 w0 += pD[0+ps*2] * pD[1+ps*2]; //
2328 w2 += pD[2+ps*2] * pD[1+ps*2];
2329 w3 += pD[3+ps*2] * pD[1+ps*2];
2330 //
2331 pD[1+ps*3] *= tmp;
2332 w0 += pD[0+ps*3] * pD[1+ps*3]; //
2333 w2 += pD[2+ps*3] * pD[1+ps*3];
2334 w3 += pD[3+ps*3] * pD[1+ps*3];
2335 //
2336 for(ii=4; ii<n; ii++)
2337 {
2338 pD[1+ps*ii] *= tmp;
2339 w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
2340 w2 += pD[2+ps*ii] * pD[1+ps*ii];
2341 w3 += pD[3+ps*ii] * pD[1+ps*ii];
2342 }
2343 //
2344 pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
2345 w2 = - dD[1] * w2;
2346 w3 = - dD[1] * w3;
2347 //
2348 pD[2+ps*1] += w2;
2349 pD[3+ps*1] += w3;
2350 //
2351 pD[2+ps*2] += w2 * pD[1+ps*2];
2352 pD[3+ps*2] += w3 * pD[1+ps*2];
2353 //
2354 pD[2+ps*3] += w2 * pD[1+ps*3];
2355 pD[3+ps*3] += w3 * pD[1+ps*3];
2356 beta = pD[2+ps*3] * pD[2+ps*3];
2357 //
2358 for(ii=4; ii<n; ii++)
2359 {
2360 pD[2+ps*ii] += w2 * pD[1+ps*ii];
2361 pD[3+ps*ii] += w3 * pD[1+ps*ii];
2362 beta += pD[2+ps*ii] * pD[2+ps*ii];
2363 }
2364 // third column
2365col3:
2366 if(beta==0.0)
2367 {
2368 dD[2] = 0.0;
2369 tmp = 0.0;
2370 goto col4;
2371 }
2372 alpha = pD[2+ps*2];
2373 beta += alpha*alpha;
2374 beta = sqrt(beta);
2375 if(alpha>0)
2376 beta = -beta;
2377 dD[2] = (beta-alpha) / beta;
2378 pT[2+ps*2] = - dD[2];
2379 tmp = 1.0 / (alpha-beta);
2380 //
2381 pD[2+ps*2] = beta;
2382 w0 = pD[0+ps*2];
2383 w1 = pD[1+ps*2];
2384 w3 = pD[3+ps*2];
2385 //
2386 pD[2+ps*3] *= tmp;
2387 w0 += pD[0+ps*3] * pD[2+ps*3];
2388 w1 += pD[1+ps*3] * pD[2+ps*3];
2389 w3 += pD[3+ps*3] * pD[2+ps*3];
2390 //
2391 for(ii=4; ii<n; ii++)
2392 {
2393 pD[2+ps*ii] *= tmp;
2394 w0 += pD[0+ps*ii] * pD[2+ps*ii];
2395 w1 += pD[1+ps*ii] * pD[2+ps*ii];
2396 w3 += pD[3+ps*ii] * pD[2+ps*ii];
2397 }
2398 //
2399 pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
2400 pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
2401 w3 = - dD[2] * w3;
2402 //
2403 pD[3+ps*2] += w3;
2404 //
2405 pD[3+ps*3] += w3 * pD[2+ps*3];
2406 //
2407 beta = 0.0;
2408 for(ii=4; ii<n; ii++)
2409 {
2410 pD[3+ps*ii] += w3 * pD[2+ps*ii];
2411 beta += pD[3+ps*ii] * pD[3+ps*ii];
2412 }
2413 // fourth column
2414col4:
2415 if(beta==0.0)
2416 {
2417 dD[3] = 0.0;
2418 tmp = 0.0;
2419 return;
2420 }
2421 alpha = pD[3+ps*3];
2422 beta += alpha*alpha;
2423 beta = sqrt(beta);
2424 if(alpha>0)
2425 beta = -beta;
2426 dD[3] = (beta-alpha) / beta;
2427 pT[3+ps*3] = - dD[3];
2428 tmp = 1.0 / (alpha-beta);
2429 //
2430 pD[3+ps*3] = beta;
2431 w0 = pD[0+ps*3];
2432 w1 = pD[1+ps*3];
2433 w2 = pD[2+ps*3];
2434 //
2435 for(ii=4; ii<n; ii++)
2436 {
2437 pD[3+ps*ii] *= tmp;
2438 w0 += pD[0+ps*ii] * pD[3+ps*ii];
2439 w1 += pD[1+ps*ii] * pD[3+ps*ii];
2440 w2 += pD[2+ps*ii] * pD[3+ps*ii];
2441 }
2442 //
2443 pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
2444 pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
2445 pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
2446 return;
2447 }
2448
2449
2450
2451void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD)
2452 {
2453 const int ps = 4;
2454 double pW[16];
2455 int kk;
2456 // 0
2457 pW[0+ps*0] = pD[0+ps*0];
2458 pW[1+ps*0] = pD[1+ps*0];
2459 pW[2+ps*0] = pD[2+ps*0];
2460 pW[3+ps*0] = pD[3+ps*0];
2461 // 1
2462 pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
2463 pW[1+ps*0] += pD[1+ps*1]*pV[0+ps*1];
2464 pW[2+ps*0] += pD[2+ps*1]*pV[0+ps*1];
2465 pW[3+ps*0] += pD[3+ps*1]*pV[0+ps*1];
2466 pW[0+ps*1] = pD[0+ps*1];
2467 pW[1+ps*1] = pD[1+ps*1];
2468 pW[2+ps*1] = pD[2+ps*1];
2469 pW[3+ps*1] = pD[3+ps*1];
2470 // 2
2471 pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
2472 pW[1+ps*0] += pD[1+ps*2]*pV[0+ps*2];
2473 pW[2+ps*0] += pD[2+ps*2]*pV[0+ps*2];
2474 pW[3+ps*0] += pD[3+ps*2]*pV[0+ps*2];
2475 pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
2476 pW[1+ps*1] += pD[1+ps*2]*pV[1+ps*2];
2477 pW[2+ps*1] += pD[2+ps*2]*pV[1+ps*2];
2478 pW[3+ps*1] += pD[3+ps*2]*pV[1+ps*2];
2479 pW[0+ps*2] = pD[0+ps*2];
2480 pW[1+ps*2] = pD[1+ps*2];
2481 pW[2+ps*2] = pD[2+ps*2];
2482 pW[3+ps*2] = pD[3+ps*2];
2483 // 3
2484 pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
2485 pW[1+ps*0] += pD[1+ps*3]*pV[0+ps*3];
2486 pW[2+ps*0] += pD[2+ps*3]*pV[0+ps*3];
2487 pW[3+ps*0] += pD[3+ps*3]*pV[0+ps*3];
2488 pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
2489 pW[1+ps*1] += pD[1+ps*3]*pV[1+ps*3];
2490 pW[2+ps*1] += pD[2+ps*3]*pV[1+ps*3];
2491 pW[3+ps*1] += pD[3+ps*3]*pV[1+ps*3];
2492 pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
2493 pW[1+ps*2] += pD[1+ps*3]*pV[2+ps*3];
2494 pW[2+ps*2] += pD[2+ps*3]*pV[2+ps*3];
2495 pW[3+ps*2] += pD[3+ps*3]*pV[2+ps*3];
2496 pW[0+ps*3] = pD[0+ps*3];
2497 pW[1+ps*3] = pD[1+ps*3];
2498 pW[2+ps*3] = pD[2+ps*3];
2499 pW[3+ps*3] = pD[3+ps*3];
2500 //
2501 for(kk=4; kk<kmax; kk++)
2502 {
2503 pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
2504 pW[1+ps*0] += pD[1+ps*kk]*pV[0+ps*kk];
2505 pW[2+ps*0] += pD[2+ps*kk]*pV[0+ps*kk];
2506 pW[3+ps*0] += pD[3+ps*kk]*pV[0+ps*kk];
2507 pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
2508 pW[1+ps*1] += pD[1+ps*kk]*pV[1+ps*kk];
2509 pW[2+ps*1] += pD[2+ps*kk]*pV[1+ps*kk];
2510 pW[3+ps*1] += pD[3+ps*kk]*pV[1+ps*kk];
2511 pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
2512 pW[1+ps*2] += pD[1+ps*kk]*pV[2+ps*kk];
2513 pW[2+ps*2] += pD[2+ps*kk]*pV[2+ps*kk];
2514 pW[3+ps*2] += pD[3+ps*kk]*pV[2+ps*kk];
2515 pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
2516 pW[1+ps*3] += pD[1+ps*kk]*pV[3+ps*kk];
2517 pW[2+ps*3] += pD[2+ps*kk]*pV[3+ps*kk];
2518 pW[3+ps*3] += pD[3+ps*kk]*pV[3+ps*kk];
2519 }
2520 //
2521 pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
2522 pW[1+ps*3] = pW[1+ps*0]*pT[0+ps*3] + pW[1+ps*1]*pT[1+ps*3] + pW[1+ps*2]*pT[2+ps*3] + pW[1+ps*3]*pT[3+ps*3];
2523 pW[2+ps*3] = pW[2+ps*0]*pT[0+ps*3] + pW[2+ps*1]*pT[1+ps*3] + pW[2+ps*2]*pT[2+ps*3] + pW[2+ps*3]*pT[3+ps*3];
2524 pW[3+ps*3] = pW[3+ps*0]*pT[0+ps*3] + pW[3+ps*1]*pT[1+ps*3] + pW[3+ps*2]*pT[2+ps*3] + pW[3+ps*3]*pT[3+ps*3];
2525 //
2526 pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
2527 pW[1+ps*2] = pW[1+ps*0]*pT[0+ps*2] + pW[1+ps*1]*pT[1+ps*2] + pW[1+ps*2]*pT[2+ps*2];
2528 pW[2+ps*2] = pW[2+ps*0]*pT[0+ps*2] + pW[2+ps*1]*pT[1+ps*2] + pW[2+ps*2]*pT[2+ps*2];
2529 pW[3+ps*2] = pW[3+ps*0]*pT[0+ps*2] + pW[3+ps*1]*pT[1+ps*2] + pW[3+ps*2]*pT[2+ps*2];
2530 //
2531 pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
2532 pW[1+ps*1] = pW[1+ps*0]*pT[0+ps*1] + pW[1+ps*1]*pT[1+ps*1];
2533 pW[2+ps*1] = pW[2+ps*0]*pT[0+ps*1] + pW[2+ps*1]*pT[1+ps*1];
2534 pW[3+ps*1] = pW[3+ps*0]*pT[0+ps*1] + pW[3+ps*1]*pT[1+ps*1];
2535 //
2536 pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
2537 pW[1+ps*0] = pW[1+ps*0]*pT[0+ps*0];
2538 pW[2+ps*0] = pW[2+ps*0]*pT[0+ps*0];
2539 pW[3+ps*0] = pW[3+ps*0]*pT[0+ps*0];
2540 //
2541 pD[0+ps*0] += pW[0+ps*0];
2542 pD[1+ps*0] += pW[1+ps*0];
2543 pD[2+ps*0] += pW[2+ps*0];
2544 pD[3+ps*0] += pW[3+ps*0];
2545 //
2546 pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
2547 pD[1+ps*1] += pW[1+ps*0]*pV[0+ps*1] + pW[1+ps*1];
2548 pD[2+ps*1] += pW[2+ps*0]*pV[0+ps*1] + pW[2+ps*1];
2549 pD[3+ps*1] += pW[3+ps*0]*pV[0+ps*1] + pW[3+ps*1];
2550 //
2551 pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
2552 pD[1+ps*2] += pW[1+ps*0]*pV[0+ps*2] + pW[1+ps*1]*pV[1+ps*2] + pW[1+ps*2];
2553 pD[2+ps*2] += pW[2+ps*0]*pV[0+ps*2] + pW[2+ps*1]*pV[1+ps*2] + pW[2+ps*2];
2554 pD[3+ps*2] += pW[3+ps*0]*pV[0+ps*2] + pW[3+ps*1]*pV[1+ps*2] + pW[3+ps*2];
2555 //
2556 pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
2557 pD[1+ps*3] += pW[1+ps*0]*pV[0+ps*3] + pW[1+ps*1]*pV[1+ps*3] + pW[1+ps*2]*pV[2+ps*3] + pW[1+ps*3];
2558 pD[2+ps*3] += pW[2+ps*0]*pV[0+ps*3] + pW[2+ps*1]*pV[1+ps*3] + pW[2+ps*2]*pV[2+ps*3] + pW[2+ps*3];
2559 pD[3+ps*3] += pW[3+ps*0]*pV[0+ps*3] + pW[3+ps*1]*pV[1+ps*3] + pW[3+ps*2]*pV[2+ps*3] + pW[3+ps*3];
2560 for(kk=4; kk<kmax; kk++)
2561 {
2562 pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
2563 pD[1+ps*kk] += pW[1+ps*0]*pV[0+ps*kk] + pW[1+ps*1]*pV[1+ps*kk] + pW[1+ps*2]*pV[2+ps*kk] + pW[1+ps*3]*pV[3+ps*kk];
2564 pD[2+ps*kk] += pW[2+ps*0]*pV[0+ps*kk] + pW[2+ps*1]*pV[1+ps*kk] + pW[2+ps*2]*pV[2+ps*kk] + pW[2+ps*3]*pV[3+ps*kk];
2565 pD[3+ps*kk] += pW[3+ps*0]*pV[0+ps*kk] + pW[3+ps*1]*pV[1+ps*kk] + pW[3+ps*2]*pV[2+ps*kk] + pW[3+ps*3]*pV[3+ps*kk];
2566 }
2567 return;
2568 }
2569
2570
2571
2572void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD)
2573 {
2574 const int ps = 4;
2575 double pW[16];
2576 int kk;
2577 // 0
2578 pW[0+ps*0] = pD[0+ps*0];
2579 // 1
2580 pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
2581 pW[0+ps*1] = pD[0+ps*1];
2582 // 2
2583 pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
2584 pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
2585 pW[0+ps*2] = pD[0+ps*2];
2586 // 3
2587 pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
2588 pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
2589 pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
2590 pW[0+ps*3] = pD[0+ps*3];
2591 //
2592 for(kk=4; kk<kmax; kk++)
2593 {
2594 pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
2595 pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
2596 pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
2597 pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
2598 }
2599 //
2600 pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
2601 //
2602 pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
2603 //
2604 pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
2605 //
2606 pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
2607 //
2608 pD[0+ps*0] += pW[0+ps*0];
2609 //
2610 pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
2611 //
2612 pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
2613 //
2614 pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
2615 for(kk=4; kk<kmax; kk++)
2616 {
2617 pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
2618 }
2619 return;
2620 }