blob: 762a8a03bad06686127ac3fb04b503dbc4090230 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29
30
31#if defined(LA_REFERENCE)
32
33
34
35// dpotrf
36void POTRF_L_LIBSTR(int m, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
37 {
38 if(m<=0)
39 return;
40 int ii, jj, kk;
41 REAL
42 f_00_inv,
43 f_10, f_11_inv,
44 c_00, c_01,
45 c_10, c_11;
46 int ldc = sC->m;
47 int ldd = sD->m;
48 REAL *pC = sC->pA + ci + cj*ldc;
49 REAL *pD = sD->pA + di + dj*ldd;
50 REAL *dD = sD->dA;
51 if(di==0 & dj==0)
52 sD->use_dA = 1;
53 else
54 sD->use_dA = 0;
55 jj = 0;
56 for(; jj<m-1; jj+=2)
57 {
58 // factorize diagonal
59 c_00 = pC[jj+0+ldc*(jj+0)];;
60 c_10 = pC[jj+1+ldc*(jj+0)];;
61 c_11 = pC[jj+1+ldc*(jj+1)];;
62 for(kk=0; kk<jj; kk++)
63 {
64 c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
65 c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
66 c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
67 }
68 if(c_00>0)
69 {
70 f_00_inv = 1.0/sqrt(c_00);
71 }
72 else
73 {
74 f_00_inv = 0.0;
75 }
76 dD[jj+0] = f_00_inv;
77 pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
78 f_10 = c_10 * f_00_inv;
79 pD[jj+1+ldd*(jj+0)] = f_10;
80 c_11 -= f_10 * f_10;
81 if(c_11>0)
82 {
83 f_11_inv = 1.0/sqrt(c_11);
84 }
85 else
86 {
87 f_11_inv = 0.0;
88 }
89 dD[jj+1] = f_11_inv;
90 pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
91 // solve lower
92 ii = jj+2;
93 for(; ii<m-1; ii+=2)
94 {
95 c_00 = pC[ii+0+ldc*(jj+0)];
96 c_10 = pC[ii+1+ldc*(jj+0)];
97 c_01 = pC[ii+0+ldc*(jj+1)];
98 c_11 = pC[ii+1+ldc*(jj+1)];
99 for(kk=0; kk<jj; kk++)
100 {
101 c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
102 c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
103 c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
104 c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
105 }
106 c_00 *= f_00_inv;
107 c_10 *= f_00_inv;
108 pD[ii+0+ldd*(jj+0)] = c_00;
109 pD[ii+1+ldd*(jj+0)] = c_10;
110 c_01 -= c_00 * f_10;
111 c_11 -= c_10 * f_10;
112 pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
113 pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
114 }
115 for(; ii<m; ii++)
116 {
117 c_00 = pC[ii+0+ldc*(jj+0)];
118 c_01 = pC[ii+0+ldc*(jj+1)];
119 for(kk=0; kk<jj; kk++)
120 {
121 c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
122 c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
123 }
124 c_00 *= f_00_inv;
125 pD[ii+0+ldd*(jj+0)] = c_00;
126 c_01 -= c_00 * f_10;
127 pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
128 }
129 }
130 for(; jj<m; jj++)
131 {
132 // factorize diagonal
133 c_00 = pC[jj+ldc*jj];;
134 for(kk=0; kk<jj; kk++)
135 {
136 c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
137 }
138 if(c_00>0)
139 {
140 f_00_inv = 1.0/sqrt(c_00);
141 }
142 else
143 {
144 f_00_inv = 0.0;
145 }
146 dD[jj] = f_00_inv;
147 pD[jj+ldd*jj] = c_00 * f_00_inv;
148 // solve lower
149// for(ii=jj+1; ii<m; ii++)
150// {
151// c_00 = pC[ii+ldc*jj];
152// for(kk=0; kk<jj; kk++)
153// {
154// c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
155// }
156// pD[ii+ldd*jj] = c_00 * f_00_inv;
157// }
158 }
159 return;
160 }
161
162
163
164// dpotrf
165void POTRF_L_MN_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
166 {
167 if(m<=0 | n<=0)
168 return;
169 int ii, jj, kk;
170 REAL
171 f_00_inv,
172 f_10, f_11_inv,
173 c_00, c_01,
174 c_10, c_11;
175 int ldc = sC->m;
176 int ldd = sD->m;
177 REAL *pC = sC->pA + ci + cj*ldc;
178 REAL *pD = sD->pA + di + dj*ldd;
179 REAL *dD = sD->dA;
180 if(di==0 & dj==0)
181 sD->use_dA = 1;
182 else
183 sD->use_dA = 0;
184 jj = 0;
185 for(; jj<n-1; jj+=2)
186 {
187 // factorize diagonal
188 c_00 = pC[jj+0+ldc*(jj+0)];;
189 c_10 = pC[jj+1+ldc*(jj+0)];;
190 c_11 = pC[jj+1+ldc*(jj+1)];;
191 for(kk=0; kk<jj; kk++)
192 {
193 c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
194 c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
195 c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
196 }
197 if(c_00>0)
198 {
199 f_00_inv = 1.0/sqrt(c_00);
200 }
201 else
202 {
203 f_00_inv = 0.0;
204 }
205 dD[jj+0] = f_00_inv;
206 pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
207 f_10 = c_10 * f_00_inv;
208 pD[jj+1+ldd*(jj+0)] = f_10;
209 c_11 -= f_10 * f_10;
210 if(c_11>0)
211 {
212 f_11_inv = 1.0/sqrt(c_11);
213 }
214 else
215 {
216 f_11_inv = 0.0;
217 }
218 dD[jj+1] = f_11_inv;
219 pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
220 // solve lower
221 ii = jj+2;
222 for(; ii<m-1; ii+=2)
223 {
224 c_00 = pC[ii+0+ldc*(jj+0)];
225 c_10 = pC[ii+1+ldc*(jj+0)];
226 c_01 = pC[ii+0+ldc*(jj+1)];
227 c_11 = pC[ii+1+ldc*(jj+1)];
228 for(kk=0; kk<jj; kk++)
229 {
230 c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
231 c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
232 c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
233 c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
234 }
235 c_00 *= f_00_inv;
236 c_10 *= f_00_inv;
237 pD[ii+0+ldd*(jj+0)] = c_00;
238 pD[ii+1+ldd*(jj+0)] = c_10;
239 c_01 -= c_00 * f_10;
240 c_11 -= c_10 * f_10;
241 pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
242 pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
243 }
244 for(; ii<m; ii++)
245 {
246 c_00 = pC[ii+0+ldc*(jj+0)];
247 c_01 = pC[ii+0+ldc*(jj+1)];
248 for(kk=0; kk<jj; kk++)
249 {
250 c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
251 c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
252 }
253 c_00 *= f_00_inv;
254 pD[ii+0+ldd*(jj+0)] = c_00;
255 c_01 -= c_00 * f_10;
256 pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
257 }
258 }
259 for(; jj<n; jj++)
260 {
261 // factorize diagonal
262 c_00 = pC[jj+ldc*jj];;
263 for(kk=0; kk<jj; kk++)
264 {
265 c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
266 }
267 if(c_00>0)
268 {
269 f_00_inv = 1.0/sqrt(c_00);
270 }
271 else
272 {
273 f_00_inv = 0.0;
274 }
275 dD[jj] = f_00_inv;
276 pD[jj+ldd*jj] = c_00 * f_00_inv;
277 // solve lower
278 for(ii=jj+1; ii<m; ii++)
279 {
280 c_00 = pC[ii+ldc*jj];
281 for(kk=0; kk<jj; kk++)
282 {
283 c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
284 }
285 pD[ii+ldd*jj] = c_00 * f_00_inv;
286 }
287 }
288 return;
289 }
290
291
292
293// dsyrk dpotrf
294void SYRK_POTRF_LN_LIBSTR(int m, int n, int k, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
295 {
296 int ii, jj, kk;
297 REAL
298 f_00_inv,
299 f_10, f_11_inv,
300 c_00, c_01,
301 c_10, c_11;
302 int lda = sA->m;
303 int ldb = sB->m;
304 int ldc = sC->m;
305 int ldd = sD->m;
306 REAL *pA = sA->pA + ai + aj*lda;
307 REAL *pB = sB->pA + bi + bj*ldb;
308 REAL *pC = sC->pA + ci + cj*ldc;
309 REAL *pD = sD->pA + di + dj*ldd;
310 REAL *dD = sD->dA;
311 if(di==0 & dj==0)
312 sD->use_dA = 1;
313 else
314 sD->use_dA = 0;
315 jj = 0;
316 for(; jj<n-1; jj+=2)
317 {
318 // factorize diagonal
319 c_00 = pC[jj+0+ldc*(jj+0)];;
320 c_10 = pC[jj+1+ldc*(jj+0)];;
321 c_11 = pC[jj+1+ldc*(jj+1)];;
322 for(kk=0; kk<k; kk++)
323 {
324 c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
325 c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
326 c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
327 }
328 for(kk=0; kk<jj; kk++)
329 {
330 c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
331 c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
332 c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
333 }
334 if(c_00>0)
335 {
336 f_00_inv = 1.0/sqrt(c_00);
337 }
338 else
339 {
340 f_00_inv = 0.0;
341 }
342 dD[jj+0] = f_00_inv;
343 pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
344 f_10 = c_10 * f_00_inv;
345 pD[jj+1+ldd*(jj+0)] = f_10;
346 c_11 -= f_10 * f_10;
347 if(c_11>0)
348 {
349 f_11_inv = 1.0/sqrt(c_11);
350 }
351 else
352 {
353 f_11_inv = 0.0;
354 }
355 dD[jj+1] = f_11_inv;
356 pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
357 // solve lower
358 ii = jj+2;
359 for(; ii<m-1; ii+=2)
360 {
361 c_00 = pC[ii+0+ldc*(jj+0)];
362 c_10 = pC[ii+1+ldc*(jj+0)];
363 c_01 = pC[ii+0+ldc*(jj+1)];
364 c_11 = pC[ii+1+ldc*(jj+1)];
365 for(kk=0; kk<k; kk++)
366 {
367 c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
368 c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
369 c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
370 c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
371 }
372 for(kk=0; kk<jj; kk++)
373 {
374 c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
375 c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
376 c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
377 c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
378 }
379 c_00 *= f_00_inv;
380 c_10 *= f_00_inv;
381 pD[ii+0+ldd*(jj+0)] = c_00;
382 pD[ii+1+ldd*(jj+0)] = c_10;
383 c_01 -= c_00 * f_10;
384 c_11 -= c_10 * f_10;
385 pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
386 pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
387 }
388 for(; ii<m; ii++)
389 {
390 c_00 = pC[ii+0+ldc*(jj+0)];
391 c_01 = pC[ii+0+ldc*(jj+1)];
392 for(kk=0; kk<k; kk++)
393 {
394 c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
395 c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
396 }
397 for(kk=0; kk<jj; kk++)
398 {
399 c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
400 c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
401 }
402 c_00 *= f_00_inv;
403 pD[ii+0+ldd*(jj+0)] = c_00;
404 c_01 -= c_00 * f_10;
405 pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
406 }
407 }
408 for(; jj<n; jj++)
409 {
410 // factorize diagonal
411 c_00 = pC[jj+ldc*jj];;
412 for(kk=0; kk<k; kk++)
413 {
414 c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
415 }
416 for(kk=0; kk<jj; kk++)
417 {
418 c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
419 }
420 if(c_00>0)
421 {
422 f_00_inv = 1.0/sqrt(c_00);
423 }
424 else
425 {
426 f_00_inv = 0.0;
427 }
428 dD[jj] = f_00_inv;
429 pD[jj+ldd*jj] = c_00 * f_00_inv;
430 // solve lower
431 for(ii=jj+1; ii<m; ii++)
432 {
433 c_00 = pC[ii+ldc*jj];
434 for(kk=0; kk<k; kk++)
435 {
436 c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
437 }
438 for(kk=0; kk<jj; kk++)
439 {
440 c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
441 }
442 pD[ii+ldd*jj] = c_00 * f_00_inv;
443 }
444 }
445 return;
446 }
447
448
449
450// dgetrf without pivoting
451void GETF2_NOPIVOT(int m, int n, REAL *A, int lda, REAL *dA)
452 {
453 int ii, jj, kk, itmp0, itmp1;
454 int iimax = m<n ? m : n;
455 int i1 = 1;
456 REAL dtmp;
457 REAL dm1 = -1.0;
458
459 for(ii=0; ii<iimax; ii++)
460 {
461 itmp0 = m-ii-1;
462 dtmp = 1.0/A[ii+lda*ii];
463 dA[ii] = dtmp;
464 for(jj=0; jj<itmp0; jj++)
465 {
466 A[ii+1+jj+lda*ii] *= dtmp;
467 }
468 itmp1 = n-ii-1;
469 for(jj=0; jj<itmp1; jj++)
470 {
471 for(kk=0; kk<itmp0; kk++)
472 {
473 A[(ii+1+kk)+lda*(ii+1+jj)] -= A[(ii+1+kk)+lda*ii] * A[ii+lda*(ii+1+jj)];
474 }
475 }
476 }
477 return;
478 }
479
480
481
482// dgetrf without pivoting
483void GETRF_NOPIVOT_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
484 {
485 int ii, jj, kk;
486// int i1 = 1;
487// REAL d1 = 1.0;
488 REAL
489 d_00_inv, d_11_inv,
490 d_00, d_01,
491 d_10, d_11;
492 int ldc = sC->m;
493 int ldd = sD->m;
494 REAL *pC = sC->pA + ci + cj*ldc;
495 REAL *pD = sD->pA + di + dj*ldd;
496 REAL *dD = sD->dA;
497 if(di==0 & dj==0)
498 sD->use_dA = 1;
499 else
500 sD->use_dA = 0;
501#if 1
502 jj = 0;
503 for(; jj<n-1; jj+=2)
504 {
505 // upper
506 ii = 0;
507 for(; ii<jj-1; ii+=2)
508 {
509 // correct upper
510 d_00 = pC[(ii+0)+ldc*(jj+0)];
511 d_10 = pC[(ii+1)+ldc*(jj+0)];
512 d_01 = pC[(ii+0)+ldc*(jj+1)];
513 d_11 = pC[(ii+1)+ldc*(jj+1)];
514 for(kk=0; kk<ii; kk++)
515 {
516 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
517 d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
518 d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
519 d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
520 }
521 // solve upper
522 d_10 -= pD[(ii+1)+ldd*kk] * d_00;
523 d_11 -= pD[(ii+1)+ldd*kk] * d_01;
524 pD[(ii+0)+ldd*(jj+0)] = d_00;
525 pD[(ii+1)+ldd*(jj+0)] = d_10;
526 pD[(ii+0)+ldd*(jj+1)] = d_01;
527 pD[(ii+1)+ldd*(jj+1)] = d_11;
528 }
529 for(; ii<jj; ii++)
530 {
531 // correct upper
532 d_00 = pC[(ii+0)+ldc*(jj+0)];
533 d_01 = pC[(ii+0)+ldc*(jj+1)];
534 for(kk=0; kk<ii; kk++)
535 {
536 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
537 d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
538 }
539 // solve upper
540 pD[(ii+0)+ldd*(jj+0)] = d_00;
541 pD[(ii+0)+ldd*(jj+1)] = d_01;
542 }
543 // diagonal
544 ii = jj;
545 if(ii<m-1)
546 {
547 // correct diagonal
548 d_00 = pC[(ii+0)+ldc*(jj+0)];
549 d_10 = pC[(ii+1)+ldc*(jj+0)];
550 d_01 = pC[(ii+0)+ldc*(jj+1)];
551 d_11 = pC[(ii+1)+ldc*(jj+1)];
552 for(kk=0; kk<ii; kk++)
553 {
554 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
555 d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
556 d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
557 d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
558 }
559 // factorize diagonal
560 d_00_inv = 1.0/d_00;
561 d_10 *= d_00_inv;
562 d_11 -= d_10 * d_01;
563 d_11_inv = 1.0/d_11;
564 pD[(ii+0)+ldd*(jj+0)] = d_00;
565 pD[(ii+1)+ldd*(jj+0)] = d_10;
566 pD[(ii+0)+ldd*(jj+1)] = d_01;
567 pD[(ii+1)+ldd*(jj+1)] = d_11;
568 dD[ii+0] = d_00_inv;
569 dD[ii+1] = d_11_inv;
570 ii += 2;
571 }
572 else if(ii<m)
573 {
574 // correct diagonal
575 d_00 = pC[(ii+0)+ldc*(jj+0)];
576 d_01 = pC[(ii+0)+ldc*(jj+1)];
577 for(kk=0; kk<ii; kk++)
578 {
579 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
580 d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
581 }
582 // factorize diagonal
583 d_00_inv = 1.0/d_00;
584 pD[(ii+0)+ldd*(jj+0)] = d_00;
585 pD[(ii+0)+ldd*(jj+1)] = d_01;
586 dD[ii+0] = d_00_inv;
587 ii += 1;
588 }
589 // lower
590 for(; ii<m-1; ii+=2)
591 {
592 // correct lower
593 d_00 = pC[(ii+0)+ldc*(jj+0)];
594 d_10 = pC[(ii+1)+ldc*(jj+0)];
595 d_01 = pC[(ii+0)+ldc*(jj+1)];
596 d_11 = pC[(ii+1)+ldc*(jj+1)];
597 for(kk=0; kk<jj; kk++)
598 {
599 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
600 d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
601 d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
602 d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
603 }
604 // solve lower
605 d_00 *= d_00_inv;
606 d_10 *= d_00_inv;
607 d_01 -= d_00 * pD[kk+ldd*(jj+1)];
608 d_11 -= d_10 * pD[kk+ldd*(jj+1)];
609 d_01 *= d_11_inv;
610 d_11 *= d_11_inv;
611 pD[(ii+0)+ldd*(jj+0)] = d_00;
612 pD[(ii+1)+ldd*(jj+0)] = d_10;
613 pD[(ii+0)+ldd*(jj+1)] = d_01;
614 pD[(ii+1)+ldd*(jj+1)] = d_11;
615 }
616 for(; ii<m; ii++)
617 {
618 // correct lower
619 d_00 = pC[(ii+0)+ldc*(jj+0)];
620 d_01 = pC[(ii+0)+ldc*(jj+1)];
621 for(kk=0; kk<jj; kk++)
622 {
623 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
624 d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
625 }
626 // solve lower
627 d_00 *= d_00_inv;
628 d_01 -= d_00 * pD[kk+ldd*(jj+1)];
629 d_01 *= d_11_inv;
630 pD[(ii+0)+ldd*(jj+0)] = d_00;
631 pD[(ii+0)+ldd*(jj+1)] = d_01;
632 }
633 }
634 for(; jj<n; jj++)
635 {
636 // upper
637 ii = 0;
638 for(; ii<jj-1; ii+=2)
639 {
640 // correct upper
641 d_00 = pC[(ii+0)+ldc*jj];
642 d_10 = pC[(ii+1)+ldc*jj];
643 for(kk=0; kk<ii; kk++)
644 {
645 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
646 d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
647 }
648 // solve upper
649 d_10 -= pD[(ii+1)+ldd*kk] * d_00;
650 pD[(ii+0)+ldd*jj] = d_00;
651 pD[(ii+1)+ldd*jj] = d_10;
652 }
653 for(; ii<jj; ii++)
654 {
655 // correct upper
656 d_00 = pC[(ii+0)+ldc*jj];
657 for(kk=0; kk<ii; kk++)
658 {
659 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
660 }
661 // solve upper
662 pD[(ii+0)+ldd*jj] = d_00;
663 }
664 // diagonal
665 ii = jj;
666 if(ii<m-1)
667 {
668 // correct diagonal
669 d_00 = pC[(ii+0)+ldc*jj];
670 d_10 = pC[(ii+1)+ldc*jj];
671 for(kk=0; kk<ii; kk++)
672 {
673 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
674 d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
675 }
676 // factorize diagonal
677 d_00_inv = 1.0/d_00;
678 d_10 *= d_00_inv;
679 pD[(ii+0)+ldd*jj] = d_00;
680 pD[(ii+1)+ldd*jj] = d_10;
681 dD[ii+0] = d_00_inv;
682 ii += 2;
683 }
684 else if(ii<m)
685 {
686 // correct diagonal
687 d_00 = pC[(ii+0)+ldc*jj];
688 for(kk=0; kk<ii; kk++)
689 {
690 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
691 }
692 // factorize diagonal
693 d_00_inv = 1.0/d_00;
694 pD[(ii+0)+ldd*jj] = d_00;
695 dD[ii+0] = d_00_inv;
696 ii += 1;
697 }
698 // lower
699 for(; ii<m-1; ii+=2)
700 {
701 // correct lower
702 d_00 = pC[(ii+0)+ldc*jj];
703 d_10 = pC[(ii+1)+ldc*jj];
704 for(kk=0; kk<jj; kk++)
705 {
706 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
707 d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
708 }
709 // solve lower
710 d_00 *= d_00_inv;
711 d_10 *= d_00_inv;
712 pD[(ii+0)+ldd*jj] = d_00;
713 pD[(ii+1)+ldd*jj] = d_10;
714 }
715 for(; ii<m; ii++)
716 {
717 // correct lower
718 d_00 = pC[(ii+0)+ldc*jj];
719 for(kk=0; kk<jj; kk++)
720 {
721 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
722 }
723 // solve lower
724 d_00 *= d_00_inv;
725 pD[(ii+0)+ldd*jj] = d_00;
726 }
727 }
728#else
729 if(pC!=pD)
730 {
731 for(jj=0; jj<n; jj++)
732 {
733 for(ii=0; ii<m; ii++)
734 {
735 pD[ii+ldd*jj] = pC[ii+ldc*jj];
736 }
737 }
738 }
739 GETF2_NOPIVOT(m, n, pD, ldd, dD);
740#endif
741 return;
742 }
743
744
745
746// dgetrf pivoting
747void GETRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, int *ipiv)
748 {
749 int ii, i0, jj, kk, ip, itmp0, itmp1;
750 REAL dtmp, dmax;
751 REAL
752 d_00_inv, d_11_inv,
753 d_00, d_01,
754 d_10, d_11;
755 int i1 = 1;
756 REAL d1 = 1.0;
757 int ldc = sC->m;
758 int ldd = sD->m;
759 REAL *pC = sC->pA+ci+cj*ldc;
760 REAL *pD = sD->pA+di+dj*ldd;
761 REAL *dD = sD->dA;
762 if(di==0 & dj==0)
763 sD->use_dA = 1;
764 else
765 sD->use_dA = 0;
766 // copy if needed
767 if(pC!=pD)
768 {
769 for(jj=0; jj<n; jj++)
770 {
771 for(ii=0; ii<m; ii++)
772 {
773 pD[ii+ldd*jj] = pC[ii+ldc*jj];
774 }
775 }
776 }
777 // factorize
778#if 1
779 jj = 0;
780 for(; jj<n-1; jj+=2)
781 {
782 ii = 0;
783 for(; ii<jj-1; ii+=2)
784 {
785 // correct upper
786 d_00 = pD[(ii+0)+ldd*(jj+0)];
787 d_10 = pD[(ii+1)+ldd*(jj+0)];
788 d_01 = pD[(ii+0)+ldd*(jj+1)];
789 d_11 = pD[(ii+1)+ldd*(jj+1)];
790 for(kk=0; kk<ii; kk++)
791 {
792 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
793 d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
794 d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
795 d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
796 }
797 // solve upper
798 d_10 -= pD[(ii+1)+ldd*kk] * d_00;
799 d_11 -= pD[(ii+1)+ldd*kk] * d_01;
800 pD[(ii+0)+ldd*(jj+0)] = d_00;
801 pD[(ii+1)+ldd*(jj+0)] = d_10;
802 pD[(ii+0)+ldd*(jj+1)] = d_01;
803 pD[(ii+1)+ldd*(jj+1)] = d_11;
804 }
805 for(; ii<jj; ii++)
806 {
807 // correct upper
808 d_00 = pD[(ii+0)+ldd*(jj+0)];
809 d_01 = pD[(ii+0)+ldd*(jj+1)];
810 for(kk=0; kk<ii; kk++)
811 {
812 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
813 d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
814 }
815 // solve upper
816 pD[(ii+0)+ldd*(jj+0)] = d_00;
817 pD[(ii+0)+ldd*(jj+1)] = d_01;
818 }
819 // correct diagonal and lower and look for pivot
820 // correct
821 ii = jj;
822 i0 = ii;
823 for(; ii<m-1; ii+=2)
824 {
825 d_00 = pD[(ii+0)+ldd*(jj+0)];
826 d_10 = pD[(ii+1)+ldd*(jj+0)];
827 d_01 = pD[(ii+0)+ldd*(jj+1)];
828 d_11 = pD[(ii+1)+ldd*(jj+1)];
829 for(kk=0; kk<jj; kk++)
830 {
831 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
832 d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
833 d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
834 d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
835 }
836 pD[(ii+0)+ldd*(jj+0)] = d_00;
837 pD[(ii+1)+ldd*(jj+0)] = d_10;
838 pD[(ii+0)+ldd*(jj+1)] = d_01;
839 pD[(ii+1)+ldd*(jj+1)] = d_11;
840 }
841 for(; ii<m; ii++)
842 {
843 d_00 = pD[(ii+0)+ldd*(jj+0)];
844 d_01 = pD[(ii+0)+ldd*(jj+1)];
845 for(kk=0; kk<jj; kk++)
846 {
847 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
848 d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
849 }
850 pD[(ii+0)+ldd*(jj+0)] = d_00;
851 pD[(ii+0)+ldd*(jj+1)] = d_01;
852 }
853 // look for pivot & solve
854 // left column
855 ii = i0;
856 dmax = 0;
857 ip = ii;
858 for(; ii<m-1; ii+=2)
859 {
860 d_00 = pD[(ii+0)+ldd*jj];
861 d_10 = pD[(ii+1)+ldd*jj];
862 dtmp = d_00>0.0 ? d_00 : -d_00;
863 if(dtmp>dmax)
864 {
865 dmax = dtmp;
866 ip = ii+0;
867 }
868 dtmp = d_10>0.0 ? d_10 : -d_10;
869 if(dtmp>dmax)
870 {
871 dmax = dtmp;
872 ip = ii+1;
873 }
874 }
875 for(; ii<m; ii++)
876 {
877 d_00 = pD[(ii+0)+ldd*jj];
878 dtmp = d_00>0.0 ? d_00 : -d_00;
879 if(dtmp>dmax)
880 {
881 dmax = dtmp;
882 ip = ii+0;
883 }
884 }
885 // row swap
886 ii = i0;
887 ipiv[ii] = ip;
888 if(ip!=ii)
889 {
890 for(kk=0; kk<n; kk++)
891 {
892 dtmp = pD[ii+ldd*kk];
893 pD[ii+ldd*kk] = pD[ip+ldd*kk];
894 pD[ip+ldd*kk] = dtmp;
895 }
896 }
897 // factorize diagonal
898 d_00 = pD[(ii+0)+ldd*(jj+0)];
899 d_00_inv = 1.0/d_00;
900 pD[(ii+0)+ldd*(jj+0)] = d_00;
901 dD[ii] = d_00_inv;
902 ii += 1;
903 // solve & compute next pivot
904 dmax = 0;
905 ip = ii;
906 for(; ii<m-1; ii+=2)
907 {
908 d_00 = pD[(ii+0)+ldd*(jj+0)];
909 d_10 = pD[(ii+1)+ldd*(jj+0)];
910 d_00 *= d_00_inv;
911 d_10 *= d_00_inv;
912 d_01 = pD[(ii+0)+ldd*(jj+1)];
913 d_11 = pD[(ii+1)+ldd*(jj+1)];
914 d_01 -= d_00 * pD[jj+ldd*(jj+1)];
915 d_11 -= d_10 * pD[jj+ldd*(jj+1)];
916 dtmp = d_01>0.0 ? d_01 : -d_01;
917 if(dtmp>dmax)
918 {
919 dmax = dtmp;
920 ip = ii+0;
921 }
922 dtmp = d_11>0.0 ? d_11 : -d_11;
923 if(dtmp>dmax)
924 {
925 dmax = dtmp;
926 ip = ii+1;
927 }
928 pD[(ii+0)+ldd*(jj+0)] = d_00;
929 pD[(ii+1)+ldd*(jj+0)] = d_10;
930 pD[(ii+0)+ldd*(jj+1)] = d_01;
931 pD[(ii+1)+ldd*(jj+1)] = d_11;
932 }
933 for(; ii<m; ii++)
934 {
935 d_00 = pD[(ii+0)+ldd*(jj+0)];
936 d_00 *= d_00_inv;
937 d_01 = pD[(ii+0)+ldd*(jj+1)];
938 d_01 -= d_00 * pD[jj+ldd*(jj+1)];
939 dtmp = d_01>0.0 ? d_01 : -d_01;
940 if(dtmp>dmax)
941 {
942 dmax = dtmp;
943 ip = ii+0;
944 }
945 pD[(ii+0)+ldd*(jj+0)] = d_00;
946 pD[(ii+0)+ldd*(jj+1)] = d_01;
947 }
948 // row swap
949 ii = i0+1;
950 ipiv[ii] = ip;
951 if(ip!=ii)
952 {
953 for(kk=0; kk<n; kk++)
954 {
955 dtmp = pD[ii+ldd*kk];
956 pD[ii+ldd*kk] = pD[ip+ldd*kk];
957 pD[ip+ldd*kk] = dtmp;
958 }
959 }
960 // factorize diagonal
961 d_00 = pD[ii+ldd*(jj+1)];
962 d_00_inv = 1.0/d_00;
963 pD[ii+ldd*(jj+1)] = d_00;
964 dD[ii] = d_00_inv;
965 ii += 1;
966 // solve lower
967 for(; ii<m; ii++)
968 {
969 d_00 = pD[ii+ldd*(jj+1)];
970 d_00 *= d_00_inv;
971 pD[ii+ldd*(jj+1)] = d_00;
972 }
973 }
974 for(; jj<n; jj++)
975 {
976 ii = 0;
977 for(; ii<jj-1; ii+=2)
978 {
979 // correct upper
980 d_00 = pD[(ii+0)+ldd*jj];
981 d_10 = pD[(ii+1)+ldd*jj];
982 for(kk=0; kk<ii; kk++)
983 {
984 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
985 d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
986 }
987 // solve upper
988 d_10 -= pD[(ii+1)+ldd*kk] * d_00;
989 pD[(ii+0)+ldd*jj] = d_00;
990 pD[(ii+1)+ldd*jj] = d_10;
991 }
992 for(; ii<jj; ii++)
993 {
994 // correct upper
995 d_00 = pD[ii+ldd*jj];
996 for(kk=0; kk<ii; kk++)
997 {
998 d_00 -= pD[ii+ldd*kk] * pD[kk+ldd*jj];
999 }
1000 // solve upper
1001 pD[ii+ldd*jj] = d_00;
1002 }
1003 i0 = ii;
1004 ii = jj;
1005 // correct diagonal and lower and look for pivot
1006 dmax = 0;
1007 ip = ii;
1008 for(; ii<m-1; ii+=2)
1009 {
1010 d_00 = pD[(ii+0)+ldd*jj];
1011 d_10 = pD[(ii+1)+ldd*jj];
1012 for(kk=0; kk<jj; kk++)
1013 {
1014 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
1015 d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
1016 }
1017 dtmp = d_00>0.0 ? d_00 : -d_00;
1018 if(dtmp>dmax)
1019 {
1020 dmax = dtmp;
1021 ip = ii+0;
1022 }
1023 dtmp = d_10>0.0 ? d_10 : -d_10;
1024 if(dtmp>dmax)
1025 {
1026 dmax = dtmp;
1027 ip = ii+1;
1028 }
1029 pD[(ii+0)+ldd*jj] = d_00;
1030 pD[(ii+1)+ldd*jj] = d_10;
1031 }
1032 for(; ii<m; ii++)
1033 {
1034 d_00 = pD[(ii+0)+ldd*jj];
1035 for(kk=0; kk<jj; kk++)
1036 {
1037 d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
1038 }
1039 dtmp = d_00>0.0 ? d_00 : -d_00;
1040 if(dtmp>dmax)
1041 {
1042 dmax = dtmp;
1043 ip = ii+0;
1044 }
1045 pD[(ii+0)+ldd*jj] = d_00;
1046 }
1047 // row swap
1048 ii = i0;
1049 ipiv[ii] = ip;
1050 if(ip!=ii)
1051 {
1052 for(kk=0; kk<n; kk++)
1053 {
1054 dtmp = pD[ii+ldd*kk];
1055 pD[ii+ldd*kk] = pD[ip+ldd*kk];
1056 pD[ip+ldd*kk] = dtmp;
1057 }
1058 }
1059 // factorize diagonal
1060 d_00 = pD[ii+ldd*jj];
1061 d_00_inv = 1.0/d_00;
1062 pD[ii+ldd*jj] = d_00;
1063 dD[ii] = d_00_inv;
1064 ii += 1;
1065 for(; ii<m; ii++)
1066 {
1067 // correct lower
1068 d_00 = pD[ii+ldd*jj];
1069 // solve lower
1070 d_00 *= d_00_inv;
1071 pD[ii+ldd*jj] = d_00;
1072 }
1073 }
1074#else
1075 int iimax = m<n ? m : n;
1076 for(ii=0; ii<iimax; ii++)
1077 {
1078 dmax = (pD[ii+ldd*ii]>0 ? pD[ii+ldd*ii] : -pD[ii+ldd*ii]);
1079 ip = ii;
1080 for(jj=1; jj<m-ii; jj++)
1081 {
1082 dtmp = pD[ii+jj+ldd*ii]>0 ? pD[ii+jj+ldd*ii] : -pD[ii+jj+ldd*ii];
1083 if(dtmp>dmax)
1084 {
1085 dmax = dtmp;
1086 ip = ii+jj;
1087 }
1088 }
1089 ipiv[ii] = ip;
1090 if(ip!=ii)
1091 {
1092 for(jj=0; jj<n; jj++)
1093 {
1094 dtmp = pD[ii+jj*ldd];
1095 pD[ii+jj*ldd] = pD[ip+jj*ldd];
1096 pD[ip+jj*ldd] = dtmp;
1097 }
1098 }
1099 itmp0 = m-ii-1;
1100 dtmp = 1.0/pD[ii+ldd*ii];
1101 dD[ii] = dtmp;
1102 for(jj=0; jj<itmp0; jj++)
1103 {
1104 pD[ii+1+jj+ldd*ii] *= dtmp;
1105 }
1106 itmp1 = n-ii-1;
1107 for(jj=0; jj<itmp1; jj++)
1108 {
1109 for(kk=0; kk<itmp0; kk++)
1110 {
1111 pD[(ii+1+kk)+ldd*(ii+1+jj)] -= pD[(ii+1+kk)+ldd*ii] * pD[ii+ldd*(ii+1+jj)];
1112 }
1113 }
1114 }
1115#endif
1116 return;
1117 }
1118
1119
1120
1121int GEQRF_WORK_SIZE_LIBSTR(int m, int n)
1122 {
1123 return 0;
1124 }
1125
1126
1127
1128void GEQRF_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRMAT *sD, int di, int dj, void *work)
1129 {
1130 if(m<=0 | n<=0)
1131 return;
1132 int ii, jj, kk;
1133 int lda = sA->m;
1134 int ldd = sD->m;
1135 REAL *pA = sA->pA+ai+aj*lda;
1136 REAL *pD = sD->pA+di+dj*ldd; // matrix of QR
1137 REAL *dD = sD->dA+di; // vectors of tau
1138 REAL alpha, beta, tmp, w0, w1;
1139 REAL *pC00, *pC01, *pC11, *pv0, *pv1;
1140 REAL pW[4] = {0.0, 0.0, 0.0, 0.0};
1141 int ldw = 2;
1142 REAL pT[4] = {0.0, 0.0, 0.0, 0.0};
1143 int ldb = 2;
1144 int imax, jmax, kmax;
1145 // copy if needed
1146 if(pA!=pD)
1147 {
1148 for(jj=0; jj<n; jj++)
1149 {
1150 for(ii=0; ii<m; ii++)
1151 {
1152 pD[ii+ldd*jj] = pA[ii+lda*jj];
1153 }
1154 }
1155 }
1156 imax = m<n ? m : n;
1157 ii = 0;
1158#if 1
1159 for(; ii<imax-1; ii+=2)
1160 {
1161 // first column
1162 pC00 = &pD[ii+ldd*ii];
1163 beta = 0.0;
1164 for(jj=1; jj<m-ii; jj++)
1165 {
1166 tmp = pC00[jj+ldd*0];
1167 beta += tmp*tmp;
1168 }
1169 if(beta==0.0)
1170 {
1171 // tau0
1172 dD[ii] = 0.0;
1173 }
1174 else
1175 {
1176 alpha = pC00[0+ldd*0];
1177 beta += alpha*alpha;
1178 beta = sqrt(beta);
1179 if(alpha>0)
1180 beta = -beta;
1181 // tau0
1182 dD[ii] = (beta-alpha) / beta;
1183 tmp = 1.0 / (alpha-beta);
1184 // compute v0
1185 pC00[0+ldd*0] = beta;
1186 for(jj=1; jj<m-ii; jj++)
1187 {
1188 pC00[jj+ldd*0] *= tmp;
1189 }
1190 }
1191 // gemv_t & ger
1192 pC01 = &pC00[0+ldd*1];
1193 pv0 = &pC00[0+ldd*0];
1194 kmax = m-ii;
1195 w0 = pC01[0+ldd*0]; // pv0[0] = 1.0
1196 for(kk=1; kk<kmax; kk++)
1197 {
1198 w0 += pC01[kk+ldd*0] * pv0[kk];
1199 }
1200 w0 = - dD[ii] * w0;
1201 pC01[0+ldd*0] += w0; // pv0[0] = 1.0
1202 for(kk=1; kk<kmax; kk++)
1203 {
1204 pC01[kk+ldd*0] += w0 * pv0[kk];
1205 }
1206 // second column
1207 pC11 = &pD[(ii+1)+ldd*(ii+1)];
1208 beta = 0.0;
1209 for(jj=1; jj<m-(ii+1); jj++)
1210 {
1211 tmp = pC11[jj+ldd*0];
1212 beta += tmp*tmp;
1213 }
1214 if(beta==0.0)
1215 {
1216 // tau1
1217 dD[(ii+1)] = 0.0;
1218 }
1219 else
1220 {
1221 alpha = pC11[0+ldd*0];
1222 beta += alpha*alpha;
1223 beta = sqrt(beta);
1224 if(alpha>0)
1225 beta = -beta;
1226 // tau1
1227 dD[(ii+1)] = (beta-alpha) / beta;
1228 tmp = 1.0 / (alpha-beta);
1229 // compute v1
1230 pC11[0+ldd*0] = beta;
1231 for(jj=1; jj<m-(ii+1); jj++)
1232 pC11[jj+ldd*0] *= tmp;
1233 }
1234 // compute lower triangular T containing tau for matrix update
1235 pv0 = &pC00[0+ldd*0];
1236 pv1 = &pC00[0+ldd*1];
1237 kmax = m-ii;
1238 tmp = pv0[1];
1239 for(kk=2; kk<kmax; kk++)
1240 tmp += pv0[kk]*pv1[kk];
1241 pT[0+ldb*0] = dD[ii+0];
1242 pT[1+ldb*0] = - dD[ii+1] * tmp * dD[ii+0];
1243 pT[1+ldb*1] = dD[ii+1];
1244 jmax = n-ii-2;
1245 jj = 0;
1246 for(; jj<jmax-1; jj+=2)
1247 {
1248 // compute W^T = C^T * V
1249 pW[0+ldw*0] = pC00[0+ldd*(jj+0+2)] + pC00[1+ldd*(jj+0+2)] * pv0[1];
1250 pW[1+ldw*0] = pC00[0+ldd*(jj+1+2)] + pC00[1+ldd*(jj+1+2)] * pv0[1];
1251 pW[0+ldw*1] = pC00[1+ldd*(jj+0+2)];
1252 pW[1+ldw*1] = pC00[1+ldd*(jj+1+2)];
1253 kk = 2;
1254 for(; kk<kmax; kk++)
1255 {
1256 tmp = pC00[kk+ldd*(jj+0+2)];
1257 pW[0+ldw*0] += tmp * pv0[kk];
1258 pW[0+ldw*1] += tmp * pv1[kk];
1259 tmp = pC00[kk+ldd*(jj+1+2)];
1260 pW[1+ldw*0] += tmp * pv0[kk];
1261 pW[1+ldw*1] += tmp * pv1[kk];
1262 }
1263 // compute W^T *= T
1264 pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
1265 pW[1+ldw*1] = pT[1+ldb*0]*pW[1+ldw*0] + pT[1+ldb*1]*pW[1+ldw*1];
1266 pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
1267 pW[1+ldw*0] = pT[0+ldb*0]*pW[1+ldw*0];
1268 // compute C -= V * W^T
1269 pC00[0+ldd*(jj+0+2)] -= pW[0+ldw*0];
1270 pC00[0+ldd*(jj+1+2)] -= pW[1+ldw*0];
1271 pC00[1+ldd*(jj+0+2)] -= pv0[1]*pW[0+ldw*0] + pW[0+ldw*1];
1272 pC00[1+ldd*(jj+1+2)] -= pv0[1]*pW[1+ldw*0] + pW[1+ldw*1];
1273 kk = 2;
1274 for(; kk<kmax-1; kk+=2)
1275 {
1276 pC00[kk+0+ldd*(jj+0+2)] -= pv0[kk+0]*pW[0+ldw*0] + pv1[kk+0]*pW[0+ldw*1];
1277 pC00[kk+1+ldd*(jj+0+2)] -= pv0[kk+1]*pW[0+ldw*0] + pv1[kk+1]*pW[0+ldw*1];
1278 pC00[kk+0+ldd*(jj+1+2)] -= pv0[kk+0]*pW[1+ldw*0] + pv1[kk+0]*pW[1+ldw*1];
1279 pC00[kk+1+ldd*(jj+1+2)] -= pv0[kk+1]*pW[1+ldw*0] + pv1[kk+1]*pW[1+ldw*1];
1280 }
1281 for(; kk<kmax; kk++)
1282 {
1283 pC00[kk+ldd*(jj+0+2)] -= pv0[kk]*pW[0+ldw*0] + pv1[kk]*pW[0+ldw*1];
1284 pC00[kk+ldd*(jj+1+2)] -= pv0[kk]*pW[1+ldw*0] + pv1[kk]*pW[1+ldw*1];
1285 }
1286 }
1287 for(; jj<jmax; jj++)
1288 {
1289 // compute W = T * V^T * C
1290 pW[0+ldw*0] = pC00[0+ldd*(jj+0+2)] + pC00[1+ldd*(jj+0+2)] * pv0[1];
1291 pW[0+ldw*1] = pC00[1+ldd*(jj+0+2)];
1292 for(kk=2; kk<kmax; kk++)
1293 {
1294 tmp = pC00[kk+ldd*(jj+0+2)];
1295 pW[0+ldw*0] += tmp * pv0[kk];
1296 pW[0+ldw*1] += tmp * pv1[kk];
1297 }
1298 pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
1299 pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
1300 // compute C -= V * W^T
1301 pC00[0+ldd*(jj+0+2)] -= pW[0+ldw*0];
1302 pC00[1+ldd*(jj+0+2)] -= pv0[1]*pW[0+ldw*0] + pW[0+ldw*1];
1303 for(kk=2; kk<kmax; kk++)
1304 {
1305 pC00[kk+ldd*(jj+0+2)] -= pv0[kk]*pW[0+ldw*0] + pv1[kk]*pW[0+ldw*1];
1306 }
1307 }
1308 }
1309#endif
1310 for(; ii<imax; ii++)
1311 {
1312 pC00 = &pD[ii+ldd*ii];
1313 beta = 0.0;
1314 for(jj=1; jj<m-ii; jj++)
1315 {
1316 tmp = pC00[jj+ldd*0];
1317 beta += tmp*tmp;
1318 }
1319 if(beta==0.0)
1320 {
1321 dD[ii] = 0.0;
1322 }
1323 else
1324 {
1325 alpha = pC00[0+ldd*0];
1326 beta += alpha*alpha;
1327 beta = sqrt(beta);
1328 if(alpha>0)
1329 beta = -beta;
1330 dD[ii] = (beta-alpha) / beta;
1331 tmp = 1.0 / (alpha-beta);
1332 for(jj=1; jj<m-ii; jj++)
1333 pC00[jj+ldd*0] *= tmp;
1334 pC00[0+ldd*0] = beta;
1335 }
1336 if(ii<n)
1337 {
1338 // gemv_t & ger
1339 pC01 = &pC00[0+ldd*1];
1340 pv0 = &pC00[0+ldd*0];
1341 jmax = n-ii-1;
1342 kmax = m-ii;
1343 jj = 0;
1344 for(; jj<jmax-1; jj+=2)
1345 {
1346 w0 = pC01[0+ldd*(jj+0)]; // pv0[0] = 1.0
1347 w1 = pC01[0+ldd*(jj+1)]; // pv0[0] = 1.0
1348 for(kk=1; kk<kmax; kk++)
1349 {
1350 w0 += pC01[kk+ldd*(jj+0)] * pv0[kk];
1351 w1 += pC01[kk+ldd*(jj+1)] * pv0[kk];
1352 }
1353 w0 = - dD[ii] * w0;
1354 w1 = - dD[ii] * w1;
1355 pC01[0+ldd*(jj+0)] += w0; // pv0[0] = 1.0
1356 pC01[0+ldd*(jj+1)] += w1; // pv0[0] = 1.0
1357 for(kk=1; kk<kmax; kk++)
1358 {
1359 pC01[kk+ldd*(jj+0)] += w0 * pv0[kk];
1360 pC01[kk+ldd*(jj+1)] += w1 * pv0[kk];
1361 }
1362 }
1363 for(; jj<jmax; jj++)
1364 {
1365 w0 = pC01[0+ldd*jj]; // pv0[0] = 1.0
1366 for(kk=1; kk<kmax; kk++)
1367 {
1368 w0 += pC01[kk+ldd*jj] * pv0[kk];
1369 }
1370 w0 = - dD[ii] * w0;
1371 pC01[0+ldd*jj] += w0; // pv0[0] = 1.0
1372 for(kk=1; kk<kmax; kk++)
1373 {
1374 pC01[kk+ldd*jj] += w0 * pv0[kk];
1375 }
1376 }
1377 }
1378 }
1379 return;
1380 }
1381
1382
1383
1384int GELQF_WORK_SIZE_LIBSTR(int m, int n)
1385 {
1386 return 0;
1387 }
1388
1389
1390
1391void GELQF_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRMAT *sD, int di, int dj, void *work)
1392 {
1393 if(m<=0 | n<=0)
1394 return;
1395 int ii, jj, kk;
1396 int lda = sA->m;
1397 int ldd = sD->m;
1398 REAL *pA = sA->pA+ai+aj*lda;
1399 REAL *pD = sD->pA+di+dj*ldd; // matrix of QR
1400 REAL *dD = sD->dA+di; // vectors of tau
1401 REAL alpha, beta, tmp, w0, w1;
1402 REAL *pC00, *pC10, *pC11, *pv0, *pv1;
1403 REAL pW[4] = {0.0, 0.0, 0.0, 0.0};
1404 int ldw = 2;
1405 REAL pT[4] = {0.0, 0.0, 0.0, 0.0};
1406 int ldb = 2;
1407 int imax, jmax, kmax;
1408 // copy if needed
1409 if(pA!=pD)
1410 {
1411 for(jj=0; jj<n; jj++)
1412 {
1413 for(ii=0; ii<m; ii++)
1414 {
1415 pD[ii+ldd*jj] = pA[ii+lda*jj];
1416 }
1417 }
1418 }
1419 imax = m<n ? m : n;
1420 ii = 0;
1421#if 1
1422 for(; ii<imax-1; ii+=2)
1423 {
1424 // first column
1425 pC00 = &pD[ii+ldd*ii];
1426 beta = 0.0;
1427 for(jj=1; jj<n-ii; jj++)
1428 {
1429 tmp = pC00[0+ldd*jj];
1430 beta += tmp*tmp;
1431 }
1432 if(beta==0.0)
1433 {
1434 // tau0
1435 dD[ii] = 0.0;
1436 }
1437 else
1438 {
1439 alpha = pC00[0+ldd*0];
1440 beta += alpha*alpha;
1441 beta = sqrt(beta);
1442 if(alpha>0)
1443 beta = -beta;
1444 // tau0
1445 dD[ii] = (beta-alpha) / beta;
1446 tmp = 1.0 / (alpha-beta);
1447 // compute v0
1448 pC00[0+ldd*0] = beta;
1449 for(jj=1; jj<n-ii; jj++)
1450 {
1451 pC00[0+ldd*jj] *= tmp;
1452 }
1453 }
1454 // gemv_t & ger
1455 pC10 = &pC00[1+ldd*0];
1456 pv0 = &pC00[0+ldd*0];
1457 kmax = n-ii;
1458 w0 = pC10[0+ldd*0]; // pv0[0] = 1.0
1459 for(kk=1; kk<kmax; kk++)
1460 {
1461 w0 += pC10[0+ldd*kk] * pv0[0+ldd*kk];
1462 }
1463 w0 = - dD[ii] * w0;
1464 pC10[0+ldd*0] += w0; // pv0[0] = 1.0
1465 for(kk=1; kk<kmax; kk++)
1466 {
1467 pC10[0+ldd*kk] += w0 * pv0[0+ldd*kk];
1468 }
1469 // second row
1470 pC11 = &pD[(ii+1)+ldd*(ii+1)];
1471 beta = 0.0;
1472 for(jj=1; jj<n-(ii+1); jj++)
1473 {
1474 tmp = pC11[0+ldd*jj];
1475 beta += tmp*tmp;
1476 }
1477 if(beta==0.0)
1478 {
1479 // tau1
1480 dD[(ii+1)] = 0.0;
1481 }
1482 else
1483 {
1484 alpha = pC11[0+ldd*0];
1485 beta += alpha*alpha;
1486 beta = sqrt(beta);
1487 if(alpha>0)
1488 beta = -beta;
1489 // tau1
1490 dD[(ii+1)] = (beta-alpha) / beta;
1491 tmp = 1.0 / (alpha-beta);
1492 // compute v1
1493 pC11[0+ldd*0] = beta;
1494 for(jj=1; jj<n-(ii+1); jj++)
1495 pC11[0+ldd*jj] *= tmp;
1496 }
1497 // compute lower triangular T containing tau for matrix update
1498 pv0 = &pC00[0+ldd*0];
1499 pv1 = &pC00[1+ldd*0];
1500 kmax = n-ii;
1501 tmp = pv0[0+ldd*1];
1502 for(kk=2; kk<kmax; kk++)
1503 tmp += pv0[0+ldd*kk]*pv1[0+ldd*kk];
1504 pT[0+ldb*0] = dD[ii+0];
1505 pT[1+ldb*0] = - dD[ii+1] * tmp * dD[ii+0];
1506 pT[1+ldb*1] = dD[ii+1];
1507 // downgrade
1508 jmax = m-ii-2;
1509 jj = 0;
1510 for(; jj<jmax-1; jj+=2)
1511 {
1512 // compute W^T = C^T * V
1513 pW[0+ldw*0] = pC00[jj+0+2+ldd*0] + pC00[jj+0+2+ldd*1] * pv0[0+ldd*1];
1514 pW[1+ldw*0] = pC00[jj+1+2+ldd*0] + pC00[jj+1+2+ldd*1] * pv0[0+ldd*1];
1515 pW[0+ldw*1] = pC00[jj+0+2+ldd*1];
1516 pW[1+ldw*1] = pC00[jj+1+2+ldd*1];
1517 kk = 2;
1518 for(; kk<kmax; kk++)
1519 {
1520 tmp = pC00[jj+0+2+ldd*kk];
1521 pW[0+ldw*0] += tmp * pv0[0+ldd*kk];
1522 pW[0+ldw*1] += tmp * pv1[0+ldd*kk];
1523 tmp = pC00[jj+1+2+ldd*kk];
1524 pW[1+ldw*0] += tmp * pv0[0+ldd*kk];
1525 pW[1+ldw*1] += tmp * pv1[0+ldd*kk];
1526 }
1527 // compute W^T *= T
1528 pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
1529 pW[1+ldw*1] = pT[1+ldb*0]*pW[1+ldw*0] + pT[1+ldb*1]*pW[1+ldw*1];
1530 pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
1531 pW[1+ldw*0] = pT[0+ldb*0]*pW[1+ldw*0];
1532 // compute C -= V * W^T
1533 pC00[jj+0+2+ldd*0] -= pW[0+ldw*0];
1534 pC00[jj+1+2+ldd*0] -= pW[1+ldw*0];
1535 pC00[jj+0+2+ldd*1] -= pv0[0+ldd*1]*pW[0+ldw*0] + pW[0+ldw*1];
1536 pC00[jj+1+2+ldd*1] -= pv0[0+ldd*1]*pW[1+ldw*0] + pW[1+ldw*1];
1537 kk = 2;
1538 for(; kk<kmax-1; kk+=2)
1539 {
1540 pC00[jj+0+2+ldd*(kk+0)] -= pv0[0+ldd*(kk+0)]*pW[0+ldw*0] + pv1[0+ldd*(kk+0)]*pW[0+ldw*1];
1541 pC00[jj+0+2+ldd*(kk+1)] -= pv0[0+ldd*(kk+1)]*pW[0+ldw*0] + pv1[0+ldd*(kk+1)]*pW[0+ldw*1];
1542 pC00[jj+1+2+ldd*(kk+0)] -= pv0[0+ldd*(kk+0)]*pW[1+ldw*0] + pv1[0+ldd*(kk+0)]*pW[1+ldw*1];
1543 pC00[jj+1+2+ldd*(kk+1)] -= pv0[0+ldd*(kk+1)]*pW[1+ldw*0] + pv1[0+ldd*(kk+1)]*pW[1+ldw*1];
1544 }
1545 for(; kk<kmax; kk++)
1546 {
1547 pC00[jj+0+2+ldd*kk] -= pv0[0+ldd*kk]*pW[0+ldw*0] + pv1[0+ldd*kk]*pW[0+ldw*1];
1548 pC00[jj+1+2+ldd*kk] -= pv0[0+ldd*kk]*pW[1+ldw*0] + pv1[0+ldd*kk]*pW[1+ldw*1];
1549 }
1550 }
1551 for(; jj<jmax; jj++)
1552 {
1553 // compute W = T * V^T * C
1554 pW[0+ldw*0] = pC00[jj+0+2+ldd*0] + pC00[jj+0+2+ldd*1] * pv0[0+ldd*1];
1555 pW[0+ldw*1] = pC00[jj+0+2+ldd*1];
1556 for(kk=2; kk<kmax; kk++)
1557 {
1558 tmp = pC00[jj+0+2+ldd*kk];
1559 pW[0+ldw*0] += tmp * pv0[0+ldd*kk];
1560 pW[0+ldw*1] += tmp * pv1[0+ldd*kk];
1561 }
1562 pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
1563 pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
1564 // compute C -= V * W^T
1565 pC00[jj+0+2+ldd*0] -= pW[0+ldw*0];
1566 pC00[jj+0+2+ldd*1] -= pv0[0+ldd*1]*pW[0+ldw*0] + pW[0+ldw*1];
1567 for(kk=2; kk<kmax; kk++)
1568 {
1569 pC00[jj+0+2+ldd*kk] -= pv0[0+ldd*kk]*pW[0+ldw*0] + pv1[0+ldd*kk]*pW[0+ldw*1];
1570 }
1571 }
1572 }
1573#endif
1574 for(; ii<imax; ii++)
1575 {
1576 pC00 = &pD[ii+ldd*ii];
1577 beta = 0.0;
1578 for(jj=1; jj<n-ii; jj++)
1579 {
1580 tmp = pC00[0+ldd*jj];
1581 beta += tmp*tmp;
1582 }
1583 if(beta==0.0)
1584 {
1585 dD[ii] = 0.0;
1586 }
1587 else
1588 {
1589 alpha = pC00[0+ldd*0];
1590 beta += alpha*alpha;
1591 beta = sqrt(beta);
1592 if(alpha>0)
1593 beta = -beta;
1594 dD[ii] = (beta-alpha) / beta;
1595 tmp = 1.0 / (alpha-beta);
1596 for(jj=1; jj<n-ii; jj++)
1597 pC00[0+ldd*jj] *= tmp;
1598 pC00[0+ldd*0] = beta;
1599 }
1600 if(ii<n)
1601 {
1602 // gemv_t & ger
1603 pC10 = &pC00[1+ldd*0];
1604 pv0 = &pC00[0+ldd*0];
1605 jmax = m-ii-1;
1606 kmax = n-ii;
1607 jj = 0;
1608 for(; jj<jmax-1; jj+=2)
1609 {
1610 w0 = pC10[jj+0+ldd*0]; // pv0[0] = 1.0
1611 w1 = pC10[jj+1+ldd*0]; // pv0[0] = 1.0
1612 for(kk=1; kk<kmax; kk++)
1613 {
1614 w0 += pC10[jj+0+ldd*kk] * pv0[0+ldd*kk];
1615 w1 += pC10[jj+1+ldd*kk] * pv0[0+ldd*kk];
1616 }
1617 w0 = - dD[ii] * w0;
1618 w1 = - dD[ii] * w1;
1619 pC10[jj+0+ldd*0] += w0; // pv0[0] = 1.0
1620 pC10[jj+1+ldd*0] += w1; // pv0[0] = 1.0
1621 for(kk=1; kk<kmax; kk++)
1622 {
1623 pC10[jj+0+ldd*kk] += w0 * pv0[0+ldd*kk];
1624 pC10[jj+1+ldd*kk] += w1 * pv0[0+ldd*kk];
1625 }
1626 }
1627 for(; jj<jmax; jj++)
1628 {
1629 w0 = pC10[jj+ldd*0]; // pv0[0] = 1.0
1630 for(kk=1; kk<kmax; kk++)
1631 {
1632 w0 += pC10[jj+ldd*kk] * pv0[0+ldd*kk];
1633 }
1634 w0 = - dD[ii] * w0;
1635 pC10[jj+ldd*0] += w0; // pv0[0] = 1.0
1636 for(kk=1; kk<kmax; kk++)
1637 {
1638 pC10[jj+ldd*kk] += w0 * pv0[0+ldd*kk];
1639 }
1640 }
1641 }
1642 }
1643 return;
1644 }
1645
1646
1647
1648#elif defined(LA_BLAS)
1649
1650
1651
1652// dpotrf
1653void POTRF_L_LIBSTR(int m, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
1654 {
1655 if(m<=0)
1656 return;
1657 int jj;
1658 char cl = 'l';
1659 char cn = 'n';
1660 char cr = 'r';
1661 char ct = 't';
1662 REAL d1 = 1.0;
1663 REAL *pC = sC->pA+ci+cj*sC->m;
1664 REAL *pD = sD->pA+di+dj*sD->m;
1665#if defined(REF_BLAS_BLIS)
1666 long long i1 = 1;
1667 long long mm = m;
1668 long long info;
1669 long long tmp;
1670 long long ldc = sC->m;
1671 long long ldd = sD->m;
1672 if(!(pC==pD))
1673 {
1674 for(jj=0; jj<m; jj++)
1675 {
1676 tmp = m-jj;
1677 COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
1678 }
1679 }
1680 POTRF(&cl, &mm, pD, &ldd, &info);
1681#else
1682 int i1 = 1;
1683 int info;
1684 int tmp;
1685 int ldc = sC->m;
1686 int ldd = sD->m;
1687 if(!(pC==pD))
1688 {
1689 for(jj=0; jj<m; jj++)
1690 {
1691 tmp = m-jj;
1692 COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
1693 }
1694 }
1695 POTRF(&cl, &m, pD, &ldd, &info);
1696#endif
1697 return;
1698 }
1699
1700
1701
1702// dpotrf
1703void POTRF_L_MN_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
1704 {
1705 if(m<=0 | n<=0)
1706 return;
1707 int jj;
1708 char cl = 'l';
1709 char cn = 'n';
1710 char cr = 'r';
1711 char ct = 't';
1712 REAL d1 = 1.0;
1713 REAL *pC = sC->pA+ci+cj*sC->m;
1714 REAL *pD = sD->pA+di+dj*sD->m;
1715#if defined(REF_BLAS_BLIS)
1716 long long i1 = 1;
1717 long long mm = m;
1718 long long nn = n;
1719 long long mmn = mm-nn;
1720 long long info;
1721 long long tmp;
1722 long long ldc = sC->m;
1723 long long ldd = sD->m;
1724 if(!(pC==pD))
1725 {
1726 for(jj=0; jj<n; jj++)
1727 {
1728 tmp = m-jj;
1729 COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
1730 }
1731 }
1732 POTRF(&cl, &nn, pD, &ldd, &info);
1733 TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
1734#else
1735 int i1 = 1;
1736 int mmn = m-n;
1737 int info;
1738 int tmp;
1739 int ldc = sC->m;
1740 int ldd = sD->m;
1741 if(!(pC==pD))
1742 {
1743 for(jj=0; jj<n; jj++)
1744 {
1745 tmp = m-jj;
1746 COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
1747 }
1748 }
1749 POTRF(&cl, &n, pD, &ldd, &info);
1750 TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
1751#endif
1752 return;
1753 }
1754
1755
1756
1757// dsyrk dpotrf
1758void SYRK_POTRF_LN_LIBSTR(int m, int n, int k, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
1759 {
1760 if(m<=0 | n<=0)
1761 return;
1762 int jj;
1763 char cl = 'l';
1764 char cn = 'n';
1765 char cr = 'r';
1766 char ct = 't';
1767 char cu = 'u';
1768 REAL d1 = 1.0;
1769 REAL *pA = sA->pA + ai + aj*sA->m;
1770 REAL *pB = sB->pA + bi + bj*sB->m;
1771 REAL *pC = sC->pA + ci + cj*sC->m;
1772 REAL *pD = sD->pA + di + dj*sD->m;
1773#if defined(REF_BLAS_BLIS)
1774 long long i1 = 1;
1775 long long mm = m;
1776 long long nn = n;
1777 long long kk = k;
1778 long long mmn = mm-nn;
1779 long long info;
1780 long long lda = sA->m;
1781 long long ldb = sB->m;
1782 long long ldc = sC->m;
1783 long long ldd = sD->m;
1784 if(!(pC==pD))
1785 {
1786 for(jj=0; jj<n; jj++)
1787 COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
1788 }
1789 if(pA==pB)
1790 {
1791 SYRK(&cl, &cn, &nn, &kk, &d1, pA, &lda, &d1, pD, &ldd);
1792 GEMM(&cn, &ct, &mmn, &nn, &kk, &d1, pA+n, &lda, pB, &ldb, &d1, pD+n, &ldd);
1793 POTRF(&cl, &nn, pD, &ldd, &info);
1794 TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
1795 }
1796 else
1797 {
1798 GEMM(&cn, &ct, &mm, &nn, &kk, &d1, pA, &lda, pB, &ldb, &d1, pD, &ldd);
1799 POTRF(&cl, &nn, pD, &ldd, &info);
1800 TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
1801 }
1802#else
1803 int i1 = 1;
1804 int mmn = m-n;
1805 int info;
1806 int lda = sA->m;
1807 int ldb = sB->m;
1808 int ldc = sC->m;
1809 int ldd = sD->m;
1810 if(!(pC==pD))
1811 {
1812 for(jj=0; jj<n; jj++)
1813 COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
1814 }
1815 if(pA==pB)
1816 {
1817 SYRK(&cl, &cn, &n, &k, &d1, pA, &lda, &d1, pD, &ldd);
1818 GEMM(&cn, &ct, &mmn, &n, &k, &d1, pA+n, &lda, pB, &ldb, &d1, pD+n, &ldd);
1819 POTRF(&cl, &n, pD, &ldd, &info);
1820 TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
1821 }
1822 else
1823 {
1824 GEMM(&cn, &ct, &m, &n, &k, &d1, pA, &lda, pB, &ldb, &d1, pD, &ldd);
1825 POTRF(&cl, &n, pD, &ldd, &info);
1826 TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
1827 }
1828#endif
1829 return;
1830 }
1831
1832
1833
1834// dgetrf without pivoting
1835#if defined(REF_BLAS_BLIS)
1836static void GETF2_NOPIVOT(long long m, long long n, REAL *A, long long lda)
1837 {
1838 if(m<=0 | n<=0)
1839 return;
1840 long long i, j;
1841 long long jmax = m<n ? m : n;
1842 REAL dtmp;
1843 REAL dm1 = -1.0;
1844 long long itmp0, itmp1;
1845 long long i1 = 1;
1846 for(j=0; j<jmax; j++)
1847 {
1848 itmp0 = m-j-1;
1849 dtmp = 1.0/A[j+lda*j];
1850 SCAL(&itmp0, &dtmp, &A[(j+1)+lda*j], &i1);
1851 itmp1 = n-j-1;
1852 GER(&itmp0, &itmp1, &dm1, &A[(j+1)+lda*j], &i1, &A[j+lda*(j+1)], &lda, &A[(j+1)+lda*(j+1)], &lda);
1853 }
1854 return;
1855 }
1856#else
1857static void GETF2_NOPIVOT(int m, int n, REAL *A, int lda)
1858 {
1859 if(m<=0 | n<=0)
1860 return;
1861 int i, j;
1862 int jmax = m<n ? m : n;
1863 REAL dtmp;
1864 REAL dm1 = -1.0;
1865 int itmp0, itmp1;
1866 int i1 = 1;
1867 for(j=0; j<jmax; j++)
1868 {
1869 itmp0 = m-j-1;
1870 dtmp = 1.0/A[j+lda*j];
1871 SCAL(&itmp0, &dtmp, &A[(j+1)+lda*j], &i1);
1872 itmp1 = n-j-1;
1873 GER(&itmp0, &itmp1, &dm1, &A[(j+1)+lda*j], &i1, &A[j+lda*(j+1)], &lda, &A[(j+1)+lda*(j+1)], &lda);
1874 }
1875 return;
1876 }
1877#endif
1878
1879
1880
1881// dgetrf without pivoting
1882void GETRF_NOPIVOT_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
1883 {
1884 // TODO with custom level 2 LAPACK + level 3 BLAS
1885 if(m<=0 | n<=0)
1886 return;
1887 int jj;
1888 REAL d1 = 1.0;
1889 REAL *pC = sC->pA+ci+cj*sC->m;
1890 REAL *pD = sD->pA+di+dj*sD->m;
1891#if defined(REF_BLAS_BLIS)
1892 long long i1 = 1;
1893 long long mm = m;
1894 long long nn = n;
1895 long long ldc = sC->m;
1896 long long ldd = sD->m;
1897 if(!(pC==pD))
1898 {
1899 for(jj=0; jj<n; jj++)
1900 COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
1901 }
1902 GETF2_NOPIVOT(mm, nn, pD, ldd);
1903#else
1904 int i1 = 1;
1905 int ldc = sC->m;
1906 int ldd = sD->m;
1907 if(!(pC==pD))
1908 {
1909 for(jj=0; jj<n; jj++)
1910 COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
1911 }
1912 GETF2_NOPIVOT(m, n, pD, ldd);
1913#endif
1914 return;
1915 }
1916
1917
1918
1919// dgetrf pivoting
1920void GETRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, int *ipiv)
1921 {
1922 // TODO with custom level 2 LAPACK + level 3 BLAS
1923 if(m<=0 | n<=0)
1924 return;
1925 int jj;
1926 int tmp = m<n ? m : n;
1927 REAL d1 = 1.0;
1928 REAL *pC = sC->pA+ci+cj*sC->m;
1929 REAL *pD = sD->pA+di+dj*sD->m;
1930#if defined(REF_BLAS_BLIS)
1931 long long i1 = 1;
1932 long long info;
1933 long long mm = m;
1934 long long nn = n;
1935 long long ldc = sC->m;
1936 long long ldd = sD->m;
1937 if(!(pC==pD))
1938 {
1939 for(jj=0; jj<n; jj++)
1940 COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
1941 }
1942 GETRF(&mm, &nn, pD, &ldd, (long long *) ipiv, &info);
1943 for(jj=0; jj<tmp; jj++)
1944 ipiv[jj] -= 1;
1945#else
1946 int i1 = 1;
1947 int info;
1948 int ldc = sC->m;
1949 int ldd = sD->m;
1950 if(!(pC==pD))
1951 {
1952 for(jj=0; jj<n; jj++)
1953 COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
1954 }
1955 GETRF(&m, &n, pD, &ldd, ipiv, &info);
1956 for(jj=0; jj<tmp; jj++)
1957 ipiv[jj] -= 1;
1958#endif
1959 return;
1960 }
1961
1962
1963
1964int GEQRF_WORK_SIZE_LIBSTR(int m, int n)
1965 {
1966 REAL dwork;
1967 REAL *pD, *dD;
1968#if defined(REF_BLAS_BLIS)
1969 long long mm = m;
1970 long long nn = n;
1971 long long lwork = -1;
1972 long long info;
1973 long long ldd = mm;
1974 GEQRF(&mm, &nn, pD, &ldd, dD, &dwork, &lwork, &info);
1975#else
1976 int lwork = -1;
1977 int info;
1978 int ldd = m;
1979 GEQRF(&m, &n, pD, &ldd, dD, &dwork, &lwork, &info);
1980#endif
1981 int size = dwork;
1982 return size*sizeof(REAL);
1983 }
1984
1985
1986
1987void GEQRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, void *work)
1988 {
1989 if(m<=0 | n<=0)
1990 return;
1991 int jj;
1992 REAL *pC = sC->pA+ci+cj*sC->m;
1993 REAL *pD = sD->pA+di+dj*sD->m;
1994 REAL *dD = sD->dA+di;
1995 REAL *dwork = (REAL *) work;
1996#if defined(REF_BLAS_BLIS)
1997 long long i1 = 1;
1998 long long info = -1;
1999 long long mm = m;
2000 long long nn = n;
2001 long long ldc = sC->m;
2002 long long ldd = sD->m;
2003 if(!(pC==pD))
2004 {
2005 for(jj=0; jj<n; jj++)
2006 COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
2007 }
2008// GEQR2(&mm, &nn, pD, &ldd, dD, dwork, &info);
2009 long long lwork = -1;
2010 GEQRF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
2011 lwork = dwork[0];
2012 GEQRF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
2013#else
2014 int i1 = 1;
2015 int info = -1;
2016 int ldc = sC->m;
2017 int ldd = sD->m;
2018 if(!(pC==pD))
2019 {
2020 for(jj=0; jj<n; jj++)
2021 COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
2022 }
2023// GEQR2(&m, &n, pD, &ldd, dD, dwork, &info);
2024 int lwork = -1;
2025 GEQRF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
2026 lwork = dwork[0];
2027 GEQRF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
2028#endif
2029 return;
2030 }
2031
2032
2033
2034int GELQF_WORK_SIZE_LIBSTR(int m, int n)
2035 {
2036 REAL dwork;
2037 REAL *pD, *dD;
2038#if defined(REF_BLAS_BLIS)
2039 long long mm = m;
2040 long long nn = n;
2041 long long lwork = -1;
2042 long long info;
2043 long long ldd = mm;
2044 GELQF(&mm, &nn, pD, &ldd, dD, &dwork, &lwork, &info);
2045#else
2046 int lwork = -1;
2047 int info;
2048 int ldd = m;
2049 GELQF(&m, &n, pD, &ldd, dD, &dwork, &lwork, &info);
2050#endif
2051 int size = dwork;
2052 return size*sizeof(REAL);
2053 }
2054
2055
2056
2057void GELQF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, void *work)
2058 {
2059 if(m<=0 | n<=0)
2060 return;
2061 int jj;
2062 REAL *pC = sC->pA+ci+cj*sC->m;
2063 REAL *pD = sD->pA+di+dj*sD->m;
2064 REAL *dD = sD->dA+di;
2065 REAL *dwork = (REAL *) work;
2066#if defined(REF_BLAS_BLIS)
2067 long long i1 = 1;
2068 long long info = -1;
2069 long long mm = m;
2070 long long nn = n;
2071 long long ldc = sC->m;
2072 long long ldd = sD->m;
2073 if(!(pC==pD))
2074 {
2075 for(jj=0; jj<n; jj++)
2076 COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
2077 }
2078// GEQR2(&mm, &nn, pD, &ldd, dD, dwork, &info);
2079 long long lwork = -1;
2080 GELQF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
2081 lwork = dwork[0];
2082 GELQF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
2083#else
2084 int i1 = 1;
2085 int info = -1;
2086 int ldc = sC->m;
2087 int ldd = sD->m;
2088 if(!(pC==pD))
2089 {
2090 for(jj=0; jj<n; jj++)
2091 COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
2092 }
2093// GEQR2(&m, &n, pD, &ldd, dD, dwork, &info);
2094 int lwork = -1;
2095 GELQF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
2096 lwork = dwork[0];
2097 GELQF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
2098#endif
2099 return;
2100 }
2101
2102
2103
2104#else
2105
2106#error : wrong LA choice
2107
2108#endif
2109
2110
2111
2112