blob: 29a33c79295fe0698b6cd77a4b808189a8d75709 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29
30
31#if defined(LA_REFERENCE)
32
33
34
35// dgemm nt
36void GEMM_NT_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
37 {
38 if(m<=0 | n<=0)
39 return;
40 int ii, jj, kk;
41 REAL
42 c_00, c_01,
43 c_10, c_11;
44 int lda = sA->m;
45 int ldb = sB->m;
46 int ldc = sC->m;
47 int ldd = sD->m;
48 REAL *pA = sA->pA + ai + aj*lda;
49 REAL *pB = sB->pA + bi + bj*ldb;
50 REAL *pC = sC->pA + ci + cj*ldc;
51 REAL *pD = sD->pA + di + dj*ldd;
52 jj = 0;
53 for(; jj<n-1; jj+=2)
54 {
55 ii = 0;
56 for(; ii<m-1; ii+=2)
57 {
58 c_00 = 0.0;
59 c_10 = 0.0;
60 c_01 = 0.0;
61 c_11 = 0.0;
62 for(kk=0; kk<k; kk++)
63 {
64 c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
65 c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
66 c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
67 c_11 += pA[(ii+1)+lda*kk] * pB[(jj+1)+ldb*kk];
68 }
69 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
70 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
71 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
72 pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
73 }
74 for(; ii<m; ii++)
75 {
76 c_00 = 0.0;
77 c_01 = 0.0;
78 for(kk=0; kk<k; kk++)
79 {
80 c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
81 c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
82 }
83 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
84 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
85 }
86 }
87 for(; jj<n; jj++)
88 {
89 ii = 0;
90 for(; ii<m-1; ii+=2)
91 {
92 c_00 = 0.0;
93 c_10 = 0.0;
94 for(kk=0; kk<k; kk++)
95 {
96 c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
97 c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
98 }
99 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
100 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
101 }
102 for(; ii<m; ii++)
103 {
104 c_00 = 0.0;
105 for(kk=0; kk<k; kk++)
106 {
107 c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
108 }
109 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
110 }
111 }
112 return;
113 }
114
115
116
117// dgemm nn
118void GEMM_NN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
119 {
120 if(m<=0 | n<=0)
121 return;
122 int ii, jj, kk;
123 REAL
124 c_00, c_01,
125 c_10, c_11;
126 int lda = sA->m;
127 int ldb = sB->m;
128 int ldc = sC->m;
129 int ldd = sD->m;
130 REAL *pA = sA->pA + ai + aj*lda;
131 REAL *pB = sB->pA + bi + bj*ldb;
132 REAL *pC = sC->pA + ci + cj*ldc;
133 REAL *pD = sD->pA + di + dj*ldd;
134 jj = 0;
135 for(; jj<n-1; jj+=2)
136 {
137 ii = 0;
138 for(; ii<m-1; ii+=2)
139 {
140 c_00 = 0.0; ;
141 c_10 = 0.0; ;
142 c_01 = 0.0; ;
143 c_11 = 0.0; ;
144 for(kk=0; kk<k; kk++)
145 {
146 c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
147 c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
148 c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
149 c_11 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+1)];
150 }
151 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
152 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
153 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
154 pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
155 }
156 for(; ii<m; ii++)
157 {
158 c_00 = 0.0; ;
159 c_01 = 0.0; ;
160 for(kk=0; kk<k; kk++)
161 {
162 c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
163 c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
164 }
165 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
166 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
167 }
168 }
169 for(; jj<n; jj++)
170 {
171 ii = 0;
172 for(; ii<m-1; ii+=2)
173 {
174 c_00 = 0.0; ;
175 c_10 = 0.0; ;
176 for(kk=0; kk<k; kk++)
177 {
178 c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
179 c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
180 }
181 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
182 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
183 }
184 for(; ii<m; ii++)
185 {
186 c_00 = 0.0; ;
187 for(kk=0; kk<k; kk++)
188 {
189 c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
190 }
191 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
192 }
193 }
194 return;
195 }
196
197
198
199// dtrsm_left_lower_nottransposed_unit
200void TRSM_LLNU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
201 {
202 if(m<=0 | n<=0)
203 return;
204 int ii, jj, kk;
205 REAL
206 d_00, d_01,
207 d_10, d_11;
208 int lda = sA->m;
209 int ldb = sB->m;
210 int ldd = sD->m;
211 REAL *pA = sA->pA + ai + aj*lda; // triangular
212 REAL *pB = sB->pA + bi + bj*ldb;
213 REAL *pD = sD->pA + di + dj*ldd;
214#if 1
215 // solve
216 jj = 0;
217 for(; jj<n-1; jj+=2)
218 {
219 ii = 0;
220 for(; ii<m-1; ii+=2)
221 {
222 d_00 = alpha * pB[ii+0+ldb*(jj+0)];
223 d_10 = alpha * pB[ii+1+ldb*(jj+0)];
224 d_01 = alpha * pB[ii+0+ldb*(jj+1)];
225 d_11 = alpha * pB[ii+1+ldb*(jj+1)];
226 kk = 0;
227#if 0
228 for(; kk<ii-1; kk+=2)
229 {
230 d_00 -= pA[ii+0+lda*(kk+0)] * pD[kk+ldd*(jj+0)];
231 d_10 -= pA[ii+1+lda*(kk+0)] * pD[kk+ldd*(jj+0)];
232 d_01 -= pA[ii+0+lda*(kk+0)] * pD[kk+ldd*(jj+1)];
233 d_11 -= pA[ii+1+lda*(kk+0)] * pD[kk+ldd*(jj+1)];
234 d_00 -= pA[ii+0+lda*(kk+1)] * pD[kk+ldd*(jj+0)];
235 d_10 -= pA[ii+1+lda*(kk+1)] * pD[kk+ldd*(jj+0)];
236 d_01 -= pA[ii+0+lda*(kk+1)] * pD[kk+ldd*(jj+1)];
237 d_11 -= pA[ii+1+lda*(kk+1)] * pD[kk+ldd*(jj+1)];
238 }
239 if(kk<ii)
240#else
241 for(; kk<ii; kk++)
242#endif
243 {
244 d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+0)];
245 d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+0)];
246 d_01 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+1)];
247 d_11 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+1)];
248 }
249 d_10 -= pA[ii+1+lda*kk] * d_00;
250 d_11 -= pA[ii+1+lda*kk] * d_01;
251 pD[ii+0+ldd*(jj+0)] = d_00;
252 pD[ii+1+ldd*(jj+0)] = d_10;
253 pD[ii+0+ldd*(jj+1)] = d_01;
254 pD[ii+1+ldd*(jj+1)] = d_11;
255 }
256 for(; ii<m; ii++)
257 {
258 d_00 = alpha * pB[ii+ldb*(jj+0)];
259 d_01 = alpha * pB[ii+ldb*(jj+1)];
260 for(kk=0; kk<ii; kk++)
261 {
262 d_00 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+0)];
263 d_01 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+1)];
264 }
265 pD[ii+ldd*(jj+0)] = d_00;
266 pD[ii+ldd*(jj+1)] = d_01;
267 }
268 }
269 for(; jj<n; jj++)
270 {
271 ii = 0;
272 for(; ii<m-1; ii+=2)
273 {
274 d_00 = alpha * pB[ii+0+ldb*jj];
275 d_10 = alpha * pB[ii+1+ldb*jj];
276 for(kk=0; kk<ii; kk++)
277 {
278 d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*jj];
279 d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*jj];
280 }
281 d_10 -= pA[ii+1+lda*kk] * d_00;
282 pD[ii+0+ldd*jj] = d_00;
283 pD[ii+1+ldd*jj] = d_10;
284 }
285 for(; ii<m; ii++)
286 {
287 d_00 = alpha * pB[ii+ldb*jj];
288 for(kk=0; kk<ii; kk++)
289 {
290 d_00 -= pA[ii+lda*kk] * pD[kk+ldd*jj];
291 }
292 pD[ii+ldd*jj] = d_00;
293 }
294 }
295#else
296 // copy
297 if(!(pB==pD))
298 {
299 for(jj=0; jj<n; jj++)
300 for(ii=0; ii<m; ii++)
301 pD[ii+ldd*jj] = alpha * pB[ii+ldb*jj];
302 }
303 for(jj=0; jj<n; jj++)
304 {
305 ii = 0;
306 for(; ii<m; ii++)
307 {
308 d_00 = pD[ii+ldd*jj];
309 for(kk=ii+1; kk<m; kk++)
310 {
311 pD[kk+ldd*jj] -= pA[kk+lda*ii] * d_00;
312 }
313 }
314 }
315#endif
316 return;
317 }
318
319
320
321// dtrsm_left_upper_nottransposed_notunit
322void TRSM_LUNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
323 {
324 if(m<=0 | n<=0)
325 return;
326 int ii, jj, kk, id;
327 REAL
328 d_00, d_01,
329 d_10, d_11;
330 int lda = sA->m;
331 int ldb = sB->m;
332 int ldd = sD->m;
333 REAL *pA = sA->pA + ai + aj*lda; // triangular
334 REAL *pB = sB->pA + bi + bj*ldb;
335 REAL *pD = sD->pA + di + dj*ldd;
336 REAL *dA = sA->dA;
337 if(!(sA->use_dA==1 & ai==0 & aj==0))
338 {
339 // inverte diagonal of pA
340 for(ii=0; ii<m; ii++)
341 dA[ii] = 1.0/pA[ii+lda*ii];
342 // use only now
343 sA->use_dA = 0;
344 }
345#if 1
346 jj = 0;
347 for(; jj<n-1; jj+=2)
348 {
349 ii = 0;
350 for(; ii<m-1; ii+=2)
351 {
352 id = m-ii-2;
353 d_00 = alpha * pB[id+0+ldb*(jj+0)];
354 d_10 = alpha * pB[id+1+ldb*(jj+0)];
355 d_01 = alpha * pB[id+0+ldb*(jj+1)];
356 d_11 = alpha * pB[id+1+ldb*(jj+1)];
357 kk = id+2;
358#if 0
359 for(; kk<m-1; kk+=2)
360 {
361 d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
362 d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
363 d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
364 d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
365 d_00 -= pA[id+0+lda*(kk+1)] * pD[kk+1+ldd*(jj+0)];
366 d_10 -= pA[id+1+lda*(kk+1)] * pD[kk+1+ldd*(jj+0)];
367 d_01 -= pA[id+0+lda*(kk+1)] * pD[kk+1+ldd*(jj+1)];
368 d_11 -= pA[id+1+lda*(kk+1)] * pD[kk+1+ldd*(jj+1)];
369 }
370 if(kk<m)
371#else
372 for(; kk<m; kk++)
373#endif
374 {
375 d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
376 d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
377 d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
378 d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
379 }
380 d_10 *= dA[id+1];
381 d_11 *= dA[id+1];
382 d_00 -= pA[id+0+lda*(id+1)] * d_10;
383 d_01 -= pA[id+0+lda*(id+1)] * d_11;
384 d_00 *= dA[id+0];
385 d_01 *= dA[id+0];
386 pD[id+0+ldd*(jj+0)] = d_00;
387 pD[id+1+ldd*(jj+0)] = d_10;
388 pD[id+0+ldd*(jj+1)] = d_01;
389 pD[id+1+ldd*(jj+1)] = d_11;
390 }
391 for(; ii<m; ii++)
392 {
393 id = m-ii-1;
394 d_00 = alpha * pB[id+0+ldb*(jj+0)];
395 d_01 = alpha * pB[id+0+ldb*(jj+1)];
396 kk = id+1;
397 for(; kk<m; kk++)
398 {
399 d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
400 d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
401 }
402 d_00 *= dA[id+0];
403 d_01 *= dA[id+0];
404 pD[id+0+ldd*(jj+0)] = d_00;
405 pD[id+0+ldd*(jj+1)] = d_01;
406 }
407 }
408 for(; jj<n; jj++)
409 {
410 ii = 0;
411 for(; ii<m-1; ii+=2)
412 {
413 id = m-ii-2;
414 d_00 = alpha * pB[id+0+ldb*(jj+0)];
415 d_10 = alpha * pB[id+1+ldb*(jj+0)];
416 kk = id+2;
417 for(; kk<m; kk++)
418 {
419 d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
420 d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
421 }
422 d_10 *= dA[id+1];
423 d_00 -= pA[id+0+lda*(id+1)] * d_10;
424 d_00 *= dA[id+0];
425 pD[id+0+ldd*(jj+0)] = d_00;
426 pD[id+1+ldd*(jj+0)] = d_10;
427 }
428 for(; ii<m; ii++)
429 {
430 id = m-ii-1;
431 d_00 = alpha * pB[id+0+ldb*(jj+0)];
432 kk = id+1;
433 for(; kk<m; kk++)
434 {
435 d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
436 }
437 d_00 *= dA[id+0];
438 pD[id+0+ldd*(jj+0)] = d_00;
439 }
440 }
441#else
442 // copy
443 if(!(pB==pD))
444 {
445 for(jj=0; jj<n; jj++)
446 for(ii=0; ii<m; ii++)
447 pD[ii+ldd*jj] = alpha * pB[ii+ldb*jj];
448 }
449 // solve
450 for(jj=0; jj<n; jj++)
451 {
452 for(ii=m-1; ii>=0; ii--)
453 {
454 d_00 = pD[ii+ldd*jj] * dA[ii];
455 pD[ii+ldd*jj] = d_00;
456 for(kk=0; kk<ii; kk++)
457 {
458 pD[kk+ldd*jj] -= pA[kk+lda*ii] * d_00;
459 }
460 }
461 }
462#endif
463 return;
464 }
465
466
467
468// dtrsm_right_lower_transposed_unit
469void TRSM_RLTU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
470 {
471 if(m<=0 | n<=0)
472 return;
473 int ii, jj, kk;
474 int lda = sA->m;
475 int ldb = sB->m;
476 int ldd = sD->m;
477 REAL *pA = sA->pA + ai + aj*lda;
478 REAL *pB = sB->pA + bi + bj*ldb;
479 REAL *pD = sD->pA + di + dj*ldd;
480 REAL
481 f_10,
482 c_00, c_01,
483 c_10, c_11;
484 jj = 0;
485 for(; jj<n-1; jj+=2)
486 {
487 f_10 = pA[jj+1+lda*(jj+0)];
488 ii = 0;
489 for(; ii<m-1; ii+=2)
490 {
491 c_00 = alpha * pB[ii+0+ldb*(jj+0)];
492 c_10 = alpha * pB[ii+1+ldb*(jj+0)];
493 c_01 = alpha * pB[ii+0+ldb*(jj+1)];
494 c_11 = alpha * pB[ii+1+ldb*(jj+1)];
495 for(kk=0; kk<jj; kk++)
496 {
497 c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
498 c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
499 c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
500 c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
501 }
502 pD[ii+0+ldd*(jj+0)] = c_00;
503 pD[ii+1+ldd*(jj+0)] = c_10;
504 c_01 -= c_00 * f_10;
505 c_11 -= c_10 * f_10;
506 pD[ii+0+ldd*(jj+1)] = c_01;
507 pD[ii+1+ldd*(jj+1)] = c_11;
508 }
509 for(; ii<m; ii++)
510 {
511 c_00 = alpha * pB[ii+0+ldb*(jj+0)];
512 c_01 = alpha * pB[ii+0+ldb*(jj+1)];
513 for(kk=0; kk<jj; kk++)
514 {
515 c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
516 c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
517 }
518 pD[ii+0+ldd*(jj+0)] = c_00;
519 c_01 -= c_00 * f_10;
520 pD[ii+0+ldd*(jj+1)] = c_01;
521 }
522 }
523 for(; jj<n; jj++)
524 {
525 // factorize diagonal
526 for(ii=0; ii<m; ii++)
527 {
528 c_00 = alpha * pB[ii+ldb*jj];
529 for(kk=0; kk<jj; kk++)
530 {
531 c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
532 }
533 pD[ii+ldd*jj] = c_00;
534 }
535 }
536 return;
537 }
538
539
540
541// dtrsm_right_lower_transposed_unit
542void TRSM_RLTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
543 {
544 if(m<=0 | n<=0)
545 return;
546 int ii, jj, kk;
547 int lda = sA->m;
548 int ldb = sB->m;
549 int ldd = sD->m;
550 REAL *pA = sA->pA + ai + aj*lda;
551 REAL *pB = sB->pA + bi + bj*ldb;
552 REAL *pD = sD->pA + di + dj*ldd;
553 REAL *dA = sA->dA;
554 if(ai==0 & aj==0)
555 {
556 if(sA->use_dA!=1)
557 {
558 for(ii=0; ii<n; ii++)
559 dA[ii] = 1.0 / pA[ii+lda*ii];
560 sA->use_dA = 1;
561 }
562 }
563 else
564 {
565 for(ii=0; ii<n; ii++)
566 dA[ii] = 1.0 / pA[ii+lda*ii];
567 sA->use_dA = 0;
568 }
569 REAL
570 f_00_inv,
571 f_10, f_11_inv,
572 c_00, c_01,
573 c_10, c_11;
574 jj = 0;
575 for(; jj<n-1; jj+=2)
576 {
577 f_00_inv = dA[jj+0];
578 f_10 = pA[jj+1+lda*(jj+0)];
579 f_11_inv = dA[jj+1];
580 ii = 0;
581 for(; ii<m-1; ii+=2)
582 {
583 c_00 = alpha * pB[ii+0+ldb*(jj+0)];
584 c_10 = alpha * pB[ii+1+ldb*(jj+0)];
585 c_01 = alpha * pB[ii+0+ldb*(jj+1)];
586 c_11 = alpha * pB[ii+1+ldb*(jj+1)];
587 for(kk=0; kk<jj; kk++)
588 {
589 c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
590 c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
591 c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
592 c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
593 }
594 c_00 *= f_00_inv;
595 c_10 *= f_00_inv;
596 pD[ii+0+ldd*(jj+0)] = c_00;
597 pD[ii+1+ldd*(jj+0)] = c_10;
598 c_01 -= c_00 * f_10;
599 c_11 -= c_10 * f_10;
600 c_01 *= f_11_inv;
601 c_11 *= f_11_inv;
602 pD[ii+0+ldd*(jj+1)] = c_01;
603 pD[ii+1+ldd*(jj+1)] = c_11;
604 }
605 for(; ii<m; ii++)
606 {
607 c_00 = alpha * pB[ii+0+ldb*(jj+0)];
608 c_01 = alpha * pB[ii+0+ldb*(jj+1)];
609 for(kk=0; kk<jj; kk++)
610 {
611 c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
612 c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
613 }
614 c_00 *= f_00_inv;
615 pD[ii+0+ldd*(jj+0)] = c_00;
616 c_01 -= c_00 * f_10;
617 c_01 *= f_11_inv;
618 pD[ii+0+ldd*(jj+1)] = c_01;
619 }
620 }
621 for(; jj<n; jj++)
622 {
623 // factorize diagonal
624 f_00_inv = dA[jj];
625 for(ii=0; ii<m; ii++)
626 {
627 c_00 = alpha * pB[ii+ldb*jj];
628 for(kk=0; kk<jj; kk++)
629 {
630 c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
631 }
632 c_00 *= f_00_inv;
633 pD[ii+ldd*jj] = c_00;
634 }
635 }
636 return;
637 }
638
639
640
641// dtrsm_right_upper_transposed_notunit
642void TRSM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
643 {
644 int jj;
645 char cl = 'l';
646 char cn = 'n';
647 char cr = 'r';
648 char ct = 't';
649 char cu = 'u';
650 int i1 = 1;
651 REAL *pA = sA->pA+ai+aj*sA->m;
652 REAL *pB = sB->pA+bi+bj*sB->m;
653 REAL *pD = sD->pA+di+dj*sD->m;
654 printf("\ndtrsm_rutn_libstr: feature not implemented yet\n");
655 exit(1);
656// if(!(pB==pD))
657// {
658// for(jj=0; jj<n; jj++)
659// COPY(&m, pB+jj*sB->m, &i1, pD+jj*sD->m, &i1);
660// }
661// TRSM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &(sA->m), pD, &(sD->m));
662 return;
663 }
664
665
666
667// dtrmm_right_upper_transposed_notunit (A triangular !!!)
668void TRMM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
669 {
670 if(m<=0 | n<=0)
671 return;
672 int ii, jj, kk;
673 REAL
674 c_00, c_01,
675 c_10, c_11;
676 int lda = sA->m;
677 int ldb = sB->m;
678 int ldd = sD->m;
679 REAL *pA = sA->pA + ai + aj*lda;
680 REAL *pB = sB->pA + bi + bj*ldb;
681 REAL *pD = sD->pA + di + dj*ldd;
682 jj = 0;
683 for(; jj<n-1; jj+=2)
684 {
685 ii = 0;
686 for(; ii<m-1; ii+=2)
687 {
688 c_00 = 0.0;
689 c_10 = 0.0;
690 c_01 = 0.0;
691 c_11 = 0.0;
692 kk = jj;
693 c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
694 c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
695 kk++;
696 for(; kk<n; kk++)
697 {
698 c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
699 c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
700 c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
701 c_11 += pB[(ii+1)+ldb*kk] * pA[(jj+1)+lda*kk];
702 }
703 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
704 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
705 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
706 pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
707 }
708 for(; ii<m; ii++)
709 {
710 c_00 = 0.0;
711 c_01 = 0.0;
712 kk = jj;
713 c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
714 kk++;
715 for(; kk<n; kk++)
716 {
717 c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
718 c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
719 }
720 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
721 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
722 }
723 }
724 for(; jj<n; jj++)
725 {
726 ii = 0;
727 for(; ii<m-1; ii+=2)
728 {
729 c_00 = 0.0;
730 c_10 = 0.0;
731 for(kk=jj; kk<n; kk++)
732 {
733 c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
734 c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
735 }
736 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
737 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
738 }
739 for(; ii<m; ii++)
740 {
741 c_00 = 0.0;
742 for(kk=jj; kk<n; kk++)
743 {
744 c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
745 }
746 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
747 }
748 }
749 return;
750 }
751
752
753
754// dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
755void TRMM_RLNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
756 {
757 if(m<=0 | n<=0)
758 return;
759 int ii, jj, kk;
760 REAL
761 c_00, c_01,
762 c_10, c_11;
763 int lda = sA->m;
764 int ldb = sB->m;
765 int ldd = sD->m;
766 REAL *pA = sA->pA + ai + aj*lda;
767 REAL *pB = sB->pA + bi + bj*ldb;
768 REAL *pD = sD->pA + di + dj*ldd;
769 jj = 0;
770 for(; jj<n-1; jj+=2)
771 {
772 ii = 0;
773 for(; ii<m-1; ii+=2)
774 {
775 c_00 = 0.0; ;
776 c_10 = 0.0; ;
777 c_01 = 0.0; ;
778 c_11 = 0.0; ;
779 kk = jj;
780 c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
781 c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
782 kk++;
783 for(; kk<n; kk++)
784 {
785 c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
786 c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
787 c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
788 c_11 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+1)];
789 }
790 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
791 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
792 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
793 pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
794 }
795 for(; ii<m; ii++)
796 {
797 c_00 = 0.0; ;
798 c_01 = 0.0; ;
799 kk = jj;
800 c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
801 kk++;
802 for(; kk<n; kk++)
803 {
804 c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
805 c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
806 }
807 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
808 pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
809 }
810 }
811 for(; jj<n; jj++)
812 {
813 ii = 0;
814 for(; ii<m-1; ii+=2)
815 {
816 c_00 = 0.0; ;
817 c_10 = 0.0; ;
818 for(kk=jj; kk<n; kk++)
819 {
820 c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
821 c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
822 }
823 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
824 pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
825 }
826 for(; ii<m; ii++)
827 {
828 c_00 = 0.0; ;
829 for(kk=jj; kk<n; kk++)
830 {
831 c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
832 }
833 pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
834 }
835 }
836 return;
837 }
838
839
840
841// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
842void SYRK_LN_LIBSTR(int m, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
843 {
844 if(m<=0)
845 return;
846 int ii, jj, kk;
847 int n = m; // TODO optimize for this case !!!!!!!!!
848 REAL
849 c_00, c_01,
850 c_10, c_11;
851 int lda = sA->m;
852 int ldb = sB->m;
853 int ldc = sC->m;
854 int ldd = sD->m;
855 REAL *pA = sA->pA + ai + aj*lda;
856 REAL *pB = sB->pA + bi + bj*ldb;
857 REAL *pC = sC->pA + ci + cj*ldc;
858 REAL *pD = sD->pA + di + dj*ldd;
859 jj = 0;
860 for(; jj<n-1; jj+=2)
861 {
862 // diagonal
863 c_00 = 0.0;
864 c_10 = 0.0;
865 c_11 = 0.0;
866 for(kk=0; kk<k; kk++)
867 {
868 c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
869 c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
870 c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
871 }
872 pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
873 pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
874 pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
875 // lower
876 ii = jj+2;
877 for(; ii<m-1; ii+=2)
878 {
879 c_00 = 0.0;
880 c_10 = 0.0;
881 c_01 = 0.0;
882 c_11 = 0.0;
883 for(kk=0; kk<k; kk++)
884 {
885 c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
886 c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
887 c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
888 c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
889 }
890 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
891 pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
892 pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
893 pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
894 }
895 for(; ii<m; ii++)
896 {
897 c_00 = 0.0;
898 c_01 = 0.0;
899 for(kk=0; kk<k; kk++)
900 {
901 c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
902 c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
903 }
904 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
905 pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
906 }
907 }
908 for(; jj<n; jj++)
909 {
910 // diagonal
911 c_00 = 0.0;
912 for(kk=0; kk<k; kk++)
913 {
914 c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
915 }
916 pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
917 // lower
918 for(ii=jj+1; ii<m; ii++)
919 {
920 c_00 = 0.0;
921 for(kk=0; kk<k; kk++)
922 {
923 c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
924 }
925 pD[ii+ldd*jj] = beta * pC[ii+ldc*jj] + alpha * c_00;
926 }
927 }
928 return;
929 }
930
931
932
933// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
934void SYRK_LN_MN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
935 {
936 if(m<=0 | n<=0)
937 return;
938 int ii, jj, kk;
939 REAL
940 c_00, c_01,
941 c_10, c_11;
942 int lda = sA->m;
943 int ldb = sB->m;
944 int ldc = sC->m;
945 int ldd = sD->m;
946 REAL *pA = sA->pA + ai + aj*lda;
947 REAL *pB = sB->pA + bi + bj*ldb;
948 REAL *pC = sC->pA + ci + cj*ldc;
949 REAL *pD = sD->pA + di + dj*ldd;
950 jj = 0;
951 for(; jj<n-1; jj+=2)
952 {
953 // diagonal
954 c_00 = 0.0;
955 c_10 = 0.0;
956 c_11 = 0.0;
957 for(kk=0; kk<k; kk++)
958 {
959 c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
960 c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
961 c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
962 }
963 pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
964 pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
965 pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
966 // lower
967 ii = jj+2;
968 for(; ii<m-1; ii+=2)
969 {
970 c_00 = 0.0;
971 c_10 = 0.0;
972 c_01 = 0.0;
973 c_11 = 0.0;
974 for(kk=0; kk<k; kk++)
975 {
976 c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
977 c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
978 c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
979 c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
980 }
981 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
982 pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
983 pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
984 pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
985 }
986 for(; ii<m; ii++)
987 {
988 c_00 = 0.0;
989 c_01 = 0.0;
990 for(kk=0; kk<k; kk++)
991 {
992 c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
993 c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
994 }
995 pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
996 pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
997 }
998 }
999 for(; jj<n; jj++)
1000 {
1001 // diagonal
1002 c_00 = 0.0;
1003 for(kk=0; kk<k; kk++)
1004 {
1005 c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
1006 }
1007 pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
1008 // lower
1009 for(ii=jj+1; ii<m; ii++)
1010 {
1011 c_00 = 0.0;
1012 for(kk=0; kk<k; kk++)
1013 {
1014 c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
1015 }
1016 pD[ii+ldd*jj] = beta * pC[ii+ldc*jj] + alpha * c_00;
1017 }
1018 }
1019 return;
1020 }
1021
1022
1023
1024#elif defined(LA_BLAS)
1025
1026
1027
1028// dgemm nt
1029void GEMM_NT_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
1030 {
1031 int jj;
1032 char cn = 'n';
1033 char ct = 't';
1034 REAL *pA = sA->pA+ai+aj*sA->m;
1035 REAL *pB = sB->pA+bi+bj*sB->m;
1036 REAL *pC = sC->pA+ci+cj*sC->m;
1037 REAL *pD = sD->pA+di+dj*sD->m;
1038#if defined(REF_BLAS_BLIS)
1039 long long i1 = 1;
1040 long long mm = m;
1041 long long nn = n;
1042 long long kk = k;
1043 long long lda = sA->m;
1044 long long ldb = sB->m;
1045 long long ldc = sC->m;
1046 long long ldd = sD->m;
1047 if(!(beta==0.0 || pC==pD))
1048 {
1049 for(jj=0; jj<n; jj++)
1050 COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
1051 }
1052 GEMM(&cn, &ct, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
1053#else
1054 int i1 = 1;
1055 int lda = sA->m;
1056 int ldb = sB->m;
1057 int ldc = sC->m;
1058 int ldd = sD->m;
1059 if(!(beta==0.0 || pC==pD))
1060 {
1061 for(jj=0; jj<n; jj++)
1062 COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
1063 }
1064 GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
1065#endif
1066 return;
1067 }
1068
1069
1070
1071// dgemm nn
1072void GEMM_NN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
1073 {
1074 int jj;
1075 char cn = 'n';
1076 REAL *pA = sA->pA+ai+aj*sA->m;
1077 REAL *pB = sB->pA+bi+bj*sB->m;
1078 REAL *pC = sC->pA+ci+cj*sC->m;
1079 REAL *pD = sD->pA+di+dj*sD->m;
1080#if defined(REF_BLAS_BLIS)
1081 long long i1 = 1;
1082 long long mm = m;
1083 long long nn = n;
1084 long long kk = k;
1085 long long lda = sA->m;
1086 long long ldb = sB->m;
1087 long long ldc = sC->m;
1088 long long ldd = sD->m;
1089 if(!(beta==0.0 || pC==pD))
1090 {
1091 for(jj=0; jj<n; jj++)
1092 COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
1093 }
1094 GEMM(&cn, &cn, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
1095#else
1096 int i1 = 1;
1097 int lda = sA->m;
1098 int ldb = sB->m;
1099 int ldc = sC->m;
1100 int ldd = sD->m;
1101 if(!(beta==0.0 || pC==pD))
1102 {
1103 for(jj=0; jj<n; jj++)
1104 COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
1105 }
1106 GEMM(&cn, &cn, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
1107#endif
1108 return;
1109 }
1110
1111
1112
1113// dtrsm_left_lower_nottransposed_unit
1114void TRSM_LLNU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
1115 {
1116 int jj;
1117 char cl = 'l';
1118 char cn = 'n';
1119 char cu = 'u';
1120 REAL *pA = sA->pA+ai+aj*sA->m;
1121 REAL *pB = sB->pA+bi+bj*sB->m;
1122 REAL *pD = sD->pA+di+dj*sD->m;
1123#if defined(REF_BLAS_BLIS)
1124 long long i1 = 1;
1125 long long mm = m;
1126 long long nn = n;
1127 long long lda = sA->m;
1128 long long ldb = sB->m;
1129 long long ldd = sD->m;
1130 if(!(pB==pD))
1131 {
1132 for(jj=0; jj<n; jj++)
1133 COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
1134 }
1135 TRSM(&cl, &cl, &cn, &cu, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
1136#else
1137 int i1 = 1;
1138 int lda = sA->m;
1139 int ldb = sB->m;
1140 int ldd = sD->m;
1141 if(!(pB==pD))
1142 {
1143 for(jj=0; jj<n; jj++)
1144 COPY(&m, pB+jj*ldb, &i1, pD+jj*sD->m, &i1);
1145 }
1146 TRSM(&cl, &cl, &cn, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
1147#endif
1148 return;
1149 }
1150
1151
1152
1153// dtrsm_left_upper_nottransposed_notunit
1154void TRSM_LUNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
1155 {
1156 int jj;
1157 char cl = 'l';
1158 char cn = 'n';
1159 char cu = 'u';
1160 REAL *pA = sA->pA+ai+aj*sA->m;
1161 REAL *pB = sB->pA+bi+bj*sB->m;
1162 REAL *pD = sD->pA+di+dj*sD->m;
1163#if defined(REF_BLAS_BLIS)
1164 long long i1 = 1;
1165 long long mm = m;
1166 long long nn = n;
1167 long long lda = sA->m;
1168 long long ldb = sB->m;
1169 long long ldd = sD->m;
1170 if(!(pB==pD))
1171 {
1172 for(jj=0; jj<n; jj++)
1173 COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
1174 }
1175 TRSM(&cl, &cu, &cn, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
1176#else
1177 int i1 = 1;
1178 int lda = sA->m;
1179 int ldb = sB->m;
1180 int ldd = sD->m;
1181 if(!(pB==pD))
1182 {
1183 for(jj=0; jj<n; jj++)
1184 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
1185 }
1186 TRSM(&cl, &cu, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
1187#endif
1188 return;
1189 }
1190
1191
1192
1193// dtrsm_right_lower_transposed_unit
1194void TRSM_RLTU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
1195 {
1196 int jj;
1197 char cl = 'l';
1198 char cn = 'n';
1199 char cr = 'r';
1200 char ct = 't';
1201 char cu = 'u';
1202 REAL *pA = sA->pA+ai+aj*sA->m;
1203 REAL *pB = sB->pA+bi+bj*sB->m;
1204 REAL *pD = sD->pA+di+dj*sD->m;
1205#if defined(REF_BLAS_BLIS)
1206 long long i1 = 1;
1207 long long mm = m;
1208 long long nn = n;
1209 long long lda = sA->m;
1210 long long ldb = sB->m;
1211 long long ldd = sD->m;
1212 if(!(pB==pD))
1213 {
1214 for(jj=0; jj<n; jj++)
1215 COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
1216 }
1217 TRSM(&cr, &cl, &ct, &cu, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
1218#else
1219 int i1 = 1;
1220 int lda = sA->m;
1221 int ldb = sB->m;
1222 int ldd = sD->m;
1223 if(!(pB==pD))
1224 {
1225 for(jj=0; jj<n; jj++)
1226 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
1227 }
1228 TRSM(&cr, &cl, &ct, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
1229#endif
1230 return;
1231 }
1232
1233
1234
1235// dtrsm_right_lower_transposed_notunit
1236void TRSM_RLTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
1237 {
1238 int jj;
1239 char cl = 'l';
1240 char cn = 'n';
1241 char cr = 'r';
1242 char ct = 't';
1243 char cu = 'u';
1244 REAL *pA = sA->pA+ai+aj*sA->m;
1245 REAL *pB = sB->pA+bi+bj*sB->m;
1246 REAL *pD = sD->pA+di+dj*sD->m;
1247#if defined(REF_BLAS_BLIS)
1248 long long i1 = 1;
1249 long long mm = m;
1250 long long nn = n;
1251 long long lda = sA->m;
1252 long long ldb = sB->m;
1253 long long ldd = sD->m;
1254 if(!(pB==pD))
1255 {
1256 for(jj=0; jj<n; jj++)
1257 COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
1258 }
1259 TRSM(&cr, &cl, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
1260#else
1261 int i1 = 1;
1262 int lda = sA->m;
1263 int ldb = sB->m;
1264 int ldd = sD->m;
1265 if(!(pB==pD))
1266 {
1267 for(jj=0; jj<n; jj++)
1268 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
1269 }
1270 TRSM(&cr, &cl, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
1271#endif
1272 return;
1273 }
1274
1275
1276
1277// dtrsm_right_upper_transposed_notunit
1278void TRSM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
1279 {
1280 int jj;
1281 char cl = 'l';
1282 char cn = 'n';
1283 char cr = 'r';
1284 char ct = 't';
1285 char cu = 'u';
1286 REAL *pA = sA->pA+ai+aj*sA->m;
1287 REAL *pB = sB->pA+bi+bj*sB->m;
1288 REAL *pD = sD->pA+di+dj*sD->m;
1289#if defined(REF_BLAS_BLIS)
1290 long long i1 = 1;
1291 long long mm = m;
1292 long long nn = n;
1293 long long lda = sA->m;
1294 long long ldb = sB->m;
1295 long long ldd = sD->m;
1296 if(!(pB==pD))
1297 {
1298 for(jj=0; jj<n; jj++)
1299 COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
1300 }
1301 TRSM(&cr, &cu, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
1302#else
1303 int i1 = 1;
1304 int lda = sA->m;
1305 int ldb = sB->m;
1306 int ldd = sD->m;
1307 if(!(pB==pD))
1308 {
1309 for(jj=0; jj<n; jj++)
1310 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
1311 }
1312 TRSM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
1313#endif
1314 return;
1315 }
1316
1317
1318
1319// dtrmm_right_upper_transposed_notunit (A triangular !!!)
1320void TRMM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
1321 {
1322 int jj;
1323 char cl = 'l';
1324 char cn = 'n';
1325 char cr = 'r';
1326 char ct = 't';
1327 char cu = 'u';
1328 REAL *pA = sA->pA+ai+aj*sA->m;
1329 REAL *pB = sB->pA+bi+bj*sB->m;
1330 REAL *pD = sD->pA+di+dj*sD->m;
1331#if defined(REF_BLAS_BLIS)
1332 long long i1 = 1;
1333 long long mm = m;
1334 long long nn = n;
1335 long long lda = sA->m;
1336 long long ldb = sB->m;
1337 long long ldd = sD->m;
1338 if(!(pB==pD))
1339 {
1340 for(jj=0; jj<n; jj++)
1341 COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
1342 }
1343 TRMM(&cr, &cu, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
1344#else
1345 int i1 = 1;
1346 int lda = sA->m;
1347 int ldb = sB->m;
1348 int ldd = sD->m;
1349 if(!(pB==pD))
1350 {
1351 for(jj=0; jj<n; jj++)
1352 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
1353 }
1354 TRMM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
1355#endif
1356 return;
1357 }
1358
1359
1360
1361// dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
1362void TRMM_RLNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
1363 {
1364 int jj;
1365 char cl = 'l';
1366 char cn = 'n';
1367 char cr = 'r';
1368 char ct = 't';
1369 char cu = 'u';
1370 REAL *pA = sA->pA+ai+aj*sA->m;
1371 REAL *pB = sB->pA+bi+bj*sB->m;
1372 REAL *pD = sD->pA+di+dj*sD->m;
1373#if defined(REF_BLAS_BLIS)
1374 long long i1 = 1;
1375 long long mm = m;
1376 long long nn = n;
1377 long long lda = sA->m;
1378 long long ldb = sB->m;
1379 long long ldd = sD->m;
1380 if(!(pB==pD))
1381 {
1382 for(jj=0; jj<n; jj++)
1383 COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
1384 }
1385 TRMM(&cr, &cl, &cn, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
1386#else
1387 int i1 = 1;
1388 int lda = sA->m;
1389 int ldb = sB->m;
1390 int ldd = sD->m;
1391 if(!(pB==pD))
1392 {
1393 for(jj=0; jj<n; jj++)
1394 COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
1395 }
1396 TRMM(&cr, &cl, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
1397#endif
1398 return;
1399 }
1400
1401
1402
1403// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
1404void SYRK_LN_LIBSTR(int m, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
1405 {
1406 int jj;
1407 char cl = 'l';
1408 char cn = 'n';
1409 char cr = 'r';
1410 char ct = 't';
1411 char cu = 'u';
1412 REAL *pA = sA->pA + ai + aj*sA->m;
1413 REAL *pB = sB->pA + bi + bj*sB->m;
1414 REAL *pC = sC->pA + ci + cj*sC->m;
1415 REAL *pD = sD->pA + di + dj*sD->m;
1416#if defined(REF_BLAS_BLIS)
1417 long long i1 = 1;
1418 long long mm = m;
1419 long long kk = k;
1420 long long lda = sA->m;
1421 long long ldb = sB->m;
1422 long long ldc = sC->m;
1423 long long ldd = sD->m;
1424 if(!(beta==0.0 || pC==pD))
1425 {
1426 for(jj=0; jj<m; jj++)
1427 COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
1428 }
1429 if(pA==pB)
1430 {
1431 SYRK(&cl, &cn, &mm, &kk, &alpha, pA, &lda, &beta, pD, &ldd);
1432 }
1433 else
1434 {
1435 GEMM(&cn, &ct, &mm, &mm, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
1436 }
1437#else
1438 int i1 = 1;
1439 int lda = sA->m;
1440 int ldb = sB->m;
1441 int ldc = sC->m;
1442 int ldd = sD->m;
1443 if(!(beta==0.0 || pC==pD))
1444 {
1445 for(jj=0; jj<m; jj++)
1446 COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
1447 }
1448 if(pA==pB)
1449 {
1450 SYRK(&cl, &cn, &m, &k, &alpha, pA, &lda, &beta, pD, &ldd);
1451 }
1452 else
1453 {
1454 GEMM(&cn, &ct, &m, &m, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
1455 }
1456#endif
1457 return;
1458 }
1459
1460// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
1461void SYRK_LN_MN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
1462 {
1463 int jj;
1464 char cl = 'l';
1465 char cn = 'n';
1466 char cr = 'r';
1467 char ct = 't';
1468 char cu = 'u';
1469 REAL *pA = sA->pA + ai + aj*sA->m;
1470 REAL *pB = sB->pA + bi + bj*sB->m;
1471 REAL *pC = sC->pA + ci + cj*sC->m;
1472 REAL *pD = sD->pA + di + dj*sD->m;
1473#if defined(REF_BLAS_BLIS)
1474 long long i1 = 1;
1475 long long mm = m;
1476 long long nn = n;
1477 long long kk = k;
1478 long long mmn = mm-nn;
1479 long long lda = sA->m;
1480 long long ldb = sB->m;
1481 long long ldc = sC->m;
1482 long long ldd = sD->m;
1483 if(!(beta==0.0 || pC==pD))
1484 {
1485 for(jj=0; jj<n; jj++)
1486 COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
1487 }
1488 if(pA==pB)
1489 {
1490 SYRK(&cl, &cn, &nn, &kk, &alpha, pA, &lda, &beta, pD, &ldd);
1491 GEMM(&cn, &ct, &mmn, &nn, &kk, &alpha, pA+n, &lda, pB, &ldb, &beta, pD+n, &ldd);
1492 }
1493 else
1494 {
1495 GEMM(&cn, &ct, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
1496 }
1497#else
1498 int i1 = 1;
1499 int mmn = m-n;
1500 int lda = sA->m;
1501 int ldb = sB->m;
1502 int ldc = sC->m;
1503 int ldd = sD->m;
1504 if(!(beta==0.0 || pC==pD))
1505 {
1506 for(jj=0; jj<n; jj++)
1507 COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
1508 }
1509 if(pA==pB)
1510 {
1511 SYRK(&cl, &cn, &n, &k, &alpha, pA, &lda, &beta, pD, &ldd);
1512 GEMM(&cn, &ct, &mmn, &n, &k, &alpha, pA+n, &lda, pB, &ldb, &beta, pD+n, &ldd);
1513 }
1514 else
1515 {
1516 GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
1517 }
1518#endif
1519 return;
1520 }
1521
1522#else
1523
1524#error : wrong LA choice
1525
1526#endif
1527
1528
1529
1530
1531