blob: cab8e3c29d79af9bec9046439ccccaf5c703a515 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#include <stdlib.h>
30#include <stdio.h>
31
32#include "../include/blasfeo_common.h"
33#include "../include/blasfeo_d_kernel.h"
34#include "../include/blasfeo_d_aux.h"
35
36
37
38void dtrsv_ln_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
39 {
40
41 if(m<=0 || n<=0)
42 return;
43
44 // suppose m>=n
45 if(m<n)
46 m = n;
47
48 const int bs = 4;
49
50 double alpha = -1.0;
51 double beta = 1.0;
52
53 int i;
54
55 if(x!=y)
56 {
57 for(i=0; i<m; i++)
58 y[i] = x[i];
59 }
60
61 i = 0;
62 for( ; i<n-3; i+=4)
63 {
64 kernel_dtrsv_ln_inv_4_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i]);
65 }
66 if(i<n)
67 {
68 kernel_dtrsv_ln_inv_4_vs_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i], m-i, n-i);
69 i+=4;
70 }
71#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
72 for( ; i<m-7; i+=8)
73 {
74 kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, y, &beta, &y[i], &y[i]);
75 }
76 if(i<m-3)
77 {
78 kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
79 i+=4;
80 }
81#else
82 for( ; i<m-3; i+=4)
83 {
84 kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
85 }
86#endif
87 if(i<m)
88 {
89 kernel_dgemv_n_4_gen_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i], 0, m-i);
90 i+=4;
91 }
92
93 }
94
95
96
97void dtrsv_lt_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
98 {
99
100 if(m<=0 || n<=0)
101 return;
102
103 if(n>m)
104 n = m;
105
106 const int bs = 4;
107
108 int i;
109
110 if(x!=y)
111 for(i=0; i<m; i++)
112 y[i] = x[i];
113
114 i=0;
115 if(n%4==1)
116 {
117 kernel_dtrsv_lt_inv_1_lib4(m-n+i+1, &pA[n/bs*bs*sda+(n-i-1)*bs], sda, &inv_diag_A[n-i-1], &y[n-i-1], &y[n-i-1], &y[n-i-1]);
118 i++;
119 }
120 else if(n%4==2)
121 {
122 kernel_dtrsv_lt_inv_2_lib4(m-n+i+2, &pA[n/bs*bs*sda+(n-i-2)*bs], sda, &inv_diag_A[n-i-2], &y[n-i-2], &y[n-i-2], &y[n-i-2]);
123 i+=2;
124 }
125 else if(n%4==3)
126 {
127 kernel_dtrsv_lt_inv_3_lib4(m-n+i+3, &pA[n/bs*bs*sda+(n-i-3)*bs], sda, &inv_diag_A[n-i-3], &y[n-i-3], &y[n-i-3], &y[n-i-3]);
128 i+=3;
129 }
130 for(; i<n-3; i+=4)
131 {
132 kernel_dtrsv_lt_inv_4_lib4(m-n+i+4, &pA[(n-i-4)/bs*bs*sda+(n-i-4)*bs], sda, &inv_diag_A[n-i-4], &y[n-i-4], &y[n-i-4], &y[n-i-4]);
133 }
134
135 }
136
137
138
139void dgemv_nt_lib(int m, int n, double alpha_n, double alpha_t, double *pA, int sda, double *x_n, double *x_t, double beta_n, double beta_t, double *y_n, double *y_t, double *z_n, double *z_t)
140 {
141
142 if(m<=0 | n<=0)
143 return;
144
145 const int bs = 4;
146
147 int ii;
148
149 // copy and scale y_n int z_n
150 ii = 0;
151 for(; ii<m-3; ii+=4)
152 {
153 z_n[ii+0] = beta_n*y_n[ii+0];
154 z_n[ii+1] = beta_n*y_n[ii+1];
155 z_n[ii+2] = beta_n*y_n[ii+2];
156 z_n[ii+3] = beta_n*y_n[ii+3];
157 }
158 for(; ii<m; ii++)
159 {
160 z_n[ii+0] = beta_n*y_n[ii+0];
161 }
162
163 ii = 0;
164#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
165 for(; ii<n-5; ii+=6)
166 {
167 kernel_dgemv_nt_6_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
168 }
169#endif
170 for(; ii<n-3; ii+=4)
171 {
172 kernel_dgemv_nt_4_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
173 }
174 if(ii<n)
175 {
176 kernel_dgemv_nt_4_vs_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
177 }
178
179 return;
180
181 }
182
183
184
185#if defined(LA_HIGH_PERFORMANCE)
186
187
188
189void dgemv_n_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
190 {
191
192 if(m<0)
193 return;
194
195 const int bs = 4;
196
197 int i;
198
199 int sda = sA->cn;
200 double *pA = sA->pA + aj*bs + ai/bs*bs*sda;
201 double *x = sx->pa + xi;
202 double *y = sy->pa + yi;
203 double *z = sz->pa + zi;
204
205 i = 0;
206 // clean up at the beginning
207 if(ai%bs!=0)
208 {
209 kernel_dgemv_n_4_gen_lib4(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
210 pA += bs*sda;
211 y += 4 - ai%bs;
212 z += 4 - ai%bs;
213 m -= 4 - ai%bs;
214 }
215 // main loop
216#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
217#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
218 for( ; i<m-11; i+=12)
219 {
220 kernel_dgemv_n_12_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
221 }
222#endif
223 for( ; i<m-7; i+=8)
224 {
225 kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
226 }
227 if(i<m-3)
228 {
229 kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
230 i+=4;
231 }
232#else
233 for( ; i<m-3; i+=4)
234 {
235 kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
236 }
237#endif
238 if(i<m)
239 {
240 kernel_dgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
241 }
242
243 return;
244
245 }
246
247
248
249void dgemv_t_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
250 {
251
252 if(n<=0)
253 return;
254
255 const int bs = 4;
256
257 int i;
258
259 int sda = sA->cn;
260 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
261 double *x = sx->pa + xi;
262 double *y = sy->pa + yi;
263 double *z = sz->pa + zi;
264
265 if(ai%bs==0)
266 {
267 i = 0;
268#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
269#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
270 for( ; i<n-11; i+=12)
271 {
272 kernel_dgemv_t_12_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
273 }
274#endif
275 for( ; i<n-7; i+=8)
276 {
277 kernel_dgemv_t_8_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
278 }
279 if(i<n-3)
280 {
281 kernel_dgemv_t_4_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
282 i+=4;
283 }
284#else
285 for( ; i<n-3; i+=4)
286 {
287 kernel_dgemv_t_4_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
288 }
289#endif
290 if(i<n)
291 {
292 kernel_dgemv_t_4_vs_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
293 }
294 }
295 else // TODO kernel 8
296 {
297 i = 0;
298 for( ; i<n; i+=4)
299 {
300 kernel_dgemv_t_4_gen_lib4(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
301 }
302 }
303
304 return;
305
306 }
307
308
309
310void dgemv_nt_libstr(int m, int n, double alpha_n, double alpha_t, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx_n, int xi_n, struct d_strvec *sx_t, int xi_t, double beta_n, double beta_t, struct d_strvec *sy_n, int yi_n, struct d_strvec *sy_t, int yi_t, struct d_strvec *sz_n, int zi_n, struct d_strvec *sz_t, int zi_t)
311 {
312 if(ai!=0)
313 {
314 printf("\ndgemv_nt_libstr: feature not implemented yet: ai=%d\n", ai);
315 exit(1);
316 }
317 const int bs = 4;
318 int sda = sA->cn;
319 double *pA = sA->pA + aj*bs; // TODO ai
320 double *x_n = sx_n->pa + xi_n;
321 double *x_t = sx_t->pa + xi_t;
322 double *y_n = sy_n->pa + yi_n;
323 double *y_t = sy_t->pa + yi_t;
324 double *z_n = sz_n->pa + zi_n;
325 double *z_t = sz_t->pa + zi_t;
326 dgemv_nt_lib(m, n, alpha_n, alpha_t, pA, sda, x_n, x_t, beta_n, beta_t, y_n, y_t, z_n, z_t);
327 return;
328 }
329
330
331
332void dsymv_l_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
333 {
334
335 if(m<=0 | n<=0)
336 return;
337
338 const int bs = 4;
339
340 int ii, n1;
341
342 int sda = sA->cn;
343 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
344 double *x = sx->pa + xi;
345 double *y = sy->pa + yi;
346 double *z = sz->pa + zi;
347
348 // copy and scale y int z
349 ii = 0;
350 for(; ii<m-3; ii+=4)
351 {
352 z[ii+0] = beta*y[ii+0];
353 z[ii+1] = beta*y[ii+1];
354 z[ii+2] = beta*y[ii+2];
355 z[ii+3] = beta*y[ii+3];
356 }
357 for(; ii<m; ii++)
358 {
359 z[ii+0] = beta*y[ii+0];
360 }
361
362 // clean up at the beginning
363 if(ai%bs!=0) // 1, 2, 3
364 {
365 n1 = 4-ai%bs;
366 kernel_dsymv_l_4_gen_lib4(m, &alpha, ai%bs, &pA[0], sda, &x[0], &z[0], n<n1 ? n : n1);
367 pA += n1 + n1*bs + (sda-1)*bs;
368 x += n1;
369 z += n1;
370 m -= n1;
371 n -= n1;
372 }
373 // main loop
374 ii = 0;
375 for(; ii<n-3; ii+=4)
376 {
377 kernel_dsymv_l_4_lib4(m-ii, &alpha, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii]);
378 }
379 // clean up at the end
380 if(ii<n)
381 {
382 kernel_dsymv_l_4_gen_lib4(m-ii, &alpha, 0, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii], n-ii);
383 }
384
385 return;
386 }
387
388
389
390// m >= n
391void dtrmv_lnn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
392 {
393
394 if(m<=0)
395 return;
396
397 const int bs = 4;
398
399 int sda = sA->cn;
400 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
401 double *x = sx->pa + xi;
402 double *z = sz->pa + zi;
403
404 if(m-n>0)
405 dgemv_n_libstr(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
406
407 double *pA2 = pA;
408 double *z2 = z;
409 int m2 = n;
410 int n2 = 0;
411 double *pA3, *x3;
412
413 double alpha = 1.0;
414 double beta = 1.0;
415
416 double zt[4];
417
418 int ii, jj, jj_end;
419
420 ii = 0;
421
422 if(ai%4!=0)
423 {
424 pA2 += sda*bs - ai%bs;
425 z2 += bs-ai%bs;
426 m2 -= bs-ai%bs;
427 n2 += bs-ai%bs;
428 }
429
430 pA2 += m2/bs*bs*sda;
431 z2 += m2/bs*bs;
432 n2 += m2/bs*bs;
433
434 if(m2%bs!=0)
435 {
436 //
437 pA3 = pA2 + bs*n2;
438 x3 = x + n2;
439 zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
440 zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
441 zt[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
442 zt[0] = pA3[0+bs*0]*x3[0];
443 kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, zt, zt);
444 for(jj=0; jj<m2%bs; jj++)
445 z2[jj] = zt[jj];
446 }
447 for(; ii<m2-3; ii+=4)
448 {
449 pA2 -= bs*sda;
450 z2 -= 4;
451 n2 -= 4;
452 pA3 = pA2 + bs*n2;
453 x3 = x + n2;
454 z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
455 z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
456 z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
457 z2[0] = pA3[0+bs*0]*x3[0];
458 kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, z2, z2);
459 }
460 if(ai%4!=0)
461 {
462 if(ai%bs==1)
463 {
464 zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
465 zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
466 zt[0] = pA[0+bs*0]*x[0];
467 jj_end = 4-ai%bs<n ? 4-ai%bs : n;
468 for(jj=0; jj<jj_end; jj++)
469 z[jj] = zt[jj];
470 }
471 else if(ai%bs==2)
472 {
473 zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
474 zt[0] = pA[0+bs*0]*x[0];
475 jj_end = 4-ai%bs<n ? 4-ai%bs : n;
476 for(jj=0; jj<jj_end; jj++)
477 z[jj] = zt[jj];
478 }
479 else // if (ai%bs==3)
480 {
481 z[0] = pA[0+bs*0]*x[0];
482 }
483 }
484
485 return;
486
487 }
488
489
490
491// m >= n
492void dtrmv_ltn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
493 {
494
495 if(m<=0)
496 return;
497
498 const int bs = 4;
499
500 int sda = sA->cn;
501 double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
502 double *x = sx->pa + xi;
503 double *z = sz->pa + zi;
504
505 double xt[4];
506 double zt[4];
507
508 double alpha = 1.0;
509 double beta = 1.0;
510
511 int ii, jj, ll, ll_max;
512
513 jj = 0;
514
515 if(ai%bs!=0)
516 {
517
518 if(ai%bs==1)
519 {
520 ll_max = m-jj<3 ? m-jj : 3;
521 for(ll=0; ll<ll_max; ll++)
522 xt[ll] = x[ll];
523 for(; ll<3; ll++)
524 xt[ll] = 0.0;
525 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
526 zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
527 zt[2] = pA[2+bs*2]*xt[2];
528 pA += bs*sda - 1;
529 x += 3;
530 kernel_dgemv_t_4_lib4(m-3-jj, &alpha, pA, sda, x, &beta, zt, zt);
531 ll_max = n-jj<3 ? n-jj : 3;
532 for(ll=0; ll<ll_max; ll++)
533 z[ll] = zt[ll];
534 pA += bs*3;
535 z += 3;
536 jj += 3;
537 }
538 else if(ai%bs==2)
539 {
540 ll_max = m-jj<2 ? m-jj : 2;
541 for(ll=0; ll<ll_max; ll++)
542 xt[ll] = x[ll];
543 for(; ll<2; ll++)
544 xt[ll] = 0.0;
545 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
546 zt[1] = pA[1+bs*1]*xt[1];
547 pA += bs*sda - 2;
548 x += 2;
549 kernel_dgemv_t_4_lib4(m-2-jj, &alpha, pA, sda, x, &beta, zt, zt);
550 ll_max = n-jj<2 ? n-jj : 2;
551 for(ll=0; ll<ll_max; ll++)
552 z[ll] = zt[ll];
553 pA += bs*2;
554 z += 2;
555 jj += 2;
556 }
557 else // if(ai%bs==3)
558 {
559 ll_max = m-jj<1 ? m-jj : 1;
560 for(ll=0; ll<ll_max; ll++)
561 xt[ll] = x[ll];
562 for(; ll<1; ll++)
563 xt[ll] = 0.0;
564 zt[0] = pA[0+bs*0]*xt[0];
565 pA += bs*sda - 3;
566 x += 1;
567 kernel_dgemv_t_4_lib4(m-1-jj, &alpha, pA, sda, x, &beta, zt, zt);
568 ll_max = n-jj<1 ? n-jj : 1;
569 for(ll=0; ll<ll_max; ll++)
570 z[ll] = zt[ll];
571 pA += bs*1;
572 z += 1;
573 jj += 1;
574 }
575
576 }
577
578 for(; jj<n-3; jj+=4)
579 {
580 zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3];
581 zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3];
582 zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3];
583 zt[3] = pA[3+bs*3]*x[3];
584 pA += bs*sda;
585 x += 4;
586 kernel_dgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, z);
587 pA += bs*4;
588 z += 4;
589 }
590 if(jj<n)
591 {
592 ll_max = m-jj<4 ? m-jj : 4;
593 for(ll=0; ll<ll_max; ll++)
594 xt[ll] = x[ll];
595 for(; ll<4; ll++)
596 xt[ll] = 0.0;
597 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
598 zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
599 zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
600 zt[3] = pA[3+bs*3]*xt[3];
601 pA += bs*sda;
602 x += 4;
603 kernel_dgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
604 for(ll=0; ll<n-jj; ll++)
605 z[ll] = zt[ll];
606// pA += bs*4;
607// z += 4;
608 }
609
610 return;
611
612 }
613
614
615
616void dtrmv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
617 {
618
619 if(m<=0)
620 return;
621
622 if(ai!=0)
623 {
624 printf("\ndtrmv_unn_libstr: feature not implemented yet: ai=%d\n", ai);
625 exit(1);
626 }
627
628 const int bs = 4;
629
630 int sda = sA->cn;
631 double *pA = sA->pA + aj*bs; // TODO ai
632 double *x = sx->pa + xi;
633 double *z = sz->pa + zi;
634
635 int i;
636
637 i=0;
638#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
639 for(; i<m-7; i+=8)
640 {
641 kernel_dtrmv_un_8_lib4(m-i, pA, sda, x, z);
642 pA += 8*sda+8*bs;
643 x += 8;
644 z += 8;
645 }
646#endif
647 for(; i<m-3; i+=4)
648 {
649 kernel_dtrmv_un_4_lib4(m-i, pA, x, z);
650 pA += 4*sda+4*bs;
651 x += 4;
652 z += 4;
653 }
654 if(m>i)
655 {
656 if(m-i==1)
657 {
658 z[0] = pA[0+bs*0]*x[0];
659 }
660 else if(m-i==2)
661 {
662 z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1];
663 z[1] = pA[1+bs*1]*x[1];
664 }
665 else // if(m-i==3)
666 {
667 z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1] + pA[0+bs*2]*x[2];
668 z[1] = pA[1+bs*1]*x[1] + pA[1+bs*2]*x[2];
669 z[2] = pA[2+bs*2]*x[2];
670 }
671 }
672
673 return;
674
675 }
676
677
678
679void dtrmv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
680 {
681
682 if(m<=0)
683 return;
684
685 if(ai!=0)
686 {
687 printf("\ndtrmv_utn_libstr: feature not implemented yet: ai=%d\n", ai);
688 exit(1);
689 }
690
691 const int bs = 4;
692
693 int sda = sA->cn;
694 double *pA = sA->pA + aj*bs; // TODO ai
695 double *x = sx->pa + xi;
696 double *z = sz->pa + zi;
697
698 int ii, idx;
699
700 double *ptrA;
701
702 ii=0;
703 idx = m/bs*bs;
704 if(m%bs!=0)
705 {
706 kernel_dtrmv_ut_4_vs_lib4(m, pA+idx*bs, sda, x, z+idx, m%bs);
707 ii += m%bs;
708 }
709 idx -= 4;
710 for(; ii<m; ii+=4)
711 {
712 kernel_dtrmv_ut_4_lib4(idx+4, pA+idx*bs, sda, x, z+idx);
713 idx -= 4;
714 }
715
716 return;
717
718 }
719
720
721
722void dtrsv_lnn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
723 {
724 if(m==0 | n==0)
725 return;
726#if defined(DIM_CHECK)
727 // non-negative size
728 if(m<0) printf("\n****** dtrsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
729 if(n<0) printf("\n****** dtrsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
730 // non-negative offset
731 if(ai<0) printf("\n****** dtrsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
732 if(aj<0) printf("\n****** dtrsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
733 if(xi<0) printf("\n****** dtrsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
734 if(zi<0) printf("\n****** dtrsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
735 // inside matrix
736 // A: m x k
737 if(ai+m > sA->m) printf("\n***** dtrsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
738 if(aj+n > sA->n) printf("\n***** dtrsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
739 // x: m
740 if(xi+m > sx->m) printf("\n***** dtrsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
741 // z: m
742 if(zi+m > sz->m) printf("\n***** dtrsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
743#endif
744 if(ai!=0 | xi%4!=0)
745 {
746 printf("\ndtrsv_lnn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
747 exit(1);
748 }
749 const int bs = 4;
750 int sda = sA->cn;
751 double *pA = sA->pA + aj*bs; // TODO ai
752 double *dA = sA->dA;
753 double *x = sx->pa + xi;
754 double *z = sz->pa + zi;
755 int ii;
756 if(ai==0 & aj==0)
757 {
758 if(sA->use_dA!=1)
759 {
760 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
761 for(ii=0; ii<n; ii++)
762 dA[ii] = 1.0 / dA[ii];
763 sA->use_dA = 1;
764 }
765 }
766 else
767 {
768 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
769 for(ii=0; ii<n; ii++)
770 dA[ii] = 1.0 / dA[ii];
771 sA->use_dA = 0;
772 }
773 dtrsv_ln_inv_lib(m, n, pA, sda, dA, x, z);
774 return;
775 }
776
777
778
779void dtrsv_ltn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
780 {
781 if(m==0)
782 return;
783#if defined(DIM_CHECK)
784 // non-negative size
785 if(m<0) printf("\n****** dtrsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
786 if(n<0) printf("\n****** dtrsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
787 // non-negative offset
788 if(ai<0) printf("\n****** dtrsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
789 if(aj<0) printf("\n****** dtrsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
790 if(xi<0) printf("\n****** dtrsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
791 if(zi<0) printf("\n****** dtrsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
792 // inside matrix
793 // A: m x k
794 if(ai+m > sA->m) printf("\n***** dtrsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
795 if(aj+n > sA->n) printf("\n***** dtrsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
796 // x: m
797 if(xi+m > sx->m) printf("\n***** dtrsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
798 // z: m
799 if(zi+m > sz->m) printf("\n***** dtrsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
800#endif
801 if(ai!=0 | xi%4!=0)
802 {
803 printf("\ndtrsv_ltn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
804 exit(1);
805 }
806 const int bs = 4;
807 int sda = sA->cn;
808 double *pA = sA->pA + aj*bs; // TODO ai
809 double *dA = sA->dA;
810 double *x = sx->pa + xi;
811 double *z = sz->pa + zi;
812 int ii;
813 if(ai==0 & aj==0)
814 {
815 if(sA->use_dA!=1)
816 {
817 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
818 for(ii=0; ii<n; ii++)
819 dA[ii] = 1.0 / dA[ii];
820 sA->use_dA = 1;
821 }
822 }
823 else
824 {
825 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
826 for(ii=0; ii<n; ii++)
827 dA[ii] = 1.0 / dA[ii];
828 sA->use_dA = 0;
829 }
830 dtrsv_lt_inv_lib(m, n, pA, sda, dA, x, z);
831 return;
832 }
833
834
835
836void dtrsv_lnn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
837 {
838 if(m==0)
839 return;
840#if defined(DIM_CHECK)
841 // non-negative size
842 if(m<0) printf("\n****** dtrsv_lnn_libstr : m<0 : %d<0 *****\n", m);
843 // non-negative offset
844 if(ai<0) printf("\n****** dtrsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
845 if(aj<0) printf("\n****** dtrsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
846 if(xi<0) printf("\n****** dtrsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
847 if(zi<0) printf("\n****** dtrsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
848 // inside matrix
849 // A: m x k
850 if(ai+m > sA->m) printf("\n***** dtrsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
851 if(aj+m > sA->n) printf("\n***** dtrsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
852 // x: m
853 if(xi+m > sx->m) printf("\n***** dtrsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
854 // z: m
855 if(zi+m > sz->m) printf("\n***** dtrsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
856#endif
857 if(ai!=0 | xi%4!=0)
858 {
859 printf("\ndtrsv_lnn_libstr: feature not implemented yet: ai=%d\n", ai);
860 exit(1);
861 }
862 const int bs = 4;
863 int sda = sA->cn;
864 double *pA = sA->pA + aj*bs; // TODO ai
865 double *dA = sA->dA;
866 double *x = sx->pa + xi;
867 double *z = sz->pa + zi;
868 int ii;
869 if(ai==0 & aj==0)
870 {
871 if(sA->use_dA!=1)
872 {
873 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
874 for(ii=0; ii<m; ii++)
875 dA[ii] = 1.0 / dA[ii];
876 sA->use_dA = 1;
877 }
878 }
879 else
880 {
881 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
882 for(ii=0; ii<m; ii++)
883 dA[ii] = 1.0 / dA[ii];
884 sA->use_dA = 0;
885 }
886 dtrsv_ln_inv_lib(m, m, pA, sda, dA, x, z);
887 return;
888 }
889
890
891
892void dtrsv_lnu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
893 {
894 if(m==0)
895 return;
896#if defined(DIM_CHECK)
897 // non-negative size
898 if(m<0) printf("\n****** dtrsv_lnu_libstr : m<0 : %d<0 *****\n", m);
899 // non-negative offset
900 if(ai<0) printf("\n****** dtrsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
901 if(aj<0) printf("\n****** dtrsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
902 if(xi<0) printf("\n****** dtrsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
903 if(zi<0) printf("\n****** dtrsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
904 // inside matrix
905 // A: m x k
906 if(ai+m > sA->m) printf("\n***** dtrsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
907 if(aj+m > sA->n) printf("\n***** dtrsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
908 // x: m
909 if(xi+m > sx->m) printf("\n***** dtrsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
910 // z: m
911 if(zi+m > sz->m) printf("\n***** dtrsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
912#endif
913 printf("\n***** dtrsv_lnu_libstr : feature not implemented yet *****\n");
914 exit(1);
915 }
916
917
918
919void dtrsv_ltn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
920 {
921 if(m==0)
922 return;
923#if defined(DIM_CHECK)
924 // non-negative size
925 if(m<0) printf("\n****** dtrsv_ltn_libstr : m<0 : %d<0 *****\n", m);
926 // non-negative offset
927 if(ai<0) printf("\n****** dtrsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
928 if(aj<0) printf("\n****** dtrsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
929 if(xi<0) printf("\n****** dtrsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
930 if(zi<0) printf("\n****** dtrsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
931 // inside matrix
932 // A: m x k
933 if(ai+m > sA->m) printf("\n***** dtrsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
934 if(aj+m > sA->n) printf("\n***** dtrsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
935 // x: m
936 if(xi+m > sx->m) printf("\n***** dtrsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
937 // z: m
938 if(zi+m > sz->m) printf("\n***** dtrsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
939#endif
940 if(ai!=0 | xi%4!=0)
941 {
942 printf("\ndtrsv_ltn_libstr: feature not implemented yet: ai=%d\n", ai);
943 exit(1);
944 }
945 const int bs = 4;
946 int sda = sA->cn;
947 double *pA = sA->pA + aj*bs; // TODO ai
948 double *dA = sA->dA;
949 double *x = sx->pa + xi;
950 double *z = sz->pa + zi;
951 int ii;
952 if(ai==0 & aj==0)
953 {
954 if(sA->use_dA!=1)
955 {
956 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
957 for(ii=0; ii<m; ii++)
958 dA[ii] = 1.0 / dA[ii];
959 sA->use_dA = 1;
960 }
961 }
962 else
963 {
964 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
965 for(ii=0; ii<m; ii++)
966 dA[ii] = 1.0 / dA[ii];
967 sA->use_dA = 0;
968 }
969 dtrsv_lt_inv_lib(m, m, pA, sda, dA, x, z);
970 return;
971 }
972
973
974
975void dtrsv_ltu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
976 {
977 if(m==0)
978 return;
979#if defined(DIM_CHECK)
980 // non-negative size
981 if(m<0) printf("\n****** dtrsv_ltu_libstr : m<0 : %d<0 *****\n", m);
982 // non-negative offset
983 if(ai<0) printf("\n****** dtrsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
984 if(aj<0) printf("\n****** dtrsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
985 if(xi<0) printf("\n****** dtrsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
986 if(zi<0) printf("\n****** dtrsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
987 // inside matrix
988 // A: m x k
989 if(ai+m > sA->m) printf("\n***** dtrsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
990 if(aj+m > sA->n) printf("\n***** dtrsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
991 // x: m
992 if(xi+m > sx->m) printf("\n***** dtrsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
993 // z: m
994 if(zi+m > sz->m) printf("\n***** dtrsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
995#endif
996 printf("\n***** dtrsv_ltu_libstr : feature not implemented yet *****\n");
997 exit(1);
998 }
999
1000
1001
1002void dtrsv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
1003 {
1004 if(m==0)
1005 return;
1006#if defined(DIM_CHECK)
1007 // non-negative size
1008 if(m<0) printf("\n****** dtrsv_unn_libstr : m<0 : %d<0 *****\n", m);
1009 // non-negative offset
1010 if(ai<0) printf("\n****** dtrsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
1011 if(aj<0) printf("\n****** dtrsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
1012 if(xi<0) printf("\n****** dtrsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
1013 if(zi<0) printf("\n****** dtrsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
1014 // inside matrix
1015 // A: m x k
1016 if(ai+m > sA->m) printf("\n***** dtrsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1017 if(aj+m > sA->n) printf("\n***** dtrsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1018 // x: m
1019 if(xi+m > sx->m) printf("\n***** dtrsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1020 // z: m
1021 if(zi+m > sz->m) printf("\n***** dtrsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1022#endif
1023 printf("\n***** dtrsv_unn_libstr : feature not implemented yet *****\n");
1024 exit(1);
1025 }
1026
1027
1028
1029void dtrsv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
1030 {
1031 if(m==0)
1032 return;
1033#if defined(DIM_CHECK)
1034 // non-negative size
1035 if(m<0) printf("\n****** dtrsv_utn_libstr : m<0 : %d<0 *****\n", m);
1036 // non-negative offset
1037 if(ai<0) printf("\n****** dtrsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
1038 if(aj<0) printf("\n****** dtrsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
1039 if(xi<0) printf("\n****** dtrsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
1040 if(zi<0) printf("\n****** dtrsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
1041 // inside matrix
1042 // A: m x k
1043 if(ai+m > sA->m) printf("\n***** dtrsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
1044 if(aj+m > sA->n) printf("\n***** dtrsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
1045 // x: m
1046 if(xi+m > sx->m) printf("\n***** dtrsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
1047 // z: m
1048 if(zi+m > sz->m) printf("\n***** dtrsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
1049#endif
1050 printf("\n***** dtrsv_utn_libstr : feature not implemented yet *****\n");
1051 exit(1);
1052 }
1053
1054
1055
1056#else
1057
1058#error : wrong LA choice
1059
1060#endif