blob: dfa3cb86b54b0cd418dd65e23ef813909d30b4df [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#include <stdlib.h>
30#include <stdio.h>
31
32#include "../include/blasfeo_common.h"
33#include "../include/blasfeo_d_kernel.h"
34#include "../include/blasfeo_d_aux.h"
35
36
37
38/****************************
39* old interface
40****************************/
41
42void dgemm_nt_lib(int m, int n, int k, double alpha, double *pA, int sda, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
43 {
44
45 if(m<=0 || n<=0)
46 return;
47
48 const int ps = 4;
49
50 int i, j, l;
51
52 i = 0;
53
54#if defined(TARGET_X64_INTEL_HASWELL)
55 for(; i<m-11; i+=12)
56 {
57 j = 0;
58 for(; j<n-3; j+=4)
59 {
60 kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
61 }
62 if(j<n)
63 {
64 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
65 }
66 }
67 if(m>i)
68 {
69 if(m-i<=4)
70 {
71 goto left_4;
72 }
73 else if(m-i<=8)
74 {
75 goto left_8;
76 }
77 else
78 {
79 goto left_12;
80 }
81 }
82#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
83 for(; i<m-7; i+=8)
84 {
85 j = 0;
86 for(; j<n-3; j+=4)
87 {
88 kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
89 }
90 if(j<n)
91 {
92 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
93 }
94 }
95 if(m>i)
96 {
97 if(m-i<=4)
98 {
99 goto left_4;
100 }
101 else
102 {
103 goto left_8;
104 }
105 }
106#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
107 for(; i<m-7; i+=8)
108 {
109 j = 0;
110 for(; j<n-3; j+=4)
111 {
112 kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
113 }
114 if(j<n)
115 {
116 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
117 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*ps+(i+4)*sdc], &pD[j*ps+(i+4)*sdd], m-(i+4), n-j);
118 }
119 }
120 if(m>i)
121 {
122 if(m-i<=4)
123 {
124 goto left_4;
125 }
126 else
127 {
128 goto left_8;
129 }
130 }
131#else
132 for(; i<m-3; i+=4)
133 {
134 j = 0;
135 for(; j<n-3; j+=4)
136 {
137 kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
138 }
139 if(j<n)
140 {
141 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
142 }
143 }
144 if(m>i)
145 {
146 goto left_4;
147 }
148#endif
149
150 // common return if i==m
151 return;
152
153 // clean up loops definitions
154
155#if defined(TARGET_X64_INTEL_HASWELL)
156 left_12:
157 j = 0;
158 for(; j<n; j+=4)
159 {
160 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
161 }
162 return;
163#endif
164
165#if defined(TARGET_X64_INTEL_HASWELL)
166 left_8:
167 j = 0;
168 for(; j<n-8; j+=12)
169 {
170 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
171 kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, n-(j+4));
172 }
173
174 if(j<n-4)
175 {
176 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
177 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, n-(j+4));
178 }
179 else if(j<n)
180 {
181 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
182 }
183 return;
184#endif
185#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
186 left_8:
187 j = 0;
188 for(; j<n; j+=4)
189 {
190 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
191 }
192 return;
193#endif
194#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
195 left_8:
196 j = 0;
197 for(; j<n; j+=4)
198 {
199 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
200 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*ps+(i+4)*sdc], &pD[j*ps+(i+4)*sdd], m-(i+4), n-j);
201 }
202 return;
203#endif
204
205#if defined(TARGET_X64_INTEL_HASWELL)
206 left_4:
207 j = 0;
208 for(; j<n-8; j+=12)
209 {
210 kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
211 }
212 if(j<n-4)
213 {
214 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
215 }
216 else if(j<n)
217 {
218 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
219 }
220 return;
221#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
222 left_4:
223 j = 0;
224 for(; j<n-4; j+=8)
225 {
226 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
227 }
228 if(j<n)
229 {
230 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
231 }
232 return;
233#else
234 left_4:
235 j = 0;
236 for(; j<n; j+=4)
237 {
238 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
239 }
240 return;
241#endif
242
243 }
244
245
246
247#if 0
248void dgemm_nn_lib(int m, int n, int k, double alpha, double *pA, int sda, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
249 {
250
251 if(m<=0 || n<=0)
252 return;
253
254 const int ps = 4;
255
256 int i, j, l;
257
258 i = 0;
259
260#if defined(TARGET_X64_INTEL_HASWELL)
261 for(; i<m-11; i+=12)
262 {
263 j = 0;
264 for(; j<n-3; j+=4)
265 {
266 kernel_dgemm_nn_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
267 }
268 if(j<n)
269 {
270 kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
271 }
272 }
273 if(m>i)
274 {
275 if(m-i<=4)
276 {
277 goto left_4;
278 }
279 else if(m-i<=8)
280 {
281 goto left_8;
282 }
283 else
284 {
285 goto left_12;
286 }
287 }
288#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
289 for(; i<m-7; i+=8)
290 {
291 j = 0;
292 for(; j<n-3; j+=4)
293 {
294 kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
295 }
296 if(j<n)
297 {
298 kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
299 }
300 }
301 if(m>i)
302 {
303 if(m-i<=4)
304 {
305 goto left_4;
306 }
307 else
308 {
309 goto left_8;
310 }
311 }
312#else
313 for(; i<m-3; i+=4)
314 {
315 j = 0;
316 for(; j<n-3; j+=4)
317 {
318 kernel_dgemm_nn_4x4_lib4(k, &alpha, &pA[i*sda], 0, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
319 }
320 if(j<n)
321 {
322 kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
323 }
324 }
325 if(m>i)
326 {
327 goto left_4;
328 }
329#endif
330
331 // common return if i==m
332 return;
333
334 // clean up loops definitions
335
336#if defined(TARGET_X64_INTEL_HASWELL)
337 left_12:
338 j = 0;
339 for(; j<n; j+=4)
340 {
341 kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
342 }
343 return;
344#endif
345
346#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
347 left_8:
348 j = 0;
349 for(; j<n; j+=4)
350 {
351 kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
352 }
353 return;
354#endif
355
356 left_4:
357 j = 0;
358 for(; j<n; j+=4)
359 {
360 kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
361 }
362 return;
363
364 }
365#endif
366
367
368
369void dtrmm_nt_ru_lib(int m, int n, double alpha, double *pA, int sda, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
370 {
371
372 if(m<=0 || n<=0)
373 return;
374
375 const int ps = 4;
376
377 int i, j;
378
379 i = 0;
380// XXX there is a bug here !!!!!!
381#if 0//defined(TARGET_X64_INTEL_HASWELL)
382 for(; i<m-11; i+=12)
383 {
384 j = 0;
385 for(; j<n-3; j+=4)
386 {
387 kernel_dtrmm_nt_ru_12x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
388 }
389 if(j<n) // TODO specialized edge routine
390 {
391 kernel_dtrmm_nt_ru_12x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
392 }
393 }
394 if(i<m)
395 {
396 if(m-i<5)
397 {
398 goto left_4;
399 }
400 if(m-i<9)
401 {
402 goto left_8;
403 }
404 else
405 {
406 goto left_12;
407 }
408 }
409
410#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
411 for(; i<m-7; i+=8)
412 {
413 j = 0;
414 for(; j<n-3; j+=4)
415 {
416 kernel_dtrmm_nt_ru_8x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
417 }
418 if(j<n) // TODO specialized edge routine
419 {
420 kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
421 }
422 }
423 if(i<m)
424 {
425 if(m-i<5)
426 {
427 goto left_4;
428 }
429 else
430 {
431 goto left_8;
432 }
433 }
434
435#else
436 for(; i<m-3; i+=4)
437 {
438 j = 0;
439 for(; j<n-3; j+=4)
440 {
441 kernel_dtrmm_nt_ru_4x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
442 }
443 if(j<n) // TODO specialized edge routine
444 {
445 kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
446 }
447 }
448 if(i<m)
449 {
450 goto left_4;
451 }
452#endif
453
454 // common return
455 return;
456
457#if defined(TARGET_X64_INTEL_HASWELL)
458 // clean up
459 left_12:
460 j = 0;
461// for(; j<n-3; j+=4)
462 for(; j<n; j+=4)
463 {
464 kernel_dtrmm_nt_ru_12x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
465 }
466// if(j<n) // TODO specialized edge routine
467// {
468// kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
469// }
470 return;
471#endif
472
473#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
474 // clean up
475 left_8:
476 j = 0;
477// for(; j<n-3; j+=4)
478 for(; j<n; j+=4)
479 {
480 kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
481 }
482// if(j<n) // TODO specialized edge routine
483// {
484// kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
485// }
486 return;
487#endif
488
489 left_4:
490 j = 0;
491// for(; j<n-3; j+=4)
492 for(; j<n; j+=4)
493 {
494 kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
495 }
496// if(j<n) // TODO specialized edge routine
497// {
498// kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
499// }
500 return;
501
502 }
503
504
505
506// D <= B * A^{-T} , with A lower triangular with unit diagonal
507void dtrsm_nt_rl_one_lib(int m, int n, double *pA, int sda, double *pB, int sdb, double *pD, int sdd)
508 {
509
510 if(m<=0 || n<=0)
511 return;
512
513 const int ps = 4;
514
515 int i, j;
516
517 i = 0;
518
519#if defined(TARGET_X64_INTEL_HASWELL)
520 for(; i<m-11; i+=12)
521 {
522 j = 0;
523 for(; j<n-3; j+=4)
524 {
525 kernel_dtrsm_nt_rl_one_12x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda]);
526 }
527 if(j<n)
528 {
529 kernel_dtrsm_nt_rl_one_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
530 }
531 }
532 if(m>i)
533 {
534 if(m-i<=4)
535 {
536 goto left_4;
537 }
538 else if(m-i<=8)
539 {
540 goto left_8;
541 }
542 else
543 {
544 goto left_12;
545 }
546 }
547#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
548 for(; i<m-7; i+=8)
549 {
550 j = 0;
551 for(; j<n-3; j+=4)
552 {
553 kernel_dtrsm_nt_rl_one_8x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda]);
554 }
555 if(j<n)
556 {
557 kernel_dtrsm_nt_rl_one_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
558 }
559 }
560 if(m>i)
561 {
562 if(m-i<=4)
563 {
564 goto left_4;
565 }
566 else
567 {
568 goto left_8;
569 }
570 }
571#else
572 for(; i<m-3; i+=4)
573 {
574 j = 0;
575 for(; j<n-3; j+=4)
576 {
577 kernel_dtrsm_nt_rl_one_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda]);
578 }
579 if(j<n)
580 {
581 kernel_dtrsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], m-i, n-j);
582 }
583 }
584 if(m>i)
585 {
586 goto left_4;
587 }
588#endif
589
590 // common return if i==m
591 return;
592
593#if defined(TARGET_X64_INTEL_HASWELL)
594 left_12:
595 j = 0;
596 for(; j<n; j+=4)
597 {
598 kernel_dtrsm_nt_rl_one_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
599 }
600 return;
601#endif
602
603#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
604 left_8:
605 j = 0;
606 for(; j<n; j+=4)
607 {
608 kernel_dtrsm_nt_rl_one_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
609 }
610 return;
611#endif
612
613 left_4:
614 j = 0;
615 for(; j<n; j+=4)
616 {
617 kernel_dtrsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], m-i, n-j);
618 }
619 return;
620
621 }
622
623
624
625// D <= B * A^{-T} , with A upper triangular employing explicit inverse of diagonal
626void dtrsm_nt_ru_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *pB, int sdb, double *pD, int sdd)
627 {
628
629 if(m<=0 || n<=0)
630 return;
631
632 const int ps = 4;
633
634 int i, j, idx;
635
636 int rn = n%4;
637
638 double *dummy;
639
640 i = 0;
641
642#if defined(TARGET_X64_INTEL_HASWELL)
643 for(; i<m-11; i+=12)
644 {
645 j = 0;
646 // clean at the end
647 if(rn>0)
648 {
649 idx = n-rn;
650 kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
651 j += rn;
652 }
653 for(; j<n; j+=4)
654 {
655 idx = n-j-4;
656 kernel_dtrsm_nt_ru_inv_12x4_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx]);
657 }
658 }
659 if(m>i)
660 {
661 if(m-i<=4)
662 {
663 goto left_4;
664 }
665 else if(m-i<=8)
666 {
667 goto left_8;
668 }
669 else
670 {
671 goto left_12;
672 }
673 }
674#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
675 for(; i<m-7; i+=8)
676 {
677 j = 0;
678 // clean at the end
679 if(rn>0)
680 {
681 idx = n-rn;
682 kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
683 j += rn;
684 }
685 for(; j<n; j+=4)
686 {
687 idx = n-j-4;
688 kernel_dtrsm_nt_ru_inv_8x4_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx]);
689 }
690 }
691 if(m>i)
692 {
693 if(m-i<=4)
694 {
695 goto left_4;
696 }
697 else
698 {
699 goto left_8;
700 }
701 }
702#else
703 for(; i<m-3; i+=4)
704 {
705 j = 0;
706 // clean at the end
707 if(rn>0)
708 {
709 idx = n-rn;
710 kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
711 j += rn;
712 }
713 for(; j<n; j+=4)
714 {
715 idx = n-j-4;
716 kernel_dtrsm_nt_ru_inv_4x4_lib4(j, &pD[i*sdd+(idx+4)*ps], &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx]);
717 }
718 }
719 if(m>i)
720 {
721 goto left_4;
722 }
723#endif
724
725 // common return if i==m
726 return;
727
728#if defined(TARGET_X64_INTEL_HASWELL)
729 left_12:
730 j = 0;
731 // TODO
732 // clean at the end
733 if(rn>0)
734 {
735 idx = n-rn;
736 kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
737 j += rn;
738 }
739 for(; j<n; j+=4)
740 {
741 idx = n-j-4;
742 kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, 4);
743 }
744 return;
745
746#endif
747
748#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
749 left_8:
750 j = 0;
751 // TODO
752 // clean at the end
753 if(rn>0)
754 {
755 idx = n-rn;
756 kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
757 j += rn;
758 }
759 for(; j<n; j+=4)
760 {
761 idx = n-j-4;
762 kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, 4);
763 }
764 return;
765
766#endif
767
768 left_4:
769 j = 0;
770 // TODO
771 // clean at the end
772 if(rn>0)
773 {
774 idx = n-rn;
775 kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
776 j += rn;
777 }
778 for(; j<n; j+=4)
779 {
780 idx = n-j-4;
781 kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, 4);
782 }
783 return;
784
785 }
786
787
788
789// D <= A^{-1} * B , with A lower triangular with unit diagonal
790void dtrsm_nn_ll_one_lib(int m, int n, double *pA, int sda, double *pB, int sdb, double *pD, int sdd)
791 {
792
793 if(m<=0 || n<=0)
794 return;
795
796 const int ps = 4;
797
798 int i, j;
799
800 i = 0;
801
802#if defined(TARGET_X64_INTEL_HASWELL)
803 for( ; i<m-11; i+=12)
804 {
805 j = 0;
806 for( ; j<n-3; j+=4)
807 {
808 kernel_dtrsm_nn_ll_one_12x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda);
809 }
810 if(j<n)
811 {
812 kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
813 }
814 }
815 if(i<m)
816 {
817 if(m-i<=4)
818 goto left_4;
819 if(m-i<=8)
820 goto left_8;
821 else
822 goto left_12;
823 }
824#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
825 for( ; i<m-7; i+=8)
826 {
827 j = 0;
828 for( ; j<n-3; j+=4)
829 {
830 kernel_dtrsm_nn_ll_one_8x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda);
831 }
832 if(j<n)
833 {
834 kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
835 }
836 }
837 if(i<m)
838 {
839 if(m-i<=4)
840 goto left_4;
841 else
842 goto left_8;
843 }
844#else
845 for( ; i<m-3; i+=4)
846 {
847 j = 0;
848 for( ; j<n-3; j+=4)
849 {
850 kernel_dtrsm_nn_ll_one_4x4_lib4(i, pA+i*sda, pD+j*ps, sdd, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps);
851 }
852 if(j<n)
853 {
854 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, m-i, n-j);
855 }
856 }
857 if(i<m)
858 {
859 goto left_4;
860 }
861#endif
862
863 // common return
864 return;
865
866#if defined(TARGET_X64_INTEL_HASWELL)
867 left_12:
868 j = 0;
869 for( ; j<n; j+=4)
870 {
871 kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
872 }
873 return;
874#endif
875
876#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
877 left_8:
878 j = 0;
879 for( ; j<n; j+=4)
880 {
881 kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
882 }
883 return;
884#endif
885
886 left_4:
887 j = 0;
888 for( ; j<n; j+=4)
889 {
890 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, m-i, n-j);
891 }
892 return;
893
894 }
895
896
897
898// D <= A^{-1} * B , with A upper triangular employing explicit inverse of diagonal
899void dtrsm_nn_lu_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *pB, int sdb, double *pD, int sdd)
900 {
901
902 if(m<=0 || n<=0)
903 return;
904
905 const int ps = 4;
906
907 int i, j, idx;
908 double *dummy;
909
910 i = 0;
911 int rm = m%4;
912 if(rm>0)
913 {
914 // TODO code expliticly the final case
915 idx = m-rm; // position of the part to do
916 j = 0;
917 for( ; j<n; j+=4)
918 {
919 kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(0, dummy, dummy, 0, pB+idx*sdb+j*ps, pD+idx*sdd+j*ps, pA+idx*sda+idx*ps, inv_diag_A+idx, rm, n-j);
920 }
921 // TODO
922 i += rm;
923 }
924// int em = m-rm;
925#if defined(TARGET_X64_INTEL_HASWELL)
926 for( ; i<m-8; i+=12)
927 {
928 idx = m-i; // position of already done part
929 j = 0;
930 for( ; j<n-3; j+=4)
931 {
932 kernel_dtrsm_nn_lu_inv_12x4_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, inv_diag_A+(idx-12));
933 }
934 if(j<n)
935 {
936 kernel_dtrsm_nn_lu_inv_12x4_vs_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, inv_diag_A+(idx-12), 12, n-j);
937// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4), 4, n-j);
938// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, inv_diag_A+(idx-8), 4, n-j);
939// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+8, pA+(idx-12)*sda+(idx-8)*ps, pD+(idx-8)*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, pD+(idx-12)*sdd+j*ps, pA+(idx-12)*sda+(idx-12)*ps, inv_diag_A+(idx-12), 4, n-j);
940 }
941 }
942#endif
943#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
944 for( ; i<m-4; i+=8)
945 {
946 idx = m-i; // position of already done part
947 j = 0;
948 for( ; j<n-3; j+=4)
949 {
950 kernel_dtrsm_nn_lu_inv_8x4_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, inv_diag_A+(idx-8));
951 }
952 if(j<n)
953 {
954 kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, inv_diag_A+(idx-8), 8, n-j);
955// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4), 4, n-j);
956// kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, inv_diag_A+(idx-8), 4, n-j);
957 }
958 }
959#endif
960 for( ; i<m; i+=4)
961 {
962 idx = m-i; // position of already done part
963 j = 0;
964 for( ; j<n-3; j+=4)
965 {
966 kernel_dtrsm_nn_lu_inv_4x4_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4));
967 }
968 if(j<n)
969 {
970 kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4), 4, n-j);
971 }
972 }
973
974 // common return
975 return;
976
977 }
978
979
980
981#if 0
982void dlauum_blk_nt_l_lib(int m, int n, int nv, int *rv, int *cv, double *pA, int sda, double *pB, int sdb, int alg, double *pC, int sdc, double *pD, int sdd)
983 {
984
985 if(m<=0 || n<=0)
986 return;
987
988 // TODO remove
989 double alpha, beta;
990 if(alg==0)
991 {
992 alpha = 1.0;
993 beta = 0.0;
994 }
995 else if(alg==1)
996 {
997 alpha = 1.0;
998 beta = 1.0;
999 }
1000 else
1001 {
1002 alpha = -1.0;
1003 beta = 1.0;
1004 }
1005
1006 // TODO remove
1007 int k = cv[nv-1];
1008
1009 const int ps = 4;
1010
1011 int i, j, l;
1012 int ii, iii, jj, kii, kiii, kjj, k0, k1;
1013
1014 i = 0;
1015 ii = 0;
1016 iii = 0;
1017
1018#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
1019 for(; i<m-7; i+=8)
1020 {
1021
1022 while(ii<nv && rv[ii]<i+8)
1023 ii++;
1024 if(ii<nv)
1025 kii = cv[ii];
1026 else
1027 kii = cv[ii-1];
1028
1029 j = 0;
1030 jj = 0;
1031 for(; j<i && j<n-3; j+=4)
1032 {
1033
1034 while(jj<nv && rv[jj]<j+4)
1035 jj++;
1036 if(jj<nv)
1037 kjj = cv[jj];
1038 else
1039 kjj = cv[jj-1];
1040 k0 = kii<kjj ? kii : kjj;
1041
1042 kernel_dgemm_nt_8x4_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
1043 }
1044 if(j<n)
1045 {
1046
1047 while(jj<nv && rv[jj]<j+4)
1048 jj++;
1049 if(jj<nv)
1050 kjj = cv[jj];
1051 else
1052 kjj = cv[jj-1];
1053 k0 = kii<kjj ? kii : kjj;
1054
1055 if(j<i) // dgemm
1056 {
1057 kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, 8, n-j);
1058 }
1059 else // dsyrk
1060 {
1061 kernel_dsyrk_nt_l_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, 8, n-j);
1062 if(j<n-4)
1063 {
1064 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], 4, n-j-4); // TODO
1065 }
1066 }
1067 }
1068 }
1069 if(m>i)
1070 {
1071 if(m-i<=4)
1072 {
1073 goto left_4;
1074 }
1075 else
1076 {
1077 goto left_8;
1078 }
1079 }
1080#else
1081 for(; i<m-3; i+=4)
1082 {
1083
1084 while(ii<nv && rv[ii]<i+4)
1085 ii++;
1086 if(ii<nv)
1087 kii = cv[ii];
1088 else
1089 kii = cv[ii-1];
1090// k0 = kii;
1091// printf("\nii %d %d %d %d %d\n", i, ii, rv[ii], cv[ii], kii);
1092
1093 j = 0;
1094 jj = 0;
1095 for(; j<i && j<n-3; j+=4)
1096 {
1097
1098 while(jj<nv && rv[jj]<j+4)
1099 jj++;
1100 if(jj<nv)
1101 kjj = cv[jj];
1102 else
1103 kjj = cv[jj-1];
1104 k0 = kii<kjj ? kii : kjj;
1105// printf("\njj %d %d %d %d %d\n", j, jj, rv[jj], cv[jj], kjj);
1106
1107 kernel_dgemm_nt_4x4_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
1108 }
1109 if(j<n)
1110 {
1111
1112 while(jj<nv && rv[jj]<j+4)
1113 jj++;
1114 if(jj<nv)
1115 kjj = cv[jj];
1116 else
1117 kjj = cv[jj-1];
1118 k0 = kii<kjj ? kii : kjj;
1119// printf("\njj %d %d %d %d %d\n", j, jj, rv[jj], cv[jj], kjj);
1120
1121 if(i<j) // dgemm
1122 {
1123 kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], 4, n-j);
1124 }
1125 else // dsyrk
1126 {
1127 kernel_dsyrk_nt_l_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], 4, n-j);
1128 }
1129 }
1130 }
1131 if(m>i)
1132 {
1133 goto left_4;
1134 }
1135#endif
1136
1137 // common return if i==m
1138 return;
1139
1140 // clean up loops definitions
1141
1142#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
1143 left_8:
1144
1145 kii = cv[nv-1];
1146
1147 j = 0;
1148 jj = 0;
1149 for(; j<i && j<n-3; j+=4)
1150 {
1151
1152 while(jj<nv && rv[jj]<j+4)
1153 jj++;
1154 if(jj<nv)
1155 kjj = cv[jj];
1156 else
1157 kjj = cv[jj-1];
1158 k0 = kii<kjj ? kii : kjj;
1159
1160 kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1161 }
1162 if(j<n)
1163 {
1164
1165 while(jj<nv && rv[jj]<j+4)
1166 jj++;
1167 if(jj<nv)
1168 kjj = cv[jj];
1169 else
1170 kjj = cv[jj-1];
1171 k0 = kii<kjj ? kii : kjj;
1172
1173 if(j<i) // dgemm
1174 {
1175 kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1176 }
1177 else // dsyrk
1178 {
1179 kernel_dsyrk_nt_l_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1180 if(j<n-4)
1181 {
1182 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4); // TODO
1183 }
1184 }
1185 }
1186 return;
1187#endif
1188
1189 left_4:
1190
1191 kii = cv[nv-1];
1192
1193 j = 0;
1194 jj = 0;
1195 for(; j<i && j<n-3; j+=4)
1196 {
1197
1198 while(jj<nv && rv[jj]<j+4)
1199 jj++;
1200 if(jj<nv)
1201 kjj = cv[jj];
1202 else
1203 kjj = cv[jj-1];
1204 k0 = kii<kjj ? kii : kjj;
1205
1206 kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1207 }
1208 if(j<n)
1209 {
1210
1211 while(jj<nv && rv[jj]<j+4)
1212 jj++;
1213 if(jj<nv)
1214 kjj = cv[jj];
1215 else
1216 kjj = cv[jj-1];
1217 k0 = kii<kjj ? kii : kjj;
1218
1219 if(j<i) // dgemm
1220 {
1221 kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1222 }
1223 else // dsyrk
1224 {
1225 kernel_dsyrk_nt_l_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1226 }
1227 }
1228 return;
1229
1230 }
1231#endif
1232
1233
1234
1235/****************************
1236* new interface
1237****************************/
1238
1239
1240
1241#if defined(LA_HIGH_PERFORMANCE)
1242
1243
1244
1245// dgemm nt
1246void dgemm_nt_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
1247 {
1248
1249 if(m<=0 | n<=0)
1250 return;
1251
1252 const int ps = 4;
1253
1254 int sda = sA->cn;
1255 int sdb = sB->cn;
1256 int sdc = sC->cn;
1257 int sdd = sD->cn;
1258 int air = ai & (ps-1);
1259 int bir = bi & (ps-1);
1260 double *pA = sA->pA + aj*ps + (ai-air)*sda;
1261 double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
1262 double *pC = sC->pA + cj*ps;
1263 double *pD = sD->pA + dj*ps;
1264
1265 if(ai==0 & bi==0 & ci==0 & di==0)
1266 {
1267 dgemm_nt_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd);
1268 return;
1269 }
1270
1271 int ci0 = ci-air;
1272 int di0 = di-air;
1273 int offsetC;
1274 int offsetD;
1275 if(ci0>=0)
1276 {
1277 pC += ci0/ps*ps*sdd;
1278 offsetC = ci0%ps;
1279 }
1280 else
1281 {
1282 pC += -4*sdc;
1283 offsetC = ps+ci0;
1284 }
1285 if(di0>=0)
1286 {
1287 pD += di0/ps*ps*sdd;
1288 offsetD = di0%ps;
1289 }
1290 else
1291 {
1292 pD += -4*sdd;
1293 offsetD = ps+di0;
1294 }
1295
1296 int i, j, l;
1297
1298 int idxB;
1299
1300 // clean up at the beginning
1301 if(air!=0)
1302 {
1303#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1304 if(m>5)
1305 {
1306 j = 0;
1307 idxB = 0;
1308 // clean up at the beginning
1309 if(bir!=0)
1310 {
1311 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, n-j);
1312 j += ps-bir;
1313 idxB += 4;
1314 }
1315 // main loop
1316 for(; j<n; j+=4)
1317 {
1318 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
1319 idxB += 4;
1320 }
1321 m -= 2*ps-air;
1322 pA += 2*ps*sda;
1323 pC += 2*ps*sdc;
1324 pD += 2*ps*sdd;
1325 }
1326 else // m<=4
1327 {
1328#endif
1329 j = 0;
1330 idxB = 0;
1331 // clean up at the beginning
1332 if(bir!=0)
1333 {
1334 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[0], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, n-j);
1335 j += ps-bir;
1336 idxB += 4;
1337 }
1338 // main loop
1339 for(; j<n; j+=4)
1340 {
1341 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[0], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
1342 idxB += 4;
1343 }
1344 m -= ps-air;
1345 pA += ps*sda;
1346 pC += ps*sdc;
1347 pD += ps*sdd;
1348#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1349 // nothing more to do
1350 return;
1351 }
1352#endif
1353 }
1354 i = 0;
1355 // main loop
1356#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1357 for(; i<m-4; i+=8)
1358 {
1359 j = 0;
1360 idxB = 0;
1361 // clean up at the beginning
1362 if(bir!=0)
1363 {
1364 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, n-j);
1365 j += ps-bir;
1366 idxB += 4;
1367 }
1368 // main loop
1369 for(; j<n; j+=4)
1370 {
1371 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1372 idxB += 4;
1373 }
1374 }
1375 if(i<m)
1376 {
1377 j = 0;
1378 idxB = 0;
1379 // clean up at the beginning
1380 if(bir!=0)
1381 {
1382 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, n-j);
1383 j += ps-bir;
1384 idxB += 4;
1385 }
1386 // main loop
1387 for(; j<n; j+=4)
1388 {
1389 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1390 idxB += 4;
1391 }
1392 }
1393#else
1394 for(; i<m; i+=4)
1395 {
1396 j = 0;
1397 idxB = 0;
1398 // clean up at the beginning
1399 if(bir!=0)
1400 {
1401 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, n-j);
1402 j += ps-bir;
1403 idxB += 4;
1404 }
1405 // main loop
1406 for(; j<n; j+=4)
1407 {
1408 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1409 idxB += 4;
1410 }
1411 }
1412#endif
1413
1414 return;
1415
1416 }
1417
1418
1419
1420// dgemm nn
1421void dgemm_nn_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
1422 {
1423
1424 if(m<=0 || n<=0)
1425 return;
1426
1427 const int ps = 4;
1428
1429 int sda = sA->cn;
1430 int sdb = sB->cn;
1431 int sdc = sC->cn;
1432 int sdd = sD->cn;
1433 int air = ai & (ps-1);
1434 int bir = bi & (ps-1);
1435 double *pA = sA->pA + aj*ps + (ai-air)*sda;
1436 double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
1437 double *pC = sC->pA + cj*ps;
1438 double *pD = sD->pA + dj*ps;
1439
1440 int offsetB = bir;
1441
1442 int ci0 = ci-air;
1443 int di0 = di-air;
1444 int offsetC;
1445 int offsetD;
1446 if(ci0>=0)
1447 {
1448 pC += ci0/ps*ps*sdd;
1449 offsetC = ci0%ps;
1450 }
1451 else
1452 {
1453 pC += -4*sdc;
1454 offsetC = ps+ci0;
1455 }
1456 if(di0>=0)
1457 {
1458 pD += di0/ps*ps*sdd;
1459 offsetD = di0%ps;
1460 }
1461 else
1462 {
1463 pD += -4*sdd;
1464 offsetD = ps+di0;
1465 }
1466
1467 int i, j, l;
1468
1469 // clean up at the beginning
1470 if(air!=0)
1471 {
1472#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
1473 if(m>5)
1474 {
1475 j = 0;
1476 for(; j<n; j+=4)
1477 {
1478 kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[0], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
1479 }
1480 m -= 2*ps-air;
1481 pA += 2*ps*sda;
1482 pC += 2*ps*sda;
1483 pD += 2*ps*sda;
1484 }
1485 else // m-i<=4
1486 {
1487#endif
1488 j = 0;
1489 for(; j<n; j+=4)
1490 {
1491 kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[0], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
1492 }
1493 m -= 2*ps-air;
1494 pA += 2*ps*sda;
1495 pC += 2*ps*sda;
1496 pD += 2*ps*sda;
1497#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
1498 // nothing more to do
1499 return;
1500 }
1501#endif
1502 }
1503 // main loop
1504 i = 0;
1505 if(offsetC==0 & offsetD==0)
1506 {
1507#if defined(TARGET_X64_INTEL_HASWELL)
1508 for(; i<m-11; i+=12)
1509 {
1510 j = 0;
1511 for(; j<n-3; j+=4)
1512 {
1513 kernel_dgemm_nn_12x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
1514 }
1515 if(j<n)
1516 {
1517// kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1518 kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1519 kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[(i+8)*sda], offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+(i+8)*sdc], sdc, 0, &pD[j*ps+(i+8)*sdd], sdd, 0, m-(i+8), 0, n-j);
1520 }
1521 }
1522 if(m>i)
1523 {
1524 if(m-i<=4)
1525 {
1526 goto left_4;
1527 }
1528 else if(m-i<=8)
1529 {
1530 goto left_8;
1531 }
1532 else
1533 {
1534 goto left_12;
1535 }
1536 }
1537#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1538 for(; i<m-7; i+=8)
1539 {
1540 j = 0;
1541 for(; j<n-3; j+=4)
1542 {
1543 kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
1544 }
1545 if(j<n)
1546 {
1547 kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1548 }
1549 }
1550 if(m>i)
1551 {
1552 if(m-i<=4)
1553 {
1554 goto left_4;
1555 }
1556 else
1557 {
1558 goto left_8;
1559 }
1560 }
1561#else
1562 for(; i<m-3; i+=4)
1563 {
1564 j = 0;
1565 for(; j<n-3; j+=4)
1566 {
1567 kernel_dgemm_nn_4x4_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
1568 }
1569 if(j<n)
1570 {
1571 kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1572 }
1573 }
1574 if(m>i)
1575 {
1576 goto left_4;
1577 }
1578#endif
1579 }
1580 else
1581 {
1582// TODO 12x4
1583#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1584 for(; i<m-4; i+=8)
1585 {
1586 j = 0;
1587 for(; j<n; j+=4)
1588 {
1589 kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1590 }
1591 }
1592 if(m>i)
1593 {
1594 goto left_4;
1595 }
1596#else
1597 for(; i<m; i+=4)
1598 {
1599 j = 0;
1600 for(; j<n; j+=4)
1601 {
1602 kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1603 }
1604 }
1605#endif
1606 }
1607
1608 // common return if i==m
1609 return;
1610
1611 // clean up loops definitions
1612
1613#if defined(TARGET_X64_INTEL_HASWELL)
1614 left_12:
1615 j = 0;
1616 for(; j<n; j+=4)
1617 {
1618// kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1619 kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1620 kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[(i+8)*sda], offsetB, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+(i+8)*sdc], sdc, offsetD, &pD[j*ps+(i+8)*sdd], sdd, 0, m-(i+8), 0, n-j);
1621 }
1622 return;
1623#endif
1624
1625#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
1626 left_8:
1627 j = 0;
1628 for(; j<n; j+=4)
1629 {
1630 kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1631 }
1632 return;
1633#endif
1634
1635 left_4:
1636 j = 0;
1637 for(; j<n; j+=4)
1638 {
1639 kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1640 }
1641 return;
1642
1643 return;
1644 }
1645
1646
1647
1648// dtrsm_nn_llu
1649void dtrsm_llnu_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
1650 {
1651 if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
1652 {
1653 printf("\ndtrsm_llnu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
1654 exit(1);
1655 }
1656 const int ps = 4;
1657 // TODO alpha
1658 int sda = sA->cn;
1659 int sdb = sB->cn;
1660 int sdd = sD->cn;
1661 double *pA = sA->pA + aj*ps;
1662 double *pB = sB->pA + bj*ps;
1663 double *pD = sD->pA + dj*ps;
1664 dtrsm_nn_ll_one_lib(m, n, pA, sda, pB, sdb, pD, sdd);
1665 return;
1666 }
1667
1668
1669
1670// dtrsm_nn_lun
1671void dtrsm_lunn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
1672 {
1673 if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
1674 {
1675 printf("\ndtrsm_lunn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
1676 exit(1);
1677 }
1678 const int ps = 4;
1679 // TODO alpha
1680 int sda = sA->cn;
1681 int sdb = sB->cn;
1682 int sdd = sD->cn;
1683 double *pA = sA->pA + aj*ps;
1684 double *pB = sB->pA + bj*ps;
1685 double *pD = sD->pA + dj*ps;
1686 double *dA = sA->dA;
1687 int ii;
1688 if(ai==0 & aj==0)
1689 {
1690 if(sA->use_dA!=1)
1691 {
1692 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1693 for(ii=0; ii<n; ii++)
1694 dA[ii] = 1.0 / dA[ii];
1695 sA->use_dA = 1;
1696 }
1697 }
1698 else
1699 {
1700 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1701 for(ii=0; ii<n; ii++)
1702 dA[ii] = 1.0 / dA[ii];
1703 sA->use_dA = 0;
1704 }
1705 dtrsm_nn_lu_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
1706 return;
1707 }
1708
1709
1710
1711// dtrsm_right_lower_transposed_notunit
1712void dtrsm_rltn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
1713 {
1714
1715 if(m<=0 || n<=0)
1716 return;
1717
1718 if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
1719 {
1720 printf("\ndtrsm_rltn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
1721 exit(1);
1722 }
1723
1724 const int ps = 4;
1725
1726 // TODO alpha !!!!!
1727
1728 int sda = sA->cn;
1729 int sdb = sB->cn;
1730 int sdd = sD->cn;
1731 double *pA = sA->pA + aj*ps;
1732 double *pB = sB->pA + bj*ps;
1733 double *pD = sD->pA + dj*ps;
1734 double *dA = sA->dA;
1735
1736 int i, j;
1737
1738 if(ai==0 & aj==0)
1739 {
1740 if(sA->use_dA!=1)
1741 {
1742 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1743 for(i=0; i<n; i++)
1744 dA[i] = 1.0 / dA[i];
1745 sA->use_dA = 1;
1746 }
1747 }
1748 else
1749 {
1750 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1751 for(i=0; i<n; i++)
1752 dA[i] = 1.0 / dA[i];
1753 sA->use_dA = 0;
1754 }
1755
1756// dtrsm_nt_rl_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
1757 i = 0;
1758#if defined(TARGET_X64_INTEL_HASWELL)
1759 for(; i<m-11; i+=12)
1760 {
1761 j = 0;
1762 for(; j<n-3; j+=4)
1763 {
1764 kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j]);
1765 }
1766 if(j<n)
1767 {
1768 kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
1769 }
1770 }
1771 if(m>i)
1772 {
1773 if(m-i<=4)
1774 {
1775 goto left_4;
1776 }
1777 else if(m-i<=8)
1778 {
1779 goto left_8;
1780 }
1781 else
1782 {
1783 goto left_12;
1784 }
1785 }
1786#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1787 for(; i<m-7; i+=8)
1788 {
1789 j = 0;
1790 for(; j<n-3; j+=4)
1791 {
1792 kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j]);
1793 }
1794 if(j<n)
1795 {
1796 kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
1797 }
1798 }
1799 if(m>i)
1800 {
1801 if(m-i<=4)
1802 {
1803 goto left_4;
1804 }
1805 else
1806 {
1807 goto left_8;
1808 }
1809 }
1810#else
1811 for(; i<m-3; i+=4)
1812 {
1813 j = 0;
1814 for(; j<n-3; j+=4)
1815 {
1816 kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j]);
1817 }
1818 if(j<n)
1819 {
1820 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
1821 }
1822 }
1823 if(m>i)
1824 {
1825 goto left_4;
1826 }
1827#endif
1828
1829 // common return if i==m
1830 return;
1831
1832#if defined(TARGET_X64_INTEL_HASWELL)
1833 left_12:
1834 j = 0;
1835 for(; j<n; j+=4)
1836 {
1837 kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
1838 }
1839 return;
1840#endif
1841
1842#if defined(TARGET_X64_INTEL_HASWELL)
1843 left_8:
1844 j = 0;
1845 for(; j<n-8; j+=12)
1846 {
1847 kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], sda, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
1848 kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pA[(j+4)*sda], sda, &pB[(j+4)*ps+i*sdb], sdb, &pD[(j+4)*ps+i*sdd], sdd, &pA[(j+4)*ps+(j+4)*sda], sda, &dA[(j+4)], m-i, n-(j+4));
1849 }
1850 if(j<n-4)
1851 {
1852 kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], sda, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
1853 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pA[(j+4)*sda], &pB[(j+4)*ps+i*sdb], &pD[(j+4)*ps+i*sdd], &pA[(j+4)*ps+(j+4)*sda], &dA[(j+4)], m-i, n-(j+4));
1854 j += 8;
1855 }
1856 else if(j<n)
1857 {
1858 kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
1859 j += 4;
1860 }
1861 return;
1862#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1863 left_8:
1864 j = 0;
1865 for(; j<n; j+=4)
1866 {
1867 kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
1868 }
1869 return;
1870#endif
1871
1872#if defined(TARGET_X64_INTEL_HASWELL)
1873 left_4:
1874 j = 0;
1875 for(; j<n-8; j+=12)
1876 {
1877 kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pA[j*sda], sda, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
1878 }
1879 if(j<n-4)
1880 {
1881 kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pA[j*sda], sda, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
1882 j += 8;
1883 }
1884 else if(j<n)
1885 {
1886 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
1887 j += 4;
1888 }
1889 return;
1890#else
1891 left_4:
1892 j = 0;
1893 for(; j<n; j+=4)
1894 {
1895 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
1896 }
1897 return;
1898#endif
1899
1900 }
1901
1902
1903
1904// dtrsm_right_lower_transposed_unit
1905void dtrsm_rltu_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
1906 {
1907 if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
1908 {
1909 printf("\ndtrsm_rltu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
1910 exit(1);
1911 }
1912 const int ps = 4;
1913 // TODO alpha
1914 int sda = sA->cn;
1915 int sdb = sB->cn;
1916 int sdd = sD->cn;
1917 double *pA = sA->pA + aj*ps;
1918 double *pB = sB->pA + bj*ps;
1919 double *pD = sD->pA + dj*ps;
1920 dtrsm_nt_rl_one_lib(m, n, pA, sda, pB, sdb, pD, sdd);
1921 return;
1922 }
1923
1924
1925
1926// dtrsm_right_upper_transposed_notunit
1927void dtrsm_rutn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
1928 {
1929 if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
1930 {
1931 printf("\ndtrsm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
1932 exit(1);
1933 }
1934 const int ps = 4;
1935 // TODO alpha
1936 int sda = sA->cn;
1937 int sdb = sB->cn;
1938 int sdd = sD->cn;
1939 double *pA = sA->pA + aj*ps;
1940 double *pB = sB->pA + bj*ps;
1941 double *pD = sD->pA + dj*ps;
1942 double *dA = sA->dA;
1943 int ii;
1944 if(ai==0 & aj==0)
1945 {
1946 if(sA->use_dA!=1)
1947 {
1948 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1949 for(ii=0; ii<n; ii++)
1950 dA[ii] = 1.0 / dA[ii];
1951 sA->use_dA = 1;
1952 }
1953 }
1954 else
1955 {
1956 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
1957 for(ii=0; ii<n; ii++)
1958 dA[ii] = 1.0 / dA[ii];
1959 sA->use_dA = 0;
1960 }
1961 dtrsm_nt_ru_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
1962 return;
1963 }
1964
1965
1966
1967// dtrmm_right_upper_transposed_notunit (B, i.e. the first matrix, is triangular !!!)
1968void dtrmm_rutn_libstr(int m, int n, double alpha, struct d_strmat *sB, int bi, int bj, struct d_strmat *sA, int ai, int aj, struct d_strmat *sD, int di, int dj)
1969 {
1970 if(ai!=0 | bi!=0 | di!=0)
1971 {
1972 printf("\ndtrmm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
1973 exit(1);
1974 }
1975 const int ps = 4;
1976 int sda = sA->cn;
1977 int sdb = sB->cn;
1978 int sdd = sD->cn;
1979 double *pA = sA->pA + aj*ps;
1980 double *pB = sB->pA + bj*ps;
1981 double *pD = sD->pA + dj*ps;
1982 dtrmm_nt_ru_lib(m, n, alpha, pA, sda, pB, sdb, 0.0, pD, sdd, pD, sdd);
1983 return;
1984 }
1985
1986
1987
1988// dtrmm_right_lower_nottransposed_notunit (B, i.e. the first matrix, is triangular !!!)
1989void dtrmm_rlnn_libstr(int m, int n, double alpha, struct d_strmat *sB, int bi, int bj, struct d_strmat *sA, int ai, int aj, struct d_strmat *sD, int di, int dj)
1990 {
1991
1992 const int ps = 4;
1993
1994 int sda = sA->cn;
1995 int sdb = sB->cn;
1996 int sdd = sD->cn;
1997 int air = ai & (ps-1);
1998 int bir = bi & (ps-1);
1999 double *pA = sA->pA + aj*ps + (ai-air)*sda;
2000 double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
2001 double *pD = sD->pA + dj*ps;
2002
2003 int offsetB = bir;
2004
2005 int di0 = di-air;
2006 int offsetD;
2007 if(di0>=0)
2008 {
2009 pD += di0/ps*ps*sdd;
2010 offsetD = di0%ps;
2011 }
2012 else
2013 {
2014 pD += -4*sdd;
2015 offsetD = ps+di0;
2016 }
2017
2018 int ii, jj;
2019
2020 if(air!=0)
2021 {
2022 jj = 0;
2023 for(; jj<n; jj+=4)
2024 {
2025 kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[jj*ps], sdd, air, air+m, 0, n-jj);
2026 }
2027 m -= ps-air;
2028 pA += ps*sda;
2029 pD += ps*sdd;
2030 }
2031 ii = 0;
2032 if(offsetD==0)
2033 {
2034#if defined(TARGET_X64_INTEL_HASWELL)
2035 for(; ii<m-11; ii+=12)
2036 {
2037 jj = 0;
2038 for(; jj<n-5; jj+=4)
2039 {
2040 kernel_dtrmm_nn_rl_12x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd); // n-j>=6 !!!!!
2041 }
2042 for(; jj<n; jj+=4)
2043 {
2044 kernel_dtrmm_nn_rl_12x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, 12, n-jj);
2045// kernel_dtrmm_nn_rl_8x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, 8, n-jj);
2046// kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[(ii+8)*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, 0, &pD[(ii+8)*sdd+jj*ps], sdd, 0, 4, 0, n-jj);
2047 }
2048 }
2049 if(ii<m)
2050 {
2051 if(ii<m-8)
2052 goto left_12;
2053 else if(ii<m-4)
2054 goto left_8;
2055 else
2056 goto left_4_gen;
2057 }
2058#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2059 for(; ii<m-7; ii+=8)
2060 {
2061 jj = 0;
2062 for(; jj<n-5; jj+=4)
2063 {
2064 kernel_dtrmm_nn_rl_8x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd);
2065 }
2066 for(; jj<n; jj+=4)
2067 {
2068 kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, 0, &pD[ii*sdd+jj*ps], sdd, 0, 8, 0, n-jj);
2069 }
2070 }
2071 if(ii<m)
2072 {
2073 if(ii<m-4)
2074 goto left_8_gen;
2075 else
2076 goto left_4_gen;
2077 }
2078#else
2079 for(; ii<m-3; ii+=4)
2080 {
2081 jj = 0;
2082 for(; jj<n-5; jj+=4)
2083 {
2084 kernel_dtrmm_nn_rl_4x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps]);
2085 }
2086 for(; jj<n; jj+=4)
2087 {
2088 kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, 0, &pD[ii*sdd+jj*ps], sdd, 0, 4, 0, n-jj);
2089 }
2090 }
2091 if(ii<m)
2092 {
2093 goto left_4_gen;
2094 }
2095#endif
2096 }
2097 else
2098 {
2099#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2100 for(; ii<m-4; ii+=8)
2101 {
2102 jj = 0;
2103 for(; jj<n; jj+=4)
2104 {
2105 kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
2106 }
2107 }
2108 if(ii<m)
2109 {
2110 goto left_4_gen;
2111 }
2112#else
2113 for(; ii<m; ii+=4)
2114 {
2115 jj = 0;
2116 for(; jj<n; jj+=4)
2117 {
2118 kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
2119 }
2120 }
2121#endif
2122 }
2123
2124 // common return if i==m
2125 return;
2126
2127 // clean up loops definitions
2128
2129#if defined(TARGET_X64_INTEL_HASWELL)
2130 left_12:
2131 jj = 0;
2132 for(; jj<n; jj+=4)
2133 {
2134 kernel_dtrmm_nn_rl_12x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, m-ii, n-jj);
2135 }
2136 return;
2137#endif
2138
2139#if defined(TARGET_X64_INTEL_HASWELL)
2140 left_8:
2141 jj = 0;
2142 for(; jj<n; jj+=4)
2143 {
2144 kernel_dtrmm_nn_rl_8x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, m-ii, n-jj);
2145 }
2146 return;
2147#endif
2148
2149#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2150 left_8_gen:
2151 jj = 0;
2152 for(; jj<n; jj+=4)
2153 {
2154 kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
2155 }
2156 return;
2157#endif
2158
2159 left_4_gen:
2160 jj = 0;
2161 for(; jj<n; jj+=4)
2162 {
2163 kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
2164 }
2165 return;
2166
2167 }
2168
2169
2170
2171void dsyrk_ln_libstr(int m, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
2172 {
2173
2174 if(m<=0)
2175 return;
2176
2177 if(ai!=0 | bi!=0)
2178 {
2179 printf("\ndsyrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d\n", ai, bi);
2180 exit(1);
2181 }
2182
2183 const int ps = 4;
2184
2185 int i, j;
2186
2187 int sda = sA->cn;
2188 int sdb = sB->cn;
2189 int sdc = sC->cn;
2190 int sdd = sD->cn;
2191 double *pA = sA->pA + aj*ps;
2192 double *pB = sB->pA + bj*ps;
2193 double *pC = sC->pA + cj*ps + (ci-(ci&(ps-1)))*sdc;
2194 double *pD = sD->pA + dj*ps + (di-(di&(ps-1)))*sdd;
2195
2196 // TODO ai and bi
2197 int offsetC;
2198 int offsetD;
2199 offsetC = ci&(ps-1);
2200 offsetD = di&(ps-1);
2201
2202 // main loop
2203 i = 0;
2204 if(offsetC==0 & offsetD==0)
2205 {
2206#if defined(TARGET_X64_INTEL_HASWELL)
2207 for(; i<m-11; i+=12)
2208 {
2209 j = 0;
2210 for(; j<i; j+=4)
2211 {
2212 kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
2213 }
2214 kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
2215 kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
2216 }
2217 if(m>i)
2218 {
2219 if(m-i<=4)
2220 {
2221 goto left_4;
2222 }
2223 else if(m-i<=8)
2224 {
2225 goto left_8;
2226 }
2227 else
2228 {
2229 goto left_12;
2230 }
2231 }
2232#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2233 for(; i<m-7; i+=8)
2234 {
2235 j = 0;
2236 for(; j<i; j+=4)
2237 {
2238 kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
2239 }
2240 kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
2241 kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
2242 }
2243 if(m>i)
2244 {
2245 if(m-i<=4)
2246 {
2247 goto left_4;
2248 }
2249 else
2250 {
2251 goto left_8;
2252 }
2253 }
2254#else
2255 for(; i<m-3; i+=4)
2256 {
2257 j = 0;
2258 for(; j<i; j+=4)
2259 {
2260 kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
2261 }
2262 kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
2263 }
2264 if(m>i)
2265 {
2266 goto left_4;
2267 }
2268#endif
2269 }
2270 else
2271 {
2272#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2273 for(; i<m-4; i+=8)
2274 {
2275 j = 0;
2276 for(; j<i; j+=4)
2277 {
2278 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
2279 }
2280 kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
2281 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, m-j-4);
2282 }
2283 if(m>i)
2284 {
2285 goto left_4_gen;
2286 }
2287#else
2288 for(; i<m; i+=4)
2289 {
2290 j = 0;
2291 for(; j<i; j+=4)
2292 {
2293 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
2294 }
2295 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
2296 }
2297#endif
2298 }
2299
2300 // common return if i==m
2301 return;
2302
2303 // clean up loops definitions
2304
2305#if defined(TARGET_X64_INTEL_HASWELL)
2306 left_12:
2307 j = 0;
2308 for(; j<i; j+=4)
2309 {
2310 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
2311 }
2312 kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
2313 kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, m-j-4);
2314// kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
2315 return;
2316#endif
2317
2318#if defined(TARGET_X64_INTEL_HASWELL)
2319 left_8:
2320 j = 0;
2321 for(; j<i-8; j+=12)
2322 {
2323 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
2324 kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, m-(j+4));
2325 }
2326 if(j<i-4)
2327 {
2328 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
2329 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, m-(j+4));
2330 j += 8;
2331 }
2332 else if(j<i)
2333 {
2334 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
2335 j += 4;
2336 }
2337 kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
2338// kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
2339 return;
2340#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2341 left_8:
2342 j = 0;
2343 for(; j<i; j+=4)
2344 {
2345 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
2346 }
2347 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
2348 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, m-j-4);
2349 return;
2350#endif
2351
2352#if defined(TARGET_X64_INTEL_HASWELL)
2353 left_4:
2354 j = 0;
2355 for(; j<i-8; j+=12)
2356 {
2357 kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
2358 }
2359 if(j<i-4)
2360 {
2361 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
2362 j += 8;
2363 }
2364 else if(j<i)
2365 {
2366 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
2367 j += 4;
2368 }
2369 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
2370 return;
2371#else
2372 left_4:
2373 j = 0;
2374 for(; j<i; j+=4)
2375 {
2376 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
2377 }
2378 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
2379 return;
2380#endif
2381
2382 left_4_gen:
2383 j = 0;
2384 for(; j<i; j+=4)
2385 {
2386 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
2387 }
2388 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
2389 return;
2390
2391 }
2392
2393
2394
2395void dsyrk_ln_mn_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
2396 {
2397
2398 if(m<=0 | n<=0)
2399 return;
2400
2401 if(ai!=0 | bi!=0)
2402 {
2403 printf("\ndsyrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d\n", ai, bi);
2404 exit(1);
2405 }
2406
2407 const int ps = 4;
2408
2409 int i, j;
2410
2411 int sda = sA->cn;
2412 int sdb = sB->cn;
2413 int sdc = sC->cn;
2414 int sdd = sD->cn;
2415 double *pA = sA->pA + aj*ps;
2416 double *pB = sB->pA + bj*ps;
2417 double *pC = sC->pA + cj*ps + (ci-(ci&(ps-1)))*sdc;
2418 double *pD = sD->pA + dj*ps + (di-(di&(ps-1)))*sdd;
2419
2420 // TODO ai and bi
2421 int offsetC;
2422 int offsetD;
2423 offsetC = ci&(ps-1);
2424 offsetD = di&(ps-1);
2425
2426 // main loop
2427 i = 0;
2428 if(offsetC==0 & offsetD==0)
2429 {
2430#if defined(TARGET_X64_INTEL_HASWELL)
2431 for(; i<m-11; i+=12)
2432 {
2433 j = 0;
2434 for(; j<i & j<n-3; j+=4)
2435 {
2436 kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
2437 }
2438 if(j<n)
2439 {
2440 if(j<i) // dgemm
2441 {
2442 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2443 }
2444 else // dsyrk
2445 {
2446 if(j<n-11)
2447 {
2448 kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
2449 kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
2450 }
2451 else
2452 {
2453 kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2454 if(j<n-4)
2455 {
2456 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, n-j-4);
2457 if(j<n-8)
2458 {
2459 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
2460 }
2461 }
2462 }
2463 }
2464 }
2465 }
2466 if(m>i)
2467 {
2468 if(m-i<=4)
2469 {
2470 goto left_4;
2471 }
2472 else if(m-i<=8)
2473 {
2474 goto left_8;
2475 }
2476 else
2477 {
2478 goto left_12;
2479 }
2480 }
2481#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2482 for(; i<m-7; i+=8)
2483 {
2484 j = 0;
2485 for(; j<i & j<n-3; j+=4)
2486 {
2487 kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
2488 }
2489 if(j<n)
2490 {
2491 if(j<i) // dgemm
2492 {
2493 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2494 }
2495 else // dsyrk
2496 {
2497 if(j<n-7)
2498 {
2499 kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
2500 kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
2501 }
2502 else
2503 {
2504 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2505 if(j<n-4)
2506 {
2507 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
2508 }
2509 }
2510 }
2511 }
2512 }
2513 if(m>i)
2514 {
2515 if(m-i<=4)
2516 {
2517 goto left_4;
2518 }
2519 else
2520 {
2521 goto left_8;
2522 }
2523 }
2524#else
2525 for(; i<m-3; i+=4)
2526 {
2527 j = 0;
2528 for(; j<i & j<n-3; j+=4)
2529 {
2530 kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
2531 }
2532 if(j<n)
2533 {
2534 if(i<j) // dgemm
2535 {
2536 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2537 }
2538 else // dsyrk
2539 {
2540 if(j<n-3)
2541 {
2542 kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
2543 }
2544 else
2545 {
2546 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2547 }
2548 }
2549 }
2550 }
2551 if(m>i)
2552 {
2553 goto left_4;
2554 }
2555#endif
2556 }
2557 else
2558 {
2559#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2560 for(; i<m-4; i+=8)
2561 {
2562 j = 0;
2563 for(; j<i & j<n; j+=4)
2564 {
2565 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2566 }
2567 if(j<n)
2568 {
2569 kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2570 if(j<n-4)
2571 {
2572 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, n-j-4);
2573 }
2574 }
2575 }
2576 if(m>i)
2577 {
2578 goto left_4_gen;
2579 }
2580#else
2581 for(; i<m; i+=4)
2582 {
2583 j = 0;
2584 for(; j<i & j<n; j+=4)
2585 {
2586 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2587 }
2588 if(j<n)
2589 {
2590 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2591 }
2592 }
2593#endif
2594 }
2595
2596 // common return if i==m
2597 return;
2598
2599 // clean up loops definitions
2600
2601#if defined(TARGET_X64_INTEL_HASWELL)
2602 left_12:
2603 j = 0;
2604 for(; j<i & j<n; j+=4)
2605 {
2606 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2607 }
2608 if(j<n)
2609 {
2610 kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2611 if(j<n-4)
2612 {
2613 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, n-j-4);
2614 if(j<n-8)
2615 {
2616 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
2617 }
2618 }
2619 }
2620 return;
2621#endif
2622
2623#if defined(TARGET_X64_INTEL_HASWELL)
2624 left_8:
2625 j = 0;
2626 for(; j<i-8 & j<n-8; j+=12)
2627 {
2628 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2629 kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, n-(j+4));
2630 }
2631 if(j<i-4 & j<n-4)
2632 {
2633 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2634 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, n-(j+4));
2635 j += 8;
2636 }
2637 if(j<i & j<n)
2638 {
2639 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2640 j += 4;
2641 }
2642 if(j<n)
2643 {
2644 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2645 if(j<n-4)
2646 {
2647 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
2648 }
2649 }
2650 return;
2651#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2652 left_8:
2653 j = 0;
2654 for(; j<i & j<n; j+=4)
2655 {
2656 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2657 }
2658 if(j<n)
2659 {
2660 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2661 if(j<n-4)
2662 {
2663 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
2664 }
2665 }
2666 return;
2667#endif
2668
2669#if defined(TARGET_X64_INTEL_HASWELL)
2670 left_4:
2671 j = 0;
2672 for(; j<i-8 & j<n-8; j+=12)
2673 {
2674 kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2675 }
2676 if(j<i-4 & j<n-4)
2677 {
2678 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2679 j += 8;
2680 }
2681 else if(j<i & j<n)
2682 {
2683 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2684 j += 4;
2685 }
2686 if(j<n)
2687 {
2688 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2689 }
2690 return;
2691#else
2692 left_4:
2693 j = 0;
2694 for(; j<i & j<n; j+=4)
2695 {
2696 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2697 }
2698 if(j<n)
2699 {
2700 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2701 }
2702 return;
2703#endif
2704
2705 left_4_gen:
2706 j = 0;
2707 for(; j<i & j<n; j+=4)
2708 {
2709 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2710 }
2711 if(j<n)
2712 {
2713 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2714 }
2715 return;
2716
2717 }
2718
2719
2720
2721#else
2722
2723#error : wrong LA choice
2724
2725#endif
2726
2727
2728