blob: f0f51445773fe1a9049e0c9d567a11c0d6db3c18 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#include <stdlib.h>
30#if defined(DIM_CHECK)
31#include <stdio.h>
32#endif
33
34#include "../include/blasfeo_common.h"
35#include "../include/blasfeo_s_kernel.h"
36#include "../include/blasfeo_s_aux.h"
37
38
39
40void sgemm_nt_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
41 {
42
43 if(m==0 | n==0)
44 return;
45
46#if defined(DIM_CHECK)
47 // TODO check that sA=!sD or that if sA==sD then they do not overlap (same for sB)
48 // non-negative size
49 if(m<0) printf("\n****** sgemm_nt_libstr : m<0 : %d<0 *****\n", m);
50 if(n<0) printf("\n****** sgemm_nt_libstr : n<0 : %d<0 *****\n", n);
51 if(k<0) printf("\n****** sgemm_nt_libstr : k<0 : %d<0 *****\n", k);
52 // non-negative offset
53 if(ai<0) printf("\n****** sgemm_nt_libstr : ai<0 : %d<0 *****\n", ai);
54 if(aj<0) printf("\n****** sgemm_nt_libstr : aj<0 : %d<0 *****\n", aj);
55 if(bi<0) printf("\n****** sgemm_nt_libstr : bi<0 : %d<0 *****\n", bi);
56 if(bj<0) printf("\n****** sgemm_nt_libstr : bj<0 : %d<0 *****\n", bj);
57 if(ci<0) printf("\n****** sgemm_nt_libstr : ci<0 : %d<0 *****\n", ci);
58 if(cj<0) printf("\n****** sgemm_nt_libstr : cj<0 : %d<0 *****\n", cj);
59 if(di<0) printf("\n****** sgemm_nt_libstr : di<0 : %d<0 *****\n", di);
60 if(dj<0) printf("\n****** sgemm_nt_libstr : dj<0 : %d<0 *****\n", dj);
61 // inside matrix
62 // A: m x k
63 if(ai+m > sA->m) printf("\n***** sgemm_nt_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
64 if(aj+k > sA->n) printf("\n***** sgemm_nt_libstr : aj+k > col(A) : %d+%d > %d *****\n", aj, k, sA->n);
65 // B: n x k
66 if(bi+n > sB->m) printf("\n***** sgemm_nt_libstr : bi+n > row(B) : %d+%d > %d *****\n", bi, n, sB->m);
67 if(bj+k > sB->n) printf("\n***** sgemm_nt_libstr : bj+k > col(B) : %d+%d > %d *****\n", bj, k, sB->n);
68 // C: m x n
69 if(ci+m > sC->m) printf("\n***** sgemm_nt_libstr : ci+m > row(C) : %d+%d > %d *****\n", ci, n, sC->m);
70 if(cj+n > sC->n) printf("\n***** sgemm_nt_libstr : cj+n > col(C) : %d+%d > %d *****\n", cj, k, sC->n);
71 // D: m x n
72 if(di+m > sD->m) printf("\n***** sgemm_nt_libstr : di+m > row(D) : %d+%d > %d *****\n", di, n, sD->m);
73 if(dj+n > sD->n) printf("\n***** sgemm_nt_libstr : dj+n > col(D) : %d+%d > %d *****\n", dj, k, sD->n);
74#endif
75
76 const int bs = 8;
77
78 int sda = sA->cn;
79 int sdb = sB->cn;
80 int sdc = sC->cn;
81 int sdd = sD->cn;
82 float *pA = sA->pA + aj*bs;
83 float *pB = sB->pA + bj*bs;
84 float *pC = sC->pA + cj*bs;
85 float *pD = sD->pA + dj*bs;
86
87 int i, j, l;
88
89 i = 0;
90
91#if defined(TARGET_X64_INTEL_HASWELL)
92 for(; i<m-23; i+=24)
93 {
94 j = 0;
95 for(; j<n-7; j+=8)
96 {
97 kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
98 kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
99 }
100 if(j<n)
101 {
102 if(j<n-3)
103 {
104 kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
105 if(j<n-4)
106 {
107 kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 8, n-(j+4));
108 }
109 }
110 else
111 {
112 kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 8, n-j);
113 }
114 }
115 }
116 if(m-i>0)
117 {
118 if(m-i<=4)
119 {
120 goto left_4;
121 }
122 else if(m-i<=8)
123 {
124 goto left_8;
125 }
126 else if(m-i<=12)
127 {
128 goto left_12;
129 }
130 else if(m-i<=16)
131 {
132 goto left_16;
133 }
134// else if(m-i<=20)
135// {
136// goto left_20;
137// }
138 else
139 {
140 goto left_24;
141 }
142 }
143#else
144 for(; i<m-15; i+=16)
145 {
146 j = 0;
147 for(; j<n-7; j+=8)
148 {
149 kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
150 kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
151 }
152 if(j<n)
153 {
154 if(j<n-3)
155 {
156 kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
157 if(j<n-4)
158 {
159 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 8, n-(j+4));
160 }
161 }
162 else
163 {
164 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 8, n-j);
165 }
166 }
167 }
168 if(m-i>0)
169 {
170 if(m-i<=4)
171 {
172 goto left_4;
173 }
174 else if(m-i<=8)
175 {
176 goto left_8;
177 }
178 else if(m-i<=12)
179 {
180 goto left_12;
181 }
182 else
183 {
184 goto left_16;
185 }
186 }
187#endif
188
189 // common return if i==m
190 return;
191
192 // clean up loops definitions
193
194 left_24:
195 j = 0;
196 for(; j<n-4; j+=8)
197 {
198 kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, 4);
199 kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
200 }
201 if(j<n)
202 {
203 kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
204 }
205 return;
206
207#if defined(TARGET_X64_INTEL_HASWELL)
208 left_20:
209 j = 0;
210 for(; j<n-4; j+=8)
211 {
212 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, 4);
213 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
214 kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+16)*sdc], &pD[(j+0)*bs+(i+16)*sdd], m-(i+16), n-j);
215 }
216 if(j<n)
217 {
218 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
219 kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+16)*sdc], &pD[(j+0)*bs+(i+16)*sdd], m-(i+16), n-j);
220 }
221 return;
222#endif
223
224 left_16:
225 j = 0;
226 for(; j<n-4; j+=8)
227 {
228 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, 4);
229 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
230 }
231 if(j<n)
232 {
233 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
234 }
235 return;
236
237#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
238left_12:
239 j = 0;
240 for(; j<n-4; j+=8)
241 {
242 kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
243 kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+8)*sdc], &pD[(j+0)*bs+(i+8)*sdd], m-(i+8), n-j);
244 }
245 if(j<n)
246 {
247 kernel_sgemm_nt_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
248 kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+8)*sdc], &pD[(j+0)*bs+(i+8)*sdd], m-(i+8), n-j);
249 }
250 return;
251#endif
252
253 left_8:
254 j = 0;
255 for(; j<n-4; j+=8)
256 {
257 kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
258 }
259 if(j<n)
260 {
261 kernel_sgemm_nt_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
262 }
263 return;
264
265#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
266 left_4:
267 j = 0;
268 for(; j<n; j+=8)
269 {
270 kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
271 }
272 return;
273#endif
274
275 }
276
277
278
279void sgemm_nn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
280 {
281
282 if(m==0 | n==0)
283 return;
284
285#if defined(DIM_CHECK)
286 // non-negative size
287 if(m<0) printf("\n****** sgemm_nt_libstr : m<0 : %d<0 *****\n", m);
288 if(n<0) printf("\n****** sgemm_nt_libstr : n<0 : %d<0 *****\n", n);
289 if(k<0) printf("\n****** sgemm_nt_libstr : k<0 : %d<0 *****\n", k);
290 // non-negative offset
291 if(ai<0) printf("\n****** sgemm_nt_libstr : ai<0 : %d<0 *****\n", ai);
292 if(aj<0) printf("\n****** sgemm_nt_libstr : aj<0 : %d<0 *****\n", aj);
293 if(bi<0) printf("\n****** sgemm_nt_libstr : bi<0 : %d<0 *****\n", bi);
294 if(bj<0) printf("\n****** sgemm_nt_libstr : bj<0 : %d<0 *****\n", bj);
295 if(ci<0) printf("\n****** sgemm_nt_libstr : ci<0 : %d<0 *****\n", ci);
296 if(cj<0) printf("\n****** sgemm_nt_libstr : cj<0 : %d<0 *****\n", cj);
297 if(di<0) printf("\n****** sgemm_nt_libstr : di<0 : %d<0 *****\n", di);
298 if(dj<0) printf("\n****** sgemm_nt_libstr : dj<0 : %d<0 *****\n", dj);
299 // inside matrix
300 // A: m x k
301 if(ai+m > sA->m) printf("\n***** sgemm_nn_libstr : ai+m > row(A) : %d+%d > %d *****\n\n", ai, m, sA->m);
302 if(aj+k > sA->n) printf("\n***** sgemm_nn_libstr : aj+k > col(A) : %d+%d > %d *****\n\n", aj, k, sA->n);
303 // B: k x n
304 if(bi+k > sB->m) printf("\n***** sgemm_nn_libstr : bi+k > row(B) : %d+%d > %d *****\n\n", bi, k, sB->m);
305 if(bj+n > sB->n) printf("\n***** sgemm_nn_libstr : bj+n > col(B) : %d+%d > %d *****\n\n", bj, n, sB->n);
306 // C: m x n
307 if(ci+m > sC->m) printf("\n***** sgemm_nn_libstr : ci+m > row(C) : %d+%d > %d *****\n\n", ci, n, sC->m);
308 if(cj+n > sC->n) printf("\n***** sgemm_nn_libstr : cj+n > col(C) : %d+%d > %d *****\n\n", cj, k, sC->n);
309 // D: m x n
310 if(di+m > sD->m) printf("\n***** sgemm_nn_libstr : di+m > row(D) : %d+%d > %d *****\n\n", di, n, sD->m);
311 if(dj+n > sD->n) printf("\n***** sgemm_nn_libstr : dj+n > col(D) : %d+%d > %d *****\n\n", dj, k, sD->n);
312#endif
313
314 const int bs = 8;
315
316 int sda = sA->cn;
317 int sdb = sB->cn;
318 int sdc = sC->cn;
319 int sdd = sD->cn;
320 float *pA = sA->pA + aj*bs;
321 float *pB = sB->pA + bj*bs + bi/bs*bs*sdb;
322 float *pC = sC->pA + cj*bs;
323 float *pD = sD->pA + dj*bs;
324
325 int offsetB = bi%bs;
326
327 int i, j, l;
328
329 i = 0;
330
331#if defined(TARGET_X64_INTEL_HASWELL)
332 for(; i<m-23; i+=24)
333 {
334 j = 0;
335 for(; j<n-7; j+=8)
336 {
337 kernel_sgemm_nn_24x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
338 kernel_sgemm_nn_24x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
339 }
340 if(j<n)
341 {
342 if(j<n-3)
343 {
344 kernel_sgemm_nn_24x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
345 if(j<n-4)
346 {
347 kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 16, n-(j+4));
348 }
349 }
350 else
351 {
352 kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 16, n-j);
353 }
354 }
355 }
356 if(m>i)
357 {
358 if(m-i<=8)
359 {
360 goto left_8;
361 }
362 else if(m-i<=16)
363 {
364 goto left_16;
365 }
366 else
367 {
368 goto left_24;
369 }
370 }
371#else
372#if 1
373 for(; i<m-15; i+=16)
374 {
375 j = 0;
376 for(; j<n-7; j+=8)
377 {
378 kernel_sgemm_nn_16x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
379 kernel_sgemm_nn_16x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
380 }
381 if(j<n)
382 {
383 if(j<n-3)
384 {
385 kernel_sgemm_nn_16x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
386 if(j<n-4)
387 {
388 kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 16, n-(j+4));
389 }
390 }
391 else
392 {
393 kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 16, n-j);
394 }
395 }
396 }
397 if(m>i)
398 {
399 if(m-i<=8)
400 {
401 goto left_8;
402 }
403 else
404 {
405 goto left_16;
406 }
407 }
408#else
409 for(; i<m-7; i+=8)
410 {
411 j = 0;
412 for(; j<n-7; j+=8)
413 {
414#if 1
415 kernel_sgemm_nn_8x8_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd]);
416#else
417 kernel_sgemm_nn_8x4_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd]);
418 kernel_sgemm_nn_8x4_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], &pD[(j+4)*bs+i*sdd]);
419#endif
420 }
421 if(j<n)
422 {
423 if(j<n-3)
424 {
425 kernel_sgemm_nn_8x4_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd]);
426 if(j<n-4)
427 {
428 kernel_sgemm_nn_8x4_gen_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+4)*bs], sdb, &beta, 0, &pC[(j+4)*bs+i*sdc], sdc, 0, &pD[(j+4)*bs+i*sdd], sdd, 0, 8, 0, n-(j+4));
429 }
430 }
431 else
432 {
433 kernel_sgemm_nn_8x4_gen_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, 0, &pC[(j+0)*bs+i*sdc], sdc, 0, &pD[(j+0)*bs+i*sdd], sdd, 0, 8, 0, n-j);
434 }
435 }
436 }
437 if(m>i)
438 {
439 goto left_8;
440 }
441#endif
442#endif
443
444 // common return if i==m
445 return;
446
447#if defined(TARGET_X64_INTEL_HASWELL)
448 left_24:
449 j = 0;
450 for(; j<n-4; j+=8)
451 {
452 kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
453 kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
454 }
455 if(j<n)
456 {
457 kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
458 }
459 return;
460#endif
461
462 left_16:
463 j = 0;
464 for(; j<n-4; j+=8)
465 {
466 kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
467 kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
468 }
469 if(j<n)
470 {
471 kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
472 }
473 return;
474
475 left_8:
476 j = 0;
477 for(; j<n-4; j+=8)
478 {
479 kernel_sgemm_nn_8x8_vs_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
480 }
481 if(j<n)
482 {
483 kernel_sgemm_nn_8x4_vs_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
484 }
485 return;
486
487 }
488
489
490
491void ssyrk_ln_libstr(int m, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
492 {
493
494 if(m<=0)
495 return;
496
497 if(ci>0 | di>0)
498 {
499 printf("\nssyrk_ln_libstr: feature not implemented yet: ci>0, di>0\n");
500 exit(1);
501 }
502
503 const int bs = 8;
504
505 int i, j;
506
507 int sda = sA->cn;
508 int sdb = sB->cn;
509 int sdc = sC->cn;
510 int sdd = sD->cn;
511 float *pA = sA->pA + aj*bs;
512 float *pB = sB->pA + bj*bs;
513 float *pC = sC->pA + cj*bs;
514 float *pD = sD->pA + dj*bs;
515
516 i = 0;
517#if defined(TARGET_X64_INTEL_HASWELL)
518 for(; i<m-23; i+=24)
519 {
520 j = 0;
521 for(; j<i; j+=8)
522 {
523 kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
524 kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
525 }
526
527 kernel_ssyrk_nt_l_24x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
528 kernel_ssyrk_nt_l_20x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
529 kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd);
530 kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd);
531 kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(j+16)*sda], &pB[0+(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd]);
532 }
533 if(m>i)
534 {
535 if(m-i<=4)
536 {
537 goto left_4;
538 }
539 else if(m-i<=8)
540 {
541 goto left_8;
542 }
543 else if(m-i<=12)
544 {
545 goto left_12;
546 }
547 else if(m-i<=16)
548 {
549 goto left_16;
550 }
551 else
552 {
553 goto left_24;
554 }
555 }
556#else
557 for(; i<m-15; i+=16)
558 {
559 j = 0;
560 for(; j<i; j+=8)
561 {
562 kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
563 kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
564 }
565 kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
566 kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
567 kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd]);
568 }
569 if(m>i)
570 {
571 if(m-i<=4)
572 {
573 goto left_4;
574 }
575 else if(m-i<=8)
576 {
577 goto left_8;
578 }
579 else if(m-i<=12)
580 {
581 goto left_12;
582 }
583 else
584 {
585 goto left_16;
586 }
587 }
588#endif
589
590 // common return if i==m
591 return;
592
593 // clean up loops definitions
594
595#if defined(TARGET_X64_INTEL_HASWELL)
596 left_24: // 17 <= m <= 23
597 j = 0;
598 for(; j<i & j<m-7; j+=8)
599 {
600 kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, m-(j+0));
601 kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, m-(j+4));
602 }
603 kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+0));
604 kernel_ssyrk_nt_l_20x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+4));
605 kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), m-(j+8));
606 kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, m-(i+8), m-(j+12));
607 if(j<m-20) // 21 - 23
608 {
609 kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+16)*sda], &pB[0+(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), m-(j+16));
610 }
611 else // 17 18 19 20
612 {
613 kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+16)*sda], &pB[0+(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), m-(j+16));
614 }
615 return;
616#endif
617
618 left_16: // 13 <= m <= 16
619 j = 0;
620 for(; j<i; j+=8)
621 {
622 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, m-(j+0));
623 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, m-(j+4));
624 }
625 kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+0));
626 kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+4));
627 if(j<m-12) // 13 - 16
628 {
629 kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), m-(j+8));
630 }
631 else // 9 - 12
632 {
633 kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), m-(j+8));
634 }
635 return;
636
637 left_12: // 9 <= m <= 12
638 j = 0;
639 for(; j<i; j+=8)
640 {
641 kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[(i+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+0)*sdc], &pD[(j+0)*bs+(i+0)*sdd], m-(i+0), m-(j+0));
642 kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+8)*sdc], &pD[(j+0)*bs+(i+8)*sdd], m-(i+0), m-(j+0));
643 }
644 kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
645 kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+8)*sdc], &pD[(j+0)*bs+(j+8)*sdd], m-(i+8), m-(j+0));
646 if(j<m-8) // 9 - 12
647 {
648 kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), m-(j+8));
649 }
650 return;
651
652 left_8: // 5 <= m <= 8
653 j = 0;
654 for(; j<i; j+=8)
655 {
656 kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[(i+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+0)*sdc], &pD[(j+0)*bs+(i+0)*sdd], m-(i+0), m-(j+0));
657 }
658 if(j<m-4) // 5 - 8
659 {
660 kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
661 }
662 else // 1 - 4
663 {
664 kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
665 }
666 return;
667
668 left_4: // 1 <= m <= 4
669 j = 0;
670 for(; j<i; j+=8)
671 {
672 kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+0)*sdc], &pD[(j+0)*bs+(i+0)*sdd], m-(i+0), m-(j+0));
673 }
674 kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
675 return;
676
677 }
678
679
680
681void ssyrk_ln_mn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
682 {
683
684 if(m<=0)
685 return;
686
687 if(ci>0 | di>0)
688 {
689 printf("\nssyrk_ln_mn_libstr: feature not implemented yet: ci>0, di>0\n");
690 exit(1);
691 }
692
693 const int bs = 8;
694
695 int i, j;
696
697 int sda = sA->cn;
698 int sdb = sB->cn;
699 int sdc = sC->cn;
700 int sdd = sD->cn;
701 float *pA = sA->pA + aj*bs;
702 float *pB = sB->pA + bj*bs;
703 float *pC = sC->pA + cj*bs;
704 float *pD = sD->pA + dj*bs;
705
706 i = 0;
707#if defined(TARGET_X64_INTEL_HASWELL)
708 for(; i<m-23; i+=24)
709 {
710 j = 0;
711 for(; j<i & j<n-7; j+=8)
712 {
713 kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
714 kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
715 }
716 if(j<n)
717 {
718 if(i<j) // dtrsm
719 {
720 kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
721 if(j<n-4) // 5 6 7
722 {
723 kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
724 }
725 }
726 else // dpotrf
727 {
728 if(j<n-23)
729 {
730 kernel_ssyrk_nt_l_24x4_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
731 kernel_ssyrk_nt_l_20x4_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
732 kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd);
733 kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd);
734 kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd]);
735 }
736 else
737 {
738 if(j<n-4) // 5 - 23
739 {
740 kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+0));
741 kernel_ssyrk_nt_l_20x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+4));
742 if(j==n-8)
743 return;
744 if(j<n-12) // 13 - 23
745 {
746 kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
747 kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+12));
748 if(j==n-16)
749 return;
750 if(j<n-20) // 21 - 23
751 {
752 kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
753 }
754 else // 17 18 19 20
755 {
756 kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
757 }
758 }
759 else // 9 10 11 12
760 {
761 kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
762 }
763 }
764 else // 1 2 3 4
765 {
766 kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
767 }
768 }
769 }
770 }
771 }
772 if(m>i)
773 {
774 if(m-i<=8)
775 {
776 goto left_8;
777 }
778 else if(m-i<=16)
779 {
780 goto left_16;
781 }
782 else
783 {
784 goto left_24;
785 }
786 }
787#else
788 for(; i<m-15; i+=16)
789 {
790 j = 0;
791 for(; j<i & j<n-7; j+=8)
792 {
793 kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
794 kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
795 }
796 if(j<n)
797 {
798 if(i<j) // dtrsm
799 {
800 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
801 if(j<n-4) // 5 6 7
802 {
803 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
804 }
805 }
806 else // dpotrf
807 {
808 if(j<n-15)
809 {
810 kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
811 kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
812 kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd]);
813 }
814 else
815 {
816 if(j<n-4) // 5 - 15
817 {
818 kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+0));
819 kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+4));
820 if(j==n-8) // 8
821 return;
822 if(j<n-12) // 13 - 15
823 {
824 kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
825 }
826 else // 9 10 11 12
827 {
828 kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
829 }
830 }
831 else // 1 2 3 4
832 {
833 kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
834 }
835 }
836 }
837 }
838 }
839 if(m>i)
840 {
841 if(m-i<=8)
842 {
843 goto left_8;
844 }
845 else
846 {
847 goto left_16;
848 }
849 }
850#endif
851
852 // common return if i==m
853 return;
854
855 // clean up loops definitions
856
857#if defined(TARGET_X64_INTEL_HASWELL)
858 left_24:
859 j = 0;
860 for(; j<i & j<n-7; j+=8)
861 {
862 kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
863 kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
864 }
865 if(j<n)
866 {
867 if(j<i) // dtrsm
868 {
869 kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
870 if(j<n-4) // 5 6 7
871 {
872 kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
873 }
874 }
875 else // dpotrf
876 {
877 if(j<n-4) // 5 - 23
878 {
879 kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+0));
880 kernel_ssyrk_nt_l_20x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+4));
881 if(j>=n-8)
882 return;
883 if(j<n-12) // 13 - 23
884 {
885 kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
886 kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+12));
887 if(j>=n-16)
888 return;
889 if(j<n-20) // 21 - 23
890 {
891 kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
892 }
893 else // 17 18 19 20
894 {
895 kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
896 }
897 }
898 else // 9 10 11 12
899 {
900 kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
901 }
902 }
903 else // 1 2 3 4
904 {
905 kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
906 }
907 }
908 }
909 return;
910#endif
911
912 left_16:
913 j = 0;
914 for(; j<i & j<n-7; j+=8)
915 {
916 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
917 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
918 }
919 if(j<n)
920 {
921 if(j<i) // dtrsm
922 {
923 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
924 if(j<n-4) // 5 6 7
925 {
926 kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
927 }
928 }
929 else // dpotrf
930 {
931 if(j<n-4) // 5 - 15
932 {
933 kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+j*sdc], sdc, &pD[(j+0)*bs+j*sdd], sdd, m-(i+0), n-(j+0));
934 kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+j*sdc], sdc, &pD[(j+4)*bs+j*sdd], sdd, m-(i+0), n-(j+4));
935 if(j>=n-8)
936 return;
937 if(j<n-12) // 13 - 15
938 {
939 kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
940 }
941 else // 9 - 12
942 {
943 kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
944 }
945 }
946 else // 1 2 3 4
947 {
948 kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
949 }
950 }
951 }
952 return;
953
954 left_8:
955 j = 0;
956 for(; j<i & j<n-7; j+=8)
957 {
958 kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
959 }
960 if(j<n)
961 {
962 if(j<i) // dtrsm
963 {
964 if(j<n-4) // 5 6 7
965 {
966 kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
967 }
968 else // 1 2 3 4
969 {
970 kernel_sgemm_nt_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
971 }
972 }
973 else // dpotrf
974 {
975 if(j<n-4) // 5 6 7
976 {
977 kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], m-i, n-j);
978 }
979 else // 1 2 3 4
980 {
981 kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], m-i, n-j);
982 }
983 }
984 }
985 return;
986
987 }
988
989
990
991// dtrmm_right_lower_nottransposed_notunit (B, i.e. the first matrix, is triangular !!!)
992void strmm_rlnn_libstr(int m, int n, float alpha, struct s_strmat *sB, int bi, int bj, struct s_strmat *sA, int ai, int aj, struct s_strmat *sD, int di, int dj)
993 {
994
995 const int bs = 8;
996
997 int sda = sA->cn;
998 int sdb = sB->cn;
999 int sdd = sD->cn;
1000 float *pA = sA->pA + aj*bs;
1001 float *pB = sB->pA + bj*bs;
1002 float *pD = sD->pA + dj*bs;
1003
1004 pA += ai/bs*bs*sda;
1005 pB += bi/bs*bs*sdb;
1006 int offsetB = bi%bs;
1007 int di0 = di-ai%bs;
1008 int offsetD;
1009 if(di0>=0)
1010 {
1011 pD += di0/bs*bs*sdd;
1012 offsetD = di0%bs;
1013 }
1014 else
1015 {
1016 pD += -8*sdd;
1017 offsetD = bs+di0;
1018 }
1019
1020 int ii, jj;
1021
1022 int offsetB4;
1023
1024 if(offsetB<4)
1025 {
1026 offsetB4 = offsetB+4;
1027 ii = 0;
1028 if(ai%bs!=0)
1029 {
1030 jj = 0;
1031 for(; jj<n-4; jj+=8)
1032 {
1033 kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, ai%bs, m-ii, 0, n-jj);
1034 kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, ai%bs, m-ii, 0, n-jj-4);
1035 }
1036 m -= bs-ai%bs;
1037 pA += bs*sda;
1038 pD += bs*sdd;
1039 }
1040 if(offsetD==0)
1041 {
1042#if defined(TARGET_X64_INTEL_HASWELL)
1043 // XXX create left_24 once the _gen_ kernel exist !!!
1044 for(; ii<m-23; ii+=24)
1045 {
1046 jj = 0;
1047 for(; jj<n-7; jj+=8)
1048 {
1049 kernel_strmm_nn_rl_24x4_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd);
1050 kernel_strmm_nn_rl_24x4_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd);
1051 }
1052 if(n-jj>0)
1053 {
1054 kernel_strmm_nn_rl_24x4_vs_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd, 24, n-jj);
1055 if(n-jj>4)
1056 {
1057 kernel_strmm_nn_rl_24x4_vs_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd, 24, n-jj-4);
1058 }
1059 }
1060 }
1061#endif
1062 for(; ii<m-15; ii+=16)
1063 {
1064 jj = 0;
1065 for(; jj<n-7; jj+=8)
1066 {
1067 kernel_strmm_nn_rl_16x4_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd);
1068 kernel_strmm_nn_rl_16x4_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd);
1069 }
1070 if(n-jj>0)
1071 {
1072 kernel_strmm_nn_rl_16x4_vs_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd, 16, n-jj);
1073 if(n-jj>4)
1074 {
1075 kernel_strmm_nn_rl_16x4_vs_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd, 16, n-jj-4);
1076 }
1077 }
1078 }
1079 if(m-ii>0)
1080 {
1081 if(m-ii<=8)
1082 goto left_8;
1083 else
1084 goto left_16;
1085 }
1086 }
1087 else
1088 {
1089 for(; ii<m-8; ii+=16)
1090 {
1091 jj = 0;
1092 for(; jj<n-4; jj+=8)
1093 {
1094 kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
1095 kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
1096 }
1097 if(n-jj>0)
1098 {
1099 kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
1100 }
1101 }
1102 if(m-ii>0)
1103 goto left_8;
1104 }
1105 }
1106 else
1107 {
1108 offsetB4 = offsetB-4;
1109 ii = 0;
1110 if(ai%bs!=0)
1111 {
1112 jj = 0;
1113 for(; jj<n-4; jj+=8)
1114 {
1115 kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, ai%bs, m-ii, 0, n-jj);
1116 kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, ai%bs, m-ii, 0, n-jj-4);
1117 }
1118 m -= bs-ai%bs;
1119 pA += bs*sda;
1120 pD += bs*sdd;
1121 }
1122 if(offsetD==0)
1123 {
1124 for(; ii<m-15; ii+=16)
1125 {
1126 jj = 0;
1127 for(; jj<n-7; jj+=8)
1128 {
1129 kernel_strmm_nn_rl_16x4_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd);
1130 kernel_strmm_nn_rl_16x4_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd);
1131 }
1132 if(n-jj>0)
1133 {
1134 kernel_strmm_nn_rl_16x4_vs_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd, 8, n-jj);
1135 if(n-jj>4)
1136 {
1137 kernel_strmm_nn_rl_16x4_vs_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd, 8, n-jj-4);
1138 }
1139 }
1140 }
1141 if(m-ii>0)
1142 {
1143 if(m-ii<=8)
1144 goto left_8;
1145 else
1146 goto left_16;
1147 }
1148 }
1149 else
1150 {
1151 for(; ii<m-8; ii+=16)
1152 {
1153 jj = 0;
1154 for(; jj<n-4; jj+=8)
1155 {
1156 kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
1157 kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
1158 }
1159 if(n-jj>0)
1160 {
1161 kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
1162 }
1163 }
1164 if(m-ii>0)
1165 goto left_8;
1166 }
1167 }
1168
1169 // common return if i==m
1170 return;
1171
1172 // clean up loops definitions
1173
1174 left_16:
1175 if(offsetB<4)
1176 {
1177 jj = 0;
1178 for(; jj<n-4; jj+=8)
1179 {
1180 kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
1181 kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
1182 }
1183 if(n-jj>0)
1184 {
1185 kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
1186 }
1187 }
1188 else
1189 {
1190 jj = 0;
1191 for(; jj<n-4; jj+=8)
1192 {
1193 kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
1194 kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
1195 }
1196 if(n-jj>0)
1197 {
1198 kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
1199 }
1200 }
1201 return;
1202
1203 left_8:
1204 if(offsetB<4)
1205 {
1206 jj = 0;
1207 for(; jj<n-4; jj+=8)
1208 {
1209 kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
1210 kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
1211 }
1212 if(n-jj>0)
1213 {
1214 kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
1215 }
1216 }
1217 else
1218 {
1219 jj = 0;
1220 for(; jj<n-4; jj+=8)
1221 {
1222 kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
1223 kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
1224 }
1225 if(n-jj>0)
1226 {
1227 kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
1228 }
1229 }
1230 return;
1231
1232 }
1233
1234
1235
1236// dtrsm_right_lower_transposed_notunit
1237void strsm_rltn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
1238 {
1239
1240 if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
1241 {
1242 printf("\nstrsm_rltn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
1243 exit(1);
1244 }
1245
1246 const int bs = 8;
1247
1248 // TODO alpha
1249
1250 int sda = sA->cn;
1251 int sdb = sB->cn;
1252 int sdd = sD->cn;
1253 float *pA = sA->pA + aj*bs;
1254 float *pB = sB->pA + bj*bs;
1255 float *pD = sD->pA + dj*bs;
1256 float *dA = sA->dA;
1257
1258 int i, j;
1259
1260 if(ai==0 & aj==0)
1261 {
1262 if(sA->use_dA!=1)
1263 {
1264 sdiaex_lib(n, 1.0, ai, pA, sda, dA);
1265 for(i=0; i<n; i++)
1266 dA[i] = 1.0 / dA[i];
1267 sA->use_dA = 1;
1268 }
1269 }
1270 else
1271 {
1272 sdiaex_lib(n, 1.0, ai, pA, sda, dA);
1273 for(i=0; i<n; i++)
1274 dA[i] = 1.0 / dA[i];
1275 sA->use_dA = 0;
1276 }
1277
1278 if(m<=0 || n<=0)
1279 return;
1280
1281 i = 0;
1282
1283 for(; i<m-7; i+=8)
1284 {
1285 j = 0;
1286 for(; j<n-7; j+=8)
1287 {
1288 kernel_strsm_nt_rl_inv_8x4_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0]);
1289 kernel_strsm_nt_rl_inv_8x4_lib8(j+4, &pD[i*sdd], &pA[4+j*sda], &pB[(j+4)*bs+i*sdb], &pD[(j+4)*bs+i*sdd], &pA[4+(j+4)*bs+j*sda], &dA[j+0]);
1290 }
1291 if(n-j>0)
1292 {
1293 kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0], m-i, n-j-0);
1294 if(n-j>4)
1295 {
1296 kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+4, &pD[i*sdd], &pA[4+j*sda], &pB[(j+4)*bs+i*sdb], &pD[(j+4)*bs+i*sdd], &pA[4+(j+4)*bs+j*sda], &dA[j+4], m-i, n-j-4);
1297 }
1298 }
1299 }
1300 if(m>i)
1301 {
1302 goto left_8;
1303 }
1304
1305 // common return if i==m
1306 return;
1307
1308 left_8:
1309 j = 0;
1310 for(; j<n-4; j+=8)
1311 {
1312 kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0], m-i, n-j-0);
1313 kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+4, &pD[i*sdd], &pA[4+j*sda], &pB[(j+4)*bs+i*sdb], &pD[(j+4)*bs+i*sdd], &pA[4+(j+4)*bs+j*sda], &dA[j+4], m-i, n-j-4);
1314 }
1315 if(n-j>0)
1316 {
1317 kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0], m-i, n-j-0);
1318 }
1319 return;
1320
1321 }
1322
1323
1324
1325