blob: 243d559576ebda6fc0b74a8a2733b92955450b20 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#include <math.h>
30
31
32
33#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
34void kernel_sgemm_nt_4x4_gen_lib4(int kmax, float *alpha, float *A, float *B, float *beta, int offsetC, float *C0, int sdc, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
35 {
36
37 const int bs = 4;
38
39 float
40 a_0, a_1, a_2, a_3,
41 b_0, b_1, b_2, b_3,
42 c_00=0, c_01=0, c_02=0, c_03=0,
43 c_10=0, c_11=0, c_12=0, c_13=0,
44 c_20=0, c_21=0, c_22=0, c_23=0,
45 c_30=0, c_31=0, c_32=0, c_33=0;
46
47 float
48 *C1, *D1;
49
50 int k;
51
52 for(k=0; k<kmax-3; k+=4)
53 {
54
55 // k = 0
56
57 a_0 = A[0];
58 a_1 = A[1];
59 a_2 = A[2];
60 a_3 = A[3];
61
62 b_0 = B[0];
63 b_1 = B[1];
64 b_2 = B[2];
65 b_3 = B[3];
66
67 c_00 += a_0 * b_0;
68 c_10 += a_1 * b_0;
69 c_20 += a_2 * b_0;
70 c_30 += a_3 * b_0;
71
72 c_01 += a_0 * b_1;
73 c_11 += a_1 * b_1;
74 c_21 += a_2 * b_1;
75 c_31 += a_3 * b_1;
76
77 c_02 += a_0 * b_2;
78 c_12 += a_1 * b_2;
79 c_22 += a_2 * b_2;
80 c_32 += a_3 * b_2;
81
82 c_03 += a_0 * b_3;
83 c_13 += a_1 * b_3;
84 c_23 += a_2 * b_3;
85 c_33 += a_3 * b_3;
86
87
88 // k = 1
89
90 a_0 = A[4];
91 a_1 = A[5];
92 a_2 = A[6];
93 a_3 = A[7];
94
95 b_0 = B[4];
96 b_1 = B[5];
97 b_2 = B[6];
98 b_3 = B[7];
99
100 c_00 += a_0 * b_0;
101 c_10 += a_1 * b_0;
102 c_20 += a_2 * b_0;
103 c_30 += a_3 * b_0;
104
105 c_01 += a_0 * b_1;
106 c_11 += a_1 * b_1;
107 c_21 += a_2 * b_1;
108 c_31 += a_3 * b_1;
109
110 c_02 += a_0 * b_2;
111 c_12 += a_1 * b_2;
112 c_22 += a_2 * b_2;
113 c_32 += a_3 * b_2;
114
115 c_03 += a_0 * b_3;
116 c_13 += a_1 * b_3;
117 c_23 += a_2 * b_3;
118 c_33 += a_3 * b_3;
119
120
121 // k = 2
122
123 a_0 = A[8];
124 a_1 = A[9];
125 a_2 = A[10];
126 a_3 = A[11];
127
128 b_0 = B[8];
129 b_1 = B[9];
130 b_2 = B[10];
131 b_3 = B[11];
132
133 c_00 += a_0 * b_0;
134 c_10 += a_1 * b_0;
135 c_20 += a_2 * b_0;
136 c_30 += a_3 * b_0;
137
138 c_01 += a_0 * b_1;
139 c_11 += a_1 * b_1;
140 c_21 += a_2 * b_1;
141 c_31 += a_3 * b_1;
142
143 c_02 += a_0 * b_2;
144 c_12 += a_1 * b_2;
145 c_22 += a_2 * b_2;
146 c_32 += a_3 * b_2;
147
148 c_03 += a_0 * b_3;
149 c_13 += a_1 * b_3;
150 c_23 += a_2 * b_3;
151 c_33 += a_3 * b_3;
152
153
154 // k = 3
155
156 a_0 = A[12];
157 a_1 = A[13];
158 a_2 = A[14];
159 a_3 = A[15];
160
161 b_0 = B[12];
162 b_1 = B[13];
163 b_2 = B[14];
164 b_3 = B[15];
165
166 c_00 += a_0 * b_0;
167 c_10 += a_1 * b_0;
168 c_20 += a_2 * b_0;
169 c_30 += a_3 * b_0;
170
171 c_01 += a_0 * b_1;
172 c_11 += a_1 * b_1;
173 c_21 += a_2 * b_1;
174 c_31 += a_3 * b_1;
175
176 c_02 += a_0 * b_2;
177 c_12 += a_1 * b_2;
178 c_22 += a_2 * b_2;
179 c_32 += a_3 * b_2;
180
181 c_03 += a_0 * b_3;
182 c_13 += a_1 * b_3;
183 c_23 += a_2 * b_3;
184 c_33 += a_3 * b_3;
185
186 A += 16;
187 B += 16;
188
189 }
190
191 for(; k<kmax; k++)
192 {
193
194 // k = 0
195
196 a_0 = A[0];
197 a_1 = A[1];
198 a_2 = A[2];
199 a_3 = A[3];
200
201 b_0 = B[0];
202 b_1 = B[1];
203 b_2 = B[2];
204 b_3 = B[3];
205
206 c_00 += a_0 * b_0;
207 c_10 += a_1 * b_0;
208 c_20 += a_2 * b_0;
209 c_30 += a_3 * b_0;
210
211 c_01 += a_0 * b_1;
212 c_11 += a_1 * b_1;
213 c_21 += a_2 * b_1;
214 c_31 += a_3 * b_1;
215
216 c_02 += a_0 * b_2;
217 c_12 += a_1 * b_2;
218 c_22 += a_2 * b_2;
219 c_32 += a_3 * b_2;
220
221 c_03 += a_0 * b_3;
222 c_13 += a_1 * b_3;
223 c_23 += a_2 * b_3;
224 c_33 += a_3 * b_3;
225
226 A += 4;
227 B += 4;
228
229 }
230
231 if(offsetC==0)
232 {
233 c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
234 c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
235 c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
236 c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
237
238 c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
239 c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
240 c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
241 c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
242
243 c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
244 c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
245 c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
246 c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
247
248 c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
249 c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
250 c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
251 c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
252 }
253 else if(offsetC==1)
254 {
255 C1 = C0 + sdc*bs;
256
257 c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
258 c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
259 c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
260 c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
261
262 c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
263 c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
264 c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
265 c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
266
267 c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
268 c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
269 c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
270 c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
271
272 c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
273 c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
274 c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
275 c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
276 }
277 else if(offsetC==2)
278 {
279 C1 = C0 + sdc*bs;
280
281 c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
282 c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
283 c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
284 c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
285
286 c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
287 c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
288 c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
289 c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
290
291 c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
292 c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
293 c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
294 c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
295
296 c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
297 c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
298 c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
299 c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
300 }
301 else //if(offsetC==3)
302 {
303 C1 = C0 + sdc*bs;
304
305 c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
306 c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
307 c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
308 c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
309
310 c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
311 c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
312 c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
313 c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
314
315 c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
316 c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
317 c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
318 c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
319
320 c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
321 c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
322 c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
323 c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
324 }
325
326 // shift sol for cols
327 if(n0>0)
328 {
329 if(n0==1)
330 {
331 c_00 = c_01;
332 c_10 = c_11;
333 c_20 = c_21;
334 c_30 = c_31;
335
336 c_01 = c_02;
337 c_11 = c_12;
338 c_21 = c_22;
339 c_31 = c_32;
340
341 c_02 = c_03;
342 c_12 = c_13;
343 c_22 = c_23;
344 c_32 = c_33;
345
346 D0 += 1*bs;
347 }
348 else if(n0==2)
349 {
350 c_00 = c_02;
351 c_10 = c_12;
352 c_20 = c_22;
353 c_30 = c_32;
354
355 c_01 = c_03;
356 c_11 = c_13;
357 c_21 = c_23;
358 c_31 = c_33;
359
360 D0 += 2*bs;
361 }
362 else //if(n0==3)
363 {
364 c_00 = c_03;
365 c_10 = c_13;
366 c_20 = c_23;
367 c_30 = c_33;
368
369 D0 += 3*bs;
370 }
371 }
372
373 int kn = n1 - n0;
374
375 if(offsetD==0)
376 {
377 if(kn<=0)
378 return;
379
380 if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
381 if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
382 if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
383 if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
384
385 if(kn<=1)
386 return;
387
388 if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
389 if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
390 if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
391 if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
392
393 if(kn<=2)
394 return;
395
396 if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
397 if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
398 if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
399 if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
400
401 if(kn<=3)
402 return;
403
404 if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
405 if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
406 if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
407 if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
408 }
409 else if(offsetD==1)
410 {
411 D1 = D0 + sdd*bs;
412
413 if(kn<=0)
414 return;
415
416 if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
417 if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
418 if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
419 if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
420
421 if(kn<=1)
422 return;
423
424 if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
425 if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
426 if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
427 if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
428
429 if(kn<=2)
430 return;
431
432 if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
433 if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
434 if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
435 if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
436
437 if(kn<=3)
438 return;
439
440 if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
441 if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
442 if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
443 if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
444 }
445 else if(offsetD==2)
446 {
447 D1 = D0 + sdd*bs;
448
449 if(kn<=0)
450 return;
451
452 if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
453 if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
454 if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
455 if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
456
457 if(kn<=1)
458 return;
459
460 if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
461 if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
462 if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
463 if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
464
465 if(kn<=2)
466 return;
467
468 if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
469 if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
470 if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
471 if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
472
473 if(kn<=3)
474 return;
475
476 if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
477 if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
478 if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
479 if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
480 }
481 else //if(offsetD==3)
482 {
483 D1 = D0 + sdd*bs;
484
485 if(kn<=0)
486 return;
487
488 if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
489 if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
490 if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
491 if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
492
493 if(kn<=1)
494 return;
495
496 if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
497 if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
498 if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
499 if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
500
501 if(kn<=2)
502 return;
503
504 if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
505 if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
506 if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
507 if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
508
509 if(kn<=3)
510 return;
511
512 if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
513 if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
514 if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
515 if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
516 }
517
518 return;
519
520 }
521#endif
522
523
524
525#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
526void kernel_sgemm_nt_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
527 {
528
529 const int bs = 4;
530
531 float
532 a_0, a_1, a_2, a_3,
533 b_0, b_1, b_2, b_3,
534 c_00=0, c_01=0, c_02=0, c_03=0,
535 c_10=0, c_11=0, c_12=0, c_13=0,
536 c_20=0, c_21=0, c_22=0, c_23=0,
537 c_30=0, c_31=0, c_32=0, c_33=0;
538
539 int k;
540
541 for(k=0; k<kmax-3; k+=4)
542 {
543
544 // k = 0
545
546 a_0 = A[0];
547 a_1 = A[1];
548 a_2 = A[2];
549 a_3 = A[3];
550
551 b_0 = B[0];
552 b_1 = B[1];
553 b_2 = B[2];
554 b_3 = B[3];
555
556 c_00 += a_0 * b_0;
557 c_10 += a_1 * b_0;
558 c_20 += a_2 * b_0;
559 c_30 += a_3 * b_0;
560
561 c_01 += a_0 * b_1;
562 c_11 += a_1 * b_1;
563 c_21 += a_2 * b_1;
564 c_31 += a_3 * b_1;
565
566 c_02 += a_0 * b_2;
567 c_12 += a_1 * b_2;
568 c_22 += a_2 * b_2;
569 c_32 += a_3 * b_2;
570
571 c_03 += a_0 * b_3;
572 c_13 += a_1 * b_3;
573 c_23 += a_2 * b_3;
574 c_33 += a_3 * b_3;
575
576
577 // k = 1
578
579 a_0 = A[4];
580 a_1 = A[5];
581 a_2 = A[6];
582 a_3 = A[7];
583
584 b_0 = B[4];
585 b_1 = B[5];
586 b_2 = B[6];
587 b_3 = B[7];
588
589 c_00 += a_0 * b_0;
590 c_10 += a_1 * b_0;
591 c_20 += a_2 * b_0;
592 c_30 += a_3 * b_0;
593
594 c_01 += a_0 * b_1;
595 c_11 += a_1 * b_1;
596 c_21 += a_2 * b_1;
597 c_31 += a_3 * b_1;
598
599 c_02 += a_0 * b_2;
600 c_12 += a_1 * b_2;
601 c_22 += a_2 * b_2;
602 c_32 += a_3 * b_2;
603
604 c_03 += a_0 * b_3;
605 c_13 += a_1 * b_3;
606 c_23 += a_2 * b_3;
607 c_33 += a_3 * b_3;
608
609
610 // k = 2
611
612 a_0 = A[8];
613 a_1 = A[9];
614 a_2 = A[10];
615 a_3 = A[11];
616
617 b_0 = B[8];
618 b_1 = B[9];
619 b_2 = B[10];
620 b_3 = B[11];
621
622 c_00 += a_0 * b_0;
623 c_10 += a_1 * b_0;
624 c_20 += a_2 * b_0;
625 c_30 += a_3 * b_0;
626
627 c_01 += a_0 * b_1;
628 c_11 += a_1 * b_1;
629 c_21 += a_2 * b_1;
630 c_31 += a_3 * b_1;
631
632 c_02 += a_0 * b_2;
633 c_12 += a_1 * b_2;
634 c_22 += a_2 * b_2;
635 c_32 += a_3 * b_2;
636
637 c_03 += a_0 * b_3;
638 c_13 += a_1 * b_3;
639 c_23 += a_2 * b_3;
640 c_33 += a_3 * b_3;
641
642
643 // k = 3
644
645 a_0 = A[12];
646 a_1 = A[13];
647 a_2 = A[14];
648 a_3 = A[15];
649
650 b_0 = B[12];
651 b_1 = B[13];
652 b_2 = B[14];
653 b_3 = B[15];
654
655 c_00 += a_0 * b_0;
656 c_10 += a_1 * b_0;
657 c_20 += a_2 * b_0;
658 c_30 += a_3 * b_0;
659
660 c_01 += a_0 * b_1;
661 c_11 += a_1 * b_1;
662 c_21 += a_2 * b_1;
663 c_31 += a_3 * b_1;
664
665 c_02 += a_0 * b_2;
666 c_12 += a_1 * b_2;
667 c_22 += a_2 * b_2;
668 c_32 += a_3 * b_2;
669
670 c_03 += a_0 * b_3;
671 c_13 += a_1 * b_3;
672 c_23 += a_2 * b_3;
673 c_33 += a_3 * b_3;
674
675 A += 16;
676 B += 16;
677
678 }
679
680 for(; k<kmax; k++)
681 {
682
683 // k = 0
684
685 a_0 = A[0];
686 a_1 = A[1];
687 a_2 = A[2];
688 a_3 = A[3];
689
690 b_0 = B[0];
691 b_1 = B[1];
692 b_2 = B[2];
693 b_3 = B[3];
694
695 c_00 += a_0 * b_0;
696 c_10 += a_1 * b_0;
697 c_20 += a_2 * b_0;
698 c_30 += a_3 * b_0;
699
700 c_01 += a_0 * b_1;
701 c_11 += a_1 * b_1;
702 c_21 += a_2 * b_1;
703 c_31 += a_3 * b_1;
704
705 c_02 += a_0 * b_2;
706 c_12 += a_1 * b_2;
707 c_22 += a_2 * b_2;
708 c_32 += a_3 * b_2;
709
710 c_03 += a_0 * b_3;
711 c_13 += a_1 * b_3;
712 c_23 += a_2 * b_3;
713 c_33 += a_3 * b_3;
714
715 A += 4;
716 B += 4;
717
718 }
719
720 c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
721 c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
722 c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
723 c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
724
725 c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
726 c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
727 c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
728 c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
729
730 c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
731 c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
732 c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
733 c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
734
735 c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
736 c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
737 c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
738 c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
739
740 if(km>=4)
741 {
742 D[0+bs*0] = c_00;
743 D[1+bs*0] = c_10;
744 D[2+bs*0] = c_20;
745 D[3+bs*0] = c_30;
746
747 if(kn==1)
748 return;
749
750 D[0+bs*1] = c_01;
751 D[1+bs*1] = c_11;
752 D[2+bs*1] = c_21;
753 D[3+bs*1] = c_31;
754
755 if(kn==2)
756 return;
757
758 D[0+bs*2] = c_02;
759 D[1+bs*2] = c_12;
760 D[2+bs*2] = c_22;
761 D[3+bs*2] = c_32;
762
763 if(kn==3)
764 return;
765
766 D[0+bs*3] = c_03;
767 D[1+bs*3] = c_13;
768 D[2+bs*3] = c_23;
769 D[3+bs*3] = c_33;
770 }
771 else if(km>=3)
772 {
773 D[0+bs*0] = c_00;
774 D[1+bs*0] = c_10;
775 D[2+bs*0] = c_20;
776
777 if(kn==1)
778 return;
779
780 D[0+bs*1] = c_01;
781 D[1+bs*1] = c_11;
782 D[2+bs*1] = c_21;
783
784 if(kn==2)
785 return;
786
787 D[0+bs*2] = c_02;
788 D[1+bs*2] = c_12;
789 D[2+bs*2] = c_22;
790
791 if(kn==3)
792 return;
793
794 D[0+bs*3] = c_03;
795 D[1+bs*3] = c_13;
796 D[2+bs*3] = c_23;
797 }
798 else if(km>=2)
799 {
800 D[0+bs*0] = c_00;
801 D[1+bs*0] = c_10;
802
803 if(kn==1)
804 return;
805
806 D[0+bs*1] = c_01;
807 D[1+bs*1] = c_11;
808
809 if(kn==2)
810 return;
811
812 D[0+bs*2] = c_02;
813 D[1+bs*2] = c_12;
814
815 if(kn==3)
816 return;
817
818 D[0+bs*3] = c_03;
819 D[1+bs*3] = c_13;
820 }
821 else //if(km>=1)
822 {
823 D[0+bs*0] = c_00;
824
825 if(kn==1)
826 return;
827
828 D[0+bs*1] = c_01;
829
830 if(kn==2)
831 return;
832
833 D[0+bs*2] = c_02;
834
835 if(kn==3)
836 return;
837
838 D[0+bs*3] = c_03;
839 }
840
841 }
842#endif
843
844
845
846#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER)
847void kernel_sgemm_nt_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
848 {
849 kernel_sgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
850 }
851#endif
852
853
854
855#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
856void kernel_sgemm_nn_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D, int km, int kn)
857 {
858
859 const int bs = 4;
860
861 float
862 a_0, a_1, a_2, a_3,
863 b_0, b_1, b_2, b_3,
864 c_00=0, c_01=0, c_02=0, c_03=0,
865 c_10=0, c_11=0, c_12=0, c_13=0,
866 c_20=0, c_21=0, c_22=0, c_23=0,
867 c_30=0, c_31=0, c_32=0, c_33=0;
868
869 int k;
870
871 for(k=0; k<kmax-3; k+=4)
872 {
873
874 // k = 0
875
876 a_0 = A[0];
877 a_1 = A[1];
878 a_2 = A[2];
879 a_3 = A[3];
880
881 b_0 = B[0];
882 b_1 = B[4];
883 b_2 = B[8];
884 b_3 = B[12];
885
886 c_00 += a_0 * b_0;
887 c_10 += a_1 * b_0;
888 c_20 += a_2 * b_0;
889 c_30 += a_3 * b_0;
890
891 c_01 += a_0 * b_1;
892 c_11 += a_1 * b_1;
893 c_21 += a_2 * b_1;
894 c_31 += a_3 * b_1;
895
896 c_02 += a_0 * b_2;
897 c_12 += a_1 * b_2;
898 c_22 += a_2 * b_2;
899 c_32 += a_3 * b_2;
900
901 c_03 += a_0 * b_3;
902 c_13 += a_1 * b_3;
903 c_23 += a_2 * b_3;
904 c_33 += a_3 * b_3;
905
906
907 // k = 1
908
909 a_0 = A[4];
910 a_1 = A[5];
911 a_2 = A[6];
912 a_3 = A[7];
913
914 b_0 = B[1];
915 b_1 = B[5];
916 b_2 = B[9];
917 b_3 = B[13];
918
919 c_00 += a_0 * b_0;
920 c_10 += a_1 * b_0;
921 c_20 += a_2 * b_0;
922 c_30 += a_3 * b_0;
923
924 c_01 += a_0 * b_1;
925 c_11 += a_1 * b_1;
926 c_21 += a_2 * b_1;
927 c_31 += a_3 * b_1;
928
929 c_02 += a_0 * b_2;
930 c_12 += a_1 * b_2;
931 c_22 += a_2 * b_2;
932 c_32 += a_3 * b_2;
933
934 c_03 += a_0 * b_3;
935 c_13 += a_1 * b_3;
936 c_23 += a_2 * b_3;
937 c_33 += a_3 * b_3;
938
939
940 // k = 2
941
942 a_0 = A[8];
943 a_1 = A[9];
944 a_2 = A[10];
945 a_3 = A[11];
946
947 b_0 = B[2];
948 b_1 = B[6];
949 b_2 = B[10];
950 b_3 = B[14];
951
952 c_00 += a_0 * b_0;
953 c_10 += a_1 * b_0;
954 c_20 += a_2 * b_0;
955 c_30 += a_3 * b_0;
956
957 c_01 += a_0 * b_1;
958 c_11 += a_1 * b_1;
959 c_21 += a_2 * b_1;
960 c_31 += a_3 * b_1;
961
962 c_02 += a_0 * b_2;
963 c_12 += a_1 * b_2;
964 c_22 += a_2 * b_2;
965 c_32 += a_3 * b_2;
966
967 c_03 += a_0 * b_3;
968 c_13 += a_1 * b_3;
969 c_23 += a_2 * b_3;
970 c_33 += a_3 * b_3;
971
972
973 // k = 3
974
975 a_0 = A[12];
976 a_1 = A[13];
977 a_2 = A[14];
978 a_3 = A[15];
979
980 b_0 = B[3];
981 b_1 = B[7];
982 b_2 = B[11];
983 b_3 = B[15];
984
985 c_00 += a_0 * b_0;
986 c_10 += a_1 * b_0;
987 c_20 += a_2 * b_0;
988 c_30 += a_3 * b_0;
989
990 c_01 += a_0 * b_1;
991 c_11 += a_1 * b_1;
992 c_21 += a_2 * b_1;
993 c_31 += a_3 * b_1;
994
995 c_02 += a_0 * b_2;
996 c_12 += a_1 * b_2;
997 c_22 += a_2 * b_2;
998 c_32 += a_3 * b_2;
999
1000 c_03 += a_0 * b_3;
1001 c_13 += a_1 * b_3;
1002 c_23 += a_2 * b_3;
1003 c_33 += a_3 * b_3;
1004
1005 A += 16;
1006 B += 4*sdb;
1007
1008 }
1009
1010 for(; k<kmax; k++)
1011 {
1012
1013 // k = 0
1014
1015 a_0 = A[0];
1016 a_1 = A[1];
1017 a_2 = A[2];
1018 a_3 = A[3];
1019
1020 b_0 = B[0];
1021 b_1 = B[4];
1022 b_2 = B[8];
1023 b_3 = B[12];
1024
1025 c_00 += a_0 * b_0;
1026 c_10 += a_1 * b_0;
1027 c_20 += a_2 * b_0;
1028 c_30 += a_3 * b_0;
1029
1030 c_01 += a_0 * b_1;
1031 c_11 += a_1 * b_1;
1032 c_21 += a_2 * b_1;
1033 c_31 += a_3 * b_1;
1034
1035 c_02 += a_0 * b_2;
1036 c_12 += a_1 * b_2;
1037 c_22 += a_2 * b_2;
1038 c_32 += a_3 * b_2;
1039
1040 c_03 += a_0 * b_3;
1041 c_13 += a_1 * b_3;
1042 c_23 += a_2 * b_3;
1043 c_33 += a_3 * b_3;
1044
1045 A += 4;
1046 B += 1;
1047
1048 }
1049
1050 c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
1051 c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
1052 c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
1053 c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
1054
1055 c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
1056 c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
1057 c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
1058 c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
1059
1060 c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
1061 c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
1062 c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
1063 c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
1064
1065 c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
1066 c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
1067 c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
1068 c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
1069
1070 if(km>=4)
1071 {
1072 D[0+bs*0] = c_00;
1073 D[1+bs*0] = c_10;
1074 D[2+bs*0] = c_20;
1075 D[3+bs*0] = c_30;
1076
1077 if(kn==1)
1078 return;
1079
1080 D[0+bs*1] = c_01;
1081 D[1+bs*1] = c_11;
1082 D[2+bs*1] = c_21;
1083 D[3+bs*1] = c_31;
1084
1085 if(kn==2)
1086 return;
1087
1088 D[0+bs*2] = c_02;
1089 D[1+bs*2] = c_12;
1090 D[2+bs*2] = c_22;
1091 D[3+bs*2] = c_32;
1092
1093 if(kn==3)
1094 return;
1095
1096 D[0+bs*3] = c_03;
1097 D[1+bs*3] = c_13;
1098 D[2+bs*3] = c_23;
1099 D[3+bs*3] = c_33;
1100 }
1101 else if(km>=3)
1102 {
1103 D[0+bs*0] = c_00;
1104 D[1+bs*0] = c_10;
1105 D[2+bs*0] = c_20;
1106
1107 if(kn==1)
1108 return;
1109
1110 D[0+bs*1] = c_01;
1111 D[1+bs*1] = c_11;
1112 D[2+bs*1] = c_21;
1113
1114 if(kn==2)
1115 return;
1116
1117 D[0+bs*2] = c_02;
1118 D[1+bs*2] = c_12;
1119 D[2+bs*2] = c_22;
1120
1121 if(kn==3)
1122 return;
1123
1124 D[0+bs*3] = c_03;
1125 D[1+bs*3] = c_13;
1126 D[2+bs*3] = c_23;
1127 }
1128 else if(km>=2)
1129 {
1130 D[0+bs*0] = c_00;
1131 D[1+bs*0] = c_10;
1132
1133 if(kn==1)
1134 return;
1135
1136 D[0+bs*1] = c_01;
1137 D[1+bs*1] = c_11;
1138
1139 if(kn==2)
1140 return;
1141
1142 D[0+bs*2] = c_02;
1143 D[1+bs*2] = c_12;
1144
1145 if(kn==3)
1146 return;
1147
1148 D[0+bs*3] = c_03;
1149 D[1+bs*3] = c_13;
1150 }
1151 else //if(km>=1)
1152 {
1153 D[0+bs*0] = c_00;
1154
1155 if(kn==1)
1156 return;
1157
1158 D[0+bs*1] = c_01;
1159
1160 if(kn==2)
1161 return;
1162
1163 D[0+bs*2] = c_02;
1164
1165 if(kn==3)
1166 return;
1167
1168 D[0+bs*3] = c_03;
1169 }
1170
1171 }
1172#endif
1173
1174
1175
1176#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1177void kernel_sgemm_nn_4x4_lib4(int kmax, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D)
1178 {
1179 kernel_sgemm_nn_4x4_vs_lib4(kmax, alpha, A, B, sdb, beta, C, D, 4, 4);
1180 }
1181#endif
1182
1183
1184
1185#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1186void kernel_ssyrk_nt_l_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
1187 {
1188
1189 const int bs = 4;
1190
1191 float
1192 a_0, a_1, a_2, a_3,
1193 b_0, b_1, b_2, b_3,
1194 c_00=0, //c_01=0, c_02=0, c_03=0,
1195 c_10=0, c_11=0, //c_12=0, c_13=0,
1196 c_20=0, c_21=0, c_22=0, //c_23=0,
1197 c_30=0, c_31=0, c_32=0, c_33=0;
1198
1199 int k;
1200
1201 for(k=0; k<kmax-3; k+=4)
1202 {
1203
1204 // k = 0
1205
1206 a_0 = A[0];
1207 a_1 = A[1];
1208 a_2 = A[2];
1209 a_3 = A[3];
1210
1211 b_0 = B[0];
1212 b_1 = B[1];
1213 b_2 = B[2];
1214 b_3 = B[3];
1215
1216 c_00 += a_0 * b_0;
1217 c_10 += a_1 * b_0;
1218 c_20 += a_2 * b_0;
1219 c_30 += a_3 * b_0;
1220
1221// c_01 += a_0 * b_1;
1222 c_11 += a_1 * b_1;
1223 c_21 += a_2 * b_1;
1224 c_31 += a_3 * b_1;
1225
1226// c_02 += a_0 * b_2;
1227// c_12 += a_1 * b_2;
1228 c_22 += a_2 * b_2;
1229 c_32 += a_3 * b_2;
1230
1231// c_03 += a_0 * b_3;
1232// c_13 += a_1 * b_3;
1233// c_23 += a_2 * b_3;
1234 c_33 += a_3 * b_3;
1235
1236
1237 // k = 1
1238
1239 a_0 = A[4];
1240 a_1 = A[5];
1241 a_2 = A[6];
1242 a_3 = A[7];
1243
1244 b_0 = B[4];
1245 b_1 = B[5];
1246 b_2 = B[6];
1247 b_3 = B[7];
1248
1249 c_00 += a_0 * b_0;
1250 c_10 += a_1 * b_0;
1251 c_20 += a_2 * b_0;
1252 c_30 += a_3 * b_0;
1253
1254// c_01 += a_0 * b_1;
1255 c_11 += a_1 * b_1;
1256 c_21 += a_2 * b_1;
1257 c_31 += a_3 * b_1;
1258
1259// c_02 += a_0 * b_2;
1260// c_12 += a_1 * b_2;
1261 c_22 += a_2 * b_2;
1262 c_32 += a_3 * b_2;
1263
1264// c_03 += a_0 * b_3;
1265// c_13 += a_1 * b_3;
1266// c_23 += a_2 * b_3;
1267 c_33 += a_3 * b_3;
1268
1269
1270 // k = 2
1271
1272 a_0 = A[8];
1273 a_1 = A[9];
1274 a_2 = A[10];
1275 a_3 = A[11];
1276
1277 b_0 = B[8];
1278 b_1 = B[9];
1279 b_2 = B[10];
1280 b_3 = B[11];
1281
1282 c_00 += a_0 * b_0;
1283 c_10 += a_1 * b_0;
1284 c_20 += a_2 * b_0;
1285 c_30 += a_3 * b_0;
1286
1287// c_01 += a_0 * b_1;
1288 c_11 += a_1 * b_1;
1289 c_21 += a_2 * b_1;
1290 c_31 += a_3 * b_1;
1291
1292// c_02 += a_0 * b_2;
1293// c_12 += a_1 * b_2;
1294 c_22 += a_2 * b_2;
1295 c_32 += a_3 * b_2;
1296
1297// c_03 += a_0 * b_3;
1298// c_13 += a_1 * b_3;
1299// c_23 += a_2 * b_3;
1300 c_33 += a_3 * b_3;
1301
1302
1303 // k = 3
1304
1305 a_0 = A[12];
1306 a_1 = A[13];
1307 a_2 = A[14];
1308 a_3 = A[15];
1309
1310 b_0 = B[12];
1311 b_1 = B[13];
1312 b_2 = B[14];
1313 b_3 = B[15];
1314
1315 c_00 += a_0 * b_0;
1316 c_10 += a_1 * b_0;
1317 c_20 += a_2 * b_0;
1318 c_30 += a_3 * b_0;
1319
1320// c_01 += a_0 * b_1;
1321 c_11 += a_1 * b_1;
1322 c_21 += a_2 * b_1;
1323 c_31 += a_3 * b_1;
1324
1325// c_02 += a_0 * b_2;
1326// c_12 += a_1 * b_2;
1327 c_22 += a_2 * b_2;
1328 c_32 += a_3 * b_2;
1329
1330// c_03 += a_0 * b_3;
1331// c_13 += a_1 * b_3;
1332// c_23 += a_2 * b_3;
1333 c_33 += a_3 * b_3;
1334
1335 A += 16;
1336 B += 16;
1337
1338 }
1339
1340 for(; k<kmax; k++)
1341 {
1342
1343 // k = 0
1344
1345 a_0 = A[0];
1346 a_1 = A[1];
1347 a_2 = A[2];
1348 a_3 = A[3];
1349
1350 b_0 = B[0];
1351 b_1 = B[1];
1352 b_2 = B[2];
1353 b_3 = B[3];
1354
1355 c_00 += a_0 * b_0;
1356 c_10 += a_1 * b_0;
1357 c_20 += a_2 * b_0;
1358 c_30 += a_3 * b_0;
1359
1360// c_01 += a_0 * b_1;
1361 c_11 += a_1 * b_1;
1362 c_21 += a_2 * b_1;
1363 c_31 += a_3 * b_1;
1364
1365// c_02 += a_0 * b_2;
1366// c_12 += a_1 * b_2;
1367 c_22 += a_2 * b_2;
1368 c_32 += a_3 * b_2;
1369
1370// c_03 += a_0 * b_3;
1371// c_13 += a_1 * b_3;
1372// c_23 += a_2 * b_3;
1373 c_33 += a_3 * b_3;
1374
1375 A += 4;
1376 B += 4;
1377
1378 }
1379
1380 c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
1381 c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
1382 c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
1383 c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
1384
1385// c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
1386 c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
1387 c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
1388 c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
1389
1390// c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
1391// c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
1392 c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
1393 c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
1394
1395// c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
1396// c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
1397// c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
1398 c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
1399
1400 if(km>=4)
1401 {
1402 D[0+bs*0] = c_00;
1403 D[1+bs*0] = c_10;
1404 D[2+bs*0] = c_20;
1405 D[3+bs*0] = c_30;
1406
1407 if(kn==1)
1408 return;
1409
1410// D[0+bs*1] = c_01;
1411 D[1+bs*1] = c_11;
1412 D[2+bs*1] = c_21;
1413 D[3+bs*1] = c_31;
1414
1415 if(kn==2)
1416 return;
1417
1418// D[0+bs*2] = c_02;
1419// D[1+bs*2] = c_12;
1420 D[2+bs*2] = c_22;
1421 D[3+bs*2] = c_32;
1422
1423 if(kn==3)
1424 return;
1425
1426// D[0+bs*3] = c_03;
1427// D[1+bs*3] = c_13;
1428// D[2+bs*3] = c_23;
1429 D[3+bs*3] = c_33;
1430 }
1431 else if(km>=3)
1432 {
1433 D[0+bs*0] = c_00;
1434 D[1+bs*0] = c_10;
1435 D[2+bs*0] = c_20;
1436
1437 if(kn==1)
1438 return;
1439
1440// D[0+bs*1] = c_01;
1441 D[1+bs*1] = c_11;
1442 D[2+bs*1] = c_21;
1443
1444 if(kn==2)
1445 return;
1446
1447// D[0+bs*2] = c_02;
1448// D[1+bs*2] = c_12;
1449 D[2+bs*2] = c_22;
1450
1451// if(kn==3)
1452// return;
1453
1454// D[0+bs*3] = c_03;
1455// D[1+bs*3] = c_13;
1456// D[2+bs*3] = c_23;
1457 }
1458 else if(km>=2)
1459 {
1460 D[0+bs*0] = c_00;
1461 D[1+bs*0] = c_10;
1462
1463 if(kn==1)
1464 return;
1465
1466// D[0+bs*1] = c_01;
1467 D[1+bs*1] = c_11;
1468
1469// if(kn==2)
1470// return;
1471
1472// D[0+bs*2] = c_02;
1473// D[1+bs*2] = c_12;
1474
1475// if(kn==3)
1476// return;
1477
1478// D[0+bs*3] = c_03;
1479// D[1+bs*3] = c_13;
1480 }
1481 else //if(km>=1)
1482 {
1483 D[0+bs*0] = c_00;
1484
1485// if(kn==1)
1486// return;
1487
1488// D[0+bs*1] = c_01;
1489
1490// if(kn==2)
1491// return;
1492
1493// D[0+bs*2] = c_02;
1494
1495// if(kn==3)
1496// return;
1497
1498// D[0+bs*3] = c_03;
1499 }
1500
1501 }
1502#endif
1503
1504
1505
1506#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1507void kernel_ssyrk_nt_l_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
1508 {
1509 kernel_ssyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
1510 }
1511#endif
1512
1513
1514
1515#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1516void kernel_strmm_nt_ru_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
1517 {
1518
1519 const int bs = 4;
1520
1521 float
1522 a_0, a_1, a_2, a_3,
1523 b_0, b_1, b_2, b_3,
1524 c_00=0, c_01=0, c_02=0, c_03=0,
1525 c_10=0, c_11=0, c_12=0, c_13=0,
1526 c_20=0, c_21=0, c_22=0, c_23=0,
1527 c_30=0, c_31=0, c_32=0, c_33=0;
1528
1529 int k;
1530
1531 k = 0;
1532
1533 // k = 0
1534 if(kmax>0)
1535 {
1536 a_0 = A[0];
1537 a_1 = A[1];
1538 a_2 = A[2];
1539 a_3 = A[3];
1540
1541 b_0 = B[0];
1542
1543 c_00 += a_0 * b_0;
1544 c_10 += a_1 * b_0;
1545 c_20 += a_2 * b_0;
1546 c_30 += a_3 * b_0;
1547
1548 A += 4;
1549 B += 4;
1550 k++;
1551 }
1552
1553 // k = 1
1554 if(kmax>0)
1555 {
1556 a_0 = A[0];
1557 a_1 = A[1];
1558 a_2 = A[2];
1559 a_3 = A[3];
1560
1561 b_0 = B[0];
1562 b_1 = B[1];
1563
1564 c_00 += a_0 * b_0;
1565 c_10 += a_1 * b_0;
1566 c_20 += a_2 * b_0;
1567 c_30 += a_3 * b_0;
1568
1569 c_01 += a_0 * b_1;
1570 c_11 += a_1 * b_1;
1571 c_21 += a_2 * b_1;
1572 c_31 += a_3 * b_1;
1573
1574 A += 4;
1575 B += 4;
1576 k++;
1577 }
1578
1579 // k = 2
1580 if(kmax>0)
1581 {
1582 a_0 = A[0];
1583 a_1 = A[1];
1584 a_2 = A[2];
1585 a_3 = A[3];
1586
1587 b_0 = B[0];
1588 b_1 = B[1];
1589 b_2 = B[2];
1590
1591 c_00 += a_0 * b_0;
1592 c_10 += a_1 * b_0;
1593 c_20 += a_2 * b_0;
1594 c_30 += a_3 * b_0;
1595
1596 c_01 += a_0 * b_1;
1597 c_11 += a_1 * b_1;
1598 c_21 += a_2 * b_1;
1599 c_31 += a_3 * b_1;
1600
1601 c_02 += a_0 * b_2;
1602 c_12 += a_1 * b_2;
1603 c_22 += a_2 * b_2;
1604 c_32 += a_3 * b_2;
1605
1606 A += 4;
1607 B += 4;
1608 k++;
1609 }
1610
1611 for(; k<kmax-3; k+=4)
1612 {
1613
1614 // k = 0
1615
1616 a_0 = A[0];
1617 a_1 = A[1];
1618 a_2 = A[2];
1619 a_3 = A[3];
1620
1621 b_0 = B[0];
1622 b_1 = B[1];
1623 b_2 = B[2];
1624 b_3 = B[3];
1625
1626 c_00 += a_0 * b_0;
1627 c_10 += a_1 * b_0;
1628 c_20 += a_2 * b_0;
1629 c_30 += a_3 * b_0;
1630
1631 c_01 += a_0 * b_1;
1632 c_11 += a_1 * b_1;
1633 c_21 += a_2 * b_1;
1634 c_31 += a_3 * b_1;
1635
1636 c_02 += a_0 * b_2;
1637 c_12 += a_1 * b_2;
1638 c_22 += a_2 * b_2;
1639 c_32 += a_3 * b_2;
1640
1641 c_03 += a_0 * b_3;
1642 c_13 += a_1 * b_3;
1643 c_23 += a_2 * b_3;
1644 c_33 += a_3 * b_3;
1645
1646
1647 // k = 1
1648
1649 a_0 = A[4];
1650 a_1 = A[5];
1651 a_2 = A[6];
1652 a_3 = A[7];
1653
1654 b_0 = B[4];
1655 b_1 = B[5];
1656 b_2 = B[6];
1657 b_3 = B[7];
1658
1659 c_00 += a_0 * b_0;
1660 c_10 += a_1 * b_0;
1661 c_20 += a_2 * b_0;
1662 c_30 += a_3 * b_0;
1663
1664 c_01 += a_0 * b_1;
1665 c_11 += a_1 * b_1;
1666 c_21 += a_2 * b_1;
1667 c_31 += a_3 * b_1;
1668
1669 c_02 += a_0 * b_2;
1670 c_12 += a_1 * b_2;
1671 c_22 += a_2 * b_2;
1672 c_32 += a_3 * b_2;
1673
1674 c_03 += a_0 * b_3;
1675 c_13 += a_1 * b_3;
1676 c_23 += a_2 * b_3;
1677 c_33 += a_3 * b_3;
1678
1679
1680 // k = 2
1681
1682 a_0 = A[8];
1683 a_1 = A[9];
1684 a_2 = A[10];
1685 a_3 = A[11];
1686
1687 b_0 = B[8];
1688 b_1 = B[9];
1689 b_2 = B[10];
1690 b_3 = B[11];
1691
1692 c_00 += a_0 * b_0;
1693 c_10 += a_1 * b_0;
1694 c_20 += a_2 * b_0;
1695 c_30 += a_3 * b_0;
1696
1697 c_01 += a_0 * b_1;
1698 c_11 += a_1 * b_1;
1699 c_21 += a_2 * b_1;
1700 c_31 += a_3 * b_1;
1701
1702 c_02 += a_0 * b_2;
1703 c_12 += a_1 * b_2;
1704 c_22 += a_2 * b_2;
1705 c_32 += a_3 * b_2;
1706
1707 c_03 += a_0 * b_3;
1708 c_13 += a_1 * b_3;
1709 c_23 += a_2 * b_3;
1710 c_33 += a_3 * b_3;
1711
1712
1713 // k = 3
1714
1715 a_0 = A[12];
1716 a_1 = A[13];
1717 a_2 = A[14];
1718 a_3 = A[15];
1719
1720 b_0 = B[12];
1721 b_1 = B[13];
1722 b_2 = B[14];
1723 b_3 = B[15];
1724
1725 c_00 += a_0 * b_0;
1726 c_10 += a_1 * b_0;
1727 c_20 += a_2 * b_0;
1728 c_30 += a_3 * b_0;
1729
1730 c_01 += a_0 * b_1;
1731 c_11 += a_1 * b_1;
1732 c_21 += a_2 * b_1;
1733 c_31 += a_3 * b_1;
1734
1735 c_02 += a_0 * b_2;
1736 c_12 += a_1 * b_2;
1737 c_22 += a_2 * b_2;
1738 c_32 += a_3 * b_2;
1739
1740 c_03 += a_0 * b_3;
1741 c_13 += a_1 * b_3;
1742 c_23 += a_2 * b_3;
1743 c_33 += a_3 * b_3;
1744
1745 A += 16;
1746 B += 16;
1747
1748 }
1749
1750 for(; k<kmax; k++)
1751 {
1752
1753 // k = 0
1754
1755 a_0 = A[0];
1756 a_1 = A[1];
1757 a_2 = A[2];
1758 a_3 = A[3];
1759
1760 b_0 = B[0];
1761 b_1 = B[1];
1762 b_2 = B[2];
1763 b_3 = B[3];
1764
1765 c_00 += a_0 * b_0;
1766 c_10 += a_1 * b_0;
1767 c_20 += a_2 * b_0;
1768 c_30 += a_3 * b_0;
1769
1770 c_01 += a_0 * b_1;
1771 c_11 += a_1 * b_1;
1772 c_21 += a_2 * b_1;
1773 c_31 += a_3 * b_1;
1774
1775 c_02 += a_0 * b_2;
1776 c_12 += a_1 * b_2;
1777 c_22 += a_2 * b_2;
1778 c_32 += a_3 * b_2;
1779
1780 c_03 += a_0 * b_3;
1781 c_13 += a_1 * b_3;
1782 c_23 += a_2 * b_3;
1783 c_33 += a_3 * b_3;
1784
1785 A += 4;
1786 B += 4;
1787
1788 }
1789
1790 c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
1791 c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
1792 c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
1793 c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
1794
1795 c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
1796 c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
1797 c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
1798 c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
1799
1800 c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
1801 c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
1802 c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
1803 c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
1804
1805 c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
1806 c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
1807 c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
1808 c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
1809
1810 if(km>=4)
1811 {
1812 D[0+bs*0] = c_00;
1813 D[1+bs*0] = c_10;
1814 D[2+bs*0] = c_20;
1815 D[3+bs*0] = c_30;
1816
1817 if(kn==1)
1818 return;
1819
1820 D[0+bs*1] = c_01;
1821 D[1+bs*1] = c_11;
1822 D[2+bs*1] = c_21;
1823 D[3+bs*1] = c_31;
1824
1825 if(kn==2)
1826 return;
1827
1828 D[0+bs*2] = c_02;
1829 D[1+bs*2] = c_12;
1830 D[2+bs*2] = c_22;
1831 D[3+bs*2] = c_32;
1832
1833 if(kn==3)
1834 return;
1835
1836 D[0+bs*3] = c_03;
1837 D[1+bs*3] = c_13;
1838 D[2+bs*3] = c_23;
1839 D[3+bs*3] = c_33;
1840 }
1841 else if(km>=3)
1842 {
1843 D[0+bs*0] = c_00;
1844 D[1+bs*0] = c_10;
1845 D[2+bs*0] = c_20;
1846
1847 if(kn==1)
1848 return;
1849
1850 D[0+bs*1] = c_01;
1851 D[1+bs*1] = c_11;
1852 D[2+bs*1] = c_21;
1853
1854 if(kn==2)
1855 return;
1856
1857 D[0+bs*2] = c_02;
1858 D[1+bs*2] = c_12;
1859 D[2+bs*2] = c_22;
1860
1861 if(kn==3)
1862 return;
1863
1864 D[0+bs*3] = c_03;
1865 D[1+bs*3] = c_13;
1866 D[2+bs*3] = c_23;
1867 }
1868 else if(km>=2)
1869 {
1870 D[0+bs*0] = c_00;
1871 D[1+bs*0] = c_10;
1872
1873 if(kn==1)
1874 return;
1875
1876 D[0+bs*1] = c_01;
1877 D[1+bs*1] = c_11;
1878
1879 if(kn==2)
1880 return;
1881
1882 D[0+bs*2] = c_02;
1883 D[1+bs*2] = c_12;
1884
1885 if(kn==3)
1886 return;
1887
1888 D[0+bs*3] = c_03;
1889 D[1+bs*3] = c_13;
1890 }
1891 else //if(km>=1)
1892 {
1893 D[0+bs*0] = c_00;
1894
1895 if(kn==1)
1896 return;
1897
1898 D[0+bs*1] = c_01;
1899
1900 if(kn==2)
1901 return;
1902
1903 D[0+bs*2] = c_02;
1904
1905 if(kn==3)
1906 return;
1907
1908 D[0+bs*3] = c_03;
1909 }
1910
1911 }
1912#endif
1913
1914
1915
1916
1917#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1918void kernel_strmm_nt_ru_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D)
1919 {
1920 kernel_strmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
1921 }
1922#endif
1923
1924
1925
1926#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1927void kernel_strmm_nn_rl_4x4_gen_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
1928 {
1929
1930 const int bs = 4;
1931
1932 float
1933 a_0, a_1, a_2, a_3,
1934 b_0, b_1, b_2, b_3,
1935 c_00=0, c_01=0, c_02=0, c_03=0,
1936 c_10=0, c_11=0, c_12=0, c_13=0,
1937 c_20=0, c_21=0, c_22=0, c_23=0,
1938 c_30=0, c_31=0, c_32=0, c_33=0;
1939
1940 float *D1;
1941
1942 int k;
1943
1944 B += offsetB;
1945
1946 k = 0;
1947
1948 if(offsetB==0)
1949 {
1950
1951 // k = 0
1952
1953 a_0 = A[0];
1954 a_1 = A[1];
1955 a_2 = A[2];
1956 a_3 = A[3];
1957
1958 b_0 = B[0];
1959 c_00 += a_0 * b_0;
1960 c_10 += a_1 * b_0;
1961 c_20 += a_2 * b_0;
1962 c_30 += a_3 * b_0;
1963
1964 A += 4;
1965 B += 1;
1966 k += 1;
1967
1968 if(k>=kmax)
1969 goto store;
1970
1971 // k = 1
1972
1973 a_0 = A[0];
1974 a_1 = A[1];
1975 a_2 = A[2];
1976 a_3 = A[3];
1977
1978 b_0 = B[0];
1979 c_00 += a_0 * b_0;
1980 c_10 += a_1 * b_0;
1981 c_20 += a_2 * b_0;
1982 c_30 += a_3 * b_0;
1983
1984 b_1 = B[4];
1985 c_01 += a_0 * b_1;
1986 c_11 += a_1 * b_1;
1987 c_21 += a_2 * b_1;
1988 c_31 += a_3 * b_1;
1989
1990 A += 4;
1991 B += 1;
1992 k += 1;
1993
1994 if(k>=kmax)
1995 goto store;
1996
1997 // k = 2
1998
1999 a_0 = A[0];
2000 a_1 = A[1];
2001 a_2 = A[2];
2002 a_3 = A[3];
2003
2004 b_0 = B[0];
2005 c_00 += a_0 * b_0;
2006 c_10 += a_1 * b_0;
2007 c_20 += a_2 * b_0;
2008 c_30 += a_3 * b_0;
2009
2010 b_1 = B[4];
2011 c_01 += a_0 * b_1;
2012 c_11 += a_1 * b_1;
2013 c_21 += a_2 * b_1;
2014 c_31 += a_3 * b_1;
2015
2016 b_2 = B[8];
2017 c_02 += a_0 * b_2;
2018 c_12 += a_1 * b_2;
2019 c_22 += a_2 * b_2;
2020 c_32 += a_3 * b_2;
2021
2022 A += 4;
2023 B += 1;
2024 k += 1;
2025
2026 if(k>=kmax)
2027 goto store;
2028
2029 // k = 3
2030
2031 a_0 = A[0];
2032 a_1 = A[1];
2033 a_2 = A[2];
2034 a_3 = A[3];
2035
2036 b_0 = B[0];
2037 c_00 += a_0 * b_0;
2038 c_10 += a_1 * b_0;
2039 c_20 += a_2 * b_0;
2040 c_30 += a_3 * b_0;
2041
2042 b_1 = B[4];
2043 c_01 += a_0 * b_1;
2044 c_11 += a_1 * b_1;
2045 c_21 += a_2 * b_1;
2046 c_31 += a_3 * b_1;
2047
2048 b_2 = B[8];
2049 c_02 += a_0 * b_2;
2050 c_12 += a_1 * b_2;
2051 c_22 += a_2 * b_2;
2052 c_32 += a_3 * b_2;
2053
2054 b_3 = B[12];
2055 c_03 += a_0 * b_3;
2056 c_13 += a_1 * b_3;
2057 c_23 += a_2 * b_3;
2058 c_33 += a_3 * b_3;
2059
2060 A += 4;
2061 B += 4*sdb-3;
2062 k += 1;
2063
2064 }
2065 else if(offsetB==1)
2066 {
2067
2068 // k = 0
2069
2070 a_0 = A[0];
2071 a_1 = A[1];
2072 a_2 = A[2];
2073 a_3 = A[3];
2074
2075 b_0 = B[0];
2076 c_00 += a_0 * b_0;
2077 c_10 += a_1 * b_0;
2078 c_20 += a_2 * b_0;
2079 c_30 += a_3 * b_0;
2080
2081 A += 4;
2082 B += 1;
2083 k += 1;
2084
2085 if(k>=kmax)
2086 goto store;
2087
2088 // k = 1
2089
2090 a_0 = A[0];
2091 a_1 = A[1];
2092 a_2 = A[2];
2093 a_3 = A[3];
2094
2095 b_0 = B[0];
2096 c_00 += a_0 * b_0;
2097 c_10 += a_1 * b_0;
2098 c_20 += a_2 * b_0;
2099 c_30 += a_3 * b_0;
2100
2101 b_1 = B[4];
2102 c_01 += a_0 * b_1;
2103 c_11 += a_1 * b_1;
2104 c_21 += a_2 * b_1;
2105 c_31 += a_3 * b_1;
2106
2107 A += 4;
2108 B += 1;
2109 k += 1;
2110
2111 if(k>=kmax)
2112 goto store;
2113
2114 // k = 2
2115
2116 a_0 = A[0];
2117 a_1 = A[1];
2118 a_2 = A[2];
2119 a_3 = A[3];
2120
2121 b_0 = B[0];
2122 c_00 += a_0 * b_0;
2123 c_10 += a_1 * b_0;
2124 c_20 += a_2 * b_0;
2125 c_30 += a_3 * b_0;
2126
2127 b_1 = B[4];
2128 c_01 += a_0 * b_1;
2129 c_11 += a_1 * b_1;
2130 c_21 += a_2 * b_1;
2131 c_31 += a_3 * b_1;
2132
2133 b_2 = B[8];
2134 c_02 += a_0 * b_2;
2135 c_12 += a_1 * b_2;
2136 c_22 += a_2 * b_2;
2137 c_32 += a_3 * b_2;
2138
2139 A += 4;
2140 B += 4*sdb-3;
2141 k += 1;
2142
2143 }
2144 else if(offsetB==2)
2145 {
2146
2147 // k = 0
2148
2149 a_0 = A[0];
2150 a_1 = A[1];
2151 a_2 = A[2];
2152 a_3 = A[3];
2153
2154 b_0 = B[0];
2155 c_00 += a_0 * b_0;
2156 c_10 += a_1 * b_0;
2157 c_20 += a_2 * b_0;
2158 c_30 += a_3 * b_0;
2159
2160 A += 4;
2161 B += 1;
2162 k += 1;
2163
2164 if(k>=kmax)
2165 goto store;
2166
2167 // k = 1
2168
2169 a_0 = A[0];
2170 a_1 = A[1];
2171 a_2 = A[2];
2172 a_3 = A[3];
2173
2174 b_0 = B[0];
2175 c_00 += a_0 * b_0;
2176 c_10 += a_1 * b_0;
2177 c_20 += a_2 * b_0;
2178 c_30 += a_3 * b_0;
2179
2180 b_1 = B[4];
2181 c_01 += a_0 * b_1;
2182 c_11 += a_1 * b_1;
2183 c_21 += a_2 * b_1;
2184 c_31 += a_3 * b_1;
2185
2186 A += 4;
2187 B += 4*sdb-3;
2188 k += 1;
2189
2190 if(k>=kmax)
2191 goto store;
2192
2193 // k = 2
2194
2195 a_0 = A[0];
2196 a_1 = A[1];
2197 a_2 = A[2];
2198 a_3 = A[3];
2199
2200 b_0 = B[0];
2201 c_00 += a_0 * b_0;
2202 c_10 += a_1 * b_0;
2203 c_20 += a_2 * b_0;
2204 c_30 += a_3 * b_0;
2205
2206 b_1 = B[4];
2207 c_01 += a_0 * b_1;
2208 c_11 += a_1 * b_1;
2209 c_21 += a_2 * b_1;
2210 c_31 += a_3 * b_1;
2211
2212 b_2 = B[8];
2213 c_02 += a_0 * b_2;
2214 c_12 += a_1 * b_2;
2215 c_22 += a_2 * b_2;
2216 c_32 += a_3 * b_2;
2217
2218 A += 4;
2219 B += 1;
2220 k += 1;
2221
2222 if(k>=kmax)
2223 goto store;
2224
2225 // k = 3
2226
2227 a_0 = A[0];
2228 a_1 = A[1];
2229 a_2 = A[2];
2230 a_3 = A[3];
2231
2232 b_0 = B[0];
2233 c_00 += a_0 * b_0;
2234 c_10 += a_1 * b_0;
2235 c_20 += a_2 * b_0;
2236 c_30 += a_3 * b_0;
2237
2238 b_1 = B[4];
2239 c_01 += a_0 * b_1;
2240 c_11 += a_1 * b_1;
2241 c_21 += a_2 * b_1;
2242 c_31 += a_3 * b_1;
2243
2244 b_2 = B[8];
2245 c_02 += a_0 * b_2;
2246 c_12 += a_1 * b_2;
2247 c_22 += a_2 * b_2;
2248 c_32 += a_3 * b_2;
2249
2250 b_3 = B[12];
2251 c_03 += a_0 * b_3;
2252 c_13 += a_1 * b_3;
2253 c_23 += a_2 * b_3;
2254 c_33 += a_3 * b_3;
2255
2256 A += 4;
2257 B += 1;
2258 k += 1;
2259
2260 if(k>=kmax)
2261 goto store;
2262
2263 // k = 4
2264
2265 a_0 = A[0];
2266 a_1 = A[1];
2267 a_2 = A[2];
2268 a_3 = A[3];
2269
2270 b_0 = B[0];
2271 c_00 += a_0 * b_0;
2272 c_10 += a_1 * b_0;
2273 c_20 += a_2 * b_0;
2274 c_30 += a_3 * b_0;
2275
2276 b_1 = B[4];
2277 c_01 += a_0 * b_1;
2278 c_11 += a_1 * b_1;
2279 c_21 += a_2 * b_1;
2280 c_31 += a_3 * b_1;
2281
2282 b_2 = B[8];
2283 c_02 += a_0 * b_2;
2284 c_12 += a_1 * b_2;
2285 c_22 += a_2 * b_2;
2286 c_32 += a_3 * b_2;
2287
2288 b_3 = B[12];
2289 c_03 += a_0 * b_3;
2290 c_13 += a_1 * b_3;
2291 c_23 += a_2 * b_3;
2292 c_33 += a_3 * b_3;
2293
2294 A += 4;
2295 B += 1;
2296 k += 1;
2297
2298 if(k>=kmax)
2299 goto store;
2300
2301 // k = 5
2302
2303 a_0 = A[0];
2304 a_1 = A[1];
2305 a_2 = A[2];
2306 a_3 = A[3];
2307
2308 b_0 = B[0];
2309 c_00 += a_0 * b_0;
2310 c_10 += a_1 * b_0;
2311 c_20 += a_2 * b_0;
2312 c_30 += a_3 * b_0;
2313
2314 b_1 = B[4];
2315 c_01 += a_0 * b_1;
2316 c_11 += a_1 * b_1;
2317 c_21 += a_2 * b_1;
2318 c_31 += a_3 * b_1;
2319
2320 b_2 = B[8];
2321 c_02 += a_0 * b_2;
2322 c_12 += a_1 * b_2;
2323 c_22 += a_2 * b_2;
2324 c_32 += a_3 * b_2;
2325
2326 b_3 = B[12];
2327 c_03 += a_0 * b_3;
2328 c_13 += a_1 * b_3;
2329 c_23 += a_2 * b_3;
2330 c_33 += a_3 * b_3;
2331
2332 A += 4;
2333 B += 4*sdb-3;
2334 k += 1;
2335
2336 }
2337 else // if(offetB==3)
2338 {
2339
2340 // k = 0
2341
2342 a_0 = A[0];
2343 a_1 = A[1];
2344 a_2 = A[2];
2345 a_3 = A[3];
2346
2347 b_0 = B[0];
2348 c_00 += a_0 * b_0;
2349 c_10 += a_1 * b_0;
2350 c_20 += a_2 * b_0;
2351 c_30 += a_3 * b_0;
2352
2353 A += 4;
2354 B += 4*sdb-3;
2355 k += 1;
2356
2357 if(k>=kmax)
2358 goto store;
2359
2360 // k = 1
2361
2362 a_0 = A[0];
2363 a_1 = A[1];
2364 a_2 = A[2];
2365 a_3 = A[3];
2366
2367 b_0 = B[0];
2368 c_00 += a_0 * b_0;
2369 c_10 += a_1 * b_0;
2370 c_20 += a_2 * b_0;
2371 c_30 += a_3 * b_0;
2372
2373 b_1 = B[4];
2374 c_01 += a_0 * b_1;
2375 c_11 += a_1 * b_1;
2376 c_21 += a_2 * b_1;
2377 c_31 += a_3 * b_1;
2378
2379 A += 4;
2380 B += 1;
2381 k += 1;
2382
2383 if(k>=kmax)
2384 goto store;
2385
2386 // k = 2
2387
2388 a_0 = A[0];
2389 a_1 = A[1];
2390 a_2 = A[2];
2391 a_3 = A[3];
2392
2393 b_0 = B[0];
2394 c_00 += a_0 * b_0;
2395 c_10 += a_1 * b_0;
2396 c_20 += a_2 * b_0;
2397 c_30 += a_3 * b_0;
2398
2399 b_1 = B[4];
2400 c_01 += a_0 * b_1;
2401 c_11 += a_1 * b_1;
2402 c_21 += a_2 * b_1;
2403 c_31 += a_3 * b_1;
2404
2405 b_2 = B[8];
2406 c_02 += a_0 * b_2;
2407 c_12 += a_1 * b_2;
2408 c_22 += a_2 * b_2;
2409 c_32 += a_3 * b_2;
2410
2411 A += 4;
2412 B += 1;
2413 k += 1;
2414
2415 if(k>=kmax)
2416 goto store;
2417
2418 // k = 3
2419
2420 a_0 = A[0];
2421 a_1 = A[1];
2422 a_2 = A[2];
2423 a_3 = A[3];
2424
2425 b_0 = B[0];
2426 c_00 += a_0 * b_0;
2427 c_10 += a_1 * b_0;
2428 c_20 += a_2 * b_0;
2429 c_30 += a_3 * b_0;
2430
2431 b_1 = B[4];
2432 c_01 += a_0 * b_1;
2433 c_11 += a_1 * b_1;
2434 c_21 += a_2 * b_1;
2435 c_31 += a_3 * b_1;
2436
2437 b_2 = B[8];
2438 c_02 += a_0 * b_2;
2439 c_12 += a_1 * b_2;
2440 c_22 += a_2 * b_2;
2441 c_32 += a_3 * b_2;
2442
2443 b_3 = B[12];
2444 c_03 += a_0 * b_3;
2445 c_13 += a_1 * b_3;
2446 c_23 += a_2 * b_3;
2447 c_33 += a_3 * b_3;
2448
2449 A += 4;
2450 B += 1;
2451 k += 1;
2452
2453 if(k>=kmax)
2454 goto store;
2455
2456 // k = 4
2457
2458 a_0 = A[0];
2459 a_1 = A[1];
2460 a_2 = A[2];
2461 a_3 = A[3];
2462
2463 b_0 = B[0];
2464 c_00 += a_0 * b_0;
2465 c_10 += a_1 * b_0;
2466 c_20 += a_2 * b_0;
2467 c_30 += a_3 * b_0;
2468
2469 b_1 = B[4];
2470 c_01 += a_0 * b_1;
2471 c_11 += a_1 * b_1;
2472 c_21 += a_2 * b_1;
2473 c_31 += a_3 * b_1;
2474
2475 b_2 = B[8];
2476 c_02 += a_0 * b_2;
2477 c_12 += a_1 * b_2;
2478 c_22 += a_2 * b_2;
2479 c_32 += a_3 * b_2;
2480
2481 b_3 = B[12];
2482 c_03 += a_0 * b_3;
2483 c_13 += a_1 * b_3;
2484 c_23 += a_2 * b_3;
2485 c_33 += a_3 * b_3;
2486
2487 A += 4;
2488 B += 4*sdb-3;
2489 k += 1;
2490
2491 }
2492
2493 for(; k<kmax-3; k+=4)
2494 {
2495
2496 // k = 0
2497
2498 a_0 = A[0];
2499 a_1 = A[1];
2500 a_2 = A[2];
2501 a_3 = A[3];
2502
2503 b_0 = B[0];
2504 b_1 = B[4];
2505 b_2 = B[8];
2506 b_3 = B[12];
2507
2508 c_00 += a_0 * b_0;
2509 c_10 += a_1 * b_0;
2510 c_20 += a_2 * b_0;
2511 c_30 += a_3 * b_0;
2512
2513 c_01 += a_0 * b_1;
2514 c_11 += a_1 * b_1;
2515 c_21 += a_2 * b_1;
2516 c_31 += a_3 * b_1;
2517
2518 c_02 += a_0 * b_2;
2519 c_12 += a_1 * b_2;
2520 c_22 += a_2 * b_2;
2521 c_32 += a_3 * b_2;
2522
2523 c_03 += a_0 * b_3;
2524 c_13 += a_1 * b_3;
2525 c_23 += a_2 * b_3;
2526 c_33 += a_3 * b_3;
2527
2528
2529 // k = 1
2530
2531 a_0 = A[4];
2532 a_1 = A[5];
2533 a_2 = A[6];
2534 a_3 = A[7];
2535
2536 b_0 = B[1];
2537 b_1 = B[5];
2538 b_2 = B[9];
2539 b_3 = B[13];
2540
2541 c_00 += a_0 * b_0;
2542 c_10 += a_1 * b_0;
2543 c_20 += a_2 * b_0;
2544 c_30 += a_3 * b_0;
2545
2546 c_01 += a_0 * b_1;
2547 c_11 += a_1 * b_1;
2548 c_21 += a_2 * b_1;
2549 c_31 += a_3 * b_1;
2550
2551 c_02 += a_0 * b_2;
2552 c_12 += a_1 * b_2;
2553 c_22 += a_2 * b_2;
2554 c_32 += a_3 * b_2;
2555
2556 c_03 += a_0 * b_3;
2557 c_13 += a_1 * b_3;
2558 c_23 += a_2 * b_3;
2559 c_33 += a_3 * b_3;
2560
2561
2562 // k = 2
2563
2564 a_0 = A[8];
2565 a_1 = A[9];
2566 a_2 = A[10];
2567 a_3 = A[11];
2568
2569 b_0 = B[2];
2570 b_1 = B[6];
2571 b_2 = B[10];
2572 b_3 = B[14];
2573
2574 c_00 += a_0 * b_0;
2575 c_10 += a_1 * b_0;
2576 c_20 += a_2 * b_0;
2577 c_30 += a_3 * b_0;
2578
2579 c_01 += a_0 * b_1;
2580 c_11 += a_1 * b_1;
2581 c_21 += a_2 * b_1;
2582 c_31 += a_3 * b_1;
2583
2584 c_02 += a_0 * b_2;
2585 c_12 += a_1 * b_2;
2586 c_22 += a_2 * b_2;
2587 c_32 += a_3 * b_2;
2588
2589 c_03 += a_0 * b_3;
2590 c_13 += a_1 * b_3;
2591 c_23 += a_2 * b_3;
2592 c_33 += a_3 * b_3;
2593
2594
2595 // k = 3
2596
2597 a_0 = A[12];
2598 a_1 = A[13];
2599 a_2 = A[14];
2600 a_3 = A[15];
2601
2602 b_0 = B[3];
2603 b_1 = B[7];
2604 b_2 = B[11];
2605 b_3 = B[15];
2606
2607 c_00 += a_0 * b_0;
2608 c_10 += a_1 * b_0;
2609 c_20 += a_2 * b_0;
2610 c_30 += a_3 * b_0;
2611
2612 c_01 += a_0 * b_1;
2613 c_11 += a_1 * b_1;
2614 c_21 += a_2 * b_1;
2615 c_31 += a_3 * b_1;
2616
2617 c_02 += a_0 * b_2;
2618 c_12 += a_1 * b_2;
2619 c_22 += a_2 * b_2;
2620 c_32 += a_3 * b_2;
2621
2622 c_03 += a_0 * b_3;
2623 c_13 += a_1 * b_3;
2624 c_23 += a_2 * b_3;
2625 c_33 += a_3 * b_3;
2626
2627 A += 16;
2628 B += 4*sdb;
2629
2630 }
2631
2632 for(; k<kmax; k++)
2633 {
2634
2635 // k = 0
2636
2637 a_0 = A[0];
2638 a_1 = A[1];
2639 a_2 = A[2];
2640 a_3 = A[3];
2641
2642 b_0 = B[0];
2643 b_1 = B[4];
2644 b_2 = B[8];
2645 b_3 = B[12];
2646
2647 c_00 += a_0 * b_0;
2648 c_10 += a_1 * b_0;
2649 c_20 += a_2 * b_0;
2650 c_30 += a_3 * b_0;
2651
2652 c_01 += a_0 * b_1;
2653 c_11 += a_1 * b_1;
2654 c_21 += a_2 * b_1;
2655 c_31 += a_3 * b_1;
2656
2657 c_02 += a_0 * b_2;
2658 c_12 += a_1 * b_2;
2659 c_22 += a_2 * b_2;
2660 c_32 += a_3 * b_2;
2661
2662 c_03 += a_0 * b_3;
2663 c_13 += a_1 * b_3;
2664 c_23 += a_2 * b_3;
2665 c_33 += a_3 * b_3;
2666
2667 A += 4;
2668 B += 1;
2669
2670 }
2671
2672 store:
2673
2674 c_00 = alpha[0]*c_00;
2675 c_10 = alpha[0]*c_10;
2676 c_20 = alpha[0]*c_20;
2677 c_30 = alpha[0]*c_30;
2678
2679 c_01 = alpha[0]*c_01;
2680 c_11 = alpha[0]*c_11;
2681 c_21 = alpha[0]*c_21;
2682 c_31 = alpha[0]*c_31;
2683
2684 c_02 = alpha[0]*c_02;
2685 c_12 = alpha[0]*c_12;
2686 c_22 = alpha[0]*c_22;
2687 c_32 = alpha[0]*c_32;
2688
2689 c_03 = alpha[0]*c_03;
2690 c_13 = alpha[0]*c_13;
2691 c_23 = alpha[0]*c_23;
2692 c_33 = alpha[0]*c_33;
2693
2694 // shift sol for cols
2695 if(n0>0)
2696 {
2697 if(n0==1)
2698 {
2699 c_00 = c_01;
2700 c_10 = c_11;
2701 c_20 = c_21;
2702 c_30 = c_31;
2703
2704 c_01 = c_02;
2705 c_11 = c_12;
2706 c_21 = c_22;
2707 c_31 = c_32;
2708
2709 c_02 = c_03;
2710 c_12 = c_13;
2711 c_22 = c_23;
2712 c_32 = c_33;
2713
2714 D0 += 1*bs;
2715 }
2716 else if(n0==2)
2717 {
2718 c_00 = c_02;
2719 c_10 = c_12;
2720 c_20 = c_22;
2721 c_30 = c_32;
2722
2723 c_01 = c_03;
2724 c_11 = c_13;
2725 c_21 = c_23;
2726 c_31 = c_33;
2727
2728 D0 += 2*bs;
2729 }
2730 else //if(n0==3)
2731 {
2732 c_00 = c_03;
2733 c_10 = c_13;
2734 c_20 = c_23;
2735 c_30 = c_33;
2736
2737 D0 += 3*bs;
2738 }
2739 }
2740
2741 int kn = n1 - n0;
2742
2743 if(offsetD==0)
2744 {
2745 if(kn<=0)
2746 return;
2747
2748 if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
2749 if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
2750 if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
2751 if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
2752
2753 if(kn<=1)
2754 return;
2755
2756 if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
2757 if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
2758 if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
2759 if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
2760
2761 if(kn<=2)
2762 return;
2763
2764 if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
2765 if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
2766 if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
2767 if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
2768
2769 if(kn<=3)
2770 return;
2771
2772 if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
2773 if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
2774 if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
2775 if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
2776 }
2777 else if(offsetD==1)
2778 {
2779 D1 = D0 + sdd*bs;
2780
2781 if(kn<=0)
2782 return;
2783
2784 if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
2785 if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
2786 if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
2787 if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
2788
2789 if(kn<=1)
2790 return;
2791
2792 if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
2793 if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
2794 if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
2795 if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
2796
2797 if(kn<=2)
2798 return;
2799
2800 if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
2801 if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
2802 if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
2803 if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
2804
2805 if(kn<=3)
2806 return;
2807
2808 if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
2809 if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
2810 if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
2811 if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
2812 }
2813 else if(offsetD==2)
2814 {
2815 D1 = D0 + sdd*bs;
2816
2817 if(kn<=0)
2818 return;
2819
2820 if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
2821 if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
2822 if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
2823 if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
2824
2825 if(kn<=1)
2826 return;
2827
2828 if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
2829 if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
2830 if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
2831 if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
2832
2833 if(kn<=2)
2834 return;
2835
2836 if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
2837 if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
2838 if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
2839 if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
2840
2841 if(kn<=3)
2842 return;
2843
2844 if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
2845 if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
2846 if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
2847 if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
2848 }
2849 else //if(offsetD==3)
2850 {
2851 D1 = D0 + sdd*bs;
2852
2853 if(kn<=0)
2854 return;
2855
2856 if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
2857 if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
2858 if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
2859 if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
2860
2861 if(kn<=1)
2862 return;
2863
2864 if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
2865 if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
2866 if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
2867 if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
2868
2869 if(kn<=2)
2870 return;
2871
2872 if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
2873 if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
2874 if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
2875 if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
2876
2877 if(kn<=3)
2878 return;
2879
2880 if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
2881 if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
2882 if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
2883 if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
2884 }
2885
2886 return;
2887
2888 }
2889#endif
2890
2891
2892
2893#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2894void kernel_strmm_nn_rl_4x4_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, float *D)
2895 {
2896 kernel_strmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
2897 }
2898#endif
2899
2900
2901
2902#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2903void kernel_spotrf_nt_l_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn)
2904 {
2905
2906 const int bs = 4;
2907
2908 float
2909 a_0, a_1, a_2, a_3,
2910 b_0, b_1, b_2, b_3,
2911 tmp,
2912 c_00=0, //c_01=0, c_02=0, c_03=0,
2913 c_10=0, c_11=0, //c_12=0, c_13=0,
2914 c_20=0, c_21=0, c_22=0, //c_23=0,
2915 c_30=0, c_31=0, c_32=0, c_33=0;
2916
2917 int k;
2918
2919 for(k=0; k<kmax-3; k+=4)
2920 {
2921
2922 // k = 0
2923
2924 a_0 = A[0];
2925 a_1 = A[1];
2926 a_2 = A[2];
2927 a_3 = A[3];
2928
2929 b_0 = B[0];
2930 b_1 = B[1];
2931 b_2 = B[2];
2932 b_3 = B[3];
2933
2934 c_00 -= a_0 * b_0;
2935 c_10 -= a_1 * b_0;
2936 c_20 -= a_2 * b_0;
2937 c_30 -= a_3 * b_0;
2938
2939// c_01 -= a_0 * b_1;
2940 c_11 -= a_1 * b_1;
2941 c_21 -= a_2 * b_1;
2942 c_31 -= a_3 * b_1;
2943
2944// c_02 -= a_0 * b_2;
2945// c_12 -= a_1 * b_2;
2946 c_22 -= a_2 * b_2;
2947 c_32 -= a_3 * b_2;
2948
2949// c_03 -= a_0 * b_3;
2950// c_13 -= a_1 * b_3;
2951// c_23 -= a_2 * b_3;
2952 c_33 -= a_3 * b_3;
2953
2954
2955 // k = 1
2956
2957 a_0 = A[4];
2958 a_1 = A[5];
2959 a_2 = A[6];
2960 a_3 = A[7];
2961
2962 b_0 = B[4];
2963 b_1 = B[5];
2964 b_2 = B[6];
2965 b_3 = B[7];
2966
2967 c_00 -= a_0 * b_0;
2968 c_10 -= a_1 * b_0;
2969 c_20 -= a_2 * b_0;
2970 c_30 -= a_3 * b_0;
2971
2972// c_01 -= a_0 * b_1;
2973 c_11 -= a_1 * b_1;
2974 c_21 -= a_2 * b_1;
2975 c_31 -= a_3 * b_1;
2976
2977// c_02 -= a_0 * b_2;
2978// c_12 -= a_1 * b_2;
2979 c_22 -= a_2 * b_2;
2980 c_32 -= a_3 * b_2;
2981
2982// c_03 -= a_0 * b_3;
2983// c_13 -= a_1 * b_3;
2984// c_23 -= a_2 * b_3;
2985 c_33 -= a_3 * b_3;
2986
2987
2988 // k = 2
2989
2990 a_0 = A[8];
2991 a_1 = A[9];
2992 a_2 = A[10];
2993 a_3 = A[11];
2994
2995 b_0 = B[8];
2996 b_1 = B[9];
2997 b_2 = B[10];
2998 b_3 = B[11];
2999
3000 c_00 -= a_0 * b_0;
3001 c_10 -= a_1 * b_0;
3002 c_20 -= a_2 * b_0;
3003 c_30 -= a_3 * b_0;
3004
3005// c_01 -= a_0 * b_1;
3006 c_11 -= a_1 * b_1;
3007 c_21 -= a_2 * b_1;
3008 c_31 -= a_3 * b_1;
3009
3010// c_02 -= a_0 * b_2;
3011// c_12 -= a_1 * b_2;
3012 c_22 -= a_2 * b_2;
3013 c_32 -= a_3 * b_2;
3014
3015// c_03 -= a_0 * b_3;
3016// c_13 -= a_1 * b_3;
3017// c_23 -= a_2 * b_3;
3018 c_33 -= a_3 * b_3;
3019
3020
3021 // k = 3
3022
3023 a_0 = A[12];
3024 a_1 = A[13];
3025 a_2 = A[14];
3026 a_3 = A[15];
3027
3028 b_0 = B[12];
3029 b_1 = B[13];
3030 b_2 = B[14];
3031 b_3 = B[15];
3032
3033 c_00 -= a_0 * b_0;
3034 c_10 -= a_1 * b_0;
3035 c_20 -= a_2 * b_0;
3036 c_30 -= a_3 * b_0;
3037
3038// c_01 -= a_0 * b_1;
3039 c_11 -= a_1 * b_1;
3040 c_21 -= a_2 * b_1;
3041 c_31 -= a_3 * b_1;
3042
3043// c_02 -= a_0 * b_2;
3044// c_12 -= a_1 * b_2;
3045 c_22 -= a_2 * b_2;
3046 c_32 -= a_3 * b_2;
3047
3048// c_03 -= a_0 * b_3;
3049// c_13 -= a_1 * b_3;
3050// c_23 -= a_2 * b_3;
3051 c_33 -= a_3 * b_3;
3052
3053 A += 16;
3054 B += 16;
3055
3056 }
3057
3058 for(; k<kmax; k++)
3059 {
3060
3061 // k = 0
3062
3063 a_0 = A[0];
3064 a_1 = A[1];
3065 a_2 = A[2];
3066 a_3 = A[3];
3067
3068 b_0 = B[0];
3069 b_1 = B[1];
3070 b_2 = B[2];
3071 b_3 = B[3];
3072
3073 c_00 -= a_0 * b_0;
3074 c_10 -= a_1 * b_0;
3075 c_20 -= a_2 * b_0;
3076 c_30 -= a_3 * b_0;
3077
3078// c_01 -= a_0 * b_1;
3079 c_11 -= a_1 * b_1;
3080 c_21 -= a_2 * b_1;
3081 c_31 -= a_3 * b_1;
3082
3083// c_02 -= a_0 * b_2;
3084// c_12 -= a_1 * b_2;
3085 c_22 -= a_2 * b_2;
3086 c_32 -= a_3 * b_2;
3087
3088// c_03 -= a_0 * b_3;
3089// c_13 -= a_1 * b_3;
3090// c_23 -= a_2 * b_3;
3091 c_33 -= a_3 * b_3;
3092
3093 A += 4;
3094 B += 4;
3095
3096 }
3097
3098 c_00 = C[0+bs*0] + c_00;
3099 c_10 = C[1+bs*0] + c_10;
3100 c_20 = C[2+bs*0] + c_20;
3101 c_30 = C[3+bs*0] + c_30;
3102
3103// c_01 = C[0+bs*1] + c_01;
3104 c_11 = C[1+bs*1] + c_11;
3105 c_21 = C[2+bs*1] + c_21;
3106 c_31 = C[3+bs*1] + c_31;
3107
3108// c_02 = C[0+bs*2] + c_02;
3109// c_12 = C[1+bs*2] + c_12;
3110 c_22 = C[2+bs*2] + c_22;
3111 c_32 = C[3+bs*2] + c_32;
3112
3113// c_03 = C[0+bs*3] + c_03;
3114// c_13 = C[1+bs*3] + c_13;
3115// c_23 = C[2+bs*3] + c_23;
3116 c_33 = C[3+bs*3] + c_33;
3117
3118 if(c_00>0)
3119 {
3120 c_00 = sqrt(c_00);
3121 tmp = 1.0/c_00;
3122 }
3123 else
3124 {
3125 c_00 = 0.0;
3126 tmp = 0.0;
3127 }
3128 c_10 *= tmp;
3129 c_20 *= tmp;
3130 c_30 *= tmp;
3131 inv_diag_D[0] = tmp;
3132
3133 if(kn==1)
3134 goto store;
3135
3136 c_11 -= c_10 * c_10;
3137 c_21 -= c_20 * c_10;
3138 c_31 -= c_30 * c_10;
3139 if(c_11>0)
3140 {
3141 c_11 = sqrt(c_11);
3142 tmp = 1.0/c_11;
3143 }
3144 else
3145 {
3146 c_11 = 0.0;
3147 tmp = 0.0;
3148 }
3149 c_21 *= tmp;
3150 c_31 *= tmp;
3151 inv_diag_D[1] = tmp;
3152
3153 if(kn==2)
3154 goto store;
3155
3156 c_22 -= c_20 * c_20;
3157 c_32 -= c_30 * c_20;
3158 c_22 -= c_21 * c_21;
3159 c_32 -= c_31 * c_21;
3160 if(c_22>0)
3161 {
3162 c_22 = sqrt(c_22);
3163 tmp = 1.0/c_22;
3164 }
3165 else
3166 {
3167 c_22 = 0.0;
3168 tmp = 0.0;
3169 }
3170 c_32 *= tmp;
3171 inv_diag_D[2] = tmp;
3172
3173 if(kn==3)
3174 goto store;
3175
3176 c_33 -= c_30 * c_30;
3177 c_33 -= c_31 * c_31;
3178 c_33 -= c_32 * c_32;
3179 if(c_33>0)
3180 {
3181 c_33 = sqrt(c_33);
3182 tmp = 1.0/c_33;
3183 }
3184 else
3185 {
3186 c_33 = 0.0;
3187 tmp = 0.0;
3188 }
3189 inv_diag_D[3] = tmp;
3190
3191
3192 store:
3193
3194 if(km>=4)
3195 {
3196 D[0+bs*0] = c_00;
3197 D[1+bs*0] = c_10;
3198 D[2+bs*0] = c_20;
3199 D[3+bs*0] = c_30;
3200
3201 if(kn==1)
3202 return;
3203
3204// D[0+bs*1] = c_01;
3205 D[1+bs*1] = c_11;
3206 D[2+bs*1] = c_21;
3207 D[3+bs*1] = c_31;
3208
3209 if(kn==2)
3210 return;
3211
3212// D[0+bs*2] = c_02;
3213// D[1+bs*2] = c_12;
3214 D[2+bs*2] = c_22;
3215 D[3+bs*2] = c_32;
3216
3217 if(kn==3)
3218 return;
3219
3220// D[0+bs*3] = c_03;
3221// D[1+bs*3] = c_13;
3222// D[2+bs*3] = c_23;
3223 D[3+bs*3] = c_33;
3224 }
3225 else if(km>=3)
3226 {
3227 D[0+bs*0] = c_00;
3228 D[1+bs*0] = c_10;
3229 D[2+bs*0] = c_20;
3230
3231 if(kn==1)
3232 return;
3233
3234// D[0+bs*1] = c_01;
3235 D[1+bs*1] = c_11;
3236 D[2+bs*1] = c_21;
3237
3238 if(kn==2)
3239 return;
3240
3241// D[0+bs*2] = c_02;
3242// D[1+bs*2] = c_12;
3243 D[2+bs*2] = c_22;
3244
3245// if(kn==3)
3246// return;
3247
3248// D[0+bs*3] = c_03;
3249// D[1+bs*3] = c_13;
3250// D[2+bs*3] = c_23;
3251 }
3252 else if(km>=2)
3253 {
3254 D[0+bs*0] = c_00;
3255 D[1+bs*0] = c_10;
3256
3257 if(kn==1)
3258 return;
3259
3260// D[0+bs*1] = c_01;
3261 D[1+bs*1] = c_11;
3262
3263// if(kn==2)
3264// return;
3265
3266// D[0+bs*2] = c_02;
3267// D[1+bs*2] = c_12;
3268
3269// if(kn==3)
3270// return;
3271
3272// D[0+bs*3] = c_03;
3273// D[1+bs*3] = c_13;
3274 }
3275 else //if(km>=1)
3276 {
3277 D[0+bs*0] = c_00;
3278
3279// if(kn==1)
3280// return;
3281
3282// D[0+bs*1] = c_01;
3283
3284// if(kn==2)
3285// return;
3286
3287// D[0+bs*2] = c_02;
3288
3289// if(kn==3)
3290// return;
3291
3292// D[0+bs*3] = c_03;
3293 }
3294
3295 }
3296#endif
3297
3298
3299
3300#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3301void kernel_spotrf_nt_l_4x4_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D)
3302 {
3303 kernel_spotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
3304 }
3305#endif
3306
3307
3308
3309#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3310void kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn)
3311 {
3312 float alpha = 1.0;
3313 float beta = 1.0;
3314 kernel_ssyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
3315 kernel_spotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
3316 }
3317#endif
3318
3319
3320
3321#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3322void kernel_ssyrk_spotrf_nt_l_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D)
3323 {
3324 float alpha = 1.0;
3325 float beta = 1.0;
3326 kernel_ssyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
3327 kernel_spotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
3328 }
3329#endif
3330
3331
3332
3333#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3334void kernel_strsm_nt_rl_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
3335 {
3336
3337 const int bs = 4;
3338
3339 float
3340 a_0, a_1, a_2, a_3,
3341 b_0, b_1, b_2, b_3,
3342 tmp,
3343 c_00=0, c_01=0, c_02=0, c_03=0,
3344 c_10=0, c_11=0, c_12=0, c_13=0,
3345 c_20=0, c_21=0, c_22=0, c_23=0,
3346 c_30=0, c_31=0, c_32=0, c_33=0;
3347
3348 int k;
3349
3350 for(k=0; k<kmax-3; k+=4)
3351 {
3352
3353 // k = 0
3354
3355 a_0 = A[0];
3356 a_1 = A[1];
3357 a_2 = A[2];
3358 a_3 = A[3];
3359
3360 b_0 = B[0];
3361 b_1 = B[1];
3362 b_2 = B[2];
3363 b_3 = B[3];
3364
3365 c_00 -= a_0 * b_0;
3366 c_10 -= a_1 * b_0;
3367 c_20 -= a_2 * b_0;
3368 c_30 -= a_3 * b_0;
3369
3370 c_01 -= a_0 * b_1;
3371 c_11 -= a_1 * b_1;
3372 c_21 -= a_2 * b_1;
3373 c_31 -= a_3 * b_1;
3374
3375 c_02 -= a_0 * b_2;
3376 c_12 -= a_1 * b_2;
3377 c_22 -= a_2 * b_2;
3378 c_32 -= a_3 * b_2;
3379
3380 c_03 -= a_0 * b_3;
3381 c_13 -= a_1 * b_3;
3382 c_23 -= a_2 * b_3;
3383 c_33 -= a_3 * b_3;
3384
3385
3386 // k = 1
3387
3388 a_0 = A[4];
3389 a_1 = A[5];
3390 a_2 = A[6];
3391 a_3 = A[7];
3392
3393 b_0 = B[4];
3394 b_1 = B[5];
3395 b_2 = B[6];
3396 b_3 = B[7];
3397
3398 c_00 -= a_0 * b_0;
3399 c_10 -= a_1 * b_0;
3400 c_20 -= a_2 * b_0;
3401 c_30 -= a_3 * b_0;
3402
3403 c_01 -= a_0 * b_1;
3404 c_11 -= a_1 * b_1;
3405 c_21 -= a_2 * b_1;
3406 c_31 -= a_3 * b_1;
3407
3408 c_02 -= a_0 * b_2;
3409 c_12 -= a_1 * b_2;
3410 c_22 -= a_2 * b_2;
3411 c_32 -= a_3 * b_2;
3412
3413 c_03 -= a_0 * b_3;
3414 c_13 -= a_1 * b_3;
3415 c_23 -= a_2 * b_3;
3416 c_33 -= a_3 * b_3;
3417
3418
3419 // k = 2
3420
3421 a_0 = A[8];
3422 a_1 = A[9];
3423 a_2 = A[10];
3424 a_3 = A[11];
3425
3426 b_0 = B[8];
3427 b_1 = B[9];
3428 b_2 = B[10];
3429 b_3 = B[11];
3430
3431 c_00 -= a_0 * b_0;
3432 c_10 -= a_1 * b_0;
3433 c_20 -= a_2 * b_0;
3434 c_30 -= a_3 * b_0;
3435
3436 c_01 -= a_0 * b_1;
3437 c_11 -= a_1 * b_1;
3438 c_21 -= a_2 * b_1;
3439 c_31 -= a_3 * b_1;
3440
3441 c_02 -= a_0 * b_2;
3442 c_12 -= a_1 * b_2;
3443 c_22 -= a_2 * b_2;
3444 c_32 -= a_3 * b_2;
3445
3446 c_03 -= a_0 * b_3;
3447 c_13 -= a_1 * b_3;
3448 c_23 -= a_2 * b_3;
3449 c_33 -= a_3 * b_3;
3450
3451
3452 // k = 3
3453
3454 a_0 = A[12];
3455 a_1 = A[13];
3456 a_2 = A[14];
3457 a_3 = A[15];
3458
3459 b_0 = B[12];
3460 b_1 = B[13];
3461 b_2 = B[14];
3462 b_3 = B[15];
3463
3464 c_00 -= a_0 * b_0;
3465 c_10 -= a_1 * b_0;
3466 c_20 -= a_2 * b_0;
3467 c_30 -= a_3 * b_0;
3468
3469 c_01 -= a_0 * b_1;
3470 c_11 -= a_1 * b_1;
3471 c_21 -= a_2 * b_1;
3472 c_31 -= a_3 * b_1;
3473
3474 c_02 -= a_0 * b_2;
3475 c_12 -= a_1 * b_2;
3476 c_22 -= a_2 * b_2;
3477 c_32 -= a_3 * b_2;
3478
3479 c_03 -= a_0 * b_3;
3480 c_13 -= a_1 * b_3;
3481 c_23 -= a_2 * b_3;
3482 c_33 -= a_3 * b_3;
3483
3484 A += 16;
3485 B += 16;
3486
3487 }
3488
3489 for(; k<kmax; k++)
3490 {
3491
3492 // k = 0
3493
3494 a_0 = A[0];
3495 a_1 = A[1];
3496 a_2 = A[2];
3497 a_3 = A[3];
3498
3499 b_0 = B[0];
3500 b_1 = B[1];
3501 b_2 = B[2];
3502 b_3 = B[3];
3503
3504 c_00 -= a_0 * b_0;
3505 c_10 -= a_1 * b_0;
3506 c_20 -= a_2 * b_0;
3507 c_30 -= a_3 * b_0;
3508
3509 c_01 -= a_0 * b_1;
3510 c_11 -= a_1 * b_1;
3511 c_21 -= a_2 * b_1;
3512 c_31 -= a_3 * b_1;
3513
3514 c_02 -= a_0 * b_2;
3515 c_12 -= a_1 * b_2;
3516 c_22 -= a_2 * b_2;
3517 c_32 -= a_3 * b_2;
3518
3519 c_03 -= a_0 * b_3;
3520 c_13 -= a_1 * b_3;
3521 c_23 -= a_2 * b_3;
3522 c_33 -= a_3 * b_3;
3523
3524 A += 4;
3525 B += 4;
3526
3527 }
3528
3529 c_00 = C[0+bs*0] + c_00;
3530 c_10 = C[1+bs*0] + c_10;
3531 c_20 = C[2+bs*0] + c_20;
3532 c_30 = C[3+bs*0] + c_30;
3533
3534 c_01 = C[0+bs*1] + c_01;
3535 c_11 = C[1+bs*1] + c_11;
3536 c_21 = C[2+bs*1] + c_21;
3537 c_31 = C[3+bs*1] + c_31;
3538
3539 c_02 = C[0+bs*2] + c_02;
3540 c_12 = C[1+bs*2] + c_12;
3541 c_22 = C[2+bs*2] + c_22;
3542 c_32 = C[3+bs*2] + c_32;
3543
3544 c_03 = C[0+bs*3] + c_03;
3545 c_13 = C[1+bs*3] + c_13;
3546 c_23 = C[2+bs*3] + c_23;
3547 c_33 = C[3+bs*3] + c_33;
3548
3549 tmp = inv_diag_E[0];
3550 c_00 *= tmp;
3551 c_10 *= tmp;
3552 c_20 *= tmp;
3553 c_30 *= tmp;
3554
3555 if(kn==1)
3556 goto store;
3557
3558 tmp = E[1+bs*0];
3559 c_01 -= c_00 * tmp;
3560 c_11 -= c_10 * tmp;
3561 c_21 -= c_20 * tmp;
3562 c_31 -= c_30 * tmp;
3563 tmp = inv_diag_E[1];
3564 c_01 *= tmp;
3565 c_11 *= tmp;
3566 c_21 *= tmp;
3567 c_31 *= tmp;
3568
3569 if(kn==2)
3570 goto store;
3571
3572 tmp = E[2+bs*0];
3573 c_02 -= c_00 * tmp;
3574 c_12 -= c_10 * tmp;
3575 c_22 -= c_20 * tmp;
3576 c_32 -= c_30 * tmp;
3577 tmp = E[2+bs*1];
3578 c_02 -= c_01 * tmp;
3579 c_12 -= c_11 * tmp;
3580 c_22 -= c_21 * tmp;
3581 c_32 -= c_31 * tmp;
3582 tmp = inv_diag_E[2];
3583 c_02 *= tmp;
3584 c_12 *= tmp;
3585 c_22 *= tmp;
3586 c_32 *= tmp;
3587
3588 if(kn==3)
3589 goto store;
3590
3591 tmp = E[3+bs*0];
3592 c_03 -= c_00 * tmp;
3593 c_13 -= c_10 * tmp;
3594 c_23 -= c_20 * tmp;
3595 c_33 -= c_30 * tmp;
3596 tmp = E[3+bs*1];
3597 c_03 -= c_01 * tmp;
3598 c_13 -= c_11 * tmp;
3599 c_23 -= c_21 * tmp;
3600 c_33 -= c_31 * tmp;
3601 tmp = E[3+bs*2];
3602 c_03 -= c_02 * tmp;
3603 c_13 -= c_12 * tmp;
3604 c_23 -= c_22 * tmp;
3605 c_33 -= c_32 * tmp;
3606 tmp = inv_diag_E[3];
3607 c_03 *= tmp;
3608 c_13 *= tmp;
3609 c_23 *= tmp;
3610 c_33 *= tmp;
3611
3612
3613 store:
3614
3615 if(km>=4)
3616 {
3617 D[0+bs*0] = c_00;
3618 D[1+bs*0] = c_10;
3619 D[2+bs*0] = c_20;
3620 D[3+bs*0] = c_30;
3621
3622 if(kn==1)
3623 return;
3624
3625 D[0+bs*1] = c_01;
3626 D[1+bs*1] = c_11;
3627 D[2+bs*1] = c_21;
3628 D[3+bs*1] = c_31;
3629
3630 if(kn==2)
3631 return;
3632
3633 D[0+bs*2] = c_02;
3634 D[1+bs*2] = c_12;
3635 D[2+bs*2] = c_22;
3636 D[3+bs*2] = c_32;
3637
3638 if(kn==3)
3639 return;
3640
3641 D[0+bs*3] = c_03;
3642 D[1+bs*3] = c_13;
3643 D[2+bs*3] = c_23;
3644 D[3+bs*3] = c_33;
3645 }
3646 else if(km>=3)
3647 {
3648 D[0+bs*0] = c_00;
3649 D[1+bs*0] = c_10;
3650 D[2+bs*0] = c_20;
3651
3652 if(kn==1)
3653 return;
3654
3655 D[0+bs*1] = c_01;
3656 D[1+bs*1] = c_11;
3657 D[2+bs*1] = c_21;
3658
3659 if(kn==2)
3660 return;
3661
3662 D[0+bs*2] = c_02;
3663 D[1+bs*2] = c_12;
3664 D[2+bs*2] = c_22;
3665
3666 if(kn==3)
3667 return;
3668
3669 D[0+bs*3] = c_03;
3670 D[1+bs*3] = c_13;
3671 D[2+bs*3] = c_23;
3672 }
3673 else if(km>=2)
3674 {
3675 D[0+bs*0] = c_00;
3676 D[1+bs*0] = c_10;
3677
3678 if(kn==1)
3679 return;
3680
3681 D[0+bs*1] = c_01;
3682 D[1+bs*1] = c_11;
3683
3684 if(kn==2)
3685 return;
3686
3687 D[0+bs*2] = c_02;
3688 D[1+bs*2] = c_12;
3689
3690 if(kn==3)
3691 return;
3692
3693 D[0+bs*3] = c_03;
3694 D[1+bs*3] = c_13;
3695 }
3696 else //if(km>=1)
3697 {
3698 D[0+bs*0] = c_00;
3699
3700 if(kn==1)
3701 return;
3702
3703 D[0+bs*1] = c_01;
3704
3705 if(kn==2)
3706 return;
3707
3708 D[0+bs*2] = c_02;
3709
3710 if(kn==3)
3711 return;
3712
3713 D[0+bs*3] = c_03;
3714 }
3715
3716 }
3717#endif
3718
3719
3720
3721#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3722void kernel_strsm_nt_rl_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E)
3723 {
3724 kernel_strsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
3725 }
3726#endif
3727
3728
3729
3730#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3731void kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
3732 {
3733 float alpha = 1.0;
3734 float beta = 1.0;
3735 kernel_sgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
3736 kernel_strsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
3737 }
3738#endif
3739
3740
3741
3742#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3743void kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E)
3744 {
3745 float alpha = 1.0;
3746 float beta = 1.0;
3747 kernel_sgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
3748 kernel_strsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
3749 }
3750#endif
3751
3752
3753
3754#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3755void kernel_strsm_nt_rl_one_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, int km, int kn)
3756 {
3757
3758 const int bs = 4;
3759
3760 float
3761 a_0, a_1, a_2, a_3,
3762 b_0, b_1, b_2, b_3,
3763 tmp,
3764 c_00=0, c_01=0, c_02=0, c_03=0,
3765 c_10=0, c_11=0, c_12=0, c_13=0,
3766 c_20=0, c_21=0, c_22=0, c_23=0,
3767 c_30=0, c_31=0, c_32=0, c_33=0;
3768
3769 int k;
3770
3771 for(k=0; k<kmax-3; k+=4)
3772 {
3773
3774 // k = 0
3775
3776 a_0 = A[0];
3777 a_1 = A[1];
3778 a_2 = A[2];
3779 a_3 = A[3];
3780
3781 b_0 = B[0];
3782 b_1 = B[1];
3783 b_2 = B[2];
3784 b_3 = B[3];
3785
3786 c_00 -= a_0 * b_0;
3787 c_10 -= a_1 * b_0;
3788 c_20 -= a_2 * b_0;
3789 c_30 -= a_3 * b_0;
3790
3791 c_01 -= a_0 * b_1;
3792 c_11 -= a_1 * b_1;
3793 c_21 -= a_2 * b_1;
3794 c_31 -= a_3 * b_1;
3795
3796 c_02 -= a_0 * b_2;
3797 c_12 -= a_1 * b_2;
3798 c_22 -= a_2 * b_2;
3799 c_32 -= a_3 * b_2;
3800
3801 c_03 -= a_0 * b_3;
3802 c_13 -= a_1 * b_3;
3803 c_23 -= a_2 * b_3;
3804 c_33 -= a_3 * b_3;
3805
3806
3807 // k = 1
3808
3809 a_0 = A[4];
3810 a_1 = A[5];
3811 a_2 = A[6];
3812 a_3 = A[7];
3813
3814 b_0 = B[4];
3815 b_1 = B[5];
3816 b_2 = B[6];
3817 b_3 = B[7];
3818
3819 c_00 -= a_0 * b_0;
3820 c_10 -= a_1 * b_0;
3821 c_20 -= a_2 * b_0;
3822 c_30 -= a_3 * b_0;
3823
3824 c_01 -= a_0 * b_1;
3825 c_11 -= a_1 * b_1;
3826 c_21 -= a_2 * b_1;
3827 c_31 -= a_3 * b_1;
3828
3829 c_02 -= a_0 * b_2;
3830 c_12 -= a_1 * b_2;
3831 c_22 -= a_2 * b_2;
3832 c_32 -= a_3 * b_2;
3833
3834 c_03 -= a_0 * b_3;
3835 c_13 -= a_1 * b_3;
3836 c_23 -= a_2 * b_3;
3837 c_33 -= a_3 * b_3;
3838
3839
3840 // k = 2
3841
3842 a_0 = A[8];
3843 a_1 = A[9];
3844 a_2 = A[10];
3845 a_3 = A[11];
3846
3847 b_0 = B[8];
3848 b_1 = B[9];
3849 b_2 = B[10];
3850 b_3 = B[11];
3851
3852 c_00 -= a_0 * b_0;
3853 c_10 -= a_1 * b_0;
3854 c_20 -= a_2 * b_0;
3855 c_30 -= a_3 * b_0;
3856
3857 c_01 -= a_0 * b_1;
3858 c_11 -= a_1 * b_1;
3859 c_21 -= a_2 * b_1;
3860 c_31 -= a_3 * b_1;
3861
3862 c_02 -= a_0 * b_2;
3863 c_12 -= a_1 * b_2;
3864 c_22 -= a_2 * b_2;
3865 c_32 -= a_3 * b_2;
3866
3867 c_03 -= a_0 * b_3;
3868 c_13 -= a_1 * b_3;
3869 c_23 -= a_2 * b_3;
3870 c_33 -= a_3 * b_3;
3871
3872
3873 // k = 3
3874
3875 a_0 = A[12];
3876 a_1 = A[13];
3877 a_2 = A[14];
3878 a_3 = A[15];
3879
3880 b_0 = B[12];
3881 b_1 = B[13];
3882 b_2 = B[14];
3883 b_3 = B[15];
3884
3885 c_00 -= a_0 * b_0;
3886 c_10 -= a_1 * b_0;
3887 c_20 -= a_2 * b_0;
3888 c_30 -= a_3 * b_0;
3889
3890 c_01 -= a_0 * b_1;
3891 c_11 -= a_1 * b_1;
3892 c_21 -= a_2 * b_1;
3893 c_31 -= a_3 * b_1;
3894
3895 c_02 -= a_0 * b_2;
3896 c_12 -= a_1 * b_2;
3897 c_22 -= a_2 * b_2;
3898 c_32 -= a_3 * b_2;
3899
3900 c_03 -= a_0 * b_3;
3901 c_13 -= a_1 * b_3;
3902 c_23 -= a_2 * b_3;
3903 c_33 -= a_3 * b_3;
3904
3905 A += 16;
3906 B += 16;
3907
3908 }
3909
3910 for(; k<kmax; k++)
3911 {
3912
3913 // k = 0
3914
3915 a_0 = A[0];
3916 a_1 = A[1];
3917 a_2 = A[2];
3918 a_3 = A[3];
3919
3920 b_0 = B[0];
3921 b_1 = B[1];
3922 b_2 = B[2];
3923 b_3 = B[3];
3924
3925 c_00 -= a_0 * b_0;
3926 c_10 -= a_1 * b_0;
3927 c_20 -= a_2 * b_0;
3928 c_30 -= a_3 * b_0;
3929
3930 c_01 -= a_0 * b_1;
3931 c_11 -= a_1 * b_1;
3932 c_21 -= a_2 * b_1;
3933 c_31 -= a_3 * b_1;
3934
3935 c_02 -= a_0 * b_2;
3936 c_12 -= a_1 * b_2;
3937 c_22 -= a_2 * b_2;
3938 c_32 -= a_3 * b_2;
3939
3940 c_03 -= a_0 * b_3;
3941 c_13 -= a_1 * b_3;
3942 c_23 -= a_2 * b_3;
3943 c_33 -= a_3 * b_3;
3944
3945 A += 4;
3946 B += 4;
3947
3948 }
3949
3950 c_00 = C[0+bs*0] + c_00;
3951 c_10 = C[1+bs*0] + c_10;
3952 c_20 = C[2+bs*0] + c_20;
3953 c_30 = C[3+bs*0] + c_30;
3954
3955 c_01 = C[0+bs*1] + c_01;
3956 c_11 = C[1+bs*1] + c_11;
3957 c_21 = C[2+bs*1] + c_21;
3958 c_31 = C[3+bs*1] + c_31;
3959
3960 c_02 = C[0+bs*2] + c_02;
3961 c_12 = C[1+bs*2] + c_12;
3962 c_22 = C[2+bs*2] + c_22;
3963 c_32 = C[3+bs*2] + c_32;
3964
3965 c_03 = C[0+bs*3] + c_03;
3966 c_13 = C[1+bs*3] + c_13;
3967 c_23 = C[2+bs*3] + c_23;
3968 c_33 = C[3+bs*3] + c_33;
3969
3970 if(kn==1)
3971 goto store;
3972
3973 tmp = E[1+bs*0];
3974 c_01 -= c_00 * tmp;
3975 c_11 -= c_10 * tmp;
3976 c_21 -= c_20 * tmp;
3977 c_31 -= c_30 * tmp;
3978
3979 if(kn==2)
3980 goto store;
3981
3982 tmp = E[2+bs*0];
3983 c_02 -= c_00 * tmp;
3984 c_12 -= c_10 * tmp;
3985 c_22 -= c_20 * tmp;
3986 c_32 -= c_30 * tmp;
3987 tmp = E[2+bs*1];
3988 c_02 -= c_01 * tmp;
3989 c_12 -= c_11 * tmp;
3990 c_22 -= c_21 * tmp;
3991 c_32 -= c_31 * tmp;
3992
3993 if(kn==3)
3994 goto store;
3995
3996 tmp = E[3+bs*0];
3997 c_03 -= c_00 * tmp;
3998 c_13 -= c_10 * tmp;
3999 c_23 -= c_20 * tmp;
4000 c_33 -= c_30 * tmp;
4001 tmp = E[3+bs*1];
4002 c_03 -= c_01 * tmp;
4003 c_13 -= c_11 * tmp;
4004 c_23 -= c_21 * tmp;
4005 c_33 -= c_31 * tmp;
4006 tmp = E[3+bs*2];
4007 c_03 -= c_02 * tmp;
4008 c_13 -= c_12 * tmp;
4009 c_23 -= c_22 * tmp;
4010 c_33 -= c_32 * tmp;
4011
4012
4013 store:
4014
4015 if(km>=4)
4016 {
4017 D[0+bs*0] = c_00;
4018 D[1+bs*0] = c_10;
4019 D[2+bs*0] = c_20;
4020 D[3+bs*0] = c_30;
4021
4022 if(kn==1)
4023 return;
4024
4025 D[0+bs*1] = c_01;
4026 D[1+bs*1] = c_11;
4027 D[2+bs*1] = c_21;
4028 D[3+bs*1] = c_31;
4029
4030 if(kn==2)
4031 return;
4032
4033 D[0+bs*2] = c_02;
4034 D[1+bs*2] = c_12;
4035 D[2+bs*2] = c_22;
4036 D[3+bs*2] = c_32;
4037
4038 if(kn==3)
4039 return;
4040
4041 D[0+bs*3] = c_03;
4042 D[1+bs*3] = c_13;
4043 D[2+bs*3] = c_23;
4044 D[3+bs*3] = c_33;
4045 }
4046 else if(km>=3)
4047 {
4048 D[0+bs*0] = c_00;
4049 D[1+bs*0] = c_10;
4050 D[2+bs*0] = c_20;
4051
4052 if(kn==1)
4053 return;
4054
4055 D[0+bs*1] = c_01;
4056 D[1+bs*1] = c_11;
4057 D[2+bs*1] = c_21;
4058
4059 if(kn==2)
4060 return;
4061
4062 D[0+bs*2] = c_02;
4063 D[1+bs*2] = c_12;
4064 D[2+bs*2] = c_22;
4065
4066 if(kn==3)
4067 return;
4068
4069 D[0+bs*3] = c_03;
4070 D[1+bs*3] = c_13;
4071 D[2+bs*3] = c_23;
4072 }
4073 else if(km>=2)
4074 {
4075 D[0+bs*0] = c_00;
4076 D[1+bs*0] = c_10;
4077
4078 if(kn==1)
4079 return;
4080
4081 D[0+bs*1] = c_01;
4082 D[1+bs*1] = c_11;
4083
4084 if(kn==2)
4085 return;
4086
4087 D[0+bs*2] = c_02;
4088 D[1+bs*2] = c_12;
4089
4090 if(kn==3)
4091 return;
4092
4093 D[0+bs*3] = c_03;
4094 D[1+bs*3] = c_13;
4095 }
4096 else //if(km>=1)
4097 {
4098 D[0+bs*0] = c_00;
4099
4100 if(kn==1)
4101 return;
4102
4103 D[0+bs*1] = c_01;
4104
4105 if(kn==2)
4106 return;
4107
4108 D[0+bs*2] = c_02;
4109
4110 if(kn==3)
4111 return;
4112
4113 D[0+bs*3] = c_03;
4114 }
4115
4116 }
4117#endif
4118
4119
4120
4121#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4122void kernel_strsm_nt_rl_one_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E)
4123 {
4124 kernel_strsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
4125 }
4126#endif
4127
4128
4129
4130#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4131void kernel_strsm_nt_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
4132 {
4133
4134 const int bs = 4;
4135
4136 float
4137 a_0, a_1, a_2, a_3,
4138 b_0, b_1, b_2, b_3,
4139 tmp,
4140 c_00=0, c_01=0, c_02=0, c_03=0,
4141 c_10=0, c_11=0, c_12=0, c_13=0,
4142 c_20=0, c_21=0, c_22=0, c_23=0,
4143 c_30=0, c_31=0, c_32=0, c_33=0;
4144
4145 int k;
4146
4147 for(k=0; k<kmax-3; k+=4)
4148 {
4149
4150 // k = 0
4151
4152 a_0 = A[0];
4153 a_1 = A[1];
4154 a_2 = A[2];
4155 a_3 = A[3];
4156
4157 b_0 = B[0];
4158 b_1 = B[1];
4159 b_2 = B[2];
4160 b_3 = B[3];
4161
4162 c_00 -= a_0 * b_0;
4163 c_10 -= a_1 * b_0;
4164 c_20 -= a_2 * b_0;
4165 c_30 -= a_3 * b_0;
4166
4167 c_01 -= a_0 * b_1;
4168 c_11 -= a_1 * b_1;
4169 c_21 -= a_2 * b_1;
4170 c_31 -= a_3 * b_1;
4171
4172 c_02 -= a_0 * b_2;
4173 c_12 -= a_1 * b_2;
4174 c_22 -= a_2 * b_2;
4175 c_32 -= a_3 * b_2;
4176
4177 c_03 -= a_0 * b_3;
4178 c_13 -= a_1 * b_3;
4179 c_23 -= a_2 * b_3;
4180 c_33 -= a_3 * b_3;
4181
4182
4183 // k = 1
4184
4185 a_0 = A[4];
4186 a_1 = A[5];
4187 a_2 = A[6];
4188 a_3 = A[7];
4189
4190 b_0 = B[4];
4191 b_1 = B[5];
4192 b_2 = B[6];
4193 b_3 = B[7];
4194
4195 c_00 -= a_0 * b_0;
4196 c_10 -= a_1 * b_0;
4197 c_20 -= a_2 * b_0;
4198 c_30 -= a_3 * b_0;
4199
4200 c_01 -= a_0 * b_1;
4201 c_11 -= a_1 * b_1;
4202 c_21 -= a_2 * b_1;
4203 c_31 -= a_3 * b_1;
4204
4205 c_02 -= a_0 * b_2;
4206 c_12 -= a_1 * b_2;
4207 c_22 -= a_2 * b_2;
4208 c_32 -= a_3 * b_2;
4209
4210 c_03 -= a_0 * b_3;
4211 c_13 -= a_1 * b_3;
4212 c_23 -= a_2 * b_3;
4213 c_33 -= a_3 * b_3;
4214
4215
4216 // k = 2
4217
4218 a_0 = A[8];
4219 a_1 = A[9];
4220 a_2 = A[10];
4221 a_3 = A[11];
4222
4223 b_0 = B[8];
4224 b_1 = B[9];
4225 b_2 = B[10];
4226 b_3 = B[11];
4227
4228 c_00 -= a_0 * b_0;
4229 c_10 -= a_1 * b_0;
4230 c_20 -= a_2 * b_0;
4231 c_30 -= a_3 * b_0;
4232
4233 c_01 -= a_0 * b_1;
4234 c_11 -= a_1 * b_1;
4235 c_21 -= a_2 * b_1;
4236 c_31 -= a_3 * b_1;
4237
4238 c_02 -= a_0 * b_2;
4239 c_12 -= a_1 * b_2;
4240 c_22 -= a_2 * b_2;
4241 c_32 -= a_3 * b_2;
4242
4243 c_03 -= a_0 * b_3;
4244 c_13 -= a_1 * b_3;
4245 c_23 -= a_2 * b_3;
4246 c_33 -= a_3 * b_3;
4247
4248
4249 // k = 3
4250
4251 a_0 = A[12];
4252 a_1 = A[13];
4253 a_2 = A[14];
4254 a_3 = A[15];
4255
4256 b_0 = B[12];
4257 b_1 = B[13];
4258 b_2 = B[14];
4259 b_3 = B[15];
4260
4261 c_00 -= a_0 * b_0;
4262 c_10 -= a_1 * b_0;
4263 c_20 -= a_2 * b_0;
4264 c_30 -= a_3 * b_0;
4265
4266 c_01 -= a_0 * b_1;
4267 c_11 -= a_1 * b_1;
4268 c_21 -= a_2 * b_1;
4269 c_31 -= a_3 * b_1;
4270
4271 c_02 -= a_0 * b_2;
4272 c_12 -= a_1 * b_2;
4273 c_22 -= a_2 * b_2;
4274 c_32 -= a_3 * b_2;
4275
4276 c_03 -= a_0 * b_3;
4277 c_13 -= a_1 * b_3;
4278 c_23 -= a_2 * b_3;
4279 c_33 -= a_3 * b_3;
4280
4281 A += 16;
4282 B += 16;
4283
4284 }
4285
4286 for(; k<kmax; k++)
4287 {
4288
4289 // k = 0
4290
4291 a_0 = A[0];
4292 a_1 = A[1];
4293 a_2 = A[2];
4294 a_3 = A[3];
4295
4296 b_0 = B[0];
4297 b_1 = B[1];
4298 b_2 = B[2];
4299 b_3 = B[3];
4300
4301 c_00 -= a_0 * b_0;
4302 c_10 -= a_1 * b_0;
4303 c_20 -= a_2 * b_0;
4304 c_30 -= a_3 * b_0;
4305
4306 c_01 -= a_0 * b_1;
4307 c_11 -= a_1 * b_1;
4308 c_21 -= a_2 * b_1;
4309 c_31 -= a_3 * b_1;
4310
4311 c_02 -= a_0 * b_2;
4312 c_12 -= a_1 * b_2;
4313 c_22 -= a_2 * b_2;
4314 c_32 -= a_3 * b_2;
4315
4316 c_03 -= a_0 * b_3;
4317 c_13 -= a_1 * b_3;
4318 c_23 -= a_2 * b_3;
4319 c_33 -= a_3 * b_3;
4320
4321 A += 4;
4322 B += 4;
4323
4324 }
4325
4326 c_00 = C[0+bs*0] + c_00;
4327 c_10 = C[1+bs*0] + c_10;
4328 c_20 = C[2+bs*0] + c_20;
4329 c_30 = C[3+bs*0] + c_30;
4330
4331 c_01 = C[0+bs*1] + c_01;
4332 c_11 = C[1+bs*1] + c_11;
4333 c_21 = C[2+bs*1] + c_21;
4334 c_31 = C[3+bs*1] + c_31;
4335
4336 c_02 = C[0+bs*2] + c_02;
4337 c_12 = C[1+bs*2] + c_12;
4338 c_22 = C[2+bs*2] + c_22;
4339 c_32 = C[3+bs*2] + c_32;
4340
4341 c_03 = C[0+bs*3] + c_03;
4342 c_13 = C[1+bs*3] + c_13;
4343 c_23 = C[2+bs*3] + c_23;
4344 c_33 = C[3+bs*3] + c_33;
4345
4346
4347 if(kn>3)
4348 {
4349 tmp = inv_diag_E[3];
4350 c_03 *= tmp;
4351 c_13 *= tmp;
4352 c_23 *= tmp;
4353 c_33 *= tmp;
4354 tmp = E[2+bs*3];
4355 c_02 -= c_03 * tmp;
4356 c_12 -= c_13 * tmp;
4357 c_22 -= c_23 * tmp;
4358 c_32 -= c_33 * tmp;
4359 tmp = E[1+bs*3];
4360 c_01 -= c_03 * tmp;
4361 c_11 -= c_13 * tmp;
4362 c_21 -= c_23 * tmp;
4363 c_31 -= c_33 * tmp;
4364 tmp = E[0+bs*3];
4365 c_00 -= c_03 * tmp;
4366 c_10 -= c_13 * tmp;
4367 c_20 -= c_23 * tmp;
4368 c_30 -= c_33 * tmp;
4369 }
4370
4371 if(kn>2)
4372 {
4373 tmp = inv_diag_E[2];
4374 c_02 *= tmp;
4375 c_12 *= tmp;
4376 c_22 *= tmp;
4377 c_32 *= tmp;
4378 tmp = E[1+bs*2];
4379 c_01 -= c_02 * tmp;
4380 c_11 -= c_12 * tmp;
4381 c_21 -= c_22 * tmp;
4382 c_31 -= c_32 * tmp;
4383 tmp = E[0+bs*2];
4384 c_00 -= c_02 * tmp;
4385 c_10 -= c_12 * tmp;
4386 c_20 -= c_22 * tmp;
4387 c_30 -= c_32 * tmp;
4388 }
4389
4390 if(kn>1)
4391 {
4392 tmp = inv_diag_E[1];
4393 c_01 *= tmp;
4394 c_11 *= tmp;
4395 c_21 *= tmp;
4396 c_31 *= tmp;
4397 tmp = E[0+bs*1];
4398 c_00 -= c_01 * tmp;
4399 c_10 -= c_11 * tmp;
4400 c_20 -= c_21 * tmp;
4401 c_30 -= c_31 * tmp;
4402 }
4403
4404 tmp = inv_diag_E[0];
4405 c_00 *= tmp;
4406 c_10 *= tmp;
4407 c_20 *= tmp;
4408 c_30 *= tmp;
4409
4410
4411 store:
4412
4413 if(km>=4)
4414 {
4415 D[0+bs*0] = c_00;
4416 D[1+bs*0] = c_10;
4417 D[2+bs*0] = c_20;
4418 D[3+bs*0] = c_30;
4419
4420 if(kn==1)
4421 return;
4422
4423 D[0+bs*1] = c_01;
4424 D[1+bs*1] = c_11;
4425 D[2+bs*1] = c_21;
4426 D[3+bs*1] = c_31;
4427
4428 if(kn==2)
4429 return;
4430
4431 D[0+bs*2] = c_02;
4432 D[1+bs*2] = c_12;
4433 D[2+bs*2] = c_22;
4434 D[3+bs*2] = c_32;
4435
4436 if(kn==3)
4437 return;
4438
4439 D[0+bs*3] = c_03;
4440 D[1+bs*3] = c_13;
4441 D[2+bs*3] = c_23;
4442 D[3+bs*3] = c_33;
4443 }
4444 else if(km>=3)
4445 {
4446 D[0+bs*0] = c_00;
4447 D[1+bs*0] = c_10;
4448 D[2+bs*0] = c_20;
4449
4450 if(kn==1)
4451 return;
4452
4453 D[0+bs*1] = c_01;
4454 D[1+bs*1] = c_11;
4455 D[2+bs*1] = c_21;
4456
4457 if(kn==2)
4458 return;
4459
4460 D[0+bs*2] = c_02;
4461 D[1+bs*2] = c_12;
4462 D[2+bs*2] = c_22;
4463
4464 if(kn==3)
4465 return;
4466
4467 D[0+bs*3] = c_03;
4468 D[1+bs*3] = c_13;
4469 D[2+bs*3] = c_23;
4470 }
4471 else if(km>=2)
4472 {
4473 D[0+bs*0] = c_00;
4474 D[1+bs*0] = c_10;
4475
4476 if(kn==1)
4477 return;
4478
4479 D[0+bs*1] = c_01;
4480 D[1+bs*1] = c_11;
4481
4482 if(kn==2)
4483 return;
4484
4485 D[0+bs*2] = c_02;
4486 D[1+bs*2] = c_12;
4487
4488 if(kn==3)
4489 return;
4490
4491 D[0+bs*3] = c_03;
4492 D[1+bs*3] = c_13;
4493 }
4494 else //if(km>=1)
4495 {
4496 D[0+bs*0] = c_00;
4497
4498 if(kn==1)
4499 return;
4500
4501 D[0+bs*1] = c_01;
4502
4503 if(kn==2)
4504 return;
4505
4506 D[0+bs*2] = c_02;
4507
4508 if(kn==3)
4509 return;
4510
4511 D[0+bs*3] = c_03;
4512 }
4513
4514 }
4515#endif
4516
4517
4518
4519#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4520void kernel_strsm_nt_ru_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E)
4521 {
4522 kernel_strsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
4523 }
4524#endif
4525
4526
4527
4528#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4529void kernel_sgetrf_nn_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D, int km, int kn)
4530 {
4531
4532 const int bs = 4;
4533
4534 int k;
4535
4536 float
4537 tmp,
4538 a_0, a_1, a_2, a_3,
4539 b_0, b_1, b_2, b_3,
4540 c_00=0, c_01=0, c_02=0, c_03=0,
4541 c_10=0, c_11=0, c_12=0, c_13=0,
4542 c_20=0, c_21=0, c_22=0, c_23=0,
4543 c_30=0, c_31=0, c_32=0, c_33=0;
4544
4545 if(kmax<=0)
4546 goto add;
4547
4548 for(k=0; k<kmax-3; k+=4)
4549 {
4550
4551 a_0 = A[0+bs*0];
4552 a_1 = A[1+bs*0];
4553 a_2 = A[2+bs*0];
4554 a_3 = A[3+bs*0];
4555
4556 b_0 = B[0+bs*0];
4557 b_1 = B[0+bs*1];
4558 b_2 = B[0+bs*2];
4559 b_3 = B[0+bs*3];
4560
4561 c_00 -= a_0 * b_0;
4562 c_10 -= a_1 * b_0;
4563 c_20 -= a_2 * b_0;
4564 c_30 -= a_3 * b_0;
4565
4566 c_01 -= a_0 * b_1;
4567 c_11 -= a_1 * b_1;
4568 c_21 -= a_2 * b_1;
4569 c_31 -= a_3 * b_1;
4570
4571 c_02 -= a_0 * b_2;
4572 c_12 -= a_1 * b_2;
4573 c_22 -= a_2 * b_2;
4574 c_32 -= a_3 * b_2;
4575
4576 c_03 -= a_0 * b_3;
4577 c_13 -= a_1 * b_3;
4578 c_23 -= a_2 * b_3;
4579 c_33 -= a_3 * b_3;
4580
4581
4582 a_0 = A[0+bs*1];
4583 a_1 = A[1+bs*1];
4584 a_2 = A[2+bs*1];
4585 a_3 = A[3+bs*1];
4586
4587 b_0 = B[1+bs*0];
4588 b_1 = B[1+bs*1];
4589 b_2 = B[1+bs*2];
4590 b_3 = B[1+bs*3];
4591
4592 c_00 -= a_0 * b_0;
4593 c_10 -= a_1 * b_0;
4594 c_20 -= a_2 * b_0;
4595 c_30 -= a_3 * b_0;
4596
4597 c_01 -= a_0 * b_1;
4598 c_11 -= a_1 * b_1;
4599 c_21 -= a_2 * b_1;
4600 c_31 -= a_3 * b_1;
4601
4602 c_02 -= a_0 * b_2;
4603 c_12 -= a_1 * b_2;
4604 c_22 -= a_2 * b_2;
4605 c_32 -= a_3 * b_2;
4606
4607 c_03 -= a_0 * b_3;
4608 c_13 -= a_1 * b_3;
4609 c_23 -= a_2 * b_3;
4610 c_33 -= a_3 * b_3;
4611
4612
4613 a_0 = A[0+bs*2];
4614 a_1 = A[1+bs*2];
4615 a_2 = A[2+bs*2];
4616 a_3 = A[3+bs*2];
4617
4618 b_0 = B[2+bs*0];
4619 b_1 = B[2+bs*1];
4620 b_2 = B[2+bs*2];
4621 b_3 = B[2+bs*3];
4622
4623 c_00 -= a_0 * b_0;
4624 c_10 -= a_1 * b_0;
4625 c_20 -= a_2 * b_0;
4626 c_30 -= a_3 * b_0;
4627
4628 c_01 -= a_0 * b_1;
4629 c_11 -= a_1 * b_1;
4630 c_21 -= a_2 * b_1;
4631 c_31 -= a_3 * b_1;
4632
4633 c_02 -= a_0 * b_2;
4634 c_12 -= a_1 * b_2;
4635 c_22 -= a_2 * b_2;
4636 c_32 -= a_3 * b_2;
4637
4638 c_03 -= a_0 * b_3;
4639 c_13 -= a_1 * b_3;
4640 c_23 -= a_2 * b_3;
4641 c_33 -= a_3 * b_3;
4642
4643
4644 a_0 = A[0+bs*3];
4645 a_1 = A[1+bs*3];
4646 a_2 = A[2+bs*3];
4647 a_3 = A[3+bs*3];
4648
4649 b_0 = B[3+bs*0];
4650 b_1 = B[3+bs*1];
4651 b_2 = B[3+bs*2];
4652 b_3 = B[3+bs*3];
4653
4654 c_00 -= a_0 * b_0;
4655 c_10 -= a_1 * b_0;
4656 c_20 -= a_2 * b_0;
4657 c_30 -= a_3 * b_0;
4658
4659 c_01 -= a_0 * b_1;
4660 c_11 -= a_1 * b_1;
4661 c_21 -= a_2 * b_1;
4662 c_31 -= a_3 * b_1;
4663
4664 c_02 -= a_0 * b_2;
4665 c_12 -= a_1 * b_2;
4666 c_22 -= a_2 * b_2;
4667 c_32 -= a_3 * b_2;
4668
4669 c_03 -= a_0 * b_3;
4670 c_13 -= a_1 * b_3;
4671 c_23 -= a_2 * b_3;
4672 c_33 -= a_3 * b_3;
4673
4674
4675 A += 16;
4676 B += 4*sdb;
4677
4678 }
4679 for(; k<kmax; k++)
4680 {
4681
4682 a_0 = A[0+bs*0];
4683 a_1 = A[1+bs*0];
4684 a_2 = A[2+bs*0];
4685 a_3 = A[3+bs*0];
4686
4687 b_0 = B[0+bs*0];
4688 b_1 = B[0+bs*1];
4689 b_2 = B[0+bs*2];
4690 b_3 = B[0+bs*3];
4691
4692 c_00 -= a_0 * b_0;
4693 c_10 -= a_1 * b_0;
4694 c_20 -= a_2 * b_0;
4695 c_30 -= a_3 * b_0;
4696
4697 c_01 -= a_0 * b_1;
4698 c_11 -= a_1 * b_1;
4699 c_21 -= a_2 * b_1;
4700 c_31 -= a_3 * b_1;
4701
4702 c_02 -= a_0 * b_2;
4703 c_12 -= a_1 * b_2;
4704 c_22 -= a_2 * b_2;
4705 c_32 -= a_3 * b_2;
4706
4707 c_03 -= a_0 * b_3;
4708 c_13 -= a_1 * b_3;
4709 c_23 -= a_2 * b_3;
4710 c_33 -= a_3 * b_3;
4711
4712
4713 A += 4;
4714 B += 1;
4715
4716 }
4717
4718 add:
4719
4720 c_00 += C[0+bs*0];
4721 c_10 += C[1+bs*0];
4722 c_20 += C[2+bs*0];
4723 c_30 += C[3+bs*0];
4724
4725 c_01 += C[0+bs*1];
4726 c_11 += C[1+bs*1];
4727 c_21 += C[2+bs*1];
4728 c_31 += C[3+bs*1];
4729
4730 c_02 += C[0+bs*2];
4731 c_12 += C[1+bs*2];
4732 c_22 += C[2+bs*2];
4733 c_32 += C[3+bs*2];
4734
4735 c_03 += C[0+bs*3];
4736 c_13 += C[1+bs*3];
4737 c_23 += C[2+bs*3];
4738 c_33 += C[3+bs*3];
4739
4740 // factorization
4741
4742 // first column
4743 tmp = 1.0 / c_00;
4744 c_10 *= tmp;
4745 c_20 *= tmp;
4746 c_30 *= tmp;
4747
4748 inv_diag_D[0] = tmp;
4749
4750 if(kn==1)
4751 goto store;
4752
4753 // second column
4754 c_11 -= c_10 * c_01;
4755 c_21 -= c_20 * c_01;
4756 c_31 -= c_30 * c_01;
4757
4758 tmp = 1.0 / c_11;
4759 c_21 *= tmp;
4760 c_31 *= tmp;
4761
4762 inv_diag_D[1] = tmp;
4763
4764 if(kn==2)
4765 goto store;
4766
4767 // third column
4768 c_12 -= c_10 * c_02;
4769 c_22 -= c_20 * c_02;
4770 c_32 -= c_30 * c_02;
4771
4772 c_22 -= c_21 * c_12;
4773 c_32 -= c_31 * c_12;
4774
4775 tmp = 1.0 / c_22;
4776 c_32 *= tmp;
4777
4778 inv_diag_D[2] = tmp;
4779
4780 if(kn==3)
4781 goto store;
4782
4783 // fourth column
4784 c_13 -= c_10 * c_03;
4785 c_23 -= c_20 * c_03;
4786 c_33 -= c_30 * c_03;
4787
4788 c_23 -= c_21 * c_13;
4789 c_33 -= c_31 * c_13;
4790
4791 c_33 -= c_32 * c_23;
4792
4793 tmp = 1.0 / c_33;
4794
4795 inv_diag_D[3] = tmp;
4796
4797 store:
4798
4799 if(km>=4)
4800 {
4801 D[0+bs*0] = c_00;
4802 D[1+bs*0] = c_10;
4803 D[2+bs*0] = c_20;
4804 D[3+bs*0] = c_30;
4805
4806 if(kn==1)
4807 return;
4808
4809 D[0+bs*1] = c_01;
4810 D[1+bs*1] = c_11;
4811 D[2+bs*1] = c_21;
4812 D[3+bs*1] = c_31;
4813
4814 if(kn==2)
4815 return;
4816
4817 D[0+bs*2] = c_02;
4818 D[1+bs*2] = c_12;
4819 D[2+bs*2] = c_22;
4820 D[3+bs*2] = c_32;
4821
4822 if(kn==3)
4823 return;
4824
4825 D[0+bs*3] = c_03;
4826 D[1+bs*3] = c_13;
4827 D[2+bs*3] = c_23;
4828 D[3+bs*3] = c_33;
4829 }
4830 else if(km>=3)
4831 {
4832 D[0+bs*0] = c_00;
4833 D[1+bs*0] = c_10;
4834 D[2+bs*0] = c_20;
4835
4836 if(kn==1)
4837 return;
4838
4839 D[0+bs*1] = c_01;
4840 D[1+bs*1] = c_11;
4841 D[2+bs*1] = c_21;
4842
4843 if(kn==2)
4844 return;
4845
4846 D[0+bs*2] = c_02;
4847 D[1+bs*2] = c_12;
4848 D[2+bs*2] = c_22;
4849
4850 if(kn==3)
4851 return;
4852
4853 D[0+bs*3] = c_03;
4854 D[1+bs*3] = c_13;
4855 D[2+bs*3] = c_23;
4856 }
4857 else if(km>=2)
4858 {
4859 D[0+bs*0] = c_00;
4860 D[1+bs*0] = c_10;
4861
4862 if(kn==1)
4863 return;
4864
4865 D[0+bs*1] = c_01;
4866 D[1+bs*1] = c_11;
4867
4868 if(kn==2)
4869 return;
4870
4871 D[0+bs*2] = c_02;
4872 D[1+bs*2] = c_12;
4873
4874 if(kn==3)
4875 return;
4876
4877 D[0+bs*3] = c_03;
4878 D[1+bs*3] = c_13;
4879 }
4880 else //if(km>=1)
4881 {
4882 D[0+bs*0] = c_00;
4883
4884 if(kn==1)
4885 return;
4886
4887 D[0+bs*1] = c_01;
4888
4889 if(kn==2)
4890 return;
4891
4892 D[0+bs*2] = c_02;
4893
4894 if(kn==3)
4895 return;
4896
4897 D[0+bs*3] = c_03;
4898 }
4899
4900 return;
4901
4902 }
4903#endif
4904
4905
4906
4907#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4908void kernel_sgetrf_nn_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D)
4909 {
4910 kernel_sgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
4911 }
4912#endif
4913
4914
4915
4916#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4917void kernel_strsm_nn_ll_one_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, int km, int kn)
4918 {
4919
4920 const int bs = 4;
4921
4922 int k;
4923
4924 float
4925 tmp,
4926 a_0, a_1, a_2, a_3,
4927 b_0, b_1, b_2, b_3,
4928 e_1, e_2, e_3,
4929 c_00=0, c_01=0, c_02=0, c_03=0,
4930 c_10=0, c_11=0, c_12=0, c_13=0,
4931 c_20=0, c_21=0, c_22=0, c_23=0,
4932 c_30=0, c_31=0, c_32=0, c_33=0;
4933
4934 if(kmax<=0)
4935 goto add;
4936
4937 for(k=0; k<kmax-3; k+=4)
4938 {
4939
4940 a_0 = A[0+bs*0];
4941 a_1 = A[1+bs*0];
4942 a_2 = A[2+bs*0];
4943 a_3 = A[3+bs*0];
4944
4945 b_0 = B[0+bs*0];
4946 b_1 = B[0+bs*1];
4947 b_2 = B[0+bs*2];
4948 b_3 = B[0+bs*3];
4949
4950 c_00 -= a_0 * b_0;
4951 c_10 -= a_1 * b_0;
4952 c_20 -= a_2 * b_0;
4953 c_30 -= a_3 * b_0;
4954
4955 c_01 -= a_0 * b_1;
4956 c_11 -= a_1 * b_1;
4957 c_21 -= a_2 * b_1;
4958 c_31 -= a_3 * b_1;
4959
4960 c_02 -= a_0 * b_2;
4961 c_12 -= a_1 * b_2;
4962 c_22 -= a_2 * b_2;
4963 c_32 -= a_3 * b_2;
4964
4965 c_03 -= a_0 * b_3;
4966 c_13 -= a_1 * b_3;
4967 c_23 -= a_2 * b_3;
4968 c_33 -= a_3 * b_3;
4969
4970
4971 a_0 = A[0+bs*1];
4972 a_1 = A[1+bs*1];
4973 a_2 = A[2+bs*1];
4974 a_3 = A[3+bs*1];
4975
4976 b_0 = B[1+bs*0];
4977 b_1 = B[1+bs*1];
4978 b_2 = B[1+bs*2];
4979 b_3 = B[1+bs*3];
4980
4981 c_00 -= a_0 * b_0;
4982 c_10 -= a_1 * b_0;
4983 c_20 -= a_2 * b_0;
4984 c_30 -= a_3 * b_0;
4985
4986 c_01 -= a_0 * b_1;
4987 c_11 -= a_1 * b_1;
4988 c_21 -= a_2 * b_1;
4989 c_31 -= a_3 * b_1;
4990
4991 c_02 -= a_0 * b_2;
4992 c_12 -= a_1 * b_2;
4993 c_22 -= a_2 * b_2;
4994 c_32 -= a_3 * b_2;
4995
4996 c_03 -= a_0 * b_3;
4997 c_13 -= a_1 * b_3;
4998 c_23 -= a_2 * b_3;
4999 c_33 -= a_3 * b_3;
5000
5001
5002 a_0 = A[0+bs*2];
5003 a_1 = A[1+bs*2];
5004 a_2 = A[2+bs*2];
5005 a_3 = A[3+bs*2];
5006
5007 b_0 = B[2+bs*0];
5008 b_1 = B[2+bs*1];
5009 b_2 = B[2+bs*2];
5010 b_3 = B[2+bs*3];
5011
5012 c_00 -= a_0 * b_0;
5013 c_10 -= a_1 * b_0;
5014 c_20 -= a_2 * b_0;
5015 c_30 -= a_3 * b_0;
5016
5017 c_01 -= a_0 * b_1;
5018 c_11 -= a_1 * b_1;
5019 c_21 -= a_2 * b_1;
5020 c_31 -= a_3 * b_1;
5021
5022 c_02 -= a_0 * b_2;
5023 c_12 -= a_1 * b_2;
5024 c_22 -= a_2 * b_2;
5025 c_32 -= a_3 * b_2;
5026
5027 c_03 -= a_0 * b_3;
5028 c_13 -= a_1 * b_3;
5029 c_23 -= a_2 * b_3;
5030 c_33 -= a_3 * b_3;
5031
5032
5033 a_0 = A[0+bs*3];
5034 a_1 = A[1+bs*3];
5035 a_2 = A[2+bs*3];
5036 a_3 = A[3+bs*3];
5037
5038 b_0 = B[3+bs*0];
5039 b_1 = B[3+bs*1];
5040 b_2 = B[3+bs*2];
5041 b_3 = B[3+bs*3];
5042
5043 c_00 -= a_0 * b_0;
5044 c_10 -= a_1 * b_0;
5045 c_20 -= a_2 * b_0;
5046 c_30 -= a_3 * b_0;
5047
5048 c_01 -= a_0 * b_1;
5049 c_11 -= a_1 * b_1;
5050 c_21 -= a_2 * b_1;
5051 c_31 -= a_3 * b_1;
5052
5053 c_02 -= a_0 * b_2;
5054 c_12 -= a_1 * b_2;
5055 c_22 -= a_2 * b_2;
5056 c_32 -= a_3 * b_2;
5057
5058 c_03 -= a_0 * b_3;
5059 c_13 -= a_1 * b_3;
5060 c_23 -= a_2 * b_3;
5061 c_33 -= a_3 * b_3;
5062
5063
5064 A += 16;
5065 B += 4*sdb;
5066
5067 }
5068 for(; k<kmax; k++)
5069 {
5070
5071 a_0 = A[0+bs*0];
5072 a_1 = A[1+bs*0];
5073 a_2 = A[2+bs*0];
5074 a_3 = A[3+bs*0];
5075
5076 b_0 = B[0+bs*0];
5077 b_1 = B[0+bs*1];
5078 b_2 = B[0+bs*2];
5079 b_3 = B[0+bs*3];
5080
5081 c_00 -= a_0 * b_0;
5082 c_10 -= a_1 * b_0;
5083 c_20 -= a_2 * b_0;
5084 c_30 -= a_3 * b_0;
5085
5086 c_01 -= a_0 * b_1;
5087 c_11 -= a_1 * b_1;
5088 c_21 -= a_2 * b_1;
5089 c_31 -= a_3 * b_1;
5090
5091 c_02 -= a_0 * b_2;
5092 c_12 -= a_1 * b_2;
5093 c_22 -= a_2 * b_2;
5094 c_32 -= a_3 * b_2;
5095
5096 c_03 -= a_0 * b_3;
5097 c_13 -= a_1 * b_3;
5098 c_23 -= a_2 * b_3;
5099 c_33 -= a_3 * b_3;
5100
5101
5102 A += 4;
5103 B += 1;
5104
5105 }
5106
5107 add:
5108
5109 c_00 += C[0+bs*0];
5110 c_10 += C[1+bs*0];
5111 c_20 += C[2+bs*0];
5112 c_30 += C[3+bs*0];
5113
5114 c_01 += C[0+bs*1];
5115 c_11 += C[1+bs*1];
5116 c_21 += C[2+bs*1];
5117 c_31 += C[3+bs*1];
5118
5119 c_02 += C[0+bs*2];
5120 c_12 += C[1+bs*2];
5121 c_22 += C[2+bs*2];
5122 c_32 += C[3+bs*2];
5123
5124 c_03 += C[0+bs*3];
5125 c_13 += C[1+bs*3];
5126 c_23 += C[2+bs*3];
5127 c_33 += C[3+bs*3];
5128
5129 // solution
5130
5131 if(km==1)
5132 goto store;
5133
5134 e_1 = E[1+bs*0];
5135 e_2 = E[2+bs*0];
5136 e_3 = E[3+bs*0];
5137 c_10 -= e_1 * c_00;
5138 c_20 -= e_2 * c_00;
5139 c_30 -= e_3 * c_00;
5140 c_11 -= e_1 * c_01;
5141 c_21 -= e_2 * c_01;
5142 c_31 -= e_3 * c_01;
5143 c_12 -= e_1 * c_02;
5144 c_22 -= e_2 * c_02;
5145 c_32 -= e_3 * c_02;
5146 c_13 -= e_1 * c_03;
5147 c_23 -= e_2 * c_03;
5148 c_33 -= e_3 * c_03;
5149
5150 if(km==2)
5151 goto store;
5152
5153 e_2 = E[2+bs*1];
5154 e_3 = E[3+bs*1];
5155 c_20 -= e_2 * c_10;
5156 c_30 -= e_3 * c_10;
5157 c_21 -= e_2 * c_11;
5158 c_31 -= e_3 * c_11;
5159 c_22 -= e_2 * c_12;
5160 c_32 -= e_3 * c_12;
5161 c_23 -= e_2 * c_13;
5162 c_33 -= e_3 * c_13;
5163
5164 if(km==3)
5165 goto store;
5166
5167 e_3 = E[3+bs*2];
5168 c_30 -= e_3 * c_20;
5169 c_31 -= e_3 * c_21;
5170 c_32 -= e_3 * c_22;
5171 c_33 -= e_3 * c_23;
5172
5173 store:
5174
5175 if(km>=4)
5176 {
5177 D[0+bs*0] = c_00;
5178 D[1+bs*0] = c_10;
5179 D[2+bs*0] = c_20;
5180 D[3+bs*0] = c_30;
5181
5182 if(kn==1)
5183 return;
5184
5185 D[0+bs*1] = c_01;
5186 D[1+bs*1] = c_11;
5187 D[2+bs*1] = c_21;
5188 D[3+bs*1] = c_31;
5189
5190 if(kn==2)
5191 return;
5192
5193 D[0+bs*2] = c_02;
5194 D[1+bs*2] = c_12;
5195 D[2+bs*2] = c_22;
5196 D[3+bs*2] = c_32;
5197
5198 if(kn==3)
5199 return;
5200
5201 D[0+bs*3] = c_03;
5202 D[1+bs*3] = c_13;
5203 D[2+bs*3] = c_23;
5204 D[3+bs*3] = c_33;
5205 }
5206 else if(km>=3)
5207 {
5208 D[0+bs*0] = c_00;
5209 D[1+bs*0] = c_10;
5210 D[2+bs*0] = c_20;
5211
5212 if(kn==1)
5213 return;
5214
5215 D[0+bs*1] = c_01;
5216 D[1+bs*1] = c_11;
5217 D[2+bs*1] = c_21;
5218
5219 if(kn==2)
5220 return;
5221
5222 D[0+bs*2] = c_02;
5223 D[1+bs*2] = c_12;
5224 D[2+bs*2] = c_22;
5225
5226 if(kn==3)
5227 return;
5228
5229 D[0+bs*3] = c_03;
5230 D[1+bs*3] = c_13;
5231 D[2+bs*3] = c_23;
5232 }
5233 else if(km>=2)
5234 {
5235 D[0+bs*0] = c_00;
5236 D[1+bs*0] = c_10;
5237
5238 if(kn==1)
5239 return;
5240
5241 D[0+bs*1] = c_01;
5242 D[1+bs*1] = c_11;
5243
5244 if(kn==2)
5245 return;
5246
5247 D[0+bs*2] = c_02;
5248 D[1+bs*2] = c_12;
5249
5250 if(kn==3)
5251 return;
5252
5253 D[0+bs*3] = c_03;
5254 D[1+bs*3] = c_13;
5255 }
5256 else //if(km>=1)
5257 {
5258 D[0+bs*0] = c_00;
5259
5260 if(kn==1)
5261 return;
5262
5263 D[0+bs*1] = c_01;
5264
5265 if(kn==2)
5266 return;
5267
5268 D[0+bs*2] = c_02;
5269
5270 if(kn==3)
5271 return;
5272
5273 D[0+bs*3] = c_03;
5274 }
5275
5276 return;
5277
5278 }
5279#endif
5280
5281
5282
5283#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5284void kernel_strsm_nn_ll_one_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E)
5285 {
5286 kernel_strsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
5287 }
5288#endif
5289
5290
5291
5292#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5293void kernel_strsm_nn_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
5294 {
5295
5296 const int bs = 4;
5297
5298 int k;
5299
5300 float
5301 tmp,
5302 a_0, a_1, a_2, a_3,
5303 b_0, b_1, b_2, b_3,
5304 e_00, e_01, e_02, e_03,
5305 e_11, e_12, e_13,
5306 e_22, e_23,
5307 e_33,
5308 c_00=0, c_01=0, c_02=0, c_03=0,
5309 c_10=0, c_11=0, c_12=0, c_13=0,
5310 c_20=0, c_21=0, c_22=0, c_23=0,
5311 c_30=0, c_31=0, c_32=0, c_33=0;
5312
5313 if(kmax<=0)
5314 goto add;
5315
5316 for(k=0; k<kmax-3; k+=4)
5317 {
5318
5319 a_0 = A[0+bs*0];
5320 a_1 = A[1+bs*0];
5321 a_2 = A[2+bs*0];
5322 a_3 = A[3+bs*0];
5323
5324 b_0 = B[0+bs*0];
5325 b_1 = B[0+bs*1];
5326 b_2 = B[0+bs*2];
5327 b_3 = B[0+bs*3];
5328
5329 c_00 -= a_0 * b_0;
5330 c_10 -= a_1 * b_0;
5331 c_20 -= a_2 * b_0;
5332 c_30 -= a_3 * b_0;
5333
5334 c_01 -= a_0 * b_1;
5335 c_11 -= a_1 * b_1;
5336 c_21 -= a_2 * b_1;
5337 c_31 -= a_3 * b_1;
5338
5339 c_02 -= a_0 * b_2;
5340 c_12 -= a_1 * b_2;
5341 c_22 -= a_2 * b_2;
5342 c_32 -= a_3 * b_2;
5343
5344 c_03 -= a_0 * b_3;
5345 c_13 -= a_1 * b_3;
5346 c_23 -= a_2 * b_3;
5347 c_33 -= a_3 * b_3;
5348
5349
5350 a_0 = A[0+bs*1];
5351 a_1 = A[1+bs*1];
5352 a_2 = A[2+bs*1];
5353 a_3 = A[3+bs*1];
5354
5355 b_0 = B[1+bs*0];
5356 b_1 = B[1+bs*1];
5357 b_2 = B[1+bs*2];
5358 b_3 = B[1+bs*3];
5359
5360 c_00 -= a_0 * b_0;
5361 c_10 -= a_1 * b_0;
5362 c_20 -= a_2 * b_0;
5363 c_30 -= a_3 * b_0;
5364
5365 c_01 -= a_0 * b_1;
5366 c_11 -= a_1 * b_1;
5367 c_21 -= a_2 * b_1;
5368 c_31 -= a_3 * b_1;
5369
5370 c_02 -= a_0 * b_2;
5371 c_12 -= a_1 * b_2;
5372 c_22 -= a_2 * b_2;
5373 c_32 -= a_3 * b_2;
5374
5375 c_03 -= a_0 * b_3;
5376 c_13 -= a_1 * b_3;
5377 c_23 -= a_2 * b_3;
5378 c_33 -= a_3 * b_3;
5379
5380
5381 a_0 = A[0+bs*2];
5382 a_1 = A[1+bs*2];
5383 a_2 = A[2+bs*2];
5384 a_3 = A[3+bs*2];
5385
5386 b_0 = B[2+bs*0];
5387 b_1 = B[2+bs*1];
5388 b_2 = B[2+bs*2];
5389 b_3 = B[2+bs*3];
5390
5391 c_00 -= a_0 * b_0;
5392 c_10 -= a_1 * b_0;
5393 c_20 -= a_2 * b_0;
5394 c_30 -= a_3 * b_0;
5395
5396 c_01 -= a_0 * b_1;
5397 c_11 -= a_1 * b_1;
5398 c_21 -= a_2 * b_1;
5399 c_31 -= a_3 * b_1;
5400
5401 c_02 -= a_0 * b_2;
5402 c_12 -= a_1 * b_2;
5403 c_22 -= a_2 * b_2;
5404 c_32 -= a_3 * b_2;
5405
5406 c_03 -= a_0 * b_3;
5407 c_13 -= a_1 * b_3;
5408 c_23 -= a_2 * b_3;
5409 c_33 -= a_3 * b_3;
5410
5411
5412 a_0 = A[0+bs*3];
5413 a_1 = A[1+bs*3];
5414 a_2 = A[2+bs*3];
5415 a_3 = A[3+bs*3];
5416
5417 b_0 = B[3+bs*0];
5418 b_1 = B[3+bs*1];
5419 b_2 = B[3+bs*2];
5420 b_3 = B[3+bs*3];
5421
5422 c_00 -= a_0 * b_0;
5423 c_10 -= a_1 * b_0;
5424 c_20 -= a_2 * b_0;
5425 c_30 -= a_3 * b_0;
5426
5427 c_01 -= a_0 * b_1;
5428 c_11 -= a_1 * b_1;
5429 c_21 -= a_2 * b_1;
5430 c_31 -= a_3 * b_1;
5431
5432 c_02 -= a_0 * b_2;
5433 c_12 -= a_1 * b_2;
5434 c_22 -= a_2 * b_2;
5435 c_32 -= a_3 * b_2;
5436
5437 c_03 -= a_0 * b_3;
5438 c_13 -= a_1 * b_3;
5439 c_23 -= a_2 * b_3;
5440 c_33 -= a_3 * b_3;
5441
5442
5443 A += 16;
5444 B += 4*sdb;
5445
5446 }
5447 for(; k<kmax; k++)
5448 {
5449
5450 a_0 = A[0+bs*0];
5451 a_1 = A[1+bs*0];
5452 a_2 = A[2+bs*0];
5453 a_3 = A[3+bs*0];
5454
5455 b_0 = B[0+bs*0];
5456 b_1 = B[0+bs*1];
5457 b_2 = B[0+bs*2];
5458 b_3 = B[0+bs*3];
5459
5460 c_00 -= a_0 * b_0;
5461 c_10 -= a_1 * b_0;
5462 c_20 -= a_2 * b_0;
5463 c_30 -= a_3 * b_0;
5464
5465 c_01 -= a_0 * b_1;
5466 c_11 -= a_1 * b_1;
5467 c_21 -= a_2 * b_1;
5468 c_31 -= a_3 * b_1;
5469
5470 c_02 -= a_0 * b_2;
5471 c_12 -= a_1 * b_2;
5472 c_22 -= a_2 * b_2;
5473 c_32 -= a_3 * b_2;
5474
5475 c_03 -= a_0 * b_3;
5476 c_13 -= a_1 * b_3;
5477 c_23 -= a_2 * b_3;
5478 c_33 -= a_3 * b_3;
5479
5480
5481 A += 4;
5482 B += 1;
5483
5484 }
5485
5486 add:
5487
5488 c_00 += C[0+bs*0];
5489 c_10 += C[1+bs*0];
5490 c_20 += C[2+bs*0];
5491 c_30 += C[3+bs*0];
5492
5493 c_01 += C[0+bs*1];
5494 c_11 += C[1+bs*1];
5495 c_21 += C[2+bs*1];
5496 c_31 += C[3+bs*1];
5497
5498 c_02 += C[0+bs*2];
5499 c_12 += C[1+bs*2];
5500 c_22 += C[2+bs*2];
5501 c_32 += C[3+bs*2];
5502
5503 c_03 += C[0+bs*3];
5504 c_13 += C[1+bs*3];
5505 c_23 += C[2+bs*3];
5506 c_33 += C[3+bs*3];
5507
5508 // solve
5509
5510 e_00 = inv_diag_E[0];
5511 c_00 *= e_00;
5512 c_10 *= e_00;
5513 c_20 *= e_00;
5514 c_30 *= e_00;
5515
5516 if(kn==1)
5517 goto store;
5518
5519 e_01 = E[0+bs*1];
5520 e_11 = inv_diag_E[1];
5521 c_01 -= c_00 * e_01;
5522 c_11 -= c_10 * e_01;
5523 c_21 -= c_20 * e_01;
5524 c_31 -= c_30 * e_01;
5525 c_01 *= e_11;
5526 c_11 *= e_11;
5527 c_21 *= e_11;
5528 c_31 *= e_11;
5529
5530 if(kn==2)
5531 goto store;
5532
5533 e_02 = E[0+bs*2];
5534 e_12 = E[1+bs*2];
5535 e_22 = inv_diag_E[2];
5536 c_02 -= c_00 * e_02;
5537 c_12 -= c_10 * e_02;
5538 c_22 -= c_20 * e_02;
5539 c_32 -= c_30 * e_02;
5540 c_02 -= c_01 * e_12;
5541 c_12 -= c_11 * e_12;
5542 c_22 -= c_21 * e_12;
5543 c_32 -= c_31 * e_12;
5544 c_02 *= e_22;
5545 c_12 *= e_22;
5546 c_22 *= e_22;
5547 c_32 *= e_22;
5548
5549 if(kn==3)
5550 goto store;
5551
5552 e_03 = E[0+bs*3];
5553 e_13 = E[1+bs*3];
5554 e_23 = E[2+bs*3];
5555 e_33 = inv_diag_E[3];
5556 c_03 -= c_00 * e_03;
5557 c_13 -= c_10 * e_03;
5558 c_23 -= c_20 * e_03;
5559 c_33 -= c_30 * e_03;
5560 c_03 -= c_01 * e_13;
5561 c_13 -= c_11 * e_13;
5562 c_23 -= c_21 * e_13;
5563 c_33 -= c_31 * e_13;
5564 c_03 -= c_02 * e_23;
5565 c_13 -= c_12 * e_23;
5566 c_23 -= c_22 * e_23;
5567 c_33 -= c_32 * e_23;
5568 c_03 *= e_33;
5569 c_13 *= e_33;
5570 c_23 *= e_33;
5571 c_33 *= e_33;
5572
5573 store:
5574
5575 if(km>=4)
5576 {
5577 D[0+bs*0] = c_00;
5578 D[1+bs*0] = c_10;
5579 D[2+bs*0] = c_20;
5580 D[3+bs*0] = c_30;
5581
5582 if(kn==1)
5583 return;
5584
5585 D[0+bs*1] = c_01;
5586 D[1+bs*1] = c_11;
5587 D[2+bs*1] = c_21;
5588 D[3+bs*1] = c_31;
5589
5590 if(kn==2)
5591 return;
5592
5593 D[0+bs*2] = c_02;
5594 D[1+bs*2] = c_12;
5595 D[2+bs*2] = c_22;
5596 D[3+bs*2] = c_32;
5597
5598 if(kn==3)
5599 return;
5600
5601 D[0+bs*3] = c_03;
5602 D[1+bs*3] = c_13;
5603 D[2+bs*3] = c_23;
5604 D[3+bs*3] = c_33;
5605 }
5606 else if(km>=3)
5607 {
5608 D[0+bs*0] = c_00;
5609 D[1+bs*0] = c_10;
5610 D[2+bs*0] = c_20;
5611
5612 if(kn==1)
5613 return;
5614
5615 D[0+bs*1] = c_01;
5616 D[1+bs*1] = c_11;
5617 D[2+bs*1] = c_21;
5618
5619 if(kn==2)
5620 return;
5621
5622 D[0+bs*2] = c_02;
5623 D[1+bs*2] = c_12;
5624 D[2+bs*2] = c_22;
5625
5626 if(kn==3)
5627 return;
5628
5629 D[0+bs*3] = c_03;
5630 D[1+bs*3] = c_13;
5631 D[2+bs*3] = c_23;
5632 }
5633 else if(km>=2)
5634 {
5635 D[0+bs*0] = c_00;
5636 D[1+bs*0] = c_10;
5637
5638 if(kn==1)
5639 return;
5640
5641 D[0+bs*1] = c_01;
5642 D[1+bs*1] = c_11;
5643
5644 if(kn==2)
5645 return;
5646
5647 D[0+bs*2] = c_02;
5648 D[1+bs*2] = c_12;
5649
5650 if(kn==3)
5651 return;
5652
5653 D[0+bs*3] = c_03;
5654 D[1+bs*3] = c_13;
5655 }
5656 else //if(km>=1)
5657 {
5658 D[0+bs*0] = c_00;
5659
5660 if(kn==1)
5661 return;
5662
5663 D[0+bs*1] = c_01;
5664
5665 if(kn==2)
5666 return;
5667
5668 D[0+bs*2] = c_02;
5669
5670 if(kn==3)
5671 return;
5672
5673 D[0+bs*3] = c_03;
5674 }
5675
5676 return;
5677
5678 }
5679#endif
5680
5681
5682
5683#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5684void kernel_strsm_nn_ru_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E)
5685 {
5686 kernel_strsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
5687 }
5688#endif
5689
5690
5691
5692#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5693void kernel_strsm_nn_lu_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
5694 {
5695
5696 const int bs = 4;
5697
5698 int k;
5699
5700 float
5701 tmp,
5702 a_0, a_1, a_2, a_3,
5703 b_0, b_1, b_2, b_3,
5704 e_00, e_01, e_02, e_03,
5705 e_11, e_12, e_13,
5706 e_22, e_23,
5707 e_33,
5708 c_00=0, c_01=0, c_02=0, c_03=0,
5709 c_10=0, c_11=0, c_12=0, c_13=0,
5710 c_20=0, c_21=0, c_22=0, c_23=0,
5711 c_30=0, c_31=0, c_32=0, c_33=0;
5712
5713 if(kmax<=0)
5714 goto add;
5715
5716 for(k=0; k<kmax-3; k+=4)
5717 {
5718
5719 a_0 = A[0+bs*0];
5720 a_1 = A[1+bs*0];
5721 a_2 = A[2+bs*0];
5722 a_3 = A[3+bs*0];
5723
5724 b_0 = B[0+bs*0];
5725 b_1 = B[0+bs*1];
5726 b_2 = B[0+bs*2];
5727 b_3 = B[0+bs*3];
5728
5729 c_00 -= a_0 * b_0;
5730 c_10 -= a_1 * b_0;
5731 c_20 -= a_2 * b_0;
5732 c_30 -= a_3 * b_0;
5733
5734 c_01 -= a_0 * b_1;
5735 c_11 -= a_1 * b_1;
5736 c_21 -= a_2 * b_1;
5737 c_31 -= a_3 * b_1;
5738
5739 c_02 -= a_0 * b_2;
5740 c_12 -= a_1 * b_2;
5741 c_22 -= a_2 * b_2;
5742 c_32 -= a_3 * b_2;
5743
5744 c_03 -= a_0 * b_3;
5745 c_13 -= a_1 * b_3;
5746 c_23 -= a_2 * b_3;
5747 c_33 -= a_3 * b_3;
5748
5749
5750 a_0 = A[0+bs*1];
5751 a_1 = A[1+bs*1];
5752 a_2 = A[2+bs*1];
5753 a_3 = A[3+bs*1];
5754
5755 b_0 = B[1+bs*0];
5756 b_1 = B[1+bs*1];
5757 b_2 = B[1+bs*2];
5758 b_3 = B[1+bs*3];
5759
5760 c_00 -= a_0 * b_0;
5761 c_10 -= a_1 * b_0;
5762 c_20 -= a_2 * b_0;
5763 c_30 -= a_3 * b_0;
5764
5765 c_01 -= a_0 * b_1;
5766 c_11 -= a_1 * b_1;
5767 c_21 -= a_2 * b_1;
5768 c_31 -= a_3 * b_1;
5769
5770 c_02 -= a_0 * b_2;
5771 c_12 -= a_1 * b_2;
5772 c_22 -= a_2 * b_2;
5773 c_32 -= a_3 * b_2;
5774
5775 c_03 -= a_0 * b_3;
5776 c_13 -= a_1 * b_3;
5777 c_23 -= a_2 * b_3;
5778 c_33 -= a_3 * b_3;
5779
5780
5781 a_0 = A[0+bs*2];
5782 a_1 = A[1+bs*2];
5783 a_2 = A[2+bs*2];
5784 a_3 = A[3+bs*2];
5785
5786 b_0 = B[2+bs*0];
5787 b_1 = B[2+bs*1];
5788 b_2 = B[2+bs*2];
5789 b_3 = B[2+bs*3];
5790
5791 c_00 -= a_0 * b_0;
5792 c_10 -= a_1 * b_0;
5793 c_20 -= a_2 * b_0;
5794 c_30 -= a_3 * b_0;
5795
5796 c_01 -= a_0 * b_1;
5797 c_11 -= a_1 * b_1;
5798 c_21 -= a_2 * b_1;
5799 c_31 -= a_3 * b_1;
5800
5801 c_02 -= a_0 * b_2;
5802 c_12 -= a_1 * b_2;
5803 c_22 -= a_2 * b_2;
5804 c_32 -= a_3 * b_2;
5805
5806 c_03 -= a_0 * b_3;
5807 c_13 -= a_1 * b_3;
5808 c_23 -= a_2 * b_3;
5809 c_33 -= a_3 * b_3;
5810
5811
5812 a_0 = A[0+bs*3];
5813 a_1 = A[1+bs*3];
5814 a_2 = A[2+bs*3];
5815 a_3 = A[3+bs*3];
5816
5817 b_0 = B[3+bs*0];
5818 b_1 = B[3+bs*1];
5819 b_2 = B[3+bs*2];
5820 b_3 = B[3+bs*3];
5821
5822 c_00 -= a_0 * b_0;
5823 c_10 -= a_1 * b_0;
5824 c_20 -= a_2 * b_0;
5825 c_30 -= a_3 * b_0;
5826
5827 c_01 -= a_0 * b_1;
5828 c_11 -= a_1 * b_1;
5829 c_21 -= a_2 * b_1;
5830 c_31 -= a_3 * b_1;
5831
5832 c_02 -= a_0 * b_2;
5833 c_12 -= a_1 * b_2;
5834 c_22 -= a_2 * b_2;
5835 c_32 -= a_3 * b_2;
5836
5837 c_03 -= a_0 * b_3;
5838 c_13 -= a_1 * b_3;
5839 c_23 -= a_2 * b_3;
5840 c_33 -= a_3 * b_3;
5841
5842
5843 A += 16;
5844 B += 4*sdb;
5845
5846 }
5847 for(; k<kmax; k++)
5848 {
5849
5850 a_0 = A[0+bs*0];
5851 a_1 = A[1+bs*0];
5852 a_2 = A[2+bs*0];
5853 a_3 = A[3+bs*0];
5854
5855 b_0 = B[0+bs*0];
5856 b_1 = B[0+bs*1];
5857 b_2 = B[0+bs*2];
5858 b_3 = B[0+bs*3];
5859
5860 c_00 -= a_0 * b_0;
5861 c_10 -= a_1 * b_0;
5862 c_20 -= a_2 * b_0;
5863 c_30 -= a_3 * b_0;
5864
5865 c_01 -= a_0 * b_1;
5866 c_11 -= a_1 * b_1;
5867 c_21 -= a_2 * b_1;
5868 c_31 -= a_3 * b_1;
5869
5870 c_02 -= a_0 * b_2;
5871 c_12 -= a_1 * b_2;
5872 c_22 -= a_2 * b_2;
5873 c_32 -= a_3 * b_2;
5874
5875 c_03 -= a_0 * b_3;
5876 c_13 -= a_1 * b_3;
5877 c_23 -= a_2 * b_3;
5878 c_33 -= a_3 * b_3;
5879
5880
5881 A += 4;
5882 B += 1;
5883
5884 }
5885
5886 add:
5887
5888 c_00 += C[0+bs*0];
5889 c_10 += C[1+bs*0];
5890 c_20 += C[2+bs*0];
5891 c_30 += C[3+bs*0];
5892
5893 c_01 += C[0+bs*1];
5894 c_11 += C[1+bs*1];
5895 c_21 += C[2+bs*1];
5896 c_31 += C[3+bs*1];
5897
5898 c_02 += C[0+bs*2];
5899 c_12 += C[1+bs*2];
5900 c_22 += C[2+bs*2];
5901 c_32 += C[3+bs*2];
5902
5903 c_03 += C[0+bs*3];
5904 c_13 += C[1+bs*3];
5905 c_23 += C[2+bs*3];
5906 c_33 += C[3+bs*3];
5907
5908// printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
5909// printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
5910// printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
5911// printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
5912
5913 // solve
5914
5915 if(km>3)
5916 {
5917 e_03 = E[0+bs*3];
5918 e_13 = E[1+bs*3];
5919 e_23 = E[2+bs*3];
5920 e_33 = inv_diag_E[3];
5921 c_30 *= e_33;
5922 c_31 *= e_33;
5923 c_32 *= e_33;
5924 c_33 *= e_33;
5925 c_00 -= e_03 * c_30;
5926 c_01 -= e_03 * c_31;
5927 c_02 -= e_03 * c_32;
5928 c_03 -= e_03 * c_33;
5929 c_10 -= e_13 * c_30;
5930 c_11 -= e_13 * c_31;
5931 c_12 -= e_13 * c_32;
5932 c_13 -= e_13 * c_33;
5933 c_20 -= e_23 * c_30;
5934 c_21 -= e_23 * c_31;
5935 c_22 -= e_23 * c_32;
5936 c_23 -= e_23 * c_33;
5937 }
5938
5939 if(km>2)
5940 {
5941 e_02 = E[0+bs*2];
5942 e_12 = E[1+bs*2];
5943 e_22 = inv_diag_E[2];
5944 c_20 *= e_22;
5945 c_21 *= e_22;
5946 c_22 *= e_22;
5947 c_23 *= e_22;
5948 c_00 -= e_02 * c_20;
5949 c_01 -= e_02 * c_21;
5950 c_02 -= e_02 * c_22;
5951 c_03 -= e_02 * c_23;
5952 c_10 -= e_12 * c_20;
5953 c_11 -= e_12 * c_21;
5954 c_12 -= e_12 * c_22;
5955 c_13 -= e_12 * c_23;
5956 }
5957
5958 if(km>1)
5959 {
5960 e_01 = E[0+bs*1];
5961 e_11 = inv_diag_E[1];
5962 c_10 *= e_11;
5963 c_11 *= e_11;
5964 c_12 *= e_11;
5965 c_13 *= e_11;
5966 c_00 -= e_01 * c_10;
5967 c_01 -= e_01 * c_11;
5968 c_02 -= e_01 * c_12;
5969 c_03 -= e_01 * c_13;
5970 }
5971
5972 e_00 = inv_diag_E[0];
5973 c_00 *= e_00;
5974 c_01 *= e_00;
5975 c_02 *= e_00;
5976 c_03 *= e_00;
5977
5978 store:
5979
5980 if(km>=4)
5981 {
5982 D[0+bs*0] = c_00;
5983 D[1+bs*0] = c_10;
5984 D[2+bs*0] = c_20;
5985 D[3+bs*0] = c_30;
5986
5987 if(kn==1)
5988 return;
5989
5990 D[0+bs*1] = c_01;
5991 D[1+bs*1] = c_11;
5992 D[2+bs*1] = c_21;
5993 D[3+bs*1] = c_31;
5994
5995 if(kn==2)
5996 return;
5997
5998 D[0+bs*2] = c_02;
5999 D[1+bs*2] = c_12;
6000 D[2+bs*2] = c_22;
6001 D[3+bs*2] = c_32;
6002
6003 if(kn==3)
6004 return;
6005
6006 D[0+bs*3] = c_03;
6007 D[1+bs*3] = c_13;
6008 D[2+bs*3] = c_23;
6009 D[3+bs*3] = c_33;
6010 }
6011 else if(km>=3)
6012 {
6013 D[0+bs*0] = c_00;
6014 D[1+bs*0] = c_10;
6015 D[2+bs*0] = c_20;
6016
6017 if(kn==1)
6018 return;
6019
6020 D[0+bs*1] = c_01;
6021 D[1+bs*1] = c_11;
6022 D[2+bs*1] = c_21;
6023
6024 if(kn==2)
6025 return;
6026
6027 D[0+bs*2] = c_02;
6028 D[1+bs*2] = c_12;
6029 D[2+bs*2] = c_22;
6030
6031 if(kn==3)
6032 return;
6033
6034 D[0+bs*3] = c_03;
6035 D[1+bs*3] = c_13;
6036 D[2+bs*3] = c_23;
6037 }
6038 else if(km>=2)
6039 {
6040 D[0+bs*0] = c_00;
6041 D[1+bs*0] = c_10;
6042
6043 if(kn==1)
6044 return;
6045
6046 D[0+bs*1] = c_01;
6047 D[1+bs*1] = c_11;
6048
6049 if(kn==2)
6050 return;
6051
6052 D[0+bs*2] = c_02;
6053 D[1+bs*2] = c_12;
6054
6055 if(kn==3)
6056 return;
6057
6058 D[0+bs*3] = c_03;
6059 D[1+bs*3] = c_13;
6060 }
6061 else //if(km>=1)
6062 {
6063 D[0+bs*0] = c_00;
6064
6065 if(kn==1)
6066 return;
6067
6068 D[0+bs*1] = c_01;
6069
6070 if(kn==2)
6071 return;
6072
6073 D[0+bs*2] = c_02;
6074
6075 if(kn==3)
6076 return;
6077
6078 D[0+bs*3] = c_03;
6079 }
6080
6081 return;
6082
6083 }
6084#endif
6085
6086
6087
6088#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6089void kernel_strsm_nn_lu_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E)
6090 {
6091 kernel_strsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
6092 }
6093#endif
6094