blob: 167e356a6c2846b0832108acfa9db1f1ac35a1f8 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#include <math.h>
30
31
32
33#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
34//#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
35void kernel_dgemm_nt_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
36 {
37
38 const int bs = 4;
39
40 double
41 a_0, a_1, a_2, a_3,
42 b_0, b_1, b_2, b_3,
43 c_00=0, c_01=0, c_02=0, c_03=0,
44 c_10=0, c_11=0, c_12=0, c_13=0,
45 c_20=0, c_21=0, c_22=0, c_23=0,
46 c_30=0, c_31=0, c_32=0, c_33=0;
47
48 double
49 *C1, *D1;
50
51 int k;
52
53 for(k=0; k<kmax-3; k+=4)
54 {
55
56 // k = 0
57
58 a_0 = A[0];
59 a_1 = A[1];
60 a_2 = A[2];
61 a_3 = A[3];
62
63 b_0 = B[0];
64 b_1 = B[1];
65 b_2 = B[2];
66 b_3 = B[3];
67
68 c_00 += a_0 * b_0;
69 c_10 += a_1 * b_0;
70 c_20 += a_2 * b_0;
71 c_30 += a_3 * b_0;
72
73 c_01 += a_0 * b_1;
74 c_11 += a_1 * b_1;
75 c_21 += a_2 * b_1;
76 c_31 += a_3 * b_1;
77
78 c_02 += a_0 * b_2;
79 c_12 += a_1 * b_2;
80 c_22 += a_2 * b_2;
81 c_32 += a_3 * b_2;
82
83 c_03 += a_0 * b_3;
84 c_13 += a_1 * b_3;
85 c_23 += a_2 * b_3;
86 c_33 += a_3 * b_3;
87
88
89 // k = 1
90
91 a_0 = A[4];
92 a_1 = A[5];
93 a_2 = A[6];
94 a_3 = A[7];
95
96 b_0 = B[4];
97 b_1 = B[5];
98 b_2 = B[6];
99 b_3 = B[7];
100
101 c_00 += a_0 * b_0;
102 c_10 += a_1 * b_0;
103 c_20 += a_2 * b_0;
104 c_30 += a_3 * b_0;
105
106 c_01 += a_0 * b_1;
107 c_11 += a_1 * b_1;
108 c_21 += a_2 * b_1;
109 c_31 += a_3 * b_1;
110
111 c_02 += a_0 * b_2;
112 c_12 += a_1 * b_2;
113 c_22 += a_2 * b_2;
114 c_32 += a_3 * b_2;
115
116 c_03 += a_0 * b_3;
117 c_13 += a_1 * b_3;
118 c_23 += a_2 * b_3;
119 c_33 += a_3 * b_3;
120
121
122 // k = 2
123
124 a_0 = A[8];
125 a_1 = A[9];
126 a_2 = A[10];
127 a_3 = A[11];
128
129 b_0 = B[8];
130 b_1 = B[9];
131 b_2 = B[10];
132 b_3 = B[11];
133
134 c_00 += a_0 * b_0;
135 c_10 += a_1 * b_0;
136 c_20 += a_2 * b_0;
137 c_30 += a_3 * b_0;
138
139 c_01 += a_0 * b_1;
140 c_11 += a_1 * b_1;
141 c_21 += a_2 * b_1;
142 c_31 += a_3 * b_1;
143
144 c_02 += a_0 * b_2;
145 c_12 += a_1 * b_2;
146 c_22 += a_2 * b_2;
147 c_32 += a_3 * b_2;
148
149 c_03 += a_0 * b_3;
150 c_13 += a_1 * b_3;
151 c_23 += a_2 * b_3;
152 c_33 += a_3 * b_3;
153
154
155 // k = 3
156
157 a_0 = A[12];
158 a_1 = A[13];
159 a_2 = A[14];
160 a_3 = A[15];
161
162 b_0 = B[12];
163 b_1 = B[13];
164 b_2 = B[14];
165 b_3 = B[15];
166
167 c_00 += a_0 * b_0;
168 c_10 += a_1 * b_0;
169 c_20 += a_2 * b_0;
170 c_30 += a_3 * b_0;
171
172 c_01 += a_0 * b_1;
173 c_11 += a_1 * b_1;
174 c_21 += a_2 * b_1;
175 c_31 += a_3 * b_1;
176
177 c_02 += a_0 * b_2;
178 c_12 += a_1 * b_2;
179 c_22 += a_2 * b_2;
180 c_32 += a_3 * b_2;
181
182 c_03 += a_0 * b_3;
183 c_13 += a_1 * b_3;
184 c_23 += a_2 * b_3;
185 c_33 += a_3 * b_3;
186
187 A += 16;
188 B += 16;
189
190 }
191
192 for(; k<kmax; k++)
193 {
194
195 // k = 0
196
197 a_0 = A[0];
198 a_1 = A[1];
199 a_2 = A[2];
200 a_3 = A[3];
201
202 b_0 = B[0];
203 b_1 = B[1];
204 b_2 = B[2];
205 b_3 = B[3];
206
207 c_00 += a_0 * b_0;
208 c_10 += a_1 * b_0;
209 c_20 += a_2 * b_0;
210 c_30 += a_3 * b_0;
211
212 c_01 += a_0 * b_1;
213 c_11 += a_1 * b_1;
214 c_21 += a_2 * b_1;
215 c_31 += a_3 * b_1;
216
217 c_02 += a_0 * b_2;
218 c_12 += a_1 * b_2;
219 c_22 += a_2 * b_2;
220 c_32 += a_3 * b_2;
221
222 c_03 += a_0 * b_3;
223 c_13 += a_1 * b_3;
224 c_23 += a_2 * b_3;
225 c_33 += a_3 * b_3;
226
227 A += 4;
228 B += 4;
229
230 }
231
232 if(offsetC==0)
233 {
234 c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
235 c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
236 c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
237 c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
238
239 c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
240 c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
241 c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
242 c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
243
244 c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
245 c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
246 c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
247 c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
248
249 c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
250 c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
251 c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
252 c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
253 }
254 else if(offsetC==1)
255 {
256 C1 = C0 + sdc*bs;
257
258 c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
259 c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
260 c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
261 c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
262
263 c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
264 c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
265 c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
266 c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
267
268 c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
269 c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
270 c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
271 c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
272
273 c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
274 c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
275 c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
276 c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
277 }
278 else if(offsetC==2)
279 {
280 C1 = C0 + sdc*bs;
281
282 c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
283 c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
284 c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
285 c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
286
287 c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
288 c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
289 c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
290 c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
291
292 c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
293 c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
294 c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
295 c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
296
297 c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
298 c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
299 c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
300 c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
301 }
302 else //if(offsetC==3)
303 {
304 C1 = C0 + sdc*bs;
305
306 c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
307 c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
308 c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
309 c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
310
311 c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
312 c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
313 c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
314 c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
315
316 c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
317 c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
318 c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
319 c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
320
321 c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
322 c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
323 c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
324 c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
325 }
326
327 // shift sol for cols
328 if(n0>0)
329 {
330 if(n0==1)
331 {
332 c_00 = c_01;
333 c_10 = c_11;
334 c_20 = c_21;
335 c_30 = c_31;
336
337 c_01 = c_02;
338 c_11 = c_12;
339 c_21 = c_22;
340 c_31 = c_32;
341
342 c_02 = c_03;
343 c_12 = c_13;
344 c_22 = c_23;
345 c_32 = c_33;
346
347 D0 += 1*bs;
348 }
349 else if(n0==2)
350 {
351 c_00 = c_02;
352 c_10 = c_12;
353 c_20 = c_22;
354 c_30 = c_32;
355
356 c_01 = c_03;
357 c_11 = c_13;
358 c_21 = c_23;
359 c_31 = c_33;
360
361 D0 += 2*bs;
362 }
363 else //if(n0==3)
364 {
365 c_00 = c_03;
366 c_10 = c_13;
367 c_20 = c_23;
368 c_30 = c_33;
369
370 D0 += 3*bs;
371 }
372 }
373
374 int kn = n1 - n0;
375
376 if(offsetD==0)
377 {
378 if(kn<=0)
379 return;
380
381 if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
382 if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
383 if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
384 if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
385
386 if(kn<=1)
387 return;
388
389 if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
390 if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
391 if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
392 if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
393
394 if(kn<=2)
395 return;
396
397 if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
398 if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
399 if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
400 if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
401
402 if(kn<=3)
403 return;
404
405 if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
406 if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
407 if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
408 if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
409 }
410 else if(offsetD==1)
411 {
412 D1 = D0 + sdd*bs;
413
414 if(kn<=0)
415 return;
416
417 if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
418 if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
419 if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
420 if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
421
422 if(kn<=1)
423 return;
424
425 if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
426 if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
427 if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
428 if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
429
430 if(kn<=2)
431 return;
432
433 if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
434 if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
435 if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
436 if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
437
438 if(kn<=3)
439 return;
440
441 if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
442 if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
443 if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
444 if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
445 }
446 else if(offsetD==2)
447 {
448 D1 = D0 + sdd*bs;
449
450 if(kn<=0)
451 return;
452
453 if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
454 if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
455 if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
456 if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
457
458 if(kn<=1)
459 return;
460
461 if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
462 if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
463 if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
464 if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
465
466 if(kn<=2)
467 return;
468
469 if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
470 if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
471 if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
472 if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
473
474 if(kn<=3)
475 return;
476
477 if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
478 if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
479 if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
480 if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
481 }
482 else //if(offsetD==3)
483 {
484 D1 = D0 + sdd*bs;
485
486 if(kn<=0)
487 return;
488
489 if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
490 if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
491 if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
492 if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
493
494 if(kn<=1)
495 return;
496
497 if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
498 if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
499 if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
500 if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
501
502 if(kn<=2)
503 return;
504
505 if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
506 if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
507 if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
508 if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
509
510 if(kn<=3)
511 return;
512
513 if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
514 if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
515 if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
516 if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
517 }
518
519 return;
520
521 }
522#endif
523
524
525
526#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
527void kernel_dgemm_nt_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
528 {
529
530 const int bs = 4;
531
532 double
533 a_0, a_1, a_2, a_3,
534 b_0, b_1, b_2, b_3,
535 c_00=0, c_01=0, c_02=0, c_03=0,
536 c_10=0, c_11=0, c_12=0, c_13=0,
537 c_20=0, c_21=0, c_22=0, c_23=0,
538 c_30=0, c_31=0, c_32=0, c_33=0;
539
540 int k;
541
542 for(k=0; k<kmax-3; k+=4)
543 {
544
545 // k = 0
546
547 a_0 = A[0];
548 a_1 = A[1];
549 a_2 = A[2];
550 a_3 = A[3];
551
552 b_0 = B[0];
553 b_1 = B[1];
554 b_2 = B[2];
555 b_3 = B[3];
556
557 c_00 += a_0 * b_0;
558 c_10 += a_1 * b_0;
559 c_20 += a_2 * b_0;
560 c_30 += a_3 * b_0;
561
562 c_01 += a_0 * b_1;
563 c_11 += a_1 * b_1;
564 c_21 += a_2 * b_1;
565 c_31 += a_3 * b_1;
566
567 c_02 += a_0 * b_2;
568 c_12 += a_1 * b_2;
569 c_22 += a_2 * b_2;
570 c_32 += a_3 * b_2;
571
572 c_03 += a_0 * b_3;
573 c_13 += a_1 * b_3;
574 c_23 += a_2 * b_3;
575 c_33 += a_3 * b_3;
576
577
578 // k = 1
579
580 a_0 = A[4];
581 a_1 = A[5];
582 a_2 = A[6];
583 a_3 = A[7];
584
585 b_0 = B[4];
586 b_1 = B[5];
587 b_2 = B[6];
588 b_3 = B[7];
589
590 c_00 += a_0 * b_0;
591 c_10 += a_1 * b_0;
592 c_20 += a_2 * b_0;
593 c_30 += a_3 * b_0;
594
595 c_01 += a_0 * b_1;
596 c_11 += a_1 * b_1;
597 c_21 += a_2 * b_1;
598 c_31 += a_3 * b_1;
599
600 c_02 += a_0 * b_2;
601 c_12 += a_1 * b_2;
602 c_22 += a_2 * b_2;
603 c_32 += a_3 * b_2;
604
605 c_03 += a_0 * b_3;
606 c_13 += a_1 * b_3;
607 c_23 += a_2 * b_3;
608 c_33 += a_3 * b_3;
609
610
611 // k = 2
612
613 a_0 = A[8];
614 a_1 = A[9];
615 a_2 = A[10];
616 a_3 = A[11];
617
618 b_0 = B[8];
619 b_1 = B[9];
620 b_2 = B[10];
621 b_3 = B[11];
622
623 c_00 += a_0 * b_0;
624 c_10 += a_1 * b_0;
625 c_20 += a_2 * b_0;
626 c_30 += a_3 * b_0;
627
628 c_01 += a_0 * b_1;
629 c_11 += a_1 * b_1;
630 c_21 += a_2 * b_1;
631 c_31 += a_3 * b_1;
632
633 c_02 += a_0 * b_2;
634 c_12 += a_1 * b_2;
635 c_22 += a_2 * b_2;
636 c_32 += a_3 * b_2;
637
638 c_03 += a_0 * b_3;
639 c_13 += a_1 * b_3;
640 c_23 += a_2 * b_3;
641 c_33 += a_3 * b_3;
642
643
644 // k = 3
645
646 a_0 = A[12];
647 a_1 = A[13];
648 a_2 = A[14];
649 a_3 = A[15];
650
651 b_0 = B[12];
652 b_1 = B[13];
653 b_2 = B[14];
654 b_3 = B[15];
655
656 c_00 += a_0 * b_0;
657 c_10 += a_1 * b_0;
658 c_20 += a_2 * b_0;
659 c_30 += a_3 * b_0;
660
661 c_01 += a_0 * b_1;
662 c_11 += a_1 * b_1;
663 c_21 += a_2 * b_1;
664 c_31 += a_3 * b_1;
665
666 c_02 += a_0 * b_2;
667 c_12 += a_1 * b_2;
668 c_22 += a_2 * b_2;
669 c_32 += a_3 * b_2;
670
671 c_03 += a_0 * b_3;
672 c_13 += a_1 * b_3;
673 c_23 += a_2 * b_3;
674 c_33 += a_3 * b_3;
675
676 A += 16;
677 B += 16;
678
679 }
680
681 for(; k<kmax; k++)
682 {
683
684 // k = 0
685
686 a_0 = A[0];
687 a_1 = A[1];
688 a_2 = A[2];
689 a_3 = A[3];
690
691 b_0 = B[0];
692 b_1 = B[1];
693 b_2 = B[2];
694 b_3 = B[3];
695
696 c_00 += a_0 * b_0;
697 c_10 += a_1 * b_0;
698 c_20 += a_2 * b_0;
699 c_30 += a_3 * b_0;
700
701 c_01 += a_0 * b_1;
702 c_11 += a_1 * b_1;
703 c_21 += a_2 * b_1;
704 c_31 += a_3 * b_1;
705
706 c_02 += a_0 * b_2;
707 c_12 += a_1 * b_2;
708 c_22 += a_2 * b_2;
709 c_32 += a_3 * b_2;
710
711 c_03 += a_0 * b_3;
712 c_13 += a_1 * b_3;
713 c_23 += a_2 * b_3;
714 c_33 += a_3 * b_3;
715
716 A += 4;
717 B += 4;
718
719 }
720
721 c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
722 c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
723 c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
724 c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
725
726 c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
727 c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
728 c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
729 c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
730
731 c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
732 c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
733 c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
734 c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
735
736 c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
737 c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
738 c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
739 c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
740
741 if(km>=4)
742 {
743 D[0+bs*0] = c_00;
744 D[1+bs*0] = c_10;
745 D[2+bs*0] = c_20;
746 D[3+bs*0] = c_30;
747
748 if(kn==1)
749 return;
750
751 D[0+bs*1] = c_01;
752 D[1+bs*1] = c_11;
753 D[2+bs*1] = c_21;
754 D[3+bs*1] = c_31;
755
756 if(kn==2)
757 return;
758
759 D[0+bs*2] = c_02;
760 D[1+bs*2] = c_12;
761 D[2+bs*2] = c_22;
762 D[3+bs*2] = c_32;
763
764 if(kn==3)
765 return;
766
767 D[0+bs*3] = c_03;
768 D[1+bs*3] = c_13;
769 D[2+bs*3] = c_23;
770 D[3+bs*3] = c_33;
771 }
772 else if(km>=3)
773 {
774 D[0+bs*0] = c_00;
775 D[1+bs*0] = c_10;
776 D[2+bs*0] = c_20;
777
778 if(kn==1)
779 return;
780
781 D[0+bs*1] = c_01;
782 D[1+bs*1] = c_11;
783 D[2+bs*1] = c_21;
784
785 if(kn==2)
786 return;
787
788 D[0+bs*2] = c_02;
789 D[1+bs*2] = c_12;
790 D[2+bs*2] = c_22;
791
792 if(kn==3)
793 return;
794
795 D[0+bs*3] = c_03;
796 D[1+bs*3] = c_13;
797 D[2+bs*3] = c_23;
798 }
799 else if(km>=2)
800 {
801 D[0+bs*0] = c_00;
802 D[1+bs*0] = c_10;
803
804 if(kn==1)
805 return;
806
807 D[0+bs*1] = c_01;
808 D[1+bs*1] = c_11;
809
810 if(kn==2)
811 return;
812
813 D[0+bs*2] = c_02;
814 D[1+bs*2] = c_12;
815
816 if(kn==3)
817 return;
818
819 D[0+bs*3] = c_03;
820 D[1+bs*3] = c_13;
821 }
822 else //if(km>=1)
823 {
824 D[0+bs*0] = c_00;
825
826 if(kn==1)
827 return;
828
829 D[0+bs*1] = c_01;
830
831 if(kn==2)
832 return;
833
834 D[0+bs*2] = c_02;
835
836 if(kn==3)
837 return;
838
839 D[0+bs*3] = c_03;
840 }
841
842 }
843#endif
844
845
846
847#if defined(TARGET_GENERIC)
848void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
849 {
850 kernel_dgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
851 }
852#endif
853
854
855
856#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
857void kernel_dgemm_nn_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
858 {
859
860 const int bs = 4;
861
862 double
863 a_0, a_1, a_2, a_3,
864 b_0, b_1, b_2, b_3,
865 c_00=0, c_01=0, c_02=0, c_03=0,
866 c_10=0, c_11=0, c_12=0, c_13=0,
867 c_20=0, c_21=0, c_22=0, c_23=0,
868 c_30=0, c_31=0, c_32=0, c_33=0;
869
870 double
871 *C1, *D1;
872
873 int k;
874
875 k = 0;
876 if(offsetB!=0)
877 {
878 if(offsetB==1)
879 {
880
881 B += 1;
882
883 a_0 = A[0];
884 a_1 = A[1];
885 a_2 = A[2];
886 a_3 = A[3];
887
888 b_0 = B[0];
889 b_1 = B[4];
890 b_2 = B[8];
891 b_3 = B[12];
892
893 c_00 += a_0 * b_0;
894 c_10 += a_1 * b_0;
895 c_20 += a_2 * b_0;
896 c_30 += a_3 * b_0;
897
898 c_01 += a_0 * b_1;
899 c_11 += a_1 * b_1;
900 c_21 += a_2 * b_1;
901 c_31 += a_3 * b_1;
902
903 c_02 += a_0 * b_2;
904 c_12 += a_1 * b_2;
905 c_22 += a_2 * b_2;
906 c_32 += a_3 * b_2;
907
908 c_03 += a_0 * b_3;
909 c_13 += a_1 * b_3;
910 c_23 += a_2 * b_3;
911 c_33 += a_3 * b_3;
912
913 A += 4;
914 B += 1;
915 k += 1;
916
917 if(k>=kmax)
918 goto scale;
919
920 a_0 = A[0];
921 a_1 = A[1];
922 a_2 = A[2];
923 a_3 = A[3];
924
925 b_0 = B[0];
926 b_1 = B[4];
927 b_2 = B[8];
928 b_3 = B[12];
929
930 c_00 += a_0 * b_0;
931 c_10 += a_1 * b_0;
932 c_20 += a_2 * b_0;
933 c_30 += a_3 * b_0;
934
935 c_01 += a_0 * b_1;
936 c_11 += a_1 * b_1;
937 c_21 += a_2 * b_1;
938 c_31 += a_3 * b_1;
939
940 c_02 += a_0 * b_2;
941 c_12 += a_1 * b_2;
942 c_22 += a_2 * b_2;
943 c_32 += a_3 * b_2;
944
945 c_03 += a_0 * b_3;
946 c_13 += a_1 * b_3;
947 c_23 += a_2 * b_3;
948 c_33 += a_3 * b_3;
949
950 A += 4;
951 B += 1;
952 k += 1;
953
954 if(k>=kmax)
955 goto scale;
956
957 a_0 = A[0];
958 a_1 = A[1];
959 a_2 = A[2];
960 a_3 = A[3];
961
962 b_0 = B[0];
963 b_1 = B[4];
964 b_2 = B[8];
965 b_3 = B[12];
966
967 c_00 += a_0 * b_0;
968 c_10 += a_1 * b_0;
969 c_20 += a_2 * b_0;
970 c_30 += a_3 * b_0;
971
972 c_01 += a_0 * b_1;
973 c_11 += a_1 * b_1;
974 c_21 += a_2 * b_1;
975 c_31 += a_3 * b_1;
976
977 c_02 += a_0 * b_2;
978 c_12 += a_1 * b_2;
979 c_22 += a_2 * b_2;
980 c_32 += a_3 * b_2;
981
982 c_03 += a_0 * b_3;
983 c_13 += a_1 * b_3;
984 c_23 += a_2 * b_3;
985 c_33 += a_3 * b_3;
986
987 A += 4;
988 B += 1;
989 B += bs*(sdb-1);
990 k += 1;
991
992 }
993 else if(offsetB==2)
994 {
995
996 B += 2;
997
998 a_0 = A[0];
999 a_1 = A[1];
1000 a_2 = A[2];
1001 a_3 = A[3];
1002
1003 b_0 = B[0];
1004 b_1 = B[4];
1005 b_2 = B[8];
1006 b_3 = B[12];
1007
1008 c_00 += a_0 * b_0;
1009 c_10 += a_1 * b_0;
1010 c_20 += a_2 * b_0;
1011 c_30 += a_3 * b_0;
1012
1013 c_01 += a_0 * b_1;
1014 c_11 += a_1 * b_1;
1015 c_21 += a_2 * b_1;
1016 c_31 += a_3 * b_1;
1017
1018 c_02 += a_0 * b_2;
1019 c_12 += a_1 * b_2;
1020 c_22 += a_2 * b_2;
1021 c_32 += a_3 * b_2;
1022
1023 c_03 += a_0 * b_3;
1024 c_13 += a_1 * b_3;
1025 c_23 += a_2 * b_3;
1026 c_33 += a_3 * b_3;
1027
1028 A += 4;
1029 B += 1;
1030 k += 1;
1031
1032 if(k>=kmax)
1033 goto scale;
1034
1035 a_0 = A[0];
1036 a_1 = A[1];
1037 a_2 = A[2];
1038 a_3 = A[3];
1039
1040 b_0 = B[0];
1041 b_1 = B[4];
1042 b_2 = B[8];
1043 b_3 = B[12];
1044
1045 c_00 += a_0 * b_0;
1046 c_10 += a_1 * b_0;
1047 c_20 += a_2 * b_0;
1048 c_30 += a_3 * b_0;
1049
1050 c_01 += a_0 * b_1;
1051 c_11 += a_1 * b_1;
1052 c_21 += a_2 * b_1;
1053 c_31 += a_3 * b_1;
1054
1055 c_02 += a_0 * b_2;
1056 c_12 += a_1 * b_2;
1057 c_22 += a_2 * b_2;
1058 c_32 += a_3 * b_2;
1059
1060 c_03 += a_0 * b_3;
1061 c_13 += a_1 * b_3;
1062 c_23 += a_2 * b_3;
1063 c_33 += a_3 * b_3;
1064
1065 A += 4;
1066 B += 1;
1067 B += bs*(sdb-1);
1068 k += 1;
1069
1070 }
1071 else // if(offsetB==3)
1072 {
1073
1074 B += 3;
1075
1076 a_0 = A[0];
1077 a_1 = A[1];
1078 a_2 = A[2];
1079 a_3 = A[3];
1080
1081 b_0 = B[0];
1082 b_1 = B[4];
1083 b_2 = B[8];
1084 b_3 = B[12];
1085
1086 c_00 += a_0 * b_0;
1087 c_10 += a_1 * b_0;
1088 c_20 += a_2 * b_0;
1089 c_30 += a_3 * b_0;
1090
1091 c_01 += a_0 * b_1;
1092 c_11 += a_1 * b_1;
1093 c_21 += a_2 * b_1;
1094 c_31 += a_3 * b_1;
1095
1096 c_02 += a_0 * b_2;
1097 c_12 += a_1 * b_2;
1098 c_22 += a_2 * b_2;
1099 c_32 += a_3 * b_2;
1100
1101 c_03 += a_0 * b_3;
1102 c_13 += a_1 * b_3;
1103 c_23 += a_2 * b_3;
1104 c_33 += a_3 * b_3;
1105
1106 A += 4;
1107 B += 1;
1108 B += bs*(sdb-1);
1109 k += 1;
1110
1111 }
1112 }
1113 for(; k<kmax-3; k+=4)
1114 {
1115
1116 // k = 0
1117
1118 a_0 = A[0];
1119 a_1 = A[1];
1120 a_2 = A[2];
1121 a_3 = A[3];
1122
1123 b_0 = B[0];
1124 b_1 = B[4];
1125 b_2 = B[8];
1126 b_3 = B[12];
1127
1128 c_00 += a_0 * b_0;
1129 c_10 += a_1 * b_0;
1130 c_20 += a_2 * b_0;
1131 c_30 += a_3 * b_0;
1132
1133 c_01 += a_0 * b_1;
1134 c_11 += a_1 * b_1;
1135 c_21 += a_2 * b_1;
1136 c_31 += a_3 * b_1;
1137
1138 c_02 += a_0 * b_2;
1139 c_12 += a_1 * b_2;
1140 c_22 += a_2 * b_2;
1141 c_32 += a_3 * b_2;
1142
1143 c_03 += a_0 * b_3;
1144 c_13 += a_1 * b_3;
1145 c_23 += a_2 * b_3;
1146 c_33 += a_3 * b_3;
1147
1148
1149 // k = 1
1150
1151 a_0 = A[4];
1152 a_1 = A[5];
1153 a_2 = A[6];
1154 a_3 = A[7];
1155
1156 b_0 = B[1];
1157 b_1 = B[5];
1158 b_2 = B[9];
1159 b_3 = B[13];
1160
1161 c_00 += a_0 * b_0;
1162 c_10 += a_1 * b_0;
1163 c_20 += a_2 * b_0;
1164 c_30 += a_3 * b_0;
1165
1166 c_01 += a_0 * b_1;
1167 c_11 += a_1 * b_1;
1168 c_21 += a_2 * b_1;
1169 c_31 += a_3 * b_1;
1170
1171 c_02 += a_0 * b_2;
1172 c_12 += a_1 * b_2;
1173 c_22 += a_2 * b_2;
1174 c_32 += a_3 * b_2;
1175
1176 c_03 += a_0 * b_3;
1177 c_13 += a_1 * b_3;
1178 c_23 += a_2 * b_3;
1179 c_33 += a_3 * b_3;
1180
1181
1182 // k = 2
1183
1184 a_0 = A[8];
1185 a_1 = A[9];
1186 a_2 = A[10];
1187 a_3 = A[11];
1188
1189 b_0 = B[2];
1190 b_1 = B[6];
1191 b_2 = B[10];
1192 b_3 = B[14];
1193
1194 c_00 += a_0 * b_0;
1195 c_10 += a_1 * b_0;
1196 c_20 += a_2 * b_0;
1197 c_30 += a_3 * b_0;
1198
1199 c_01 += a_0 * b_1;
1200 c_11 += a_1 * b_1;
1201 c_21 += a_2 * b_1;
1202 c_31 += a_3 * b_1;
1203
1204 c_02 += a_0 * b_2;
1205 c_12 += a_1 * b_2;
1206 c_22 += a_2 * b_2;
1207 c_32 += a_3 * b_2;
1208
1209 c_03 += a_0 * b_3;
1210 c_13 += a_1 * b_3;
1211 c_23 += a_2 * b_3;
1212 c_33 += a_3 * b_3;
1213
1214
1215 // k = 3
1216
1217 a_0 = A[12];
1218 a_1 = A[13];
1219 a_2 = A[14];
1220 a_3 = A[15];
1221
1222 b_0 = B[3];
1223 b_1 = B[7];
1224 b_2 = B[11];
1225 b_3 = B[15];
1226
1227 c_00 += a_0 * b_0;
1228 c_10 += a_1 * b_0;
1229 c_20 += a_2 * b_0;
1230 c_30 += a_3 * b_0;
1231
1232 c_01 += a_0 * b_1;
1233 c_11 += a_1 * b_1;
1234 c_21 += a_2 * b_1;
1235 c_31 += a_3 * b_1;
1236
1237 c_02 += a_0 * b_2;
1238 c_12 += a_1 * b_2;
1239 c_22 += a_2 * b_2;
1240 c_32 += a_3 * b_2;
1241
1242 c_03 += a_0 * b_3;
1243 c_13 += a_1 * b_3;
1244 c_23 += a_2 * b_3;
1245 c_33 += a_3 * b_3;
1246
1247 A += 16;
1248 B += 4*sdb;
1249
1250 }
1251 for(; k<kmax; k++)
1252 {
1253
1254 // k = 0
1255
1256 a_0 = A[0];
1257 a_1 = A[1];
1258 a_2 = A[2];
1259 a_3 = A[3];
1260
1261 b_0 = B[0];
1262 b_1 = B[4];
1263 b_2 = B[8];
1264 b_3 = B[12];
1265
1266 c_00 += a_0 * b_0;
1267 c_10 += a_1 * b_0;
1268 c_20 += a_2 * b_0;
1269 c_30 += a_3 * b_0;
1270
1271 c_01 += a_0 * b_1;
1272 c_11 += a_1 * b_1;
1273 c_21 += a_2 * b_1;
1274 c_31 += a_3 * b_1;
1275
1276 c_02 += a_0 * b_2;
1277 c_12 += a_1 * b_2;
1278 c_22 += a_2 * b_2;
1279 c_32 += a_3 * b_2;
1280
1281 c_03 += a_0 * b_3;
1282 c_13 += a_1 * b_3;
1283 c_23 += a_2 * b_3;
1284 c_33 += a_3 * b_3;
1285
1286 A += 4;
1287 B += 1;
1288
1289 }
1290
1291 scale:
1292
1293 if(offsetC==0)
1294 {
1295 c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
1296 c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
1297 c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
1298 c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
1299
1300 c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
1301 c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
1302 c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
1303 c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
1304
1305 c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
1306 c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
1307 c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
1308 c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
1309
1310 c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
1311 c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
1312 c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
1313 c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
1314 }
1315 else if(offsetC==1)
1316 {
1317 C1 = C0 + sdc*bs;
1318
1319 c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
1320 c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
1321 c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
1322 c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
1323
1324 c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
1325 c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
1326 c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
1327 c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
1328
1329 c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
1330 c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
1331 c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
1332 c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
1333
1334 c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
1335 c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
1336 c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
1337 c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
1338 }
1339 else if(offsetC==2)
1340 {
1341 C1 = C0 + sdc*bs;
1342
1343 c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
1344 c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
1345 c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
1346 c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
1347
1348 c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
1349 c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
1350 c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
1351 c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
1352
1353 c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
1354 c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
1355 c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
1356 c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
1357
1358 c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
1359 c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
1360 c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
1361 c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
1362 }
1363 else //if(offsetC==3)
1364 {
1365 C1 = C0 + sdc*bs;
1366
1367 c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
1368 c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
1369 c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
1370 c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
1371
1372 c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
1373 c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
1374 c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
1375 c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
1376
1377 c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
1378 c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
1379 c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
1380 c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
1381
1382 c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
1383 c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
1384 c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
1385 c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
1386 }
1387
1388 // shift sol for cols
1389 if(n0>0)
1390 {
1391 if(n0==1)
1392 {
1393 c_00 = c_01;
1394 c_10 = c_11;
1395 c_20 = c_21;
1396 c_30 = c_31;
1397
1398 c_01 = c_02;
1399 c_11 = c_12;
1400 c_21 = c_22;
1401 c_31 = c_32;
1402
1403 c_02 = c_03;
1404 c_12 = c_13;
1405 c_22 = c_23;
1406 c_32 = c_33;
1407
1408 D0 += 1*bs;
1409 }
1410 else if(n0==2)
1411 {
1412 c_00 = c_02;
1413 c_10 = c_12;
1414 c_20 = c_22;
1415 c_30 = c_32;
1416
1417 c_01 = c_03;
1418 c_11 = c_13;
1419 c_21 = c_23;
1420 c_31 = c_33;
1421
1422 D0 += 2*bs;
1423 }
1424 else //if(n0==3)
1425 {
1426 c_00 = c_03;
1427 c_10 = c_13;
1428 c_20 = c_23;
1429 c_30 = c_33;
1430
1431 D0 += 3*bs;
1432 }
1433 }
1434
1435 int kn = n1 - n0;
1436
1437 if(offsetD==0)
1438 {
1439 if(kn<=0)
1440 return;
1441
1442 if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
1443 if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
1444 if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
1445 if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
1446
1447 if(kn<=1)
1448 return;
1449
1450 if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
1451 if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
1452 if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
1453 if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
1454
1455 if(kn<=2)
1456 return;
1457
1458 if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
1459 if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
1460 if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
1461 if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
1462
1463 if(kn<=3)
1464 return;
1465
1466 if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
1467 if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
1468 if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
1469 if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
1470 }
1471 else if(offsetD==1)
1472 {
1473 D1 = D0 + sdd*bs;
1474
1475 if(kn<=0)
1476 return;
1477
1478 if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
1479 if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
1480 if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
1481 if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
1482
1483 if(kn<=1)
1484 return;
1485
1486 if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
1487 if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
1488 if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
1489 if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
1490
1491 if(kn<=2)
1492 return;
1493
1494 if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
1495 if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
1496 if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
1497 if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
1498
1499 if(kn<=3)
1500 return;
1501
1502 if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
1503 if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
1504 if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
1505 if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
1506 }
1507 else if(offsetD==2)
1508 {
1509 D1 = D0 + sdd*bs;
1510
1511 if(kn<=0)
1512 return;
1513
1514 if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
1515 if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
1516 if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
1517 if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
1518
1519 if(kn<=1)
1520 return;
1521
1522 if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
1523 if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
1524 if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
1525 if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
1526
1527 if(kn<=2)
1528 return;
1529
1530 if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
1531 if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
1532 if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
1533 if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
1534
1535 if(kn<=3)
1536 return;
1537
1538 if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
1539 if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
1540 if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
1541 if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
1542 }
1543 else //if(offsetD==3)
1544 {
1545 D1 = D0 + sdd*bs;
1546
1547 if(kn<=0)
1548 return;
1549
1550 if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
1551 if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
1552 if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
1553 if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
1554
1555 if(kn<=1)
1556 return;
1557
1558 if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
1559 if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
1560 if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
1561 if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
1562
1563 if(kn<=2)
1564 return;
1565
1566 if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
1567 if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
1568 if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
1569 if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
1570
1571 if(kn<=3)
1572 return;
1573
1574 if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
1575 if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
1576 if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
1577 if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
1578 }
1579
1580 return;
1581
1582 }
1583#endif
1584
1585
1586
1587#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1588void kernel_dgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
1589 {
1590 kernel_dgemm_nn_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, beta, 0, C, 0, 0, D, 0, 0, 4, 0, 4);
1591 }
1592#endif
1593
1594
1595
1596#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1597void kernel_dsyrk_nt_l_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
1598 {
1599
1600 const int bs = 4;
1601
1602 double
1603 a_0, a_1, a_2, a_3,
1604 b_0, b_1, b_2, b_3,
1605 c_00=0,
1606 c_10=0, c_11=0,
1607 c_20=0, c_21=0, c_22=0,
1608 c_30=0, c_31=0, c_32=0, c_33=0;
1609
1610 double
1611 *C1, *D1;
1612
1613 int k;
1614
1615 for(k=0; k<kmax-3; k+=4)
1616 {
1617
1618 // k = 0
1619
1620 a_0 = A[0];
1621 a_1 = A[1];
1622 a_2 = A[2];
1623 a_3 = A[3];
1624
1625 b_0 = B[0];
1626 b_1 = B[1];
1627 b_2 = B[2];
1628 b_3 = B[3];
1629
1630 c_00 += a_0 * b_0;
1631 c_10 += a_1 * b_0;
1632 c_20 += a_2 * b_0;
1633 c_30 += a_3 * b_0;
1634
1635 c_11 += a_1 * b_1;
1636 c_21 += a_2 * b_1;
1637 c_31 += a_3 * b_1;
1638
1639 c_22 += a_2 * b_2;
1640 c_32 += a_3 * b_2;
1641
1642 c_33 += a_3 * b_3;
1643
1644
1645 // k = 1
1646
1647 a_0 = A[4];
1648 a_1 = A[5];
1649 a_2 = A[6];
1650 a_3 = A[7];
1651
1652 b_0 = B[4];
1653 b_1 = B[5];
1654 b_2 = B[6];
1655 b_3 = B[7];
1656
1657 c_00 += a_0 * b_0;
1658 c_10 += a_1 * b_0;
1659 c_20 += a_2 * b_0;
1660 c_30 += a_3 * b_0;
1661
1662 c_11 += a_1 * b_1;
1663 c_21 += a_2 * b_1;
1664 c_31 += a_3 * b_1;
1665
1666 c_22 += a_2 * b_2;
1667 c_32 += a_3 * b_2;
1668
1669 c_33 += a_3 * b_3;
1670
1671
1672 // k = 2
1673
1674 a_0 = A[8];
1675 a_1 = A[9];
1676 a_2 = A[10];
1677 a_3 = A[11];
1678
1679 b_0 = B[8];
1680 b_1 = B[9];
1681 b_2 = B[10];
1682 b_3 = B[11];
1683
1684 c_00 += a_0 * b_0;
1685 c_10 += a_1 * b_0;
1686 c_20 += a_2 * b_0;
1687 c_30 += a_3 * b_0;
1688
1689 c_11 += a_1 * b_1;
1690 c_21 += a_2 * b_1;
1691 c_31 += a_3 * b_1;
1692
1693 c_22 += a_2 * b_2;
1694 c_32 += a_3 * b_2;
1695
1696 c_33 += a_3 * b_3;
1697
1698
1699 // k = 3
1700
1701 a_0 = A[12];
1702 a_1 = A[13];
1703 a_2 = A[14];
1704 a_3 = A[15];
1705
1706 b_0 = B[12];
1707 b_1 = B[13];
1708 b_2 = B[14];
1709 b_3 = B[15];
1710
1711 c_00 += a_0 * b_0;
1712 c_10 += a_1 * b_0;
1713 c_20 += a_2 * b_0;
1714 c_30 += a_3 * b_0;
1715
1716 c_11 += a_1 * b_1;
1717 c_21 += a_2 * b_1;
1718 c_31 += a_3 * b_1;
1719
1720 c_22 += a_2 * b_2;
1721 c_32 += a_3 * b_2;
1722
1723 c_33 += a_3 * b_3;
1724
1725 A += 16;
1726 B += 16;
1727
1728 }
1729
1730 for(; k<kmax; k++)
1731 {
1732
1733 // k = 0
1734
1735 a_0 = A[0];
1736 a_1 = A[1];
1737 a_2 = A[2];
1738 a_3 = A[3];
1739
1740 b_0 = B[0];
1741 b_1 = B[1];
1742 b_2 = B[2];
1743 b_3 = B[3];
1744
1745 c_00 += a_0 * b_0;
1746 c_10 += a_1 * b_0;
1747 c_20 += a_2 * b_0;
1748 c_30 += a_3 * b_0;
1749
1750 c_11 += a_1 * b_1;
1751 c_21 += a_2 * b_1;
1752 c_31 += a_3 * b_1;
1753
1754 c_22 += a_2 * b_2;
1755 c_32 += a_3 * b_2;
1756
1757 c_33 += a_3 * b_3;
1758
1759 A += 4;
1760 B += 4;
1761
1762 }
1763
1764 if(offsetC==0)
1765 {
1766 c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
1767 c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
1768 c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
1769 c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
1770
1771 c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
1772 c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
1773 c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
1774
1775 c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
1776 c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
1777
1778 c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
1779 }
1780 else if(offsetC==1)
1781 {
1782 C1 = C0 + sdc*bs;
1783
1784 c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
1785 c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
1786 c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
1787 c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
1788
1789 c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
1790 c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
1791 c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
1792
1793 c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
1794 c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
1795
1796 c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
1797 }
1798 else if(offsetC==2)
1799 {
1800 C1 = C0 + sdc*bs;
1801
1802 c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
1803 c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
1804 c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
1805 c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
1806
1807 c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
1808 c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
1809 c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
1810
1811 c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
1812 c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
1813
1814 c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
1815 }
1816 else //if(offsetC==3)
1817 {
1818 C1 = C0 + sdc*bs;
1819
1820 c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
1821 c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
1822 c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
1823 c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
1824
1825 c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
1826 c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
1827 c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
1828
1829 c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
1830 c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
1831
1832 c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
1833 }
1834
1835 // shift sol for cols
1836 if(n0>0)
1837 {
1838 if(n0==1)
1839 {
1840 c_10 = c_11;
1841 c_20 = c_21;
1842 c_30 = c_31;
1843
1844 c_21 = c_22;
1845 c_31 = c_32;
1846
1847 c_32 = c_33;
1848
1849 D0 += 1*bs;
1850 }
1851 else if(n0==2)
1852 {
1853 c_20 = c_22;
1854 c_30 = c_32;
1855
1856 c_31 = c_33;
1857
1858 D0 += 2*bs;
1859 }
1860 else //if(n0==3)
1861 {
1862 c_30 = c_33;
1863
1864 D0 += 3*bs;
1865 }
1866 }
1867
1868 int kn = n1 - n0;
1869
1870 if(offsetD==0)
1871 {
1872 if(kn<=0)
1873 return;
1874
1875 if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
1876 if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
1877 if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
1878 if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
1879
1880 if(kn<=1)
1881 return;
1882
1883 if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
1884 if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
1885 if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
1886
1887 if(kn<=2)
1888 return;
1889
1890 if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
1891 if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
1892
1893 if(kn<=3)
1894 return;
1895
1896 if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
1897 }
1898 else if(offsetD==1)
1899 {
1900 D1 = D0 + sdd*bs;
1901
1902 if(kn<=0)
1903 return;
1904
1905 if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
1906 if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
1907 if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
1908 if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
1909
1910 if(kn<=1)
1911 return;
1912
1913 if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
1914 if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
1915 if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
1916
1917 if(kn<=2)
1918 return;
1919
1920 if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
1921 if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
1922
1923 if(kn<=3)
1924 return;
1925
1926 if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
1927 }
1928 else if(offsetD==2)
1929 {
1930 D1 = D0 + sdd*bs;
1931
1932 if(kn<=0)
1933 return;
1934
1935 if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
1936 if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
1937 if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
1938 if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
1939
1940 if(kn<=1)
1941 return;
1942
1943 if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
1944 if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
1945 if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
1946
1947 if(kn<=2)
1948 return;
1949
1950 if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
1951 if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
1952
1953 if(kn<=3)
1954 return;
1955
1956 if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
1957 }
1958 else //if(offsetD==3)
1959 {
1960 D1 = D0 + sdd*bs;
1961
1962 if(kn<=0)
1963 return;
1964
1965 if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
1966 if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
1967 if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
1968 if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
1969
1970 if(kn<=1)
1971 return;
1972
1973 if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
1974 if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
1975 if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
1976
1977 if(kn<=2)
1978 return;
1979
1980 if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
1981 if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
1982
1983 if(kn<=3)
1984 return;
1985
1986 if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
1987 }
1988
1989 return;
1990
1991 }
1992#endif
1993
1994
1995
1996#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1997void kernel_dsyrk_nt_l_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
1998 {
1999
2000 const int bs = 4;
2001
2002 double
2003 a_0, a_1, a_2, a_3,
2004 b_0, b_1, b_2, b_3,
2005 c_00=0,
2006 c_10=0, c_11=0,
2007 c_20=0, c_21=0, c_22=0,
2008 c_30=0, c_31=0, c_32=0, c_33=0;
2009
2010 int k;
2011
2012 for(k=0; k<kmax-3; k+=4)
2013 {
2014
2015 // k = 0
2016
2017 a_0 = A[0];
2018 a_1 = A[1];
2019 a_2 = A[2];
2020 a_3 = A[3];
2021
2022 b_0 = B[0];
2023 b_1 = B[1];
2024 b_2 = B[2];
2025 b_3 = B[3];
2026
2027 c_00 += a_0 * b_0;
2028 c_10 += a_1 * b_0;
2029 c_20 += a_2 * b_0;
2030 c_30 += a_3 * b_0;
2031
2032 c_11 += a_1 * b_1;
2033 c_21 += a_2 * b_1;
2034 c_31 += a_3 * b_1;
2035
2036 c_22 += a_2 * b_2;
2037 c_32 += a_3 * b_2;
2038
2039 c_33 += a_3 * b_3;
2040
2041
2042 // k = 1
2043
2044 a_0 = A[4];
2045 a_1 = A[5];
2046 a_2 = A[6];
2047 a_3 = A[7];
2048
2049 b_0 = B[4];
2050 b_1 = B[5];
2051 b_2 = B[6];
2052 b_3 = B[7];
2053
2054 c_00 += a_0 * b_0;
2055 c_10 += a_1 * b_0;
2056 c_20 += a_2 * b_0;
2057 c_30 += a_3 * b_0;
2058
2059 c_11 += a_1 * b_1;
2060 c_21 += a_2 * b_1;
2061 c_31 += a_3 * b_1;
2062
2063 c_22 += a_2 * b_2;
2064 c_32 += a_3 * b_2;
2065
2066 c_33 += a_3 * b_3;
2067
2068
2069 // k = 2
2070
2071 a_0 = A[8];
2072 a_1 = A[9];
2073 a_2 = A[10];
2074 a_3 = A[11];
2075
2076 b_0 = B[8];
2077 b_1 = B[9];
2078 b_2 = B[10];
2079 b_3 = B[11];
2080
2081 c_00 += a_0 * b_0;
2082 c_10 += a_1 * b_0;
2083 c_20 += a_2 * b_0;
2084 c_30 += a_3 * b_0;
2085
2086 c_11 += a_1 * b_1;
2087 c_21 += a_2 * b_1;
2088 c_31 += a_3 * b_1;
2089
2090 c_22 += a_2 * b_2;
2091 c_32 += a_3 * b_2;
2092
2093 c_33 += a_3 * b_3;
2094
2095
2096 // k = 3
2097
2098 a_0 = A[12];
2099 a_1 = A[13];
2100 a_2 = A[14];
2101 a_3 = A[15];
2102
2103 b_0 = B[12];
2104 b_1 = B[13];
2105 b_2 = B[14];
2106 b_3 = B[15];
2107
2108 c_00 += a_0 * b_0;
2109 c_10 += a_1 * b_0;
2110 c_20 += a_2 * b_0;
2111 c_30 += a_3 * b_0;
2112
2113 c_11 += a_1 * b_1;
2114 c_21 += a_2 * b_1;
2115 c_31 += a_3 * b_1;
2116
2117 c_22 += a_2 * b_2;
2118 c_32 += a_3 * b_2;
2119
2120 c_33 += a_3 * b_3;
2121
2122 A += 16;
2123 B += 16;
2124
2125 }
2126
2127 for(; k<kmax; k++)
2128 {
2129
2130 // k = 0
2131
2132 a_0 = A[0];
2133 a_1 = A[1];
2134 a_2 = A[2];
2135 a_3 = A[3];
2136
2137 b_0 = B[0];
2138 b_1 = B[1];
2139 b_2 = B[2];
2140 b_3 = B[3];
2141
2142 c_00 += a_0 * b_0;
2143 c_10 += a_1 * b_0;
2144 c_20 += a_2 * b_0;
2145 c_30 += a_3 * b_0;
2146
2147 c_11 += a_1 * b_1;
2148 c_21 += a_2 * b_1;
2149 c_31 += a_3 * b_1;
2150
2151 c_22 += a_2 * b_2;
2152 c_32 += a_3 * b_2;
2153
2154 c_33 += a_3 * b_3;
2155
2156 A += 4;
2157 B += 4;
2158
2159 }
2160
2161 c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
2162 c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
2163 c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
2164 c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
2165
2166 c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
2167 c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
2168 c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
2169
2170 c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
2171 c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
2172
2173 c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
2174
2175 if(km>=4)
2176 {
2177 D[0+bs*0] = c_00;
2178 D[1+bs*0] = c_10;
2179 D[2+bs*0] = c_20;
2180 D[3+bs*0] = c_30;
2181
2182 if(kn==1)
2183 return;
2184
2185 D[1+bs*1] = c_11;
2186 D[2+bs*1] = c_21;
2187 D[3+bs*1] = c_31;
2188
2189 if(kn==2)
2190 return;
2191
2192 D[2+bs*2] = c_22;
2193 D[3+bs*2] = c_32;
2194
2195 if(kn==3)
2196 return;
2197
2198 D[3+bs*3] = c_33;
2199 }
2200 else if(km>=3)
2201 {
2202 D[0+bs*0] = c_00;
2203 D[1+bs*0] = c_10;
2204 D[2+bs*0] = c_20;
2205
2206 if(kn==1)
2207 return;
2208
2209 D[1+bs*1] = c_11;
2210 D[2+bs*1] = c_21;
2211
2212 if(kn==2)
2213 return;
2214
2215 D[2+bs*2] = c_22;
2216 }
2217 else if(km>=2)
2218 {
2219 D[0+bs*0] = c_00;
2220 D[1+bs*0] = c_10;
2221
2222 if(kn==1)
2223 return;
2224
2225 D[1+bs*1] = c_11;
2226 }
2227 else //if(km>=1)
2228 {
2229 D[0+bs*0] = c_00;
2230 }
2231
2232 }
2233#endif
2234
2235
2236
2237#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2238void kernel_dsyrk_nt_l_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
2239 {
2240 kernel_dsyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
2241 }
2242#endif
2243
2244
2245
2246#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2247void kernel_dtrmm_nt_ru_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
2248 {
2249
2250 const int bs = 4;
2251
2252 double
2253 a_0, a_1, a_2, a_3,
2254 b_0, b_1, b_2, b_3,
2255 c_00=0, c_01=0, c_02=0, c_03=0,
2256 c_10=0, c_11=0, c_12=0, c_13=0,
2257 c_20=0, c_21=0, c_22=0, c_23=0,
2258 c_30=0, c_31=0, c_32=0, c_33=0;
2259
2260 int k;
2261
2262 k = 0;
2263
2264 // k = 0
2265 if(kmax>0)
2266 {
2267 a_0 = A[0];
2268 a_1 = A[1];
2269 a_2 = A[2];
2270 a_3 = A[3];
2271
2272 b_0 = B[0];
2273
2274 c_00 += a_0 * b_0;
2275 c_10 += a_1 * b_0;
2276 c_20 += a_2 * b_0;
2277 c_30 += a_3 * b_0;
2278
2279 A += 4;
2280 B += 4;
2281 k++;
2282 }
2283
2284 // k = 1
2285 if(kmax>0)
2286 {
2287 a_0 = A[0];
2288 a_1 = A[1];
2289 a_2 = A[2];
2290 a_3 = A[3];
2291
2292 b_0 = B[0];
2293 b_1 = B[1];
2294
2295 c_00 += a_0 * b_0;
2296 c_10 += a_1 * b_0;
2297 c_20 += a_2 * b_0;
2298 c_30 += a_3 * b_0;
2299
2300 c_01 += a_0 * b_1;
2301 c_11 += a_1 * b_1;
2302 c_21 += a_2 * b_1;
2303 c_31 += a_3 * b_1;
2304
2305 A += 4;
2306 B += 4;
2307 k++;
2308 }
2309
2310 // k = 2
2311 if(kmax>0)
2312 {
2313 a_0 = A[0];
2314 a_1 = A[1];
2315 a_2 = A[2];
2316 a_3 = A[3];
2317
2318 b_0 = B[0];
2319 b_1 = B[1];
2320 b_2 = B[2];
2321
2322 c_00 += a_0 * b_0;
2323 c_10 += a_1 * b_0;
2324 c_20 += a_2 * b_0;
2325 c_30 += a_3 * b_0;
2326
2327 c_01 += a_0 * b_1;
2328 c_11 += a_1 * b_1;
2329 c_21 += a_2 * b_1;
2330 c_31 += a_3 * b_1;
2331
2332 c_02 += a_0 * b_2;
2333 c_12 += a_1 * b_2;
2334 c_22 += a_2 * b_2;
2335 c_32 += a_3 * b_2;
2336
2337 A += 4;
2338 B += 4;
2339 k++;
2340 }
2341
2342 for(; k<kmax-3; k+=4)
2343 {
2344
2345 // k = 0
2346
2347 a_0 = A[0];
2348 a_1 = A[1];
2349 a_2 = A[2];
2350 a_3 = A[3];
2351
2352 b_0 = B[0];
2353 b_1 = B[1];
2354 b_2 = B[2];
2355 b_3 = B[3];
2356
2357 c_00 += a_0 * b_0;
2358 c_10 += a_1 * b_0;
2359 c_20 += a_2 * b_0;
2360 c_30 += a_3 * b_0;
2361
2362 c_01 += a_0 * b_1;
2363 c_11 += a_1 * b_1;
2364 c_21 += a_2 * b_1;
2365 c_31 += a_3 * b_1;
2366
2367 c_02 += a_0 * b_2;
2368 c_12 += a_1 * b_2;
2369 c_22 += a_2 * b_2;
2370 c_32 += a_3 * b_2;
2371
2372 c_03 += a_0 * b_3;
2373 c_13 += a_1 * b_3;
2374 c_23 += a_2 * b_3;
2375 c_33 += a_3 * b_3;
2376
2377
2378 // k = 1
2379
2380 a_0 = A[4];
2381 a_1 = A[5];
2382 a_2 = A[6];
2383 a_3 = A[7];
2384
2385 b_0 = B[4];
2386 b_1 = B[5];
2387 b_2 = B[6];
2388 b_3 = B[7];
2389
2390 c_00 += a_0 * b_0;
2391 c_10 += a_1 * b_0;
2392 c_20 += a_2 * b_0;
2393 c_30 += a_3 * b_0;
2394
2395 c_01 += a_0 * b_1;
2396 c_11 += a_1 * b_1;
2397 c_21 += a_2 * b_1;
2398 c_31 += a_3 * b_1;
2399
2400 c_02 += a_0 * b_2;
2401 c_12 += a_1 * b_2;
2402 c_22 += a_2 * b_2;
2403 c_32 += a_3 * b_2;
2404
2405 c_03 += a_0 * b_3;
2406 c_13 += a_1 * b_3;
2407 c_23 += a_2 * b_3;
2408 c_33 += a_3 * b_3;
2409
2410
2411 // k = 2
2412
2413 a_0 = A[8];
2414 a_1 = A[9];
2415 a_2 = A[10];
2416 a_3 = A[11];
2417
2418 b_0 = B[8];
2419 b_1 = B[9];
2420 b_2 = B[10];
2421 b_3 = B[11];
2422
2423 c_00 += a_0 * b_0;
2424 c_10 += a_1 * b_0;
2425 c_20 += a_2 * b_0;
2426 c_30 += a_3 * b_0;
2427
2428 c_01 += a_0 * b_1;
2429 c_11 += a_1 * b_1;
2430 c_21 += a_2 * b_1;
2431 c_31 += a_3 * b_1;
2432
2433 c_02 += a_0 * b_2;
2434 c_12 += a_1 * b_2;
2435 c_22 += a_2 * b_2;
2436 c_32 += a_3 * b_2;
2437
2438 c_03 += a_0 * b_3;
2439 c_13 += a_1 * b_3;
2440 c_23 += a_2 * b_3;
2441 c_33 += a_3 * b_3;
2442
2443
2444 // k = 3
2445
2446 a_0 = A[12];
2447 a_1 = A[13];
2448 a_2 = A[14];
2449 a_3 = A[15];
2450
2451 b_0 = B[12];
2452 b_1 = B[13];
2453 b_2 = B[14];
2454 b_3 = B[15];
2455
2456 c_00 += a_0 * b_0;
2457 c_10 += a_1 * b_0;
2458 c_20 += a_2 * b_0;
2459 c_30 += a_3 * b_0;
2460
2461 c_01 += a_0 * b_1;
2462 c_11 += a_1 * b_1;
2463 c_21 += a_2 * b_1;
2464 c_31 += a_3 * b_1;
2465
2466 c_02 += a_0 * b_2;
2467 c_12 += a_1 * b_2;
2468 c_22 += a_2 * b_2;
2469 c_32 += a_3 * b_2;
2470
2471 c_03 += a_0 * b_3;
2472 c_13 += a_1 * b_3;
2473 c_23 += a_2 * b_3;
2474 c_33 += a_3 * b_3;
2475
2476 A += 16;
2477 B += 16;
2478
2479 }
2480
2481 for(; k<kmax; k++)
2482 {
2483
2484 // k = 0
2485
2486 a_0 = A[0];
2487 a_1 = A[1];
2488 a_2 = A[2];
2489 a_3 = A[3];
2490
2491 b_0 = B[0];
2492 b_1 = B[1];
2493 b_2 = B[2];
2494 b_3 = B[3];
2495
2496 c_00 += a_0 * b_0;
2497 c_10 += a_1 * b_0;
2498 c_20 += a_2 * b_0;
2499 c_30 += a_3 * b_0;
2500
2501 c_01 += a_0 * b_1;
2502 c_11 += a_1 * b_1;
2503 c_21 += a_2 * b_1;
2504 c_31 += a_3 * b_1;
2505
2506 c_02 += a_0 * b_2;
2507 c_12 += a_1 * b_2;
2508 c_22 += a_2 * b_2;
2509 c_32 += a_3 * b_2;
2510
2511 c_03 += a_0 * b_3;
2512 c_13 += a_1 * b_3;
2513 c_23 += a_2 * b_3;
2514 c_33 += a_3 * b_3;
2515
2516 A += 4;
2517 B += 4;
2518
2519 }
2520
2521 c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
2522 c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
2523 c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
2524 c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
2525
2526 c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
2527 c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
2528 c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
2529 c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
2530
2531 c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
2532 c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
2533 c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
2534 c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
2535
2536 c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
2537 c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
2538 c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
2539 c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
2540
2541 if(km>=4)
2542 {
2543 D[0+bs*0] = c_00;
2544 D[1+bs*0] = c_10;
2545 D[2+bs*0] = c_20;
2546 D[3+bs*0] = c_30;
2547
2548 if(kn==1)
2549 return;
2550
2551 D[0+bs*1] = c_01;
2552 D[1+bs*1] = c_11;
2553 D[2+bs*1] = c_21;
2554 D[3+bs*1] = c_31;
2555
2556 if(kn==2)
2557 return;
2558
2559 D[0+bs*2] = c_02;
2560 D[1+bs*2] = c_12;
2561 D[2+bs*2] = c_22;
2562 D[3+bs*2] = c_32;
2563
2564 if(kn==3)
2565 return;
2566
2567 D[0+bs*3] = c_03;
2568 D[1+bs*3] = c_13;
2569 D[2+bs*3] = c_23;
2570 D[3+bs*3] = c_33;
2571 }
2572 else if(km>=3)
2573 {
2574 D[0+bs*0] = c_00;
2575 D[1+bs*0] = c_10;
2576 D[2+bs*0] = c_20;
2577
2578 if(kn==1)
2579 return;
2580
2581 D[0+bs*1] = c_01;
2582 D[1+bs*1] = c_11;
2583 D[2+bs*1] = c_21;
2584
2585 if(kn==2)
2586 return;
2587
2588 D[0+bs*2] = c_02;
2589 D[1+bs*2] = c_12;
2590 D[2+bs*2] = c_22;
2591
2592 if(kn==3)
2593 return;
2594
2595 D[0+bs*3] = c_03;
2596 D[1+bs*3] = c_13;
2597 D[2+bs*3] = c_23;
2598 }
2599 else if(km>=2)
2600 {
2601 D[0+bs*0] = c_00;
2602 D[1+bs*0] = c_10;
2603
2604 if(kn==1)
2605 return;
2606
2607 D[0+bs*1] = c_01;
2608 D[1+bs*1] = c_11;
2609
2610 if(kn==2)
2611 return;
2612
2613 D[0+bs*2] = c_02;
2614 D[1+bs*2] = c_12;
2615
2616 if(kn==3)
2617 return;
2618
2619 D[0+bs*3] = c_03;
2620 D[1+bs*3] = c_13;
2621 }
2622 else //if(km>=1)
2623 {
2624 D[0+bs*0] = c_00;
2625
2626 if(kn==1)
2627 return;
2628
2629 D[0+bs*1] = c_01;
2630
2631 if(kn==2)
2632 return;
2633
2634 D[0+bs*2] = c_02;
2635
2636 if(kn==3)
2637 return;
2638
2639 D[0+bs*3] = c_03;
2640 }
2641
2642 }
2643#endif
2644
2645
2646
2647
2648#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2649void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D)
2650 {
2651 kernel_dtrmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
2652 }
2653#endif
2654
2655
2656
2657#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2658void kernel_dtrmm_nn_rl_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
2659 {
2660
2661 const int bs = 4;
2662
2663 double
2664 a_0, a_1, a_2, a_3,
2665 b_0, b_1, b_2, b_3,
2666 c_00=0, c_01=0, c_02=0, c_03=0,
2667 c_10=0, c_11=0, c_12=0, c_13=0,
2668 c_20=0, c_21=0, c_22=0, c_23=0,
2669 c_30=0, c_31=0, c_32=0, c_33=0;
2670
2671 double *D1;
2672
2673 int k;
2674
2675 B += offsetB;
2676
2677 k = 0;
2678
2679 if(offsetB==0)
2680 {
2681
2682 // k = 0
2683
2684 a_0 = A[0];
2685 a_1 = A[1];
2686 a_2 = A[2];
2687 a_3 = A[3];
2688
2689 b_0 = B[0];
2690 c_00 += a_0 * b_0;
2691 c_10 += a_1 * b_0;
2692 c_20 += a_2 * b_0;
2693 c_30 += a_3 * b_0;
2694
2695 A += 4;
2696 B += 1;
2697 k += 1;
2698
2699 if(k>=kmax)
2700 goto store;
2701
2702 // k = 1
2703
2704 a_0 = A[0];
2705 a_1 = A[1];
2706 a_2 = A[2];
2707 a_3 = A[3];
2708
2709 b_0 = B[0];
2710 c_00 += a_0 * b_0;
2711 c_10 += a_1 * b_0;
2712 c_20 += a_2 * b_0;
2713 c_30 += a_3 * b_0;
2714
2715 b_1 = B[4];
2716 c_01 += a_0 * b_1;
2717 c_11 += a_1 * b_1;
2718 c_21 += a_2 * b_1;
2719 c_31 += a_3 * b_1;
2720
2721 A += 4;
2722 B += 1;
2723 k += 1;
2724
2725 if(k>=kmax)
2726 goto store;
2727
2728 // k = 2
2729
2730 a_0 = A[0];
2731 a_1 = A[1];
2732 a_2 = A[2];
2733 a_3 = A[3];
2734
2735 b_0 = B[0];
2736 c_00 += a_0 * b_0;
2737 c_10 += a_1 * b_0;
2738 c_20 += a_2 * b_0;
2739 c_30 += a_3 * b_0;
2740
2741 b_1 = B[4];
2742 c_01 += a_0 * b_1;
2743 c_11 += a_1 * b_1;
2744 c_21 += a_2 * b_1;
2745 c_31 += a_3 * b_1;
2746
2747 b_2 = B[8];
2748 c_02 += a_0 * b_2;
2749 c_12 += a_1 * b_2;
2750 c_22 += a_2 * b_2;
2751 c_32 += a_3 * b_2;
2752
2753 A += 4;
2754 B += 1;
2755 k += 1;
2756
2757 if(k>=kmax)
2758 goto store;
2759
2760 // k = 3
2761
2762 a_0 = A[0];
2763 a_1 = A[1];
2764 a_2 = A[2];
2765 a_3 = A[3];
2766
2767 b_0 = B[0];
2768 c_00 += a_0 * b_0;
2769 c_10 += a_1 * b_0;
2770 c_20 += a_2 * b_0;
2771 c_30 += a_3 * b_0;
2772
2773 b_1 = B[4];
2774 c_01 += a_0 * b_1;
2775 c_11 += a_1 * b_1;
2776 c_21 += a_2 * b_1;
2777 c_31 += a_3 * b_1;
2778
2779 b_2 = B[8];
2780 c_02 += a_0 * b_2;
2781 c_12 += a_1 * b_2;
2782 c_22 += a_2 * b_2;
2783 c_32 += a_3 * b_2;
2784
2785 b_3 = B[12];
2786 c_03 += a_0 * b_3;
2787 c_13 += a_1 * b_3;
2788 c_23 += a_2 * b_3;
2789 c_33 += a_3 * b_3;
2790
2791 A += 4;
2792 B += 4*sdb-3;
2793 k += 1;
2794
2795 }
2796 else if(offsetB==1)
2797 {
2798
2799 // k = 0
2800
2801 a_0 = A[0];
2802 a_1 = A[1];
2803 a_2 = A[2];
2804 a_3 = A[3];
2805
2806 b_0 = B[0];
2807 c_00 += a_0 * b_0;
2808 c_10 += a_1 * b_0;
2809 c_20 += a_2 * b_0;
2810 c_30 += a_3 * b_0;
2811
2812 A += 4;
2813 B += 1;
2814 k += 1;
2815
2816 if(k>=kmax)
2817 goto store;
2818
2819 // k = 1
2820
2821 a_0 = A[0];
2822 a_1 = A[1];
2823 a_2 = A[2];
2824 a_3 = A[3];
2825
2826 b_0 = B[0];
2827 c_00 += a_0 * b_0;
2828 c_10 += a_1 * b_0;
2829 c_20 += a_2 * b_0;
2830 c_30 += a_3 * b_0;
2831
2832 b_1 = B[4];
2833 c_01 += a_0 * b_1;
2834 c_11 += a_1 * b_1;
2835 c_21 += a_2 * b_1;
2836 c_31 += a_3 * b_1;
2837
2838 A += 4;
2839 B += 1;
2840 k += 1;
2841
2842 if(k>=kmax)
2843 goto store;
2844
2845 // k = 2
2846
2847 a_0 = A[0];
2848 a_1 = A[1];
2849 a_2 = A[2];
2850 a_3 = A[3];
2851
2852 b_0 = B[0];
2853 c_00 += a_0 * b_0;
2854 c_10 += a_1 * b_0;
2855 c_20 += a_2 * b_0;
2856 c_30 += a_3 * b_0;
2857
2858 b_1 = B[4];
2859 c_01 += a_0 * b_1;
2860 c_11 += a_1 * b_1;
2861 c_21 += a_2 * b_1;
2862 c_31 += a_3 * b_1;
2863
2864 b_2 = B[8];
2865 c_02 += a_0 * b_2;
2866 c_12 += a_1 * b_2;
2867 c_22 += a_2 * b_2;
2868 c_32 += a_3 * b_2;
2869
2870 A += 4;
2871 B += 4*sdb-3;
2872 k += 1;
2873
2874 }
2875 else if(offsetB==2)
2876 {
2877
2878 // k = 0
2879
2880 a_0 = A[0];
2881 a_1 = A[1];
2882 a_2 = A[2];
2883 a_3 = A[3];
2884
2885 b_0 = B[0];
2886 c_00 += a_0 * b_0;
2887 c_10 += a_1 * b_0;
2888 c_20 += a_2 * b_0;
2889 c_30 += a_3 * b_0;
2890
2891 A += 4;
2892 B += 1;
2893 k += 1;
2894
2895 if(k>=kmax)
2896 goto store;
2897
2898 // k = 1
2899
2900 a_0 = A[0];
2901 a_1 = A[1];
2902 a_2 = A[2];
2903 a_3 = A[3];
2904
2905 b_0 = B[0];
2906 c_00 += a_0 * b_0;
2907 c_10 += a_1 * b_0;
2908 c_20 += a_2 * b_0;
2909 c_30 += a_3 * b_0;
2910
2911 b_1 = B[4];
2912 c_01 += a_0 * b_1;
2913 c_11 += a_1 * b_1;
2914 c_21 += a_2 * b_1;
2915 c_31 += a_3 * b_1;
2916
2917 A += 4;
2918 B += 4*sdb-3;
2919 k += 1;
2920
2921 if(k>=kmax)
2922 goto store;
2923
2924 // k = 2
2925
2926 a_0 = A[0];
2927 a_1 = A[1];
2928 a_2 = A[2];
2929 a_3 = A[3];
2930
2931 b_0 = B[0];
2932 c_00 += a_0 * b_0;
2933 c_10 += a_1 * b_0;
2934 c_20 += a_2 * b_0;
2935 c_30 += a_3 * b_0;
2936
2937 b_1 = B[4];
2938 c_01 += a_0 * b_1;
2939 c_11 += a_1 * b_1;
2940 c_21 += a_2 * b_1;
2941 c_31 += a_3 * b_1;
2942
2943 b_2 = B[8];
2944 c_02 += a_0 * b_2;
2945 c_12 += a_1 * b_2;
2946 c_22 += a_2 * b_2;
2947 c_32 += a_3 * b_2;
2948
2949 A += 4;
2950 B += 1;
2951 k += 1;
2952
2953 if(k>=kmax)
2954 goto store;
2955
2956 // k = 3
2957
2958 a_0 = A[0];
2959 a_1 = A[1];
2960 a_2 = A[2];
2961 a_3 = A[3];
2962
2963 b_0 = B[0];
2964 c_00 += a_0 * b_0;
2965 c_10 += a_1 * b_0;
2966 c_20 += a_2 * b_0;
2967 c_30 += a_3 * b_0;
2968
2969 b_1 = B[4];
2970 c_01 += a_0 * b_1;
2971 c_11 += a_1 * b_1;
2972 c_21 += a_2 * b_1;
2973 c_31 += a_3 * b_1;
2974
2975 b_2 = B[8];
2976 c_02 += a_0 * b_2;
2977 c_12 += a_1 * b_2;
2978 c_22 += a_2 * b_2;
2979 c_32 += a_3 * b_2;
2980
2981 b_3 = B[12];
2982 c_03 += a_0 * b_3;
2983 c_13 += a_1 * b_3;
2984 c_23 += a_2 * b_3;
2985 c_33 += a_3 * b_3;
2986
2987 A += 4;
2988 B += 1;
2989 k += 1;
2990
2991 if(k>=kmax)
2992 goto store;
2993
2994 // k = 4
2995
2996 a_0 = A[0];
2997 a_1 = A[1];
2998 a_2 = A[2];
2999 a_3 = A[3];
3000
3001 b_0 = B[0];
3002 c_00 += a_0 * b_0;
3003 c_10 += a_1 * b_0;
3004 c_20 += a_2 * b_0;
3005 c_30 += a_3 * b_0;
3006
3007 b_1 = B[4];
3008 c_01 += a_0 * b_1;
3009 c_11 += a_1 * b_1;
3010 c_21 += a_2 * b_1;
3011 c_31 += a_3 * b_1;
3012
3013 b_2 = B[8];
3014 c_02 += a_0 * b_2;
3015 c_12 += a_1 * b_2;
3016 c_22 += a_2 * b_2;
3017 c_32 += a_3 * b_2;
3018
3019 b_3 = B[12];
3020 c_03 += a_0 * b_3;
3021 c_13 += a_1 * b_3;
3022 c_23 += a_2 * b_3;
3023 c_33 += a_3 * b_3;
3024
3025 A += 4;
3026 B += 1;
3027 k += 1;
3028
3029 if(k>=kmax)
3030 goto store;
3031
3032 // k = 5
3033
3034 a_0 = A[0];
3035 a_1 = A[1];
3036 a_2 = A[2];
3037 a_3 = A[3];
3038
3039 b_0 = B[0];
3040 c_00 += a_0 * b_0;
3041 c_10 += a_1 * b_0;
3042 c_20 += a_2 * b_0;
3043 c_30 += a_3 * b_0;
3044
3045 b_1 = B[4];
3046 c_01 += a_0 * b_1;
3047 c_11 += a_1 * b_1;
3048 c_21 += a_2 * b_1;
3049 c_31 += a_3 * b_1;
3050
3051 b_2 = B[8];
3052 c_02 += a_0 * b_2;
3053 c_12 += a_1 * b_2;
3054 c_22 += a_2 * b_2;
3055 c_32 += a_3 * b_2;
3056
3057 b_3 = B[12];
3058 c_03 += a_0 * b_3;
3059 c_13 += a_1 * b_3;
3060 c_23 += a_2 * b_3;
3061 c_33 += a_3 * b_3;
3062
3063 A += 4;
3064 B += 4*sdb-3;
3065 k += 1;
3066
3067 }
3068 else // if(offetB==3)
3069 {
3070
3071 // k = 0
3072
3073 a_0 = A[0];
3074 a_1 = A[1];
3075 a_2 = A[2];
3076 a_3 = A[3];
3077
3078 b_0 = B[0];
3079 c_00 += a_0 * b_0;
3080 c_10 += a_1 * b_0;
3081 c_20 += a_2 * b_0;
3082 c_30 += a_3 * b_0;
3083
3084 A += 4;
3085 B += 4*sdb-3;
3086 k += 1;
3087
3088 if(k>=kmax)
3089 goto store;
3090
3091 // k = 1
3092
3093 a_0 = A[0];
3094 a_1 = A[1];
3095 a_2 = A[2];
3096 a_3 = A[3];
3097
3098 b_0 = B[0];
3099 c_00 += a_0 * b_0;
3100 c_10 += a_1 * b_0;
3101 c_20 += a_2 * b_0;
3102 c_30 += a_3 * b_0;
3103
3104 b_1 = B[4];
3105 c_01 += a_0 * b_1;
3106 c_11 += a_1 * b_1;
3107 c_21 += a_2 * b_1;
3108 c_31 += a_3 * b_1;
3109
3110 A += 4;
3111 B += 1;
3112 k += 1;
3113
3114 if(k>=kmax)
3115 goto store;
3116
3117 // k = 2
3118
3119 a_0 = A[0];
3120 a_1 = A[1];
3121 a_2 = A[2];
3122 a_3 = A[3];
3123
3124 b_0 = B[0];
3125 c_00 += a_0 * b_0;
3126 c_10 += a_1 * b_0;
3127 c_20 += a_2 * b_0;
3128 c_30 += a_3 * b_0;
3129
3130 b_1 = B[4];
3131 c_01 += a_0 * b_1;
3132 c_11 += a_1 * b_1;
3133 c_21 += a_2 * b_1;
3134 c_31 += a_3 * b_1;
3135
3136 b_2 = B[8];
3137 c_02 += a_0 * b_2;
3138 c_12 += a_1 * b_2;
3139 c_22 += a_2 * b_2;
3140 c_32 += a_3 * b_2;
3141
3142 A += 4;
3143 B += 1;
3144 k += 1;
3145
3146 if(k>=kmax)
3147 goto store;
3148
3149 // k = 3
3150
3151 a_0 = A[0];
3152 a_1 = A[1];
3153 a_2 = A[2];
3154 a_3 = A[3];
3155
3156 b_0 = B[0];
3157 c_00 += a_0 * b_0;
3158 c_10 += a_1 * b_0;
3159 c_20 += a_2 * b_0;
3160 c_30 += a_3 * b_0;
3161
3162 b_1 = B[4];
3163 c_01 += a_0 * b_1;
3164 c_11 += a_1 * b_1;
3165 c_21 += a_2 * b_1;
3166 c_31 += a_3 * b_1;
3167
3168 b_2 = B[8];
3169 c_02 += a_0 * b_2;
3170 c_12 += a_1 * b_2;
3171 c_22 += a_2 * b_2;
3172 c_32 += a_3 * b_2;
3173
3174 b_3 = B[12];
3175 c_03 += a_0 * b_3;
3176 c_13 += a_1 * b_3;
3177 c_23 += a_2 * b_3;
3178 c_33 += a_3 * b_3;
3179
3180 A += 4;
3181 B += 1;
3182 k += 1;
3183
3184 if(k>=kmax)
3185 goto store;
3186
3187 // k = 4
3188
3189 a_0 = A[0];
3190 a_1 = A[1];
3191 a_2 = A[2];
3192 a_3 = A[3];
3193
3194 b_0 = B[0];
3195 c_00 += a_0 * b_0;
3196 c_10 += a_1 * b_0;
3197 c_20 += a_2 * b_0;
3198 c_30 += a_3 * b_0;
3199
3200 b_1 = B[4];
3201 c_01 += a_0 * b_1;
3202 c_11 += a_1 * b_1;
3203 c_21 += a_2 * b_1;
3204 c_31 += a_3 * b_1;
3205
3206 b_2 = B[8];
3207 c_02 += a_0 * b_2;
3208 c_12 += a_1 * b_2;
3209 c_22 += a_2 * b_2;
3210 c_32 += a_3 * b_2;
3211
3212 b_3 = B[12];
3213 c_03 += a_0 * b_3;
3214 c_13 += a_1 * b_3;
3215 c_23 += a_2 * b_3;
3216 c_33 += a_3 * b_3;
3217
3218 A += 4;
3219 B += 4*sdb-3;
3220 k += 1;
3221
3222 }
3223
3224 for(; k<kmax-3; k+=4)
3225 {
3226
3227 // k = 0
3228
3229 a_0 = A[0];
3230 a_1 = A[1];
3231 a_2 = A[2];
3232 a_3 = A[3];
3233
3234 b_0 = B[0];
3235 b_1 = B[4];
3236 b_2 = B[8];
3237 b_3 = B[12];
3238
3239 c_00 += a_0 * b_0;
3240 c_10 += a_1 * b_0;
3241 c_20 += a_2 * b_0;
3242 c_30 += a_3 * b_0;
3243
3244 c_01 += a_0 * b_1;
3245 c_11 += a_1 * b_1;
3246 c_21 += a_2 * b_1;
3247 c_31 += a_3 * b_1;
3248
3249 c_02 += a_0 * b_2;
3250 c_12 += a_1 * b_2;
3251 c_22 += a_2 * b_2;
3252 c_32 += a_3 * b_2;
3253
3254 c_03 += a_0 * b_3;
3255 c_13 += a_1 * b_3;
3256 c_23 += a_2 * b_3;
3257 c_33 += a_3 * b_3;
3258
3259
3260 // k = 1
3261
3262 a_0 = A[4];
3263 a_1 = A[5];
3264 a_2 = A[6];
3265 a_3 = A[7];
3266
3267 b_0 = B[1];
3268 b_1 = B[5];
3269 b_2 = B[9];
3270 b_3 = B[13];
3271
3272 c_00 += a_0 * b_0;
3273 c_10 += a_1 * b_0;
3274 c_20 += a_2 * b_0;
3275 c_30 += a_3 * b_0;
3276
3277 c_01 += a_0 * b_1;
3278 c_11 += a_1 * b_1;
3279 c_21 += a_2 * b_1;
3280 c_31 += a_3 * b_1;
3281
3282 c_02 += a_0 * b_2;
3283 c_12 += a_1 * b_2;
3284 c_22 += a_2 * b_2;
3285 c_32 += a_3 * b_2;
3286
3287 c_03 += a_0 * b_3;
3288 c_13 += a_1 * b_3;
3289 c_23 += a_2 * b_3;
3290 c_33 += a_3 * b_3;
3291
3292
3293 // k = 2
3294
3295 a_0 = A[8];
3296 a_1 = A[9];
3297 a_2 = A[10];
3298 a_3 = A[11];
3299
3300 b_0 = B[2];
3301 b_1 = B[6];
3302 b_2 = B[10];
3303 b_3 = B[14];
3304
3305 c_00 += a_0 * b_0;
3306 c_10 += a_1 * b_0;
3307 c_20 += a_2 * b_0;
3308 c_30 += a_3 * b_0;
3309
3310 c_01 += a_0 * b_1;
3311 c_11 += a_1 * b_1;
3312 c_21 += a_2 * b_1;
3313 c_31 += a_3 * b_1;
3314
3315 c_02 += a_0 * b_2;
3316 c_12 += a_1 * b_2;
3317 c_22 += a_2 * b_2;
3318 c_32 += a_3 * b_2;
3319
3320 c_03 += a_0 * b_3;
3321 c_13 += a_1 * b_3;
3322 c_23 += a_2 * b_3;
3323 c_33 += a_3 * b_3;
3324
3325
3326 // k = 3
3327
3328 a_0 = A[12];
3329 a_1 = A[13];
3330 a_2 = A[14];
3331 a_3 = A[15];
3332
3333 b_0 = B[3];
3334 b_1 = B[7];
3335 b_2 = B[11];
3336 b_3 = B[15];
3337
3338 c_00 += a_0 * b_0;
3339 c_10 += a_1 * b_0;
3340 c_20 += a_2 * b_0;
3341 c_30 += a_3 * b_0;
3342
3343 c_01 += a_0 * b_1;
3344 c_11 += a_1 * b_1;
3345 c_21 += a_2 * b_1;
3346 c_31 += a_3 * b_1;
3347
3348 c_02 += a_0 * b_2;
3349 c_12 += a_1 * b_2;
3350 c_22 += a_2 * b_2;
3351 c_32 += a_3 * b_2;
3352
3353 c_03 += a_0 * b_3;
3354 c_13 += a_1 * b_3;
3355 c_23 += a_2 * b_3;
3356 c_33 += a_3 * b_3;
3357
3358 A += 16;
3359 B += 4*sdb;
3360
3361 }
3362
3363 for(; k<kmax; k++)
3364 {
3365
3366 // k = 0
3367
3368 a_0 = A[0];
3369 a_1 = A[1];
3370 a_2 = A[2];
3371 a_3 = A[3];
3372
3373 b_0 = B[0];
3374 b_1 = B[4];
3375 b_2 = B[8];
3376 b_3 = B[12];
3377
3378 c_00 += a_0 * b_0;
3379 c_10 += a_1 * b_0;
3380 c_20 += a_2 * b_0;
3381 c_30 += a_3 * b_0;
3382
3383 c_01 += a_0 * b_1;
3384 c_11 += a_1 * b_1;
3385 c_21 += a_2 * b_1;
3386 c_31 += a_3 * b_1;
3387
3388 c_02 += a_0 * b_2;
3389 c_12 += a_1 * b_2;
3390 c_22 += a_2 * b_2;
3391 c_32 += a_3 * b_2;
3392
3393 c_03 += a_0 * b_3;
3394 c_13 += a_1 * b_3;
3395 c_23 += a_2 * b_3;
3396 c_33 += a_3 * b_3;
3397
3398 A += 4;
3399 B += 1;
3400
3401 }
3402
3403 store:
3404
3405 c_00 = alpha[0]*c_00;
3406 c_10 = alpha[0]*c_10;
3407 c_20 = alpha[0]*c_20;
3408 c_30 = alpha[0]*c_30;
3409
3410 c_01 = alpha[0]*c_01;
3411 c_11 = alpha[0]*c_11;
3412 c_21 = alpha[0]*c_21;
3413 c_31 = alpha[0]*c_31;
3414
3415 c_02 = alpha[0]*c_02;
3416 c_12 = alpha[0]*c_12;
3417 c_22 = alpha[0]*c_22;
3418 c_32 = alpha[0]*c_32;
3419
3420 c_03 = alpha[0]*c_03;
3421 c_13 = alpha[0]*c_13;
3422 c_23 = alpha[0]*c_23;
3423 c_33 = alpha[0]*c_33;
3424
3425 // shift sol for cols
3426 if(n0>0)
3427 {
3428 if(n0==1)
3429 {
3430 c_00 = c_01;
3431 c_10 = c_11;
3432 c_20 = c_21;
3433 c_30 = c_31;
3434
3435 c_01 = c_02;
3436 c_11 = c_12;
3437 c_21 = c_22;
3438 c_31 = c_32;
3439
3440 c_02 = c_03;
3441 c_12 = c_13;
3442 c_22 = c_23;
3443 c_32 = c_33;
3444
3445 D0 += 1*bs;
3446 }
3447 else if(n0==2)
3448 {
3449 c_00 = c_02;
3450 c_10 = c_12;
3451 c_20 = c_22;
3452 c_30 = c_32;
3453
3454 c_01 = c_03;
3455 c_11 = c_13;
3456 c_21 = c_23;
3457 c_31 = c_33;
3458
3459 D0 += 2*bs;
3460 }
3461 else //if(n0==3)
3462 {
3463 c_00 = c_03;
3464 c_10 = c_13;
3465 c_20 = c_23;
3466 c_30 = c_33;
3467
3468 D0 += 3*bs;
3469 }
3470 }
3471
3472 int kn = n1 - n0;
3473
3474 if(offsetD==0)
3475 {
3476 if(kn<=0)
3477 return;
3478
3479 if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
3480 if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
3481 if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
3482 if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
3483
3484 if(kn<=1)
3485 return;
3486
3487 if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
3488 if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
3489 if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
3490 if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
3491
3492 if(kn<=2)
3493 return;
3494
3495 if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
3496 if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
3497 if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
3498 if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
3499
3500 if(kn<=3)
3501 return;
3502
3503 if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
3504 if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
3505 if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
3506 if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
3507 }
3508 else if(offsetD==1)
3509 {
3510 D1 = D0 + sdd*bs;
3511
3512 if(kn<=0)
3513 return;
3514
3515 if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
3516 if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
3517 if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
3518 if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
3519
3520 if(kn<=1)
3521 return;
3522
3523 if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
3524 if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
3525 if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
3526 if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
3527
3528 if(kn<=2)
3529 return;
3530
3531 if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
3532 if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
3533 if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
3534 if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
3535
3536 if(kn<=3)
3537 return;
3538
3539 if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
3540 if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
3541 if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
3542 if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
3543 }
3544 else if(offsetD==2)
3545 {
3546 D1 = D0 + sdd*bs;
3547
3548 if(kn<=0)
3549 return;
3550
3551 if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
3552 if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
3553 if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
3554 if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
3555
3556 if(kn<=1)
3557 return;
3558
3559 if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
3560 if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
3561 if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
3562 if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
3563
3564 if(kn<=2)
3565 return;
3566
3567 if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
3568 if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
3569 if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
3570 if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
3571
3572 if(kn<=3)
3573 return;
3574
3575 if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
3576 if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
3577 if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
3578 if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
3579 }
3580 else //if(offsetD==3)
3581 {
3582 D1 = D0 + sdd*bs;
3583
3584 if(kn<=0)
3585 return;
3586
3587 if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
3588 if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
3589 if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
3590 if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
3591
3592 if(kn<=1)
3593 return;
3594
3595 if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
3596 if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
3597 if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
3598 if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
3599
3600 if(kn<=2)
3601 return;
3602
3603 if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
3604 if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
3605 if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
3606 if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
3607
3608 if(kn<=3)
3609 return;
3610
3611 if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
3612 if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
3613 if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
3614 if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
3615 }
3616
3617 return;
3618
3619 }
3620#endif
3621
3622
3623
3624#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3625void kernel_dtrmm_nn_rl_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *D)
3626 {
3627 kernel_dtrmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
3628 }
3629#endif
3630
3631
3632
3633#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3634void kernel_dpotrf_nt_l_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn)
3635 {
3636
3637 const int bs = 4;
3638
3639 double
3640 a_0, a_1, a_2, a_3,
3641 b_0, b_1, b_2, b_3,
3642 tmp,
3643 c_00=0, //c_01=0, c_02=0, c_03=0,
3644 c_10=0, c_11=0, //c_12=0, c_13=0,
3645 c_20=0, c_21=0, c_22=0, //c_23=0,
3646 c_30=0, c_31=0, c_32=0, c_33=0;
3647
3648 int k;
3649
3650 for(k=0; k<kmax-3; k+=4)
3651 {
3652
3653 // k = 0
3654
3655 a_0 = A[0];
3656 a_1 = A[1];
3657 a_2 = A[2];
3658 a_3 = A[3];
3659
3660 b_0 = B[0];
3661 b_1 = B[1];
3662 b_2 = B[2];
3663 b_3 = B[3];
3664
3665 c_00 -= a_0 * b_0;
3666 c_10 -= a_1 * b_0;
3667 c_20 -= a_2 * b_0;
3668 c_30 -= a_3 * b_0;
3669
3670// c_01 -= a_0 * b_1;
3671 c_11 -= a_1 * b_1;
3672 c_21 -= a_2 * b_1;
3673 c_31 -= a_3 * b_1;
3674
3675// c_02 -= a_0 * b_2;
3676// c_12 -= a_1 * b_2;
3677 c_22 -= a_2 * b_2;
3678 c_32 -= a_3 * b_2;
3679
3680// c_03 -= a_0 * b_3;
3681// c_13 -= a_1 * b_3;
3682// c_23 -= a_2 * b_3;
3683 c_33 -= a_3 * b_3;
3684
3685
3686 // k = 1
3687
3688 a_0 = A[4];
3689 a_1 = A[5];
3690 a_2 = A[6];
3691 a_3 = A[7];
3692
3693 b_0 = B[4];
3694 b_1 = B[5];
3695 b_2 = B[6];
3696 b_3 = B[7];
3697
3698 c_00 -= a_0 * b_0;
3699 c_10 -= a_1 * b_0;
3700 c_20 -= a_2 * b_0;
3701 c_30 -= a_3 * b_0;
3702
3703// c_01 -= a_0 * b_1;
3704 c_11 -= a_1 * b_1;
3705 c_21 -= a_2 * b_1;
3706 c_31 -= a_3 * b_1;
3707
3708// c_02 -= a_0 * b_2;
3709// c_12 -= a_1 * b_2;
3710 c_22 -= a_2 * b_2;
3711 c_32 -= a_3 * b_2;
3712
3713// c_03 -= a_0 * b_3;
3714// c_13 -= a_1 * b_3;
3715// c_23 -= a_2 * b_3;
3716 c_33 -= a_3 * b_3;
3717
3718
3719 // k = 2
3720
3721 a_0 = A[8];
3722 a_1 = A[9];
3723 a_2 = A[10];
3724 a_3 = A[11];
3725
3726 b_0 = B[8];
3727 b_1 = B[9];
3728 b_2 = B[10];
3729 b_3 = B[11];
3730
3731 c_00 -= a_0 * b_0;
3732 c_10 -= a_1 * b_0;
3733 c_20 -= a_2 * b_0;
3734 c_30 -= a_3 * b_0;
3735
3736// c_01 -= a_0 * b_1;
3737 c_11 -= a_1 * b_1;
3738 c_21 -= a_2 * b_1;
3739 c_31 -= a_3 * b_1;
3740
3741// c_02 -= a_0 * b_2;
3742// c_12 -= a_1 * b_2;
3743 c_22 -= a_2 * b_2;
3744 c_32 -= a_3 * b_2;
3745
3746// c_03 -= a_0 * b_3;
3747// c_13 -= a_1 * b_3;
3748// c_23 -= a_2 * b_3;
3749 c_33 -= a_3 * b_3;
3750
3751
3752 // k = 3
3753
3754 a_0 = A[12];
3755 a_1 = A[13];
3756 a_2 = A[14];
3757 a_3 = A[15];
3758
3759 b_0 = B[12];
3760 b_1 = B[13];
3761 b_2 = B[14];
3762 b_3 = B[15];
3763
3764 c_00 -= a_0 * b_0;
3765 c_10 -= a_1 * b_0;
3766 c_20 -= a_2 * b_0;
3767 c_30 -= a_3 * b_0;
3768
3769// c_01 -= a_0 * b_1;
3770 c_11 -= a_1 * b_1;
3771 c_21 -= a_2 * b_1;
3772 c_31 -= a_3 * b_1;
3773
3774// c_02 -= a_0 * b_2;
3775// c_12 -= a_1 * b_2;
3776 c_22 -= a_2 * b_2;
3777 c_32 -= a_3 * b_2;
3778
3779// c_03 -= a_0 * b_3;
3780// c_13 -= a_1 * b_3;
3781// c_23 -= a_2 * b_3;
3782 c_33 -= a_3 * b_3;
3783
3784 A += 16;
3785 B += 16;
3786
3787 }
3788
3789 for(; k<kmax; k++)
3790 {
3791
3792 // k = 0
3793
3794 a_0 = A[0];
3795 a_1 = A[1];
3796 a_2 = A[2];
3797 a_3 = A[3];
3798
3799 b_0 = B[0];
3800 b_1 = B[1];
3801 b_2 = B[2];
3802 b_3 = B[3];
3803
3804 c_00 -= a_0 * b_0;
3805 c_10 -= a_1 * b_0;
3806 c_20 -= a_2 * b_0;
3807 c_30 -= a_3 * b_0;
3808
3809// c_01 -= a_0 * b_1;
3810 c_11 -= a_1 * b_1;
3811 c_21 -= a_2 * b_1;
3812 c_31 -= a_3 * b_1;
3813
3814// c_02 -= a_0 * b_2;
3815// c_12 -= a_1 * b_2;
3816 c_22 -= a_2 * b_2;
3817 c_32 -= a_3 * b_2;
3818
3819// c_03 -= a_0 * b_3;
3820// c_13 -= a_1 * b_3;
3821// c_23 -= a_2 * b_3;
3822 c_33 -= a_3 * b_3;
3823
3824 A += 4;
3825 B += 4;
3826
3827 }
3828
3829 c_00 = C[0+bs*0] + c_00;
3830 c_10 = C[1+bs*0] + c_10;
3831 c_20 = C[2+bs*0] + c_20;
3832 c_30 = C[3+bs*0] + c_30;
3833
3834// c_01 = C[0+bs*1] + c_01;
3835 c_11 = C[1+bs*1] + c_11;
3836 c_21 = C[2+bs*1] + c_21;
3837 c_31 = C[3+bs*1] + c_31;
3838
3839// c_02 = C[0+bs*2] + c_02;
3840// c_12 = C[1+bs*2] + c_12;
3841 c_22 = C[2+bs*2] + c_22;
3842 c_32 = C[3+bs*2] + c_32;
3843
3844// c_03 = C[0+bs*3] + c_03;
3845// c_13 = C[1+bs*3] + c_13;
3846// c_23 = C[2+bs*3] + c_23;
3847 c_33 = C[3+bs*3] + c_33;
3848
3849 if(c_00>0)
3850 {
3851 c_00 = sqrt(c_00);
3852 tmp = 1.0/c_00;
3853 }
3854 else
3855 {
3856 c_00 = 0.0;
3857 tmp = 0.0;
3858 }
3859 c_10 *= tmp;
3860 c_20 *= tmp;
3861 c_30 *= tmp;
3862 inv_diag_D[0] = tmp;
3863
3864 if(kn==1)
3865 goto store;
3866
3867 c_11 -= c_10 * c_10;
3868 c_21 -= c_20 * c_10;
3869 c_31 -= c_30 * c_10;
3870 if(c_11>0)
3871 {
3872 c_11 = sqrt(c_11);
3873 tmp = 1.0/c_11;
3874 }
3875 else
3876 {
3877 c_11 = 0.0;
3878 tmp = 0.0;
3879 }
3880 c_21 *= tmp;
3881 c_31 *= tmp;
3882 inv_diag_D[1] = tmp;
3883
3884 if(kn==2)
3885 goto store;
3886
3887 c_22 -= c_20 * c_20;
3888 c_32 -= c_30 * c_20;
3889 c_22 -= c_21 * c_21;
3890 c_32 -= c_31 * c_21;
3891 if(c_22>0)
3892 {
3893 c_22 = sqrt(c_22);
3894 tmp = 1.0/c_22;
3895 }
3896 else
3897 {
3898 c_22 = 0.0;
3899 tmp = 0.0;
3900 }
3901 c_32 *= tmp;
3902 inv_diag_D[2] = tmp;
3903
3904 if(kn==3)
3905 goto store;
3906
3907 c_33 -= c_30 * c_30;
3908 c_33 -= c_31 * c_31;
3909 c_33 -= c_32 * c_32;
3910 if(c_33>0)
3911 {
3912 c_33 = sqrt(c_33);
3913 tmp = 1.0/c_33;
3914 }
3915 else
3916 {
3917 c_33 = 0.0;
3918 tmp = 0.0;
3919 }
3920 inv_diag_D[3] = tmp;
3921
3922
3923 store:
3924
3925 if(km>=4)
3926 {
3927 D[0+bs*0] = c_00;
3928 D[1+bs*0] = c_10;
3929 D[2+bs*0] = c_20;
3930 D[3+bs*0] = c_30;
3931
3932 if(kn==1)
3933 return;
3934
3935// D[0+bs*1] = c_01;
3936 D[1+bs*1] = c_11;
3937 D[2+bs*1] = c_21;
3938 D[3+bs*1] = c_31;
3939
3940 if(kn==2)
3941 return;
3942
3943// D[0+bs*2] = c_02;
3944// D[1+bs*2] = c_12;
3945 D[2+bs*2] = c_22;
3946 D[3+bs*2] = c_32;
3947
3948 if(kn==3)
3949 return;
3950
3951// D[0+bs*3] = c_03;
3952// D[1+bs*3] = c_13;
3953// D[2+bs*3] = c_23;
3954 D[3+bs*3] = c_33;
3955 }
3956 else if(km>=3)
3957 {
3958 D[0+bs*0] = c_00;
3959 D[1+bs*0] = c_10;
3960 D[2+bs*0] = c_20;
3961
3962 if(kn==1)
3963 return;
3964
3965// D[0+bs*1] = c_01;
3966 D[1+bs*1] = c_11;
3967 D[2+bs*1] = c_21;
3968
3969 if(kn==2)
3970 return;
3971
3972// D[0+bs*2] = c_02;
3973// D[1+bs*2] = c_12;
3974 D[2+bs*2] = c_22;
3975
3976// if(kn==3)
3977// return;
3978
3979// D[0+bs*3] = c_03;
3980// D[1+bs*3] = c_13;
3981// D[2+bs*3] = c_23;
3982 }
3983 else if(km>=2)
3984 {
3985 D[0+bs*0] = c_00;
3986 D[1+bs*0] = c_10;
3987
3988 if(kn==1)
3989 return;
3990
3991// D[0+bs*1] = c_01;
3992 D[1+bs*1] = c_11;
3993
3994// if(kn==2)
3995// return;
3996
3997// D[0+bs*2] = c_02;
3998// D[1+bs*2] = c_12;
3999
4000// if(kn==3)
4001// return;
4002
4003// D[0+bs*3] = c_03;
4004// D[1+bs*3] = c_13;
4005 }
4006 else //if(km>=1)
4007 {
4008 D[0+bs*0] = c_00;
4009
4010// if(kn==1)
4011// return;
4012
4013// D[0+bs*1] = c_01;
4014
4015// if(kn==2)
4016// return;
4017
4018// D[0+bs*2] = c_02;
4019
4020// if(kn==3)
4021// return;
4022
4023// D[0+bs*3] = c_03;
4024 }
4025
4026 }
4027#endif
4028
4029
4030
4031#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4032void kernel_dpotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D)
4033 {
4034 kernel_dpotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
4035 }
4036#endif
4037
4038
4039
4040#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4041void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn)
4042 {
4043 double alpha = 1.0;
4044 double beta = 1.0;
4045 kernel_dsyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
4046 kernel_dpotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
4047 }
4048#endif
4049
4050
4051
4052#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4053void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D)
4054 {
4055 double alpha = 1.0;
4056 double beta = 1.0;
4057 kernel_dsyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
4058 kernel_dpotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
4059 }
4060#endif
4061
4062
4063
4064#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4065void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
4066 {
4067
4068 const int bs = 4;
4069
4070 double
4071 a_0, a_1, a_2, a_3,
4072 b_0, b_1, b_2, b_3,
4073 tmp,
4074 c_00=0, c_01=0, c_02=0, c_03=0,
4075 c_10=0, c_11=0, c_12=0, c_13=0,
4076 c_20=0, c_21=0, c_22=0, c_23=0,
4077 c_30=0, c_31=0, c_32=0, c_33=0;
4078
4079 int k;
4080
4081 for(k=0; k<kmax-3; k+=4)
4082 {
4083
4084 // k = 0
4085
4086 a_0 = A[0];
4087 a_1 = A[1];
4088 a_2 = A[2];
4089 a_3 = A[3];
4090
4091 b_0 = B[0];
4092 b_1 = B[1];
4093 b_2 = B[2];
4094 b_3 = B[3];
4095
4096 c_00 -= a_0 * b_0;
4097 c_10 -= a_1 * b_0;
4098 c_20 -= a_2 * b_0;
4099 c_30 -= a_3 * b_0;
4100
4101 c_01 -= a_0 * b_1;
4102 c_11 -= a_1 * b_1;
4103 c_21 -= a_2 * b_1;
4104 c_31 -= a_3 * b_1;
4105
4106 c_02 -= a_0 * b_2;
4107 c_12 -= a_1 * b_2;
4108 c_22 -= a_2 * b_2;
4109 c_32 -= a_3 * b_2;
4110
4111 c_03 -= a_0 * b_3;
4112 c_13 -= a_1 * b_3;
4113 c_23 -= a_2 * b_3;
4114 c_33 -= a_3 * b_3;
4115
4116
4117 // k = 1
4118
4119 a_0 = A[4];
4120 a_1 = A[5];
4121 a_2 = A[6];
4122 a_3 = A[7];
4123
4124 b_0 = B[4];
4125 b_1 = B[5];
4126 b_2 = B[6];
4127 b_3 = B[7];
4128
4129 c_00 -= a_0 * b_0;
4130 c_10 -= a_1 * b_0;
4131 c_20 -= a_2 * b_0;
4132 c_30 -= a_3 * b_0;
4133
4134 c_01 -= a_0 * b_1;
4135 c_11 -= a_1 * b_1;
4136 c_21 -= a_2 * b_1;
4137 c_31 -= a_3 * b_1;
4138
4139 c_02 -= a_0 * b_2;
4140 c_12 -= a_1 * b_2;
4141 c_22 -= a_2 * b_2;
4142 c_32 -= a_3 * b_2;
4143
4144 c_03 -= a_0 * b_3;
4145 c_13 -= a_1 * b_3;
4146 c_23 -= a_2 * b_3;
4147 c_33 -= a_3 * b_3;
4148
4149
4150 // k = 2
4151
4152 a_0 = A[8];
4153 a_1 = A[9];
4154 a_2 = A[10];
4155 a_3 = A[11];
4156
4157 b_0 = B[8];
4158 b_1 = B[9];
4159 b_2 = B[10];
4160 b_3 = B[11];
4161
4162 c_00 -= a_0 * b_0;
4163 c_10 -= a_1 * b_0;
4164 c_20 -= a_2 * b_0;
4165 c_30 -= a_3 * b_0;
4166
4167 c_01 -= a_0 * b_1;
4168 c_11 -= a_1 * b_1;
4169 c_21 -= a_2 * b_1;
4170 c_31 -= a_3 * b_1;
4171
4172 c_02 -= a_0 * b_2;
4173 c_12 -= a_1 * b_2;
4174 c_22 -= a_2 * b_2;
4175 c_32 -= a_3 * b_2;
4176
4177 c_03 -= a_0 * b_3;
4178 c_13 -= a_1 * b_3;
4179 c_23 -= a_2 * b_3;
4180 c_33 -= a_3 * b_3;
4181
4182
4183 // k = 3
4184
4185 a_0 = A[12];
4186 a_1 = A[13];
4187 a_2 = A[14];
4188 a_3 = A[15];
4189
4190 b_0 = B[12];
4191 b_1 = B[13];
4192 b_2 = B[14];
4193 b_3 = B[15];
4194
4195 c_00 -= a_0 * b_0;
4196 c_10 -= a_1 * b_0;
4197 c_20 -= a_2 * b_0;
4198 c_30 -= a_3 * b_0;
4199
4200 c_01 -= a_0 * b_1;
4201 c_11 -= a_1 * b_1;
4202 c_21 -= a_2 * b_1;
4203 c_31 -= a_3 * b_1;
4204
4205 c_02 -= a_0 * b_2;
4206 c_12 -= a_1 * b_2;
4207 c_22 -= a_2 * b_2;
4208 c_32 -= a_3 * b_2;
4209
4210 c_03 -= a_0 * b_3;
4211 c_13 -= a_1 * b_3;
4212 c_23 -= a_2 * b_3;
4213 c_33 -= a_3 * b_3;
4214
4215 A += 16;
4216 B += 16;
4217
4218 }
4219
4220 for(; k<kmax; k++)
4221 {
4222
4223 // k = 0
4224
4225 a_0 = A[0];
4226 a_1 = A[1];
4227 a_2 = A[2];
4228 a_3 = A[3];
4229
4230 b_0 = B[0];
4231 b_1 = B[1];
4232 b_2 = B[2];
4233 b_3 = B[3];
4234
4235 c_00 -= a_0 * b_0;
4236 c_10 -= a_1 * b_0;
4237 c_20 -= a_2 * b_0;
4238 c_30 -= a_3 * b_0;
4239
4240 c_01 -= a_0 * b_1;
4241 c_11 -= a_1 * b_1;
4242 c_21 -= a_2 * b_1;
4243 c_31 -= a_3 * b_1;
4244
4245 c_02 -= a_0 * b_2;
4246 c_12 -= a_1 * b_2;
4247 c_22 -= a_2 * b_2;
4248 c_32 -= a_3 * b_2;
4249
4250 c_03 -= a_0 * b_3;
4251 c_13 -= a_1 * b_3;
4252 c_23 -= a_2 * b_3;
4253 c_33 -= a_3 * b_3;
4254
4255 A += 4;
4256 B += 4;
4257
4258 }
4259
4260 c_00 = C[0+bs*0] + c_00;
4261 c_10 = C[1+bs*0] + c_10;
4262 c_20 = C[2+bs*0] + c_20;
4263 c_30 = C[3+bs*0] + c_30;
4264
4265 c_01 = C[0+bs*1] + c_01;
4266 c_11 = C[1+bs*1] + c_11;
4267 c_21 = C[2+bs*1] + c_21;
4268 c_31 = C[3+bs*1] + c_31;
4269
4270 c_02 = C[0+bs*2] + c_02;
4271 c_12 = C[1+bs*2] + c_12;
4272 c_22 = C[2+bs*2] + c_22;
4273 c_32 = C[3+bs*2] + c_32;
4274
4275 c_03 = C[0+bs*3] + c_03;
4276 c_13 = C[1+bs*3] + c_13;
4277 c_23 = C[2+bs*3] + c_23;
4278 c_33 = C[3+bs*3] + c_33;
4279
4280 tmp = inv_diag_E[0];
4281 c_00 *= tmp;
4282 c_10 *= tmp;
4283 c_20 *= tmp;
4284 c_30 *= tmp;
4285
4286 if(kn==1)
4287 goto store;
4288
4289 tmp = E[1+bs*0];
4290 c_01 -= c_00 * tmp;
4291 c_11 -= c_10 * tmp;
4292 c_21 -= c_20 * tmp;
4293 c_31 -= c_30 * tmp;
4294 tmp = inv_diag_E[1];
4295 c_01 *= tmp;
4296 c_11 *= tmp;
4297 c_21 *= tmp;
4298 c_31 *= tmp;
4299
4300 if(kn==2)
4301 goto store;
4302
4303 tmp = E[2+bs*0];
4304 c_02 -= c_00 * tmp;
4305 c_12 -= c_10 * tmp;
4306 c_22 -= c_20 * tmp;
4307 c_32 -= c_30 * tmp;
4308 tmp = E[2+bs*1];
4309 c_02 -= c_01 * tmp;
4310 c_12 -= c_11 * tmp;
4311 c_22 -= c_21 * tmp;
4312 c_32 -= c_31 * tmp;
4313 tmp = inv_diag_E[2];
4314 c_02 *= tmp;
4315 c_12 *= tmp;
4316 c_22 *= tmp;
4317 c_32 *= tmp;
4318
4319 if(kn==3)
4320 goto store;
4321
4322 tmp = E[3+bs*0];
4323 c_03 -= c_00 * tmp;
4324 c_13 -= c_10 * tmp;
4325 c_23 -= c_20 * tmp;
4326 c_33 -= c_30 * tmp;
4327 tmp = E[3+bs*1];
4328 c_03 -= c_01 * tmp;
4329 c_13 -= c_11 * tmp;
4330 c_23 -= c_21 * tmp;
4331 c_33 -= c_31 * tmp;
4332 tmp = E[3+bs*2];
4333 c_03 -= c_02 * tmp;
4334 c_13 -= c_12 * tmp;
4335 c_23 -= c_22 * tmp;
4336 c_33 -= c_32 * tmp;
4337 tmp = inv_diag_E[3];
4338 c_03 *= tmp;
4339 c_13 *= tmp;
4340 c_23 *= tmp;
4341 c_33 *= tmp;
4342
4343
4344 store:
4345
4346 if(km>=4)
4347 {
4348 D[0+bs*0] = c_00;
4349 D[1+bs*0] = c_10;
4350 D[2+bs*0] = c_20;
4351 D[3+bs*0] = c_30;
4352
4353 if(kn==1)
4354 return;
4355
4356 D[0+bs*1] = c_01;
4357 D[1+bs*1] = c_11;
4358 D[2+bs*1] = c_21;
4359 D[3+bs*1] = c_31;
4360
4361 if(kn==2)
4362 return;
4363
4364 D[0+bs*2] = c_02;
4365 D[1+bs*2] = c_12;
4366 D[2+bs*2] = c_22;
4367 D[3+bs*2] = c_32;
4368
4369 if(kn==3)
4370 return;
4371
4372 D[0+bs*3] = c_03;
4373 D[1+bs*3] = c_13;
4374 D[2+bs*3] = c_23;
4375 D[3+bs*3] = c_33;
4376 }
4377 else if(km>=3)
4378 {
4379 D[0+bs*0] = c_00;
4380 D[1+bs*0] = c_10;
4381 D[2+bs*0] = c_20;
4382
4383 if(kn==1)
4384 return;
4385
4386 D[0+bs*1] = c_01;
4387 D[1+bs*1] = c_11;
4388 D[2+bs*1] = c_21;
4389
4390 if(kn==2)
4391 return;
4392
4393 D[0+bs*2] = c_02;
4394 D[1+bs*2] = c_12;
4395 D[2+bs*2] = c_22;
4396
4397 if(kn==3)
4398 return;
4399
4400 D[0+bs*3] = c_03;
4401 D[1+bs*3] = c_13;
4402 D[2+bs*3] = c_23;
4403 }
4404 else if(km>=2)
4405 {
4406 D[0+bs*0] = c_00;
4407 D[1+bs*0] = c_10;
4408
4409 if(kn==1)
4410 return;
4411
4412 D[0+bs*1] = c_01;
4413 D[1+bs*1] = c_11;
4414
4415 if(kn==2)
4416 return;
4417
4418 D[0+bs*2] = c_02;
4419 D[1+bs*2] = c_12;
4420
4421 if(kn==3)
4422 return;
4423
4424 D[0+bs*3] = c_03;
4425 D[1+bs*3] = c_13;
4426 }
4427 else //if(km>=1)
4428 {
4429 D[0+bs*0] = c_00;
4430
4431 if(kn==1)
4432 return;
4433
4434 D[0+bs*1] = c_01;
4435
4436 if(kn==2)
4437 return;
4438
4439 D[0+bs*2] = c_02;
4440
4441 if(kn==3)
4442 return;
4443
4444 D[0+bs*3] = c_03;
4445 }
4446
4447 }
4448#endif
4449
4450
4451
4452#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4453void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
4454 {
4455 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
4456 }
4457#endif
4458
4459
4460
4461#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4462void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
4463 {
4464 double alpha = 1.0;
4465 double beta = 1.0;
4466 kernel_dgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
4467 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
4468 }
4469#endif
4470
4471
4472
4473#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4474void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E)
4475 {
4476 double alpha = 1.0;
4477 double beta = 1.0;
4478 kernel_dgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
4479 kernel_dtrsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
4480 }
4481#endif
4482
4483
4484
4485#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4486void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, int km, int kn)
4487 {
4488
4489 const int bs = 4;
4490
4491 double
4492 a_0, a_1, a_2, a_3,
4493 b_0, b_1, b_2, b_3,
4494 tmp,
4495 c_00=0, c_01=0, c_02=0, c_03=0,
4496 c_10=0, c_11=0, c_12=0, c_13=0,
4497 c_20=0, c_21=0, c_22=0, c_23=0,
4498 c_30=0, c_31=0, c_32=0, c_33=0;
4499
4500 int k;
4501
4502 for(k=0; k<kmax-3; k+=4)
4503 {
4504
4505 // k = 0
4506
4507 a_0 = A[0];
4508 a_1 = A[1];
4509 a_2 = A[2];
4510 a_3 = A[3];
4511
4512 b_0 = B[0];
4513 b_1 = B[1];
4514 b_2 = B[2];
4515 b_3 = B[3];
4516
4517 c_00 -= a_0 * b_0;
4518 c_10 -= a_1 * b_0;
4519 c_20 -= a_2 * b_0;
4520 c_30 -= a_3 * b_0;
4521
4522 c_01 -= a_0 * b_1;
4523 c_11 -= a_1 * b_1;
4524 c_21 -= a_2 * b_1;
4525 c_31 -= a_3 * b_1;
4526
4527 c_02 -= a_0 * b_2;
4528 c_12 -= a_1 * b_2;
4529 c_22 -= a_2 * b_2;
4530 c_32 -= a_3 * b_2;
4531
4532 c_03 -= a_0 * b_3;
4533 c_13 -= a_1 * b_3;
4534 c_23 -= a_2 * b_3;
4535 c_33 -= a_3 * b_3;
4536
4537
4538 // k = 1
4539
4540 a_0 = A[4];
4541 a_1 = A[5];
4542 a_2 = A[6];
4543 a_3 = A[7];
4544
4545 b_0 = B[4];
4546 b_1 = B[5];
4547 b_2 = B[6];
4548 b_3 = B[7];
4549
4550 c_00 -= a_0 * b_0;
4551 c_10 -= a_1 * b_0;
4552 c_20 -= a_2 * b_0;
4553 c_30 -= a_3 * b_0;
4554
4555 c_01 -= a_0 * b_1;
4556 c_11 -= a_1 * b_1;
4557 c_21 -= a_2 * b_1;
4558 c_31 -= a_3 * b_1;
4559
4560 c_02 -= a_0 * b_2;
4561 c_12 -= a_1 * b_2;
4562 c_22 -= a_2 * b_2;
4563 c_32 -= a_3 * b_2;
4564
4565 c_03 -= a_0 * b_3;
4566 c_13 -= a_1 * b_3;
4567 c_23 -= a_2 * b_3;
4568 c_33 -= a_3 * b_3;
4569
4570
4571 // k = 2
4572
4573 a_0 = A[8];
4574 a_1 = A[9];
4575 a_2 = A[10];
4576 a_3 = A[11];
4577
4578 b_0 = B[8];
4579 b_1 = B[9];
4580 b_2 = B[10];
4581 b_3 = B[11];
4582
4583 c_00 -= a_0 * b_0;
4584 c_10 -= a_1 * b_0;
4585 c_20 -= a_2 * b_0;
4586 c_30 -= a_3 * b_0;
4587
4588 c_01 -= a_0 * b_1;
4589 c_11 -= a_1 * b_1;
4590 c_21 -= a_2 * b_1;
4591 c_31 -= a_3 * b_1;
4592
4593 c_02 -= a_0 * b_2;
4594 c_12 -= a_1 * b_2;
4595 c_22 -= a_2 * b_2;
4596 c_32 -= a_3 * b_2;
4597
4598 c_03 -= a_0 * b_3;
4599 c_13 -= a_1 * b_3;
4600 c_23 -= a_2 * b_3;
4601 c_33 -= a_3 * b_3;
4602
4603
4604 // k = 3
4605
4606 a_0 = A[12];
4607 a_1 = A[13];
4608 a_2 = A[14];
4609 a_3 = A[15];
4610
4611 b_0 = B[12];
4612 b_1 = B[13];
4613 b_2 = B[14];
4614 b_3 = B[15];
4615
4616 c_00 -= a_0 * b_0;
4617 c_10 -= a_1 * b_0;
4618 c_20 -= a_2 * b_0;
4619 c_30 -= a_3 * b_0;
4620
4621 c_01 -= a_0 * b_1;
4622 c_11 -= a_1 * b_1;
4623 c_21 -= a_2 * b_1;
4624 c_31 -= a_3 * b_1;
4625
4626 c_02 -= a_0 * b_2;
4627 c_12 -= a_1 * b_2;
4628 c_22 -= a_2 * b_2;
4629 c_32 -= a_3 * b_2;
4630
4631 c_03 -= a_0 * b_3;
4632 c_13 -= a_1 * b_3;
4633 c_23 -= a_2 * b_3;
4634 c_33 -= a_3 * b_3;
4635
4636 A += 16;
4637 B += 16;
4638
4639 }
4640
4641 for(; k<kmax; k++)
4642 {
4643
4644 // k = 0
4645
4646 a_0 = A[0];
4647 a_1 = A[1];
4648 a_2 = A[2];
4649 a_3 = A[3];
4650
4651 b_0 = B[0];
4652 b_1 = B[1];
4653 b_2 = B[2];
4654 b_3 = B[3];
4655
4656 c_00 -= a_0 * b_0;
4657 c_10 -= a_1 * b_0;
4658 c_20 -= a_2 * b_0;
4659 c_30 -= a_3 * b_0;
4660
4661 c_01 -= a_0 * b_1;
4662 c_11 -= a_1 * b_1;
4663 c_21 -= a_2 * b_1;
4664 c_31 -= a_3 * b_1;
4665
4666 c_02 -= a_0 * b_2;
4667 c_12 -= a_1 * b_2;
4668 c_22 -= a_2 * b_2;
4669 c_32 -= a_3 * b_2;
4670
4671 c_03 -= a_0 * b_3;
4672 c_13 -= a_1 * b_3;
4673 c_23 -= a_2 * b_3;
4674 c_33 -= a_3 * b_3;
4675
4676 A += 4;
4677 B += 4;
4678
4679 }
4680
4681 c_00 = C[0+bs*0] + c_00;
4682 c_10 = C[1+bs*0] + c_10;
4683 c_20 = C[2+bs*0] + c_20;
4684 c_30 = C[3+bs*0] + c_30;
4685
4686 c_01 = C[0+bs*1] + c_01;
4687 c_11 = C[1+bs*1] + c_11;
4688 c_21 = C[2+bs*1] + c_21;
4689 c_31 = C[3+bs*1] + c_31;
4690
4691 c_02 = C[0+bs*2] + c_02;
4692 c_12 = C[1+bs*2] + c_12;
4693 c_22 = C[2+bs*2] + c_22;
4694 c_32 = C[3+bs*2] + c_32;
4695
4696 c_03 = C[0+bs*3] + c_03;
4697 c_13 = C[1+bs*3] + c_13;
4698 c_23 = C[2+bs*3] + c_23;
4699 c_33 = C[3+bs*3] + c_33;
4700
4701 if(kn==1)
4702 goto store;
4703
4704 tmp = E[1+bs*0];
4705 c_01 -= c_00 * tmp;
4706 c_11 -= c_10 * tmp;
4707 c_21 -= c_20 * tmp;
4708 c_31 -= c_30 * tmp;
4709
4710 if(kn==2)
4711 goto store;
4712
4713 tmp = E[2+bs*0];
4714 c_02 -= c_00 * tmp;
4715 c_12 -= c_10 * tmp;
4716 c_22 -= c_20 * tmp;
4717 c_32 -= c_30 * tmp;
4718 tmp = E[2+bs*1];
4719 c_02 -= c_01 * tmp;
4720 c_12 -= c_11 * tmp;
4721 c_22 -= c_21 * tmp;
4722 c_32 -= c_31 * tmp;
4723
4724 if(kn==3)
4725 goto store;
4726
4727 tmp = E[3+bs*0];
4728 c_03 -= c_00 * tmp;
4729 c_13 -= c_10 * tmp;
4730 c_23 -= c_20 * tmp;
4731 c_33 -= c_30 * tmp;
4732 tmp = E[3+bs*1];
4733 c_03 -= c_01 * tmp;
4734 c_13 -= c_11 * tmp;
4735 c_23 -= c_21 * tmp;
4736 c_33 -= c_31 * tmp;
4737 tmp = E[3+bs*2];
4738 c_03 -= c_02 * tmp;
4739 c_13 -= c_12 * tmp;
4740 c_23 -= c_22 * tmp;
4741 c_33 -= c_32 * tmp;
4742
4743
4744 store:
4745
4746 if(km>=4)
4747 {
4748 D[0+bs*0] = c_00;
4749 D[1+bs*0] = c_10;
4750 D[2+bs*0] = c_20;
4751 D[3+bs*0] = c_30;
4752
4753 if(kn==1)
4754 return;
4755
4756 D[0+bs*1] = c_01;
4757 D[1+bs*1] = c_11;
4758 D[2+bs*1] = c_21;
4759 D[3+bs*1] = c_31;
4760
4761 if(kn==2)
4762 return;
4763
4764 D[0+bs*2] = c_02;
4765 D[1+bs*2] = c_12;
4766 D[2+bs*2] = c_22;
4767 D[3+bs*2] = c_32;
4768
4769 if(kn==3)
4770 return;
4771
4772 D[0+bs*3] = c_03;
4773 D[1+bs*3] = c_13;
4774 D[2+bs*3] = c_23;
4775 D[3+bs*3] = c_33;
4776 }
4777 else if(km>=3)
4778 {
4779 D[0+bs*0] = c_00;
4780 D[1+bs*0] = c_10;
4781 D[2+bs*0] = c_20;
4782
4783 if(kn==1)
4784 return;
4785
4786 D[0+bs*1] = c_01;
4787 D[1+bs*1] = c_11;
4788 D[2+bs*1] = c_21;
4789
4790 if(kn==2)
4791 return;
4792
4793 D[0+bs*2] = c_02;
4794 D[1+bs*2] = c_12;
4795 D[2+bs*2] = c_22;
4796
4797 if(kn==3)
4798 return;
4799
4800 D[0+bs*3] = c_03;
4801 D[1+bs*3] = c_13;
4802 D[2+bs*3] = c_23;
4803 }
4804 else if(km>=2)
4805 {
4806 D[0+bs*0] = c_00;
4807 D[1+bs*0] = c_10;
4808
4809 if(kn==1)
4810 return;
4811
4812 D[0+bs*1] = c_01;
4813 D[1+bs*1] = c_11;
4814
4815 if(kn==2)
4816 return;
4817
4818 D[0+bs*2] = c_02;
4819 D[1+bs*2] = c_12;
4820
4821 if(kn==3)
4822 return;
4823
4824 D[0+bs*3] = c_03;
4825 D[1+bs*3] = c_13;
4826 }
4827 else //if(km>=1)
4828 {
4829 D[0+bs*0] = c_00;
4830
4831 if(kn==1)
4832 return;
4833
4834 D[0+bs*1] = c_01;
4835
4836 if(kn==2)
4837 return;
4838
4839 D[0+bs*2] = c_02;
4840
4841 if(kn==3)
4842 return;
4843
4844 D[0+bs*3] = c_03;
4845 }
4846
4847 }
4848#endif
4849
4850
4851
4852#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4853void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E)
4854 {
4855 kernel_dtrsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
4856 }
4857#endif
4858
4859
4860
4861#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4862void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
4863 {
4864
4865 const int bs = 4;
4866
4867 double
4868 a_0, a_1, a_2, a_3,
4869 b_0, b_1, b_2, b_3,
4870 tmp,
4871 c_00=0, c_01=0, c_02=0, c_03=0,
4872 c_10=0, c_11=0, c_12=0, c_13=0,
4873 c_20=0, c_21=0, c_22=0, c_23=0,
4874 c_30=0, c_31=0, c_32=0, c_33=0;
4875
4876 int k;
4877
4878 for(k=0; k<kmax-3; k+=4)
4879 {
4880
4881 // k = 0
4882
4883 a_0 = A[0];
4884 a_1 = A[1];
4885 a_2 = A[2];
4886 a_3 = A[3];
4887
4888 b_0 = B[0];
4889 b_1 = B[1];
4890 b_2 = B[2];
4891 b_3 = B[3];
4892
4893 c_00 -= a_0 * b_0;
4894 c_10 -= a_1 * b_0;
4895 c_20 -= a_2 * b_0;
4896 c_30 -= a_3 * b_0;
4897
4898 c_01 -= a_0 * b_1;
4899 c_11 -= a_1 * b_1;
4900 c_21 -= a_2 * b_1;
4901 c_31 -= a_3 * b_1;
4902
4903 c_02 -= a_0 * b_2;
4904 c_12 -= a_1 * b_2;
4905 c_22 -= a_2 * b_2;
4906 c_32 -= a_3 * b_2;
4907
4908 c_03 -= a_0 * b_3;
4909 c_13 -= a_1 * b_3;
4910 c_23 -= a_2 * b_3;
4911 c_33 -= a_3 * b_3;
4912
4913
4914 // k = 1
4915
4916 a_0 = A[4];
4917 a_1 = A[5];
4918 a_2 = A[6];
4919 a_3 = A[7];
4920
4921 b_0 = B[4];
4922 b_1 = B[5];
4923 b_2 = B[6];
4924 b_3 = B[7];
4925
4926 c_00 -= a_0 * b_0;
4927 c_10 -= a_1 * b_0;
4928 c_20 -= a_2 * b_0;
4929 c_30 -= a_3 * b_0;
4930
4931 c_01 -= a_0 * b_1;
4932 c_11 -= a_1 * b_1;
4933 c_21 -= a_2 * b_1;
4934 c_31 -= a_3 * b_1;
4935
4936 c_02 -= a_0 * b_2;
4937 c_12 -= a_1 * b_2;
4938 c_22 -= a_2 * b_2;
4939 c_32 -= a_3 * b_2;
4940
4941 c_03 -= a_0 * b_3;
4942 c_13 -= a_1 * b_3;
4943 c_23 -= a_2 * b_3;
4944 c_33 -= a_3 * b_3;
4945
4946
4947 // k = 2
4948
4949 a_0 = A[8];
4950 a_1 = A[9];
4951 a_2 = A[10];
4952 a_3 = A[11];
4953
4954 b_0 = B[8];
4955 b_1 = B[9];
4956 b_2 = B[10];
4957 b_3 = B[11];
4958
4959 c_00 -= a_0 * b_0;
4960 c_10 -= a_1 * b_0;
4961 c_20 -= a_2 * b_0;
4962 c_30 -= a_3 * b_0;
4963
4964 c_01 -= a_0 * b_1;
4965 c_11 -= a_1 * b_1;
4966 c_21 -= a_2 * b_1;
4967 c_31 -= a_3 * b_1;
4968
4969 c_02 -= a_0 * b_2;
4970 c_12 -= a_1 * b_2;
4971 c_22 -= a_2 * b_2;
4972 c_32 -= a_3 * b_2;
4973
4974 c_03 -= a_0 * b_3;
4975 c_13 -= a_1 * b_3;
4976 c_23 -= a_2 * b_3;
4977 c_33 -= a_3 * b_3;
4978
4979
4980 // k = 3
4981
4982 a_0 = A[12];
4983 a_1 = A[13];
4984 a_2 = A[14];
4985 a_3 = A[15];
4986
4987 b_0 = B[12];
4988 b_1 = B[13];
4989 b_2 = B[14];
4990 b_3 = B[15];
4991
4992 c_00 -= a_0 * b_0;
4993 c_10 -= a_1 * b_0;
4994 c_20 -= a_2 * b_0;
4995 c_30 -= a_3 * b_0;
4996
4997 c_01 -= a_0 * b_1;
4998 c_11 -= a_1 * b_1;
4999 c_21 -= a_2 * b_1;
5000 c_31 -= a_3 * b_1;
5001
5002 c_02 -= a_0 * b_2;
5003 c_12 -= a_1 * b_2;
5004 c_22 -= a_2 * b_2;
5005 c_32 -= a_3 * b_2;
5006
5007 c_03 -= a_0 * b_3;
5008 c_13 -= a_1 * b_3;
5009 c_23 -= a_2 * b_3;
5010 c_33 -= a_3 * b_3;
5011
5012 A += 16;
5013 B += 16;
5014
5015 }
5016
5017 for(; k<kmax; k++)
5018 {
5019
5020 // k = 0
5021
5022 a_0 = A[0];
5023 a_1 = A[1];
5024 a_2 = A[2];
5025 a_3 = A[3];
5026
5027 b_0 = B[0];
5028 b_1 = B[1];
5029 b_2 = B[2];
5030 b_3 = B[3];
5031
5032 c_00 -= a_0 * b_0;
5033 c_10 -= a_1 * b_0;
5034 c_20 -= a_2 * b_0;
5035 c_30 -= a_3 * b_0;
5036
5037 c_01 -= a_0 * b_1;
5038 c_11 -= a_1 * b_1;
5039 c_21 -= a_2 * b_1;
5040 c_31 -= a_3 * b_1;
5041
5042 c_02 -= a_0 * b_2;
5043 c_12 -= a_1 * b_2;
5044 c_22 -= a_2 * b_2;
5045 c_32 -= a_3 * b_2;
5046
5047 c_03 -= a_0 * b_3;
5048 c_13 -= a_1 * b_3;
5049 c_23 -= a_2 * b_3;
5050 c_33 -= a_3 * b_3;
5051
5052 A += 4;
5053 B += 4;
5054
5055 }
5056
5057 c_00 = C[0+bs*0] + c_00;
5058 c_10 = C[1+bs*0] + c_10;
5059 c_20 = C[2+bs*0] + c_20;
5060 c_30 = C[3+bs*0] + c_30;
5061
5062 c_01 = C[0+bs*1] + c_01;
5063 c_11 = C[1+bs*1] + c_11;
5064 c_21 = C[2+bs*1] + c_21;
5065 c_31 = C[3+bs*1] + c_31;
5066
5067 c_02 = C[0+bs*2] + c_02;
5068 c_12 = C[1+bs*2] + c_12;
5069 c_22 = C[2+bs*2] + c_22;
5070 c_32 = C[3+bs*2] + c_32;
5071
5072 c_03 = C[0+bs*3] + c_03;
5073 c_13 = C[1+bs*3] + c_13;
5074 c_23 = C[2+bs*3] + c_23;
5075 c_33 = C[3+bs*3] + c_33;
5076
5077
5078 if(kn>3)
5079 {
5080 tmp = inv_diag_E[3];
5081 c_03 *= tmp;
5082 c_13 *= tmp;
5083 c_23 *= tmp;
5084 c_33 *= tmp;
5085 tmp = E[2+bs*3];
5086 c_02 -= c_03 * tmp;
5087 c_12 -= c_13 * tmp;
5088 c_22 -= c_23 * tmp;
5089 c_32 -= c_33 * tmp;
5090 tmp = E[1+bs*3];
5091 c_01 -= c_03 * tmp;
5092 c_11 -= c_13 * tmp;
5093 c_21 -= c_23 * tmp;
5094 c_31 -= c_33 * tmp;
5095 tmp = E[0+bs*3];
5096 c_00 -= c_03 * tmp;
5097 c_10 -= c_13 * tmp;
5098 c_20 -= c_23 * tmp;
5099 c_30 -= c_33 * tmp;
5100 }
5101
5102 if(kn>2)
5103 {
5104 tmp = inv_diag_E[2];
5105 c_02 *= tmp;
5106 c_12 *= tmp;
5107 c_22 *= tmp;
5108 c_32 *= tmp;
5109 tmp = E[1+bs*2];
5110 c_01 -= c_02 * tmp;
5111 c_11 -= c_12 * tmp;
5112 c_21 -= c_22 * tmp;
5113 c_31 -= c_32 * tmp;
5114 tmp = E[0+bs*2];
5115 c_00 -= c_02 * tmp;
5116 c_10 -= c_12 * tmp;
5117 c_20 -= c_22 * tmp;
5118 c_30 -= c_32 * tmp;
5119 }
5120
5121 if(kn>1)
5122 {
5123 tmp = inv_diag_E[1];
5124 c_01 *= tmp;
5125 c_11 *= tmp;
5126 c_21 *= tmp;
5127 c_31 *= tmp;
5128 tmp = E[0+bs*1];
5129 c_00 -= c_01 * tmp;
5130 c_10 -= c_11 * tmp;
5131 c_20 -= c_21 * tmp;
5132 c_30 -= c_31 * tmp;
5133 }
5134
5135 tmp = inv_diag_E[0];
5136 c_00 *= tmp;
5137 c_10 *= tmp;
5138 c_20 *= tmp;
5139 c_30 *= tmp;
5140
5141
5142 store:
5143
5144 if(km>=4)
5145 {
5146 D[0+bs*0] = c_00;
5147 D[1+bs*0] = c_10;
5148 D[2+bs*0] = c_20;
5149 D[3+bs*0] = c_30;
5150
5151 if(kn==1)
5152 return;
5153
5154 D[0+bs*1] = c_01;
5155 D[1+bs*1] = c_11;
5156 D[2+bs*1] = c_21;
5157 D[3+bs*1] = c_31;
5158
5159 if(kn==2)
5160 return;
5161
5162 D[0+bs*2] = c_02;
5163 D[1+bs*2] = c_12;
5164 D[2+bs*2] = c_22;
5165 D[3+bs*2] = c_32;
5166
5167 if(kn==3)
5168 return;
5169
5170 D[0+bs*3] = c_03;
5171 D[1+bs*3] = c_13;
5172 D[2+bs*3] = c_23;
5173 D[3+bs*3] = c_33;
5174 }
5175 else if(km>=3)
5176 {
5177 D[0+bs*0] = c_00;
5178 D[1+bs*0] = c_10;
5179 D[2+bs*0] = c_20;
5180
5181 if(kn==1)
5182 return;
5183
5184 D[0+bs*1] = c_01;
5185 D[1+bs*1] = c_11;
5186 D[2+bs*1] = c_21;
5187
5188 if(kn==2)
5189 return;
5190
5191 D[0+bs*2] = c_02;
5192 D[1+bs*2] = c_12;
5193 D[2+bs*2] = c_22;
5194
5195 if(kn==3)
5196 return;
5197
5198 D[0+bs*3] = c_03;
5199 D[1+bs*3] = c_13;
5200 D[2+bs*3] = c_23;
5201 }
5202 else if(km>=2)
5203 {
5204 D[0+bs*0] = c_00;
5205 D[1+bs*0] = c_10;
5206
5207 if(kn==1)
5208 return;
5209
5210 D[0+bs*1] = c_01;
5211 D[1+bs*1] = c_11;
5212
5213 if(kn==2)
5214 return;
5215
5216 D[0+bs*2] = c_02;
5217 D[1+bs*2] = c_12;
5218
5219 if(kn==3)
5220 return;
5221
5222 D[0+bs*3] = c_03;
5223 D[1+bs*3] = c_13;
5224 }
5225 else //if(km>=1)
5226 {
5227 D[0+bs*0] = c_00;
5228
5229 if(kn==1)
5230 return;
5231
5232 D[0+bs*1] = c_01;
5233
5234 if(kn==2)
5235 return;
5236
5237 D[0+bs*2] = c_02;
5238
5239 if(kn==3)
5240 return;
5241
5242 D[0+bs*3] = c_03;
5243 }
5244
5245 }
5246#endif
5247
5248
5249
5250#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5251void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
5252 {
5253 kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
5254 }
5255#endif
5256
5257
5258
5259#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5260void kernel_dgetrf_nn_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn)
5261 {
5262
5263 const int bs = 4;
5264
5265 int k;
5266
5267 double
5268 tmp,
5269 a_0, a_1, a_2, a_3,
5270 b_0, b_1, b_2, b_3,
5271 c_00=0, c_01=0, c_02=0, c_03=0,
5272 c_10=0, c_11=0, c_12=0, c_13=0,
5273 c_20=0, c_21=0, c_22=0, c_23=0,
5274 c_30=0, c_31=0, c_32=0, c_33=0;
5275
5276 if(kmax<=0)
5277 goto add;
5278
5279 for(k=0; k<kmax-3; k+=4)
5280 {
5281
5282 a_0 = A[0+bs*0];
5283 a_1 = A[1+bs*0];
5284 a_2 = A[2+bs*0];
5285 a_3 = A[3+bs*0];
5286
5287 b_0 = B[0+bs*0];
5288 b_1 = B[0+bs*1];
5289 b_2 = B[0+bs*2];
5290 b_3 = B[0+bs*3];
5291
5292 c_00 -= a_0 * b_0;
5293 c_10 -= a_1 * b_0;
5294 c_20 -= a_2 * b_0;
5295 c_30 -= a_3 * b_0;
5296
5297 c_01 -= a_0 * b_1;
5298 c_11 -= a_1 * b_1;
5299 c_21 -= a_2 * b_1;
5300 c_31 -= a_3 * b_1;
5301
5302 c_02 -= a_0 * b_2;
5303 c_12 -= a_1 * b_2;
5304 c_22 -= a_2 * b_2;
5305 c_32 -= a_3 * b_2;
5306
5307 c_03 -= a_0 * b_3;
5308 c_13 -= a_1 * b_3;
5309 c_23 -= a_2 * b_3;
5310 c_33 -= a_3 * b_3;
5311
5312
5313 a_0 = A[0+bs*1];
5314 a_1 = A[1+bs*1];
5315 a_2 = A[2+bs*1];
5316 a_3 = A[3+bs*1];
5317
5318 b_0 = B[1+bs*0];
5319 b_1 = B[1+bs*1];
5320 b_2 = B[1+bs*2];
5321 b_3 = B[1+bs*3];
5322
5323 c_00 -= a_0 * b_0;
5324 c_10 -= a_1 * b_0;
5325 c_20 -= a_2 * b_0;
5326 c_30 -= a_3 * b_0;
5327
5328 c_01 -= a_0 * b_1;
5329 c_11 -= a_1 * b_1;
5330 c_21 -= a_2 * b_1;
5331 c_31 -= a_3 * b_1;
5332
5333 c_02 -= a_0 * b_2;
5334 c_12 -= a_1 * b_2;
5335 c_22 -= a_2 * b_2;
5336 c_32 -= a_3 * b_2;
5337
5338 c_03 -= a_0 * b_3;
5339 c_13 -= a_1 * b_3;
5340 c_23 -= a_2 * b_3;
5341 c_33 -= a_3 * b_3;
5342
5343
5344 a_0 = A[0+bs*2];
5345 a_1 = A[1+bs*2];
5346 a_2 = A[2+bs*2];
5347 a_3 = A[3+bs*2];
5348
5349 b_0 = B[2+bs*0];
5350 b_1 = B[2+bs*1];
5351 b_2 = B[2+bs*2];
5352 b_3 = B[2+bs*3];
5353
5354 c_00 -= a_0 * b_0;
5355 c_10 -= a_1 * b_0;
5356 c_20 -= a_2 * b_0;
5357 c_30 -= a_3 * b_0;
5358
5359 c_01 -= a_0 * b_1;
5360 c_11 -= a_1 * b_1;
5361 c_21 -= a_2 * b_1;
5362 c_31 -= a_3 * b_1;
5363
5364 c_02 -= a_0 * b_2;
5365 c_12 -= a_1 * b_2;
5366 c_22 -= a_2 * b_2;
5367 c_32 -= a_3 * b_2;
5368
5369 c_03 -= a_0 * b_3;
5370 c_13 -= a_1 * b_3;
5371 c_23 -= a_2 * b_3;
5372 c_33 -= a_3 * b_3;
5373
5374
5375 a_0 = A[0+bs*3];
5376 a_1 = A[1+bs*3];
5377 a_2 = A[2+bs*3];
5378 a_3 = A[3+bs*3];
5379
5380 b_0 = B[3+bs*0];
5381 b_1 = B[3+bs*1];
5382 b_2 = B[3+bs*2];
5383 b_3 = B[3+bs*3];
5384
5385 c_00 -= a_0 * b_0;
5386 c_10 -= a_1 * b_0;
5387 c_20 -= a_2 * b_0;
5388 c_30 -= a_3 * b_0;
5389
5390 c_01 -= a_0 * b_1;
5391 c_11 -= a_1 * b_1;
5392 c_21 -= a_2 * b_1;
5393 c_31 -= a_3 * b_1;
5394
5395 c_02 -= a_0 * b_2;
5396 c_12 -= a_1 * b_2;
5397 c_22 -= a_2 * b_2;
5398 c_32 -= a_3 * b_2;
5399
5400 c_03 -= a_0 * b_3;
5401 c_13 -= a_1 * b_3;
5402 c_23 -= a_2 * b_3;
5403 c_33 -= a_3 * b_3;
5404
5405
5406 A += 16;
5407 B += 4*sdb;
5408
5409 }
5410 for(; k<kmax; k++)
5411 {
5412
5413 a_0 = A[0+bs*0];
5414 a_1 = A[1+bs*0];
5415 a_2 = A[2+bs*0];
5416 a_3 = A[3+bs*0];
5417
5418 b_0 = B[0+bs*0];
5419 b_1 = B[0+bs*1];
5420 b_2 = B[0+bs*2];
5421 b_3 = B[0+bs*3];
5422
5423 c_00 -= a_0 * b_0;
5424 c_10 -= a_1 * b_0;
5425 c_20 -= a_2 * b_0;
5426 c_30 -= a_3 * b_0;
5427
5428 c_01 -= a_0 * b_1;
5429 c_11 -= a_1 * b_1;
5430 c_21 -= a_2 * b_1;
5431 c_31 -= a_3 * b_1;
5432
5433 c_02 -= a_0 * b_2;
5434 c_12 -= a_1 * b_2;
5435 c_22 -= a_2 * b_2;
5436 c_32 -= a_3 * b_2;
5437
5438 c_03 -= a_0 * b_3;
5439 c_13 -= a_1 * b_3;
5440 c_23 -= a_2 * b_3;
5441 c_33 -= a_3 * b_3;
5442
5443
5444 A += 4;
5445 B += 1;
5446
5447 }
5448
5449 add:
5450
5451 c_00 += C[0+bs*0];
5452 c_10 += C[1+bs*0];
5453 c_20 += C[2+bs*0];
5454 c_30 += C[3+bs*0];
5455
5456 c_01 += C[0+bs*1];
5457 c_11 += C[1+bs*1];
5458 c_21 += C[2+bs*1];
5459 c_31 += C[3+bs*1];
5460
5461 c_02 += C[0+bs*2];
5462 c_12 += C[1+bs*2];
5463 c_22 += C[2+bs*2];
5464 c_32 += C[3+bs*2];
5465
5466 c_03 += C[0+bs*3];
5467 c_13 += C[1+bs*3];
5468 c_23 += C[2+bs*3];
5469 c_33 += C[3+bs*3];
5470
5471 // factorization
5472
5473 // first column
5474 tmp = 1.0 / c_00;
5475 c_10 *= tmp;
5476 c_20 *= tmp;
5477 c_30 *= tmp;
5478
5479 inv_diag_D[0] = tmp;
5480
5481 if(kn==1)
5482 goto store;
5483
5484 // second column
5485 c_11 -= c_10 * c_01;
5486 c_21 -= c_20 * c_01;
5487 c_31 -= c_30 * c_01;
5488
5489 tmp = 1.0 / c_11;
5490 c_21 *= tmp;
5491 c_31 *= tmp;
5492
5493 inv_diag_D[1] = tmp;
5494
5495 if(kn==2)
5496 goto store;
5497
5498 // third column
5499 c_12 -= c_10 * c_02;
5500 c_22 -= c_20 * c_02;
5501 c_32 -= c_30 * c_02;
5502
5503 c_22 -= c_21 * c_12;
5504 c_32 -= c_31 * c_12;
5505
5506 tmp = 1.0 / c_22;
5507 c_32 *= tmp;
5508
5509 inv_diag_D[2] = tmp;
5510
5511 if(kn==3)
5512 goto store;
5513
5514 // fourth column
5515 c_13 -= c_10 * c_03;
5516 c_23 -= c_20 * c_03;
5517 c_33 -= c_30 * c_03;
5518
5519 c_23 -= c_21 * c_13;
5520 c_33 -= c_31 * c_13;
5521
5522 c_33 -= c_32 * c_23;
5523
5524 tmp = 1.0 / c_33;
5525
5526 inv_diag_D[3] = tmp;
5527
5528 store:
5529
5530 if(km>=4)
5531 {
5532 D[0+bs*0] = c_00;
5533 D[1+bs*0] = c_10;
5534 D[2+bs*0] = c_20;
5535 D[3+bs*0] = c_30;
5536
5537 if(kn==1)
5538 return;
5539
5540 D[0+bs*1] = c_01;
5541 D[1+bs*1] = c_11;
5542 D[2+bs*1] = c_21;
5543 D[3+bs*1] = c_31;
5544
5545 if(kn==2)
5546 return;
5547
5548 D[0+bs*2] = c_02;
5549 D[1+bs*2] = c_12;
5550 D[2+bs*2] = c_22;
5551 D[3+bs*2] = c_32;
5552
5553 if(kn==3)
5554 return;
5555
5556 D[0+bs*3] = c_03;
5557 D[1+bs*3] = c_13;
5558 D[2+bs*3] = c_23;
5559 D[3+bs*3] = c_33;
5560 }
5561 else if(km>=3)
5562 {
5563 D[0+bs*0] = c_00;
5564 D[1+bs*0] = c_10;
5565 D[2+bs*0] = c_20;
5566
5567 if(kn==1)
5568 return;
5569
5570 D[0+bs*1] = c_01;
5571 D[1+bs*1] = c_11;
5572 D[2+bs*1] = c_21;
5573
5574 if(kn==2)
5575 return;
5576
5577 D[0+bs*2] = c_02;
5578 D[1+bs*2] = c_12;
5579 D[2+bs*2] = c_22;
5580
5581 if(kn==3)
5582 return;
5583
5584 D[0+bs*3] = c_03;
5585 D[1+bs*3] = c_13;
5586 D[2+bs*3] = c_23;
5587 }
5588 else if(km>=2)
5589 {
5590 D[0+bs*0] = c_00;
5591 D[1+bs*0] = c_10;
5592
5593 if(kn==1)
5594 return;
5595
5596 D[0+bs*1] = c_01;
5597 D[1+bs*1] = c_11;
5598
5599 if(kn==2)
5600 return;
5601
5602 D[0+bs*2] = c_02;
5603 D[1+bs*2] = c_12;
5604
5605 if(kn==3)
5606 return;
5607
5608 D[0+bs*3] = c_03;
5609 D[1+bs*3] = c_13;
5610 }
5611 else //if(km>=1)
5612 {
5613 D[0+bs*0] = c_00;
5614
5615 if(kn==1)
5616 return;
5617
5618 D[0+bs*1] = c_01;
5619
5620 if(kn==2)
5621 return;
5622
5623 D[0+bs*2] = c_02;
5624
5625 if(kn==3)
5626 return;
5627
5628 D[0+bs*3] = c_03;
5629 }
5630
5631 return;
5632
5633 }
5634#endif
5635
5636
5637
5638#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5639void kernel_dgetrf_nn_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D)
5640 {
5641 kernel_dgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
5642 }
5643#endif
5644
5645
5646
5647#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5648void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn)
5649 {
5650
5651 const int bs = 4;
5652
5653 int k;
5654
5655 double
5656 tmp,
5657 a_0, a_1, a_2, a_3,
5658 b_0, b_1, b_2, b_3,
5659 e_1, e_2, e_3,
5660 c_00=0, c_01=0, c_02=0, c_03=0,
5661 c_10=0, c_11=0, c_12=0, c_13=0,
5662 c_20=0, c_21=0, c_22=0, c_23=0,
5663 c_30=0, c_31=0, c_32=0, c_33=0;
5664
5665 if(kmax<=0)
5666 goto add;
5667
5668 for(k=0; k<kmax-3; k+=4)
5669 {
5670
5671 a_0 = A[0+bs*0];
5672 a_1 = A[1+bs*0];
5673 a_2 = A[2+bs*0];
5674 a_3 = A[3+bs*0];
5675
5676 b_0 = B[0+bs*0];
5677 b_1 = B[0+bs*1];
5678 b_2 = B[0+bs*2];
5679 b_3 = B[0+bs*3];
5680
5681 c_00 -= a_0 * b_0;
5682 c_10 -= a_1 * b_0;
5683 c_20 -= a_2 * b_0;
5684 c_30 -= a_3 * b_0;
5685
5686 c_01 -= a_0 * b_1;
5687 c_11 -= a_1 * b_1;
5688 c_21 -= a_2 * b_1;
5689 c_31 -= a_3 * b_1;
5690
5691 c_02 -= a_0 * b_2;
5692 c_12 -= a_1 * b_2;
5693 c_22 -= a_2 * b_2;
5694 c_32 -= a_3 * b_2;
5695
5696 c_03 -= a_0 * b_3;
5697 c_13 -= a_1 * b_3;
5698 c_23 -= a_2 * b_3;
5699 c_33 -= a_3 * b_3;
5700
5701
5702 a_0 = A[0+bs*1];
5703 a_1 = A[1+bs*1];
5704 a_2 = A[2+bs*1];
5705 a_3 = A[3+bs*1];
5706
5707 b_0 = B[1+bs*0];
5708 b_1 = B[1+bs*1];
5709 b_2 = B[1+bs*2];
5710 b_3 = B[1+bs*3];
5711
5712 c_00 -= a_0 * b_0;
5713 c_10 -= a_1 * b_0;
5714 c_20 -= a_2 * b_0;
5715 c_30 -= a_3 * b_0;
5716
5717 c_01 -= a_0 * b_1;
5718 c_11 -= a_1 * b_1;
5719 c_21 -= a_2 * b_1;
5720 c_31 -= a_3 * b_1;
5721
5722 c_02 -= a_0 * b_2;
5723 c_12 -= a_1 * b_2;
5724 c_22 -= a_2 * b_2;
5725 c_32 -= a_3 * b_2;
5726
5727 c_03 -= a_0 * b_3;
5728 c_13 -= a_1 * b_3;
5729 c_23 -= a_2 * b_3;
5730 c_33 -= a_3 * b_3;
5731
5732
5733 a_0 = A[0+bs*2];
5734 a_1 = A[1+bs*2];
5735 a_2 = A[2+bs*2];
5736 a_3 = A[3+bs*2];
5737
5738 b_0 = B[2+bs*0];
5739 b_1 = B[2+bs*1];
5740 b_2 = B[2+bs*2];
5741 b_3 = B[2+bs*3];
5742
5743 c_00 -= a_0 * b_0;
5744 c_10 -= a_1 * b_0;
5745 c_20 -= a_2 * b_0;
5746 c_30 -= a_3 * b_0;
5747
5748 c_01 -= a_0 * b_1;
5749 c_11 -= a_1 * b_1;
5750 c_21 -= a_2 * b_1;
5751 c_31 -= a_3 * b_1;
5752
5753 c_02 -= a_0 * b_2;
5754 c_12 -= a_1 * b_2;
5755 c_22 -= a_2 * b_2;
5756 c_32 -= a_3 * b_2;
5757
5758 c_03 -= a_0 * b_3;
5759 c_13 -= a_1 * b_3;
5760 c_23 -= a_2 * b_3;
5761 c_33 -= a_3 * b_3;
5762
5763
5764 a_0 = A[0+bs*3];
5765 a_1 = A[1+bs*3];
5766 a_2 = A[2+bs*3];
5767 a_3 = A[3+bs*3];
5768
5769 b_0 = B[3+bs*0];
5770 b_1 = B[3+bs*1];
5771 b_2 = B[3+bs*2];
5772 b_3 = B[3+bs*3];
5773
5774 c_00 -= a_0 * b_0;
5775 c_10 -= a_1 * b_0;
5776 c_20 -= a_2 * b_0;
5777 c_30 -= a_3 * b_0;
5778
5779 c_01 -= a_0 * b_1;
5780 c_11 -= a_1 * b_1;
5781 c_21 -= a_2 * b_1;
5782 c_31 -= a_3 * b_1;
5783
5784 c_02 -= a_0 * b_2;
5785 c_12 -= a_1 * b_2;
5786 c_22 -= a_2 * b_2;
5787 c_32 -= a_3 * b_2;
5788
5789 c_03 -= a_0 * b_3;
5790 c_13 -= a_1 * b_3;
5791 c_23 -= a_2 * b_3;
5792 c_33 -= a_3 * b_3;
5793
5794
5795 A += 16;
5796 B += 4*sdb;
5797
5798 }
5799 for(; k<kmax; k++)
5800 {
5801
5802 a_0 = A[0+bs*0];
5803 a_1 = A[1+bs*0];
5804 a_2 = A[2+bs*0];
5805 a_3 = A[3+bs*0];
5806
5807 b_0 = B[0+bs*0];
5808 b_1 = B[0+bs*1];
5809 b_2 = B[0+bs*2];
5810 b_3 = B[0+bs*3];
5811
5812 c_00 -= a_0 * b_0;
5813 c_10 -= a_1 * b_0;
5814 c_20 -= a_2 * b_0;
5815 c_30 -= a_3 * b_0;
5816
5817 c_01 -= a_0 * b_1;
5818 c_11 -= a_1 * b_1;
5819 c_21 -= a_2 * b_1;
5820 c_31 -= a_3 * b_1;
5821
5822 c_02 -= a_0 * b_2;
5823 c_12 -= a_1 * b_2;
5824 c_22 -= a_2 * b_2;
5825 c_32 -= a_3 * b_2;
5826
5827 c_03 -= a_0 * b_3;
5828 c_13 -= a_1 * b_3;
5829 c_23 -= a_2 * b_3;
5830 c_33 -= a_3 * b_3;
5831
5832
5833 A += 4;
5834 B += 1;
5835
5836 }
5837
5838 add:
5839
5840 c_00 += C[0+bs*0];
5841 c_10 += C[1+bs*0];
5842 c_20 += C[2+bs*0];
5843 c_30 += C[3+bs*0];
5844
5845 c_01 += C[0+bs*1];
5846 c_11 += C[1+bs*1];
5847 c_21 += C[2+bs*1];
5848 c_31 += C[3+bs*1];
5849
5850 c_02 += C[0+bs*2];
5851 c_12 += C[1+bs*2];
5852 c_22 += C[2+bs*2];
5853 c_32 += C[3+bs*2];
5854
5855 c_03 += C[0+bs*3];
5856 c_13 += C[1+bs*3];
5857 c_23 += C[2+bs*3];
5858 c_33 += C[3+bs*3];
5859
5860 // solution
5861
5862 if(km==1)
5863 goto store;
5864
5865 e_1 = E[1+bs*0];
5866 e_2 = E[2+bs*0];
5867 e_3 = E[3+bs*0];
5868 c_10 -= e_1 * c_00;
5869 c_20 -= e_2 * c_00;
5870 c_30 -= e_3 * c_00;
5871 c_11 -= e_1 * c_01;
5872 c_21 -= e_2 * c_01;
5873 c_31 -= e_3 * c_01;
5874 c_12 -= e_1 * c_02;
5875 c_22 -= e_2 * c_02;
5876 c_32 -= e_3 * c_02;
5877 c_13 -= e_1 * c_03;
5878 c_23 -= e_2 * c_03;
5879 c_33 -= e_3 * c_03;
5880
5881 if(km==2)
5882 goto store;
5883
5884 e_2 = E[2+bs*1];
5885 e_3 = E[3+bs*1];
5886 c_20 -= e_2 * c_10;
5887 c_30 -= e_3 * c_10;
5888 c_21 -= e_2 * c_11;
5889 c_31 -= e_3 * c_11;
5890 c_22 -= e_2 * c_12;
5891 c_32 -= e_3 * c_12;
5892 c_23 -= e_2 * c_13;
5893 c_33 -= e_3 * c_13;
5894
5895 if(km==3)
5896 goto store;
5897
5898 e_3 = E[3+bs*2];
5899 c_30 -= e_3 * c_20;
5900 c_31 -= e_3 * c_21;
5901 c_32 -= e_3 * c_22;
5902 c_33 -= e_3 * c_23;
5903
5904 store:
5905
5906 if(km>=4)
5907 {
5908 D[0+bs*0] = c_00;
5909 D[1+bs*0] = c_10;
5910 D[2+bs*0] = c_20;
5911 D[3+bs*0] = c_30;
5912
5913 if(kn==1)
5914 return;
5915
5916 D[0+bs*1] = c_01;
5917 D[1+bs*1] = c_11;
5918 D[2+bs*1] = c_21;
5919 D[3+bs*1] = c_31;
5920
5921 if(kn==2)
5922 return;
5923
5924 D[0+bs*2] = c_02;
5925 D[1+bs*2] = c_12;
5926 D[2+bs*2] = c_22;
5927 D[3+bs*2] = c_32;
5928
5929 if(kn==3)
5930 return;
5931
5932 D[0+bs*3] = c_03;
5933 D[1+bs*3] = c_13;
5934 D[2+bs*3] = c_23;
5935 D[3+bs*3] = c_33;
5936 }
5937 else if(km>=3)
5938 {
5939 D[0+bs*0] = c_00;
5940 D[1+bs*0] = c_10;
5941 D[2+bs*0] = c_20;
5942
5943 if(kn==1)
5944 return;
5945
5946 D[0+bs*1] = c_01;
5947 D[1+bs*1] = c_11;
5948 D[2+bs*1] = c_21;
5949
5950 if(kn==2)
5951 return;
5952
5953 D[0+bs*2] = c_02;
5954 D[1+bs*2] = c_12;
5955 D[2+bs*2] = c_22;
5956
5957 if(kn==3)
5958 return;
5959
5960 D[0+bs*3] = c_03;
5961 D[1+bs*3] = c_13;
5962 D[2+bs*3] = c_23;
5963 }
5964 else if(km>=2)
5965 {
5966 D[0+bs*0] = c_00;
5967 D[1+bs*0] = c_10;
5968
5969 if(kn==1)
5970 return;
5971
5972 D[0+bs*1] = c_01;
5973 D[1+bs*1] = c_11;
5974
5975 if(kn==2)
5976 return;
5977
5978 D[0+bs*2] = c_02;
5979 D[1+bs*2] = c_12;
5980
5981 if(kn==3)
5982 return;
5983
5984 D[0+bs*3] = c_03;
5985 D[1+bs*3] = c_13;
5986 }
5987 else //if(km>=1)
5988 {
5989 D[0+bs*0] = c_00;
5990
5991 if(kn==1)
5992 return;
5993
5994 D[0+bs*1] = c_01;
5995
5996 if(kn==2)
5997 return;
5998
5999 D[0+bs*2] = c_02;
6000
6001 if(kn==3)
6002 return;
6003
6004 D[0+bs*3] = c_03;
6005 }
6006
6007 return;
6008
6009 }
6010#endif
6011
6012
6013
6014#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6015void kernel_dtrsm_nn_ll_one_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E)
6016 {
6017 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
6018 }
6019#endif
6020
6021
6022
6023#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6024void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
6025 {
6026
6027 const int bs = 4;
6028
6029 int k;
6030
6031 double
6032 tmp,
6033 a_0, a_1, a_2, a_3,
6034 b_0, b_1, b_2, b_3,
6035 e_00, e_01, e_02, e_03,
6036 e_11, e_12, e_13,
6037 e_22, e_23,
6038 e_33,
6039 c_00=0, c_01=0, c_02=0, c_03=0,
6040 c_10=0, c_11=0, c_12=0, c_13=0,
6041 c_20=0, c_21=0, c_22=0, c_23=0,
6042 c_30=0, c_31=0, c_32=0, c_33=0;
6043
6044 if(kmax<=0)
6045 goto add;
6046
6047 for(k=0; k<kmax-3; k+=4)
6048 {
6049
6050 a_0 = A[0+bs*0];
6051 a_1 = A[1+bs*0];
6052 a_2 = A[2+bs*0];
6053 a_3 = A[3+bs*0];
6054
6055 b_0 = B[0+bs*0];
6056 b_1 = B[0+bs*1];
6057 b_2 = B[0+bs*2];
6058 b_3 = B[0+bs*3];
6059
6060 c_00 -= a_0 * b_0;
6061 c_10 -= a_1 * b_0;
6062 c_20 -= a_2 * b_0;
6063 c_30 -= a_3 * b_0;
6064
6065 c_01 -= a_0 * b_1;
6066 c_11 -= a_1 * b_1;
6067 c_21 -= a_2 * b_1;
6068 c_31 -= a_3 * b_1;
6069
6070 c_02 -= a_0 * b_2;
6071 c_12 -= a_1 * b_2;
6072 c_22 -= a_2 * b_2;
6073 c_32 -= a_3 * b_2;
6074
6075 c_03 -= a_0 * b_3;
6076 c_13 -= a_1 * b_3;
6077 c_23 -= a_2 * b_3;
6078 c_33 -= a_3 * b_3;
6079
6080
6081 a_0 = A[0+bs*1];
6082 a_1 = A[1+bs*1];
6083 a_2 = A[2+bs*1];
6084 a_3 = A[3+bs*1];
6085
6086 b_0 = B[1+bs*0];
6087 b_1 = B[1+bs*1];
6088 b_2 = B[1+bs*2];
6089 b_3 = B[1+bs*3];
6090
6091 c_00 -= a_0 * b_0;
6092 c_10 -= a_1 * b_0;
6093 c_20 -= a_2 * b_0;
6094 c_30 -= a_3 * b_0;
6095
6096 c_01 -= a_0 * b_1;
6097 c_11 -= a_1 * b_1;
6098 c_21 -= a_2 * b_1;
6099 c_31 -= a_3 * b_1;
6100
6101 c_02 -= a_0 * b_2;
6102 c_12 -= a_1 * b_2;
6103 c_22 -= a_2 * b_2;
6104 c_32 -= a_3 * b_2;
6105
6106 c_03 -= a_0 * b_3;
6107 c_13 -= a_1 * b_3;
6108 c_23 -= a_2 * b_3;
6109 c_33 -= a_3 * b_3;
6110
6111
6112 a_0 = A[0+bs*2];
6113 a_1 = A[1+bs*2];
6114 a_2 = A[2+bs*2];
6115 a_3 = A[3+bs*2];
6116
6117 b_0 = B[2+bs*0];
6118 b_1 = B[2+bs*1];
6119 b_2 = B[2+bs*2];
6120 b_3 = B[2+bs*3];
6121
6122 c_00 -= a_0 * b_0;
6123 c_10 -= a_1 * b_0;
6124 c_20 -= a_2 * b_0;
6125 c_30 -= a_3 * b_0;
6126
6127 c_01 -= a_0 * b_1;
6128 c_11 -= a_1 * b_1;
6129 c_21 -= a_2 * b_1;
6130 c_31 -= a_3 * b_1;
6131
6132 c_02 -= a_0 * b_2;
6133 c_12 -= a_1 * b_2;
6134 c_22 -= a_2 * b_2;
6135 c_32 -= a_3 * b_2;
6136
6137 c_03 -= a_0 * b_3;
6138 c_13 -= a_1 * b_3;
6139 c_23 -= a_2 * b_3;
6140 c_33 -= a_3 * b_3;
6141
6142
6143 a_0 = A[0+bs*3];
6144 a_1 = A[1+bs*3];
6145 a_2 = A[2+bs*3];
6146 a_3 = A[3+bs*3];
6147
6148 b_0 = B[3+bs*0];
6149 b_1 = B[3+bs*1];
6150 b_2 = B[3+bs*2];
6151 b_3 = B[3+bs*3];
6152
6153 c_00 -= a_0 * b_0;
6154 c_10 -= a_1 * b_0;
6155 c_20 -= a_2 * b_0;
6156 c_30 -= a_3 * b_0;
6157
6158 c_01 -= a_0 * b_1;
6159 c_11 -= a_1 * b_1;
6160 c_21 -= a_2 * b_1;
6161 c_31 -= a_3 * b_1;
6162
6163 c_02 -= a_0 * b_2;
6164 c_12 -= a_1 * b_2;
6165 c_22 -= a_2 * b_2;
6166 c_32 -= a_3 * b_2;
6167
6168 c_03 -= a_0 * b_3;
6169 c_13 -= a_1 * b_3;
6170 c_23 -= a_2 * b_3;
6171 c_33 -= a_3 * b_3;
6172
6173
6174 A += 16;
6175 B += 4*sdb;
6176
6177 }
6178 for(; k<kmax; k++)
6179 {
6180
6181 a_0 = A[0+bs*0];
6182 a_1 = A[1+bs*0];
6183 a_2 = A[2+bs*0];
6184 a_3 = A[3+bs*0];
6185
6186 b_0 = B[0+bs*0];
6187 b_1 = B[0+bs*1];
6188 b_2 = B[0+bs*2];
6189 b_3 = B[0+bs*3];
6190
6191 c_00 -= a_0 * b_0;
6192 c_10 -= a_1 * b_0;
6193 c_20 -= a_2 * b_0;
6194 c_30 -= a_3 * b_0;
6195
6196 c_01 -= a_0 * b_1;
6197 c_11 -= a_1 * b_1;
6198 c_21 -= a_2 * b_1;
6199 c_31 -= a_3 * b_1;
6200
6201 c_02 -= a_0 * b_2;
6202 c_12 -= a_1 * b_2;
6203 c_22 -= a_2 * b_2;
6204 c_32 -= a_3 * b_2;
6205
6206 c_03 -= a_0 * b_3;
6207 c_13 -= a_1 * b_3;
6208 c_23 -= a_2 * b_3;
6209 c_33 -= a_3 * b_3;
6210
6211
6212 A += 4;
6213 B += 1;
6214
6215 }
6216
6217 add:
6218
6219 c_00 += C[0+bs*0];
6220 c_10 += C[1+bs*0];
6221 c_20 += C[2+bs*0];
6222 c_30 += C[3+bs*0];
6223
6224 c_01 += C[0+bs*1];
6225 c_11 += C[1+bs*1];
6226 c_21 += C[2+bs*1];
6227 c_31 += C[3+bs*1];
6228
6229 c_02 += C[0+bs*2];
6230 c_12 += C[1+bs*2];
6231 c_22 += C[2+bs*2];
6232 c_32 += C[3+bs*2];
6233
6234 c_03 += C[0+bs*3];
6235 c_13 += C[1+bs*3];
6236 c_23 += C[2+bs*3];
6237 c_33 += C[3+bs*3];
6238
6239 // solve
6240
6241 e_00 = inv_diag_E[0];
6242 c_00 *= e_00;
6243 c_10 *= e_00;
6244 c_20 *= e_00;
6245 c_30 *= e_00;
6246
6247 if(kn==1)
6248 goto store;
6249
6250 e_01 = E[0+bs*1];
6251 e_11 = inv_diag_E[1];
6252 c_01 -= c_00 * e_01;
6253 c_11 -= c_10 * e_01;
6254 c_21 -= c_20 * e_01;
6255 c_31 -= c_30 * e_01;
6256 c_01 *= e_11;
6257 c_11 *= e_11;
6258 c_21 *= e_11;
6259 c_31 *= e_11;
6260
6261 if(kn==2)
6262 goto store;
6263
6264 e_02 = E[0+bs*2];
6265 e_12 = E[1+bs*2];
6266 e_22 = inv_diag_E[2];
6267 c_02 -= c_00 * e_02;
6268 c_12 -= c_10 * e_02;
6269 c_22 -= c_20 * e_02;
6270 c_32 -= c_30 * e_02;
6271 c_02 -= c_01 * e_12;
6272 c_12 -= c_11 * e_12;
6273 c_22 -= c_21 * e_12;
6274 c_32 -= c_31 * e_12;
6275 c_02 *= e_22;
6276 c_12 *= e_22;
6277 c_22 *= e_22;
6278 c_32 *= e_22;
6279
6280 if(kn==3)
6281 goto store;
6282
6283 e_03 = E[0+bs*3];
6284 e_13 = E[1+bs*3];
6285 e_23 = E[2+bs*3];
6286 e_33 = inv_diag_E[3];
6287 c_03 -= c_00 * e_03;
6288 c_13 -= c_10 * e_03;
6289 c_23 -= c_20 * e_03;
6290 c_33 -= c_30 * e_03;
6291 c_03 -= c_01 * e_13;
6292 c_13 -= c_11 * e_13;
6293 c_23 -= c_21 * e_13;
6294 c_33 -= c_31 * e_13;
6295 c_03 -= c_02 * e_23;
6296 c_13 -= c_12 * e_23;
6297 c_23 -= c_22 * e_23;
6298 c_33 -= c_32 * e_23;
6299 c_03 *= e_33;
6300 c_13 *= e_33;
6301 c_23 *= e_33;
6302 c_33 *= e_33;
6303
6304 store:
6305
6306 if(km>=4)
6307 {
6308 D[0+bs*0] = c_00;
6309 D[1+bs*0] = c_10;
6310 D[2+bs*0] = c_20;
6311 D[3+bs*0] = c_30;
6312
6313 if(kn==1)
6314 return;
6315
6316 D[0+bs*1] = c_01;
6317 D[1+bs*1] = c_11;
6318 D[2+bs*1] = c_21;
6319 D[3+bs*1] = c_31;
6320
6321 if(kn==2)
6322 return;
6323
6324 D[0+bs*2] = c_02;
6325 D[1+bs*2] = c_12;
6326 D[2+bs*2] = c_22;
6327 D[3+bs*2] = c_32;
6328
6329 if(kn==3)
6330 return;
6331
6332 D[0+bs*3] = c_03;
6333 D[1+bs*3] = c_13;
6334 D[2+bs*3] = c_23;
6335 D[3+bs*3] = c_33;
6336 }
6337 else if(km>=3)
6338 {
6339 D[0+bs*0] = c_00;
6340 D[1+bs*0] = c_10;
6341 D[2+bs*0] = c_20;
6342
6343 if(kn==1)
6344 return;
6345
6346 D[0+bs*1] = c_01;
6347 D[1+bs*1] = c_11;
6348 D[2+bs*1] = c_21;
6349
6350 if(kn==2)
6351 return;
6352
6353 D[0+bs*2] = c_02;
6354 D[1+bs*2] = c_12;
6355 D[2+bs*2] = c_22;
6356
6357 if(kn==3)
6358 return;
6359
6360 D[0+bs*3] = c_03;
6361 D[1+bs*3] = c_13;
6362 D[2+bs*3] = c_23;
6363 }
6364 else if(km>=2)
6365 {
6366 D[0+bs*0] = c_00;
6367 D[1+bs*0] = c_10;
6368
6369 if(kn==1)
6370 return;
6371
6372 D[0+bs*1] = c_01;
6373 D[1+bs*1] = c_11;
6374
6375 if(kn==2)
6376 return;
6377
6378 D[0+bs*2] = c_02;
6379 D[1+bs*2] = c_12;
6380
6381 if(kn==3)
6382 return;
6383
6384 D[0+bs*3] = c_03;
6385 D[1+bs*3] = c_13;
6386 }
6387 else //if(km>=1)
6388 {
6389 D[0+bs*0] = c_00;
6390
6391 if(kn==1)
6392 return;
6393
6394 D[0+bs*1] = c_01;
6395
6396 if(kn==2)
6397 return;
6398
6399 D[0+bs*2] = c_02;
6400
6401 if(kn==3)
6402 return;
6403
6404 D[0+bs*3] = c_03;
6405 }
6406
6407 return;
6408
6409 }
6410#endif
6411
6412
6413
6414#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6415void kernel_dtrsm_nn_ru_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
6416 {
6417 kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
6418 }
6419#endif
6420
6421
6422
6423#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6424void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
6425 {
6426
6427 const int bs = 4;
6428
6429 int k;
6430
6431 double
6432 tmp,
6433 a_0, a_1, a_2, a_3,
6434 b_0, b_1, b_2, b_3,
6435 e_00, e_01, e_02, e_03,
6436 e_11, e_12, e_13,
6437 e_22, e_23,
6438 e_33,
6439 c_00=0, c_01=0, c_02=0, c_03=0,
6440 c_10=0, c_11=0, c_12=0, c_13=0,
6441 c_20=0, c_21=0, c_22=0, c_23=0,
6442 c_30=0, c_31=0, c_32=0, c_33=0;
6443
6444 if(kmax<=0)
6445 goto add;
6446
6447 for(k=0; k<kmax-3; k+=4)
6448 {
6449
6450 a_0 = A[0+bs*0];
6451 a_1 = A[1+bs*0];
6452 a_2 = A[2+bs*0];
6453 a_3 = A[3+bs*0];
6454
6455 b_0 = B[0+bs*0];
6456 b_1 = B[0+bs*1];
6457 b_2 = B[0+bs*2];
6458 b_3 = B[0+bs*3];
6459
6460 c_00 -= a_0 * b_0;
6461 c_10 -= a_1 * b_0;
6462 c_20 -= a_2 * b_0;
6463 c_30 -= a_3 * b_0;
6464
6465 c_01 -= a_0 * b_1;
6466 c_11 -= a_1 * b_1;
6467 c_21 -= a_2 * b_1;
6468 c_31 -= a_3 * b_1;
6469
6470 c_02 -= a_0 * b_2;
6471 c_12 -= a_1 * b_2;
6472 c_22 -= a_2 * b_2;
6473 c_32 -= a_3 * b_2;
6474
6475 c_03 -= a_0 * b_3;
6476 c_13 -= a_1 * b_3;
6477 c_23 -= a_2 * b_3;
6478 c_33 -= a_3 * b_3;
6479
6480
6481 a_0 = A[0+bs*1];
6482 a_1 = A[1+bs*1];
6483 a_2 = A[2+bs*1];
6484 a_3 = A[3+bs*1];
6485
6486 b_0 = B[1+bs*0];
6487 b_1 = B[1+bs*1];
6488 b_2 = B[1+bs*2];
6489 b_3 = B[1+bs*3];
6490
6491 c_00 -= a_0 * b_0;
6492 c_10 -= a_1 * b_0;
6493 c_20 -= a_2 * b_0;
6494 c_30 -= a_3 * b_0;
6495
6496 c_01 -= a_0 * b_1;
6497 c_11 -= a_1 * b_1;
6498 c_21 -= a_2 * b_1;
6499 c_31 -= a_3 * b_1;
6500
6501 c_02 -= a_0 * b_2;
6502 c_12 -= a_1 * b_2;
6503 c_22 -= a_2 * b_2;
6504 c_32 -= a_3 * b_2;
6505
6506 c_03 -= a_0 * b_3;
6507 c_13 -= a_1 * b_3;
6508 c_23 -= a_2 * b_3;
6509 c_33 -= a_3 * b_3;
6510
6511
6512 a_0 = A[0+bs*2];
6513 a_1 = A[1+bs*2];
6514 a_2 = A[2+bs*2];
6515 a_3 = A[3+bs*2];
6516
6517 b_0 = B[2+bs*0];
6518 b_1 = B[2+bs*1];
6519 b_2 = B[2+bs*2];
6520 b_3 = B[2+bs*3];
6521
6522 c_00 -= a_0 * b_0;
6523 c_10 -= a_1 * b_0;
6524 c_20 -= a_2 * b_0;
6525 c_30 -= a_3 * b_0;
6526
6527 c_01 -= a_0 * b_1;
6528 c_11 -= a_1 * b_1;
6529 c_21 -= a_2 * b_1;
6530 c_31 -= a_3 * b_1;
6531
6532 c_02 -= a_0 * b_2;
6533 c_12 -= a_1 * b_2;
6534 c_22 -= a_2 * b_2;
6535 c_32 -= a_3 * b_2;
6536
6537 c_03 -= a_0 * b_3;
6538 c_13 -= a_1 * b_3;
6539 c_23 -= a_2 * b_3;
6540 c_33 -= a_3 * b_3;
6541
6542
6543 a_0 = A[0+bs*3];
6544 a_1 = A[1+bs*3];
6545 a_2 = A[2+bs*3];
6546 a_3 = A[3+bs*3];
6547
6548 b_0 = B[3+bs*0];
6549 b_1 = B[3+bs*1];
6550 b_2 = B[3+bs*2];
6551 b_3 = B[3+bs*3];
6552
6553 c_00 -= a_0 * b_0;
6554 c_10 -= a_1 * b_0;
6555 c_20 -= a_2 * b_0;
6556 c_30 -= a_3 * b_0;
6557
6558 c_01 -= a_0 * b_1;
6559 c_11 -= a_1 * b_1;
6560 c_21 -= a_2 * b_1;
6561 c_31 -= a_3 * b_1;
6562
6563 c_02 -= a_0 * b_2;
6564 c_12 -= a_1 * b_2;
6565 c_22 -= a_2 * b_2;
6566 c_32 -= a_3 * b_2;
6567
6568 c_03 -= a_0 * b_3;
6569 c_13 -= a_1 * b_3;
6570 c_23 -= a_2 * b_3;
6571 c_33 -= a_3 * b_3;
6572
6573
6574 A += 16;
6575 B += 4*sdb;
6576
6577 }
6578 for(; k<kmax; k++)
6579 {
6580
6581 a_0 = A[0+bs*0];
6582 a_1 = A[1+bs*0];
6583 a_2 = A[2+bs*0];
6584 a_3 = A[3+bs*0];
6585
6586 b_0 = B[0+bs*0];
6587 b_1 = B[0+bs*1];
6588 b_2 = B[0+bs*2];
6589 b_3 = B[0+bs*3];
6590
6591 c_00 -= a_0 * b_0;
6592 c_10 -= a_1 * b_0;
6593 c_20 -= a_2 * b_0;
6594 c_30 -= a_3 * b_0;
6595
6596 c_01 -= a_0 * b_1;
6597 c_11 -= a_1 * b_1;
6598 c_21 -= a_2 * b_1;
6599 c_31 -= a_3 * b_1;
6600
6601 c_02 -= a_0 * b_2;
6602 c_12 -= a_1 * b_2;
6603 c_22 -= a_2 * b_2;
6604 c_32 -= a_3 * b_2;
6605
6606 c_03 -= a_0 * b_3;
6607 c_13 -= a_1 * b_3;
6608 c_23 -= a_2 * b_3;
6609 c_33 -= a_3 * b_3;
6610
6611
6612 A += 4;
6613 B += 1;
6614
6615 }
6616
6617 add:
6618
6619 c_00 += C[0+bs*0];
6620 c_10 += C[1+bs*0];
6621 c_20 += C[2+bs*0];
6622 c_30 += C[3+bs*0];
6623
6624 c_01 += C[0+bs*1];
6625 c_11 += C[1+bs*1];
6626 c_21 += C[2+bs*1];
6627 c_31 += C[3+bs*1];
6628
6629 c_02 += C[0+bs*2];
6630 c_12 += C[1+bs*2];
6631 c_22 += C[2+bs*2];
6632 c_32 += C[3+bs*2];
6633
6634 c_03 += C[0+bs*3];
6635 c_13 += C[1+bs*3];
6636 c_23 += C[2+bs*3];
6637 c_33 += C[3+bs*3];
6638
6639// printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
6640// printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
6641// printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
6642// printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
6643
6644 // solve
6645
6646 if(km>3)
6647 {
6648 e_03 = E[0+bs*3];
6649 e_13 = E[1+bs*3];
6650 e_23 = E[2+bs*3];
6651 e_33 = inv_diag_E[3];
6652 c_30 *= e_33;
6653 c_31 *= e_33;
6654 c_32 *= e_33;
6655 c_33 *= e_33;
6656 c_00 -= e_03 * c_30;
6657 c_01 -= e_03 * c_31;
6658 c_02 -= e_03 * c_32;
6659 c_03 -= e_03 * c_33;
6660 c_10 -= e_13 * c_30;
6661 c_11 -= e_13 * c_31;
6662 c_12 -= e_13 * c_32;
6663 c_13 -= e_13 * c_33;
6664 c_20 -= e_23 * c_30;
6665 c_21 -= e_23 * c_31;
6666 c_22 -= e_23 * c_32;
6667 c_23 -= e_23 * c_33;
6668 }
6669
6670 if(km>2)
6671 {
6672 e_02 = E[0+bs*2];
6673 e_12 = E[1+bs*2];
6674 e_22 = inv_diag_E[2];
6675 c_20 *= e_22;
6676 c_21 *= e_22;
6677 c_22 *= e_22;
6678 c_23 *= e_22;
6679 c_00 -= e_02 * c_20;
6680 c_01 -= e_02 * c_21;
6681 c_02 -= e_02 * c_22;
6682 c_03 -= e_02 * c_23;
6683 c_10 -= e_12 * c_20;
6684 c_11 -= e_12 * c_21;
6685 c_12 -= e_12 * c_22;
6686 c_13 -= e_12 * c_23;
6687 }
6688
6689 if(km>1)
6690 {
6691 e_01 = E[0+bs*1];
6692 e_11 = inv_diag_E[1];
6693 c_10 *= e_11;
6694 c_11 *= e_11;
6695 c_12 *= e_11;
6696 c_13 *= e_11;
6697 c_00 -= e_01 * c_10;
6698 c_01 -= e_01 * c_11;
6699 c_02 -= e_01 * c_12;
6700 c_03 -= e_01 * c_13;
6701 }
6702
6703 e_00 = inv_diag_E[0];
6704 c_00 *= e_00;
6705 c_01 *= e_00;
6706 c_02 *= e_00;
6707 c_03 *= e_00;
6708
6709 store:
6710
6711 if(km>=4)
6712 {
6713 D[0+bs*0] = c_00;
6714 D[1+bs*0] = c_10;
6715 D[2+bs*0] = c_20;
6716 D[3+bs*0] = c_30;
6717
6718 if(kn==1)
6719 return;
6720
6721 D[0+bs*1] = c_01;
6722 D[1+bs*1] = c_11;
6723 D[2+bs*1] = c_21;
6724 D[3+bs*1] = c_31;
6725
6726 if(kn==2)
6727 return;
6728
6729 D[0+bs*2] = c_02;
6730 D[1+bs*2] = c_12;
6731 D[2+bs*2] = c_22;
6732 D[3+bs*2] = c_32;
6733
6734 if(kn==3)
6735 return;
6736
6737 D[0+bs*3] = c_03;
6738 D[1+bs*3] = c_13;
6739 D[2+bs*3] = c_23;
6740 D[3+bs*3] = c_33;
6741 }
6742 else if(km>=3)
6743 {
6744 D[0+bs*0] = c_00;
6745 D[1+bs*0] = c_10;
6746 D[2+bs*0] = c_20;
6747
6748 if(kn==1)
6749 return;
6750
6751 D[0+bs*1] = c_01;
6752 D[1+bs*1] = c_11;
6753 D[2+bs*1] = c_21;
6754
6755 if(kn==2)
6756 return;
6757
6758 D[0+bs*2] = c_02;
6759 D[1+bs*2] = c_12;
6760 D[2+bs*2] = c_22;
6761
6762 if(kn==3)
6763 return;
6764
6765 D[0+bs*3] = c_03;
6766 D[1+bs*3] = c_13;
6767 D[2+bs*3] = c_23;
6768 }
6769 else if(km>=2)
6770 {
6771 D[0+bs*0] = c_00;
6772 D[1+bs*0] = c_10;
6773
6774 if(kn==1)
6775 return;
6776
6777 D[0+bs*1] = c_01;
6778 D[1+bs*1] = c_11;
6779
6780 if(kn==2)
6781 return;
6782
6783 D[0+bs*2] = c_02;
6784 D[1+bs*2] = c_12;
6785
6786 if(kn==3)
6787 return;
6788
6789 D[0+bs*3] = c_03;
6790 D[1+bs*3] = c_13;
6791 }
6792 else //if(km>=1)
6793 {
6794 D[0+bs*0] = c_00;
6795
6796 if(kn==1)
6797 return;
6798
6799 D[0+bs*1] = c_01;
6800
6801 if(kn==2)
6802 return;
6803
6804 D[0+bs*2] = c_02;
6805
6806 if(kn==3)
6807 return;
6808
6809 D[0+bs*3] = c_03;
6810 }
6811
6812 return;
6813
6814 }
6815#endif
6816
6817
6818
6819#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6820void kernel_dtrsm_nn_lu_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
6821 {
6822 kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
6823 }
6824#endif
6825