blob: 03975f4751ab180a2aaec1177cc28b0c0910167f [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29
30
31#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
32void kernel_sgemv_n_4_gen_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1)
33 {
34
35 const int bs = 4;
36
37 int k;
38
39 float
40 x_0,
41 y_0=0, y_1=0, y_2=0, y_3=0;
42
43 k=0;
44 for(; k<kmax-3; k+=4)
45 {
46
47 x_0 = x[0];
48
49 y_0 += A[0+bs*0] * x_0;
50 y_1 += A[1+bs*0] * x_0;
51 y_2 += A[2+bs*0] * x_0;
52 y_3 += A[3+bs*0] * x_0;
53
54 x_0 = x[1];
55
56 y_0 += A[0+bs*1] * x_0;
57 y_1 += A[1+bs*1] * x_0;
58 y_2 += A[2+bs*1] * x_0;
59 y_3 += A[3+bs*1] * x_0;
60
61 x_0 = x[2];
62
63 y_0 += A[0+bs*2] * x_0;
64 y_1 += A[1+bs*2] * x_0;
65 y_2 += A[2+bs*2] * x_0;
66 y_3 += A[3+bs*2] * x_0;
67
68 x_0 = x[3];
69
70 y_0 += A[0+bs*3] * x_0;
71 y_1 += A[1+bs*3] * x_0;
72 y_2 += A[2+bs*3] * x_0;
73 y_3 += A[3+bs*3] * x_0;
74
75 A += 4*bs;
76 x += 4;
77
78 }
79
80 for(; k<kmax; k++)
81 {
82
83 x_0 = x[0];
84
85 y_0 += A[0+bs*0] * x_0;
86 y_1 += A[1+bs*0] * x_0;
87 y_2 += A[2+bs*0] * x_0;
88 y_3 += A[3+bs*0] * x_0;
89
90 A += 1*bs;
91 x += 1;
92
93 }
94
95 y_0 = alpha[0]*y_0 + beta[0]*y[0];
96 y_1 = alpha[0]*y_1 + beta[0]*y[1];
97 y_2 = alpha[0]*y_2 + beta[0]*y[2];
98 y_3 = alpha[0]*y_3 + beta[0]*y[3];
99
100 if(k0<=0 & k1>3)
101 {
102 z[0] = y_0;
103 z[1] = y_1;
104 z[2] = y_2;
105 z[3] = y_3;
106 }
107 else
108 {
109 if(k0<=0 & k1>0) z[0] = y_0;
110 if(k0<=1 & k1>1) z[1] = y_1;
111 if(k0<=2 & k1>2) z[2] = y_2;
112 if(k0<=3 & k1>3) z[3] = y_3;
113 }
114
115 }
116#endif
117
118
119
120
121#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
122void kernel_sgemv_n_4_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z)
123 {
124
125 kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, 4);
126
127 }
128#endif
129
130
131
132#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
133void kernel_sgemv_n_4_vs_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1)
134 {
135
136 kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, k1);
137
138 }
139#endif
140
141
142
143#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
144void kernel_sgemv_t_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *y, float *z, int km)
145 {
146
147 const int bs = 4;
148
149 int k, kend;
150
151 float
152 x_0, x_1, x_2, x_3,
153 y_0=0, y_1=0, y_2=0, y_3=0;
154
155 k=0;
156 if(offA!=0) // 1, 2, 3
157 {
158 kend = 4-offA<kmax ? 4-offA : kmax;
159 for(; k<kend; k++)
160 {
161
162 x_0 = x[0];
163
164 y_0 += A[0+bs*0] * x_0;
165 y_1 += A[0+bs*1] * x_0;
166 y_2 += A[0+bs*2] * x_0;
167 y_3 += A[0+bs*3] * x_0;
168
169 A += 1;
170 x += 1;
171
172 }
173 A += bs*(sda-1);
174 }
175 for(; k<kmax-bs+1; k+=bs)
176 {
177
178 x_0 = x[0];
179 x_1 = x[1];
180 x_2 = x[2];
181 x_3 = x[3];
182
183 y_0 += A[0+bs*0] * x_0;
184 y_1 += A[0+bs*1] * x_0;
185 y_2 += A[0+bs*2] * x_0;
186 y_3 += A[0+bs*3] * x_0;
187
188 y_0 += A[1+bs*0] * x_1;
189 y_1 += A[1+bs*1] * x_1;
190 y_2 += A[1+bs*2] * x_1;
191 y_3 += A[1+bs*3] * x_1;
192
193 y_0 += A[2+bs*0] * x_2;
194 y_1 += A[2+bs*1] * x_2;
195 y_2 += A[2+bs*2] * x_2;
196 y_3 += A[2+bs*3] * x_2;
197
198 y_0 += A[3+bs*0] * x_3;
199 y_1 += A[3+bs*1] * x_3;
200 y_2 += A[3+bs*2] * x_3;
201 y_3 += A[3+bs*3] * x_3;
202
203 A += sda*bs;
204 x += 4;
205
206 }
207 for(; k<kmax; k++)
208 {
209
210 x_0 = x[0];
211
212 y_0 += A[0+bs*0] * x_0;
213 y_1 += A[0+bs*1] * x_0;
214 y_2 += A[0+bs*2] * x_0;
215 y_3 += A[0+bs*3] * x_0;
216
217 A += 1;
218 x += 1;
219
220 }
221
222 y_0 = alpha[0]*y_0 + beta[0]*y[0];
223 y_1 = alpha[0]*y_1 + beta[0]*y[1];
224 y_2 = alpha[0]*y_2 + beta[0]*y[2];
225 y_3 = alpha[0]*y_3 + beta[0]*y[3];
226
227 if(km>=4)
228 {
229 z[0] = y_0;
230 z[1] = y_1;
231 z[2] = y_2;
232 z[3] = y_3;
233 }
234 else
235 {
236 z[0] = y_0;
237 if(km>=2)
238 {
239 z[1] = y_1;
240 if(km>2)
241 {
242 z[2] = y_2;
243 }
244 }
245 }
246
247 }
248#endif
249
250
251
252#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
253void kernel_sgemv_t_4_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z)
254 {
255
256 kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, 4);
257
258 }
259#endif
260
261
262
263
264#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
265void kernel_sgemv_t_4_vs_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1)
266 {
267
268 kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, k1);
269
270 }
271#endif
272
273
274
275
276#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
277void kernel_strsv_ln_inv_4_vs_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn)
278 {
279
280 const int bs = 4;
281
282 int k;
283
284 float
285 x_0, x_1, x_2, x_3,
286 y_0=0, y_1=0, y_2=0, y_3=0;
287
288 k=0;
289 for(; k<kmax-3; k+=4)
290 {
291
292 x_0 = x[0];
293 x_1 = x[1];
294 x_2 = x[2];
295 x_3 = x[3];
296
297 y_0 -= A[0+bs*0] * x_0;
298 y_1 -= A[1+bs*0] * x_0;
299 y_2 -= A[2+bs*0] * x_0;
300 y_3 -= A[3+bs*0] * x_0;
301
302 y_0 -= A[0+bs*1] * x_1;
303 y_1 -= A[1+bs*1] * x_1;
304 y_2 -= A[2+bs*1] * x_1;
305 y_3 -= A[3+bs*1] * x_1;
306
307 y_0 -= A[0+bs*2] * x_2;
308 y_1 -= A[1+bs*2] * x_2;
309 y_2 -= A[2+bs*2] * x_2;
310 y_3 -= A[3+bs*2] * x_2;
311
312 y_0 -= A[0+bs*3] * x_3;
313 y_1 -= A[1+bs*3] * x_3;
314 y_2 -= A[2+bs*3] * x_3;
315 y_3 -= A[3+bs*3] * x_3;
316
317 A += 4*bs;
318 x += 4;
319
320 }
321
322 y_0 = y[0] + y_0;
323 y_1 = y[1] + y_1;
324 y_2 = y[2] + y_2;
325 y_3 = y[3] + y_3;
326
327 float
328 a_00, a_10, a_20, a_30,
329 a_11, a_21, a_31;
330
331 // a_00
332 a_00 = inv_diag_A[0];
333 a_10 = A[1+bs*0];
334 a_20 = A[2+bs*0];
335 a_30 = A[3+bs*0];
336 y_0 *= a_00;
337 z[0] = y_0;
338 y_1 -= a_10 * y_0;
339 y_2 -= a_20 * y_0;
340 y_3 -= a_30 * y_0;
341
342 if(kn==1)
343 {
344 if(km==1)
345 return;
346 y[1] = y_1;
347 if(km==2)
348 return;
349 y[2] = y_2;
350 if(km==3)
351 return;
352 y[3] = y_3;
353 return;
354 }
355
356 // a_11
357 a_11 = inv_diag_A[1];
358 a_21 = A[2+bs*1];
359 a_31 = A[3+bs*1];
360 y_1 *= a_11;
361 z[1] = y_1;
362 y_2 -= a_21 * y_1;
363 y_3 -= a_31 * y_1;
364
365 if(kn==2)
366 {
367 if(km==2)
368 return;
369 y[2] = y_2;
370 if(km==3)
371 return;
372 y[3] = y_3;
373 return;
374 }
375
376 // a_22
377 a_00 = inv_diag_A[2];
378 a_10 = A[3+bs*2];
379 y_2 *= a_00;
380 z[2] = y_2;
381 y_3 -= a_10 * y_2;
382
383 if(kn==3)
384 {
385 if(km==3)
386 return;
387 y[3] = y_3;
388
389 return;
390 }
391
392 // a_33
393 a_11 = inv_diag_A[3];
394 y_3 *= a_11;
395 z[3] = y_3;
396
397 }
398#endif
399
400
401
402#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
403void kernel_strsv_ln_inv_4_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z)
404 {
405
406 kernel_strsv_ln_inv_4_vs_lib4(kmax, A, inv_diag_A, x, y, z, 4, 4);
407
408
409 }
410#endif
411
412
413
414#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
415void kernel_strsv_lt_inv_4_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
416 {
417
418 const int bs = 4;
419
420 int
421 k;
422
423 float *tA, *tx;
424 tA = A;
425 tx = x;
426
427 float
428 x_0, x_1, x_2, x_3,
429 y_0=0, y_1=0, y_2=0, y_3=0;
430
431 k=4;
432 A += 4 + (sda-1)*bs;
433 x += 4;
434 for(; k<kmax-3; k+=4)
435 {
436
437 x_0 = x[0];
438 x_1 = x[1];
439 x_2 = x[2];
440 x_3 = x[3];
441
442 y_0 -= A[0+bs*0] * x_0;
443 y_1 -= A[0+bs*1] * x_0;
444 y_2 -= A[0+bs*2] * x_0;
445 y_3 -= A[0+bs*3] * x_0;
446
447 y_0 -= A[1+bs*0] * x_1;
448 y_1 -= A[1+bs*1] * x_1;
449 y_2 -= A[1+bs*2] * x_1;
450 y_3 -= A[1+bs*3] * x_1;
451
452 y_0 -= A[2+bs*0] * x_2;
453 y_1 -= A[2+bs*1] * x_2;
454 y_2 -= A[2+bs*2] * x_2;
455 y_3 -= A[2+bs*3] * x_2;
456
457 y_0 -= A[3+bs*0] * x_3;
458 y_1 -= A[3+bs*1] * x_3;
459 y_2 -= A[3+bs*2] * x_3;
460 y_3 -= A[3+bs*3] * x_3;
461
462 A += sda*bs;
463 x += 4;
464
465 }
466 for(; k<kmax; k++)
467 {
468
469 x_0 = x[0];
470
471 y_0 -= A[0+bs*0] * x_0;
472 y_1 -= A[0+bs*1] * x_0;
473 y_2 -= A[0+bs*2] * x_0;
474 y_3 -= A[0+bs*3] * x_0;
475
476 A += 1;//sda*bs;
477 x += 1;
478
479 }
480
481 y_0 = y[0] + y_0;
482 y_1 = y[1] + y_1;
483 y_2 = y[2] + y_2;
484 y_3 = y[3] + y_3;
485
486 A = tA;
487 x = tx;
488
489 // bottom trinagle
490 y_3 *= inv_diag_A[3];
491 z[3] = y_3;
492
493 y_2 -= A[3+bs*2] * y_3;
494 y_2 *= inv_diag_A[2];
495 z[2] = y_2;
496
497 // square
498 y_0 -= A[2+bs*0]*y_2 + A[3+bs*0]*y_3;
499 y_1 -= A[2+bs*1]*y_2 + A[3+bs*1]*y_3;
500
501 // top trinagle
502 y_1 *= inv_diag_A[1];
503 z[1] = y_1;
504
505 y_0 -= A[1+bs*0] * y_1;
506 y_0 *= inv_diag_A[0];
507 z[0] = y_0;
508
509 }
510#endif
511
512
513
514#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
515void kernel_strsv_lt_inv_3_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
516 {
517
518 const int bs = 4;
519
520 int
521 k;
522
523 float *tA, *tx;
524 tA = A;
525 tx = x;
526
527 float
528 x_0, x_1, x_2, x_3,
529 y_0=0, y_1=0, y_2=0;
530
531 k = 3;
532 if(kmax>4)
533 {
534 // clean up at the beginning
535 x_3 = x[3];
536
537 y_0 -= A[3+bs*0] * x_3;
538 y_1 -= A[3+bs*1] * x_3;
539 y_2 -= A[3+bs*2] * x_3;
540
541 k=4;
542 A += 4 + (sda-1)*bs;
543 x += 4;
544 for(; k<kmax-3; k+=4)
545 {
546
547 x_0 = x[0];
548 x_1 = x[1];
549 x_2 = x[2];
550 x_3 = x[3];
551
552 y_0 -= A[0+bs*0] * x_0;
553 y_1 -= A[0+bs*1] * x_0;
554 y_2 -= A[0+bs*2] * x_0;
555
556 y_0 -= A[1+bs*0] * x_1;
557 y_1 -= A[1+bs*1] * x_1;
558 y_2 -= A[1+bs*2] * x_1;
559
560 y_0 -= A[2+bs*0] * x_2;
561 y_1 -= A[2+bs*1] * x_2;
562 y_2 -= A[2+bs*2] * x_2;
563
564 y_0 -= A[3+bs*0] * x_3;
565 y_1 -= A[3+bs*1] * x_3;
566 y_2 -= A[3+bs*2] * x_3;
567
568 A += sda*bs;
569 x += 4;
570
571 }
572 }
573 else
574 {
575 A += 3;
576 x += 1;
577 }
578 for(; k<kmax; k++)
579 {
580
581 x_0 = x[0];
582
583 y_0 -= A[0+bs*0] * x_0;
584 y_1 -= A[0+bs*1] * x_0;
585 y_2 -= A[0+bs*2] * x_0;
586
587 A += 1;//sda*bs;
588 x += 1;
589
590 }
591
592 y_0 = y[0] + y_0;
593 y_1 = y[1] + y_1;
594 y_2 = y[2] + y_2;
595
596 A = tA;
597 x = tx;
598
599 // bottom trinagle
600 y_2 *= inv_diag_A[2];
601 z[2] = y_2;
602
603 // square
604 y_0 -= A[2+bs*0]*y_2;
605 y_1 -= A[2+bs*1]*y_2;
606
607 // top trinagle
608 y_1 *= inv_diag_A[1];
609 z[1] = y_1;
610
611 y_0 -= A[1+bs*0] * y_1;
612 y_0 *= inv_diag_A[0];
613 z[0] = y_0;
614
615 }
616#endif
617
618
619
620#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
621void kernel_strsv_lt_inv_2_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
622 {
623
624 const int bs = 4;
625
626 int
627 k;
628
629 float *tA, *tx;
630 tA = A;
631 tx = x;
632
633 float
634 x_0, x_1, x_2, x_3,
635 y_0=0, y_1=0;
636
637 k = 2;
638 if(kmax>4)
639 {
640 // clean up at the beginning
641 x_2 = x[2];
642 x_3 = x[3];
643
644 y_0 -= A[2+bs*0] * x_2;
645 y_1 -= A[2+bs*1] * x_2;
646
647 y_0 -= A[3+bs*0] * x_3;
648 y_1 -= A[3+bs*1] * x_3;
649
650 k=4;
651 A += 4 + (sda-1)*bs;
652 x += 4;
653 for(; k<kmax-3; k+=4)
654 {
655
656 x_0 = x[0];
657 x_1 = x[1];
658 x_2 = x[2];
659 x_3 = x[3];
660
661 y_0 -= A[0+bs*0] * x_0;
662 y_1 -= A[0+bs*1] * x_0;
663
664 y_0 -= A[1+bs*0] * x_1;
665 y_1 -= A[1+bs*1] * x_1;
666
667 y_0 -= A[2+bs*0] * x_2;
668 y_1 -= A[2+bs*1] * x_2;
669
670 y_0 -= A[3+bs*0] * x_3;
671 y_1 -= A[3+bs*1] * x_3;
672
673 A += sda*bs;
674 x += 4;
675
676 }
677 }
678 else
679 {
680 A += 2;
681 x += 2;
682 }
683 for(; k<kmax; k++)
684 {
685
686 x_0 = x[0];
687
688 y_0 -= A[0+bs*0] * x_0;
689 y_1 -= A[0+bs*1] * x_0;
690
691 A += 1;//sda*bs;
692 x += 1;
693
694 }
695
696 y_0 = y[0] + y_0;
697 y_1 = y[1] + y_1;
698
699 A = tA;
700 x = tx;
701
702 // top trinagle
703 y_1 *= inv_diag_A[1];
704 z[1] = y_1;
705
706 y_0 -= A[1+bs*0] * y_1;
707 y_0 *= inv_diag_A[0];
708 z[0] = y_0;
709
710 }
711#endif
712
713
714
715#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
716void kernel_strsv_lt_inv_1_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
717 {
718
719 const int bs = 4;
720
721 int
722 k;
723
724 float *tA, *tx;
725 tA = A;
726 tx = x;
727
728 float
729 x_0, x_1, x_2, x_3,
730 y_0=0;
731
732 k = 1;
733 if(kmax>4)
734 {
735 // clean up at the beginning
736 x_1 = x[1];
737 x_2 = x[2];
738 x_3 = x[3];
739
740 y_0 -= A[1+bs*0] * x_1;
741 y_0 -= A[2+bs*0] * x_2;
742 y_0 -= A[3+bs*0] * x_3;
743
744 k=4;
745 A += 4 + (sda-1)*bs;
746 x += 4;
747 for(; k<kmax-3; k+=4)
748 {
749
750 x_0 = x[0];
751 x_1 = x[1];
752 x_2 = x[2];
753 x_3 = x[3];
754
755 y_0 -= A[0+bs*0] * x_0;
756 y_0 -= A[1+bs*0] * x_1;
757 y_0 -= A[2+bs*0] * x_2;
758 y_0 -= A[3+bs*0] * x_3;
759
760 A += sda*bs;
761 x += 4;
762
763 }
764 }
765 else
766 {
767 A += 1;
768 x += 1;
769 }
770 for(; k<kmax; k++)
771 {
772
773 x_0 = x[0];
774
775 y_0 -= A[0+bs*0] * x_0;
776
777 A += 1;//sda*bs;
778 x += 1;
779
780 }
781
782 y_0 = y[0] + y_0;
783
784 A = tA;
785 x = tx;
786
787 // top trinagle
788 y_0 *= inv_diag_A[0];
789 z[0] = y_0;
790
791 }
792#endif
793
794
795
796#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
797void kernel_strmv_un_4_lib4(int kmax, float *A, float *x, float *z)
798 {
799
800 const int bs = 4;
801
802 int k;
803
804 float
805 x_0, x_1, x_2, x_3,
806 y_0=0, y_1=0, y_2=0, y_3=0;
807
808 x_0 = x[0];
809 x_1 = x[1];
810 x_2 = x[2];
811 x_3 = x[3];
812
813 y_0 += A[0+bs*0] * x_0;
814/* y_1 += A[1+bs*0] * x_0;*/
815/* y_2 += A[2+bs*0] * x_0;*/
816/* y_3 += A[3+bs*0] * x_0;*/
817
818 y_0 += A[0+bs*1] * x_1;
819 y_1 += A[1+bs*1] * x_1;
820/* y_2 += A[2+bs*1] * x_1;*/
821/* y_3 += A[3+bs*1] * x_1;*/
822
823 y_0 += A[0+bs*2] * x_2;
824 y_1 += A[1+bs*2] * x_2;
825 y_2 += A[2+bs*2] * x_2;
826/* y_3 += A[3+bs*2] * x_2;*/
827
828 y_0 += A[0+bs*3] * x_3;
829 y_1 += A[1+bs*3] * x_3;
830 y_2 += A[2+bs*3] * x_3;
831 y_3 += A[3+bs*3] * x_3;
832
833 A += 4*bs;
834 x += 4;
835
836 k=4;
837 for(; k<kmax-3; k+=4)
838 {
839
840 x_0 = x[0];
841 x_1 = x[1];
842 x_2 = x[2];
843 x_3 = x[3];
844
845 y_0 += A[0+bs*0] * x_0;
846 y_1 += A[1+bs*0] * x_0;
847 y_2 += A[2+bs*0] * x_0;
848 y_3 += A[3+bs*0] * x_0;
849
850 y_0 += A[0+bs*1] * x_1;
851 y_1 += A[1+bs*1] * x_1;
852 y_2 += A[2+bs*1] * x_1;
853 y_3 += A[3+bs*1] * x_1;
854
855 y_0 += A[0+bs*2] * x_2;
856 y_1 += A[1+bs*2] * x_2;
857 y_2 += A[2+bs*2] * x_2;
858 y_3 += A[3+bs*2] * x_2;
859
860 y_0 += A[0+bs*3] * x_3;
861 y_1 += A[1+bs*3] * x_3;
862 y_2 += A[2+bs*3] * x_3;
863 y_3 += A[3+bs*3] * x_3;
864
865 A += 4*bs;
866 x += 4;
867
868 }
869
870 for(; k<kmax; k++)
871 {
872
873 x_0 = x[0];
874
875 y_0 += A[0+bs*0] * x_0;
876 y_1 += A[1+bs*0] * x_0;
877 y_2 += A[2+bs*0] * x_0;
878 y_3 += A[3+bs*0] * x_0;
879
880 A += 1*bs;
881 x += 1;
882
883 }
884
885 z[0] = y_0;
886 z[1] = y_1;
887 z[2] = y_2;
888 z[3] = y_3;
889
890 }
891#endif
892
893
894
895#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
896void kernel_strmv_ut_4_vs_lib4(int kmax, float *A, int sda, float *x, float *z, int km)
897 {
898
899 const int bs = 4;
900
901 int
902 k;
903
904 float
905 x_0, x_1, x_2, x_3,
906 y_0=0, y_1=0, y_2=0, y_3=0;
907
908 k=0;
909 for(; k<kmax-4; k+=4)
910 {
911
912 x_0 = x[0];
913 x_1 = x[1];
914 x_2 = x[2];
915 x_3 = x[3];
916
917 y_0 += A[0+bs*0] * x_0;
918 y_1 += A[0+bs*1] * x_0;
919 y_2 += A[0+bs*2] * x_0;
920 y_3 += A[0+bs*3] * x_0;
921
922 y_0 += A[1+bs*0] * x_1;
923 y_1 += A[1+bs*1] * x_1;
924 y_2 += A[1+bs*2] * x_1;
925 y_3 += A[1+bs*3] * x_1;
926
927 y_0 += A[2+bs*0] * x_2;
928 y_1 += A[2+bs*1] * x_2;
929 y_2 += A[2+bs*2] * x_2;
930 y_3 += A[2+bs*3] * x_2;
931
932 y_0 += A[3+bs*0] * x_3;
933 y_1 += A[3+bs*1] * x_3;
934 y_2 += A[3+bs*2] * x_3;
935 y_3 += A[3+bs*3] * x_3;
936
937 A += sda*bs;
938 x += 4;
939
940 }
941
942 x_0 = x[0];
943 x_1 = x[1];
944 x_2 = x[2];
945 x_3 = x[3];
946
947 y_0 += A[0+bs*0] * x_0;
948 y_1 += A[0+bs*1] * x_0;
949 y_2 += A[0+bs*2] * x_0;
950 y_3 += A[0+bs*3] * x_0;
951
952/* y_0 += A[1+bs*0] * x_1;*/
953 y_1 += A[1+bs*1] * x_1;
954 y_2 += A[1+bs*2] * x_1;
955 y_3 += A[1+bs*3] * x_1;
956
957/* y_0 += A[2+bs*0] * x_2;*/
958/* y_1 += A[2+bs*1] * x_2;*/
959 y_2 += A[2+bs*2] * x_2;
960 y_3 += A[2+bs*3] * x_2;
961
962/* y_0 += A[3+bs*0] * x_3;*/
963/* y_1 += A[3+bs*1] * x_3;*/
964/* y_2 += A[3+bs*2] * x_3;*/
965 y_3 += A[3+bs*3] * x_3;
966
967// A += sda*bs;
968// x += 4;
969
970 // store_vs
971 store:
972 if(km>=4)
973 {
974 z[0] = y_0;
975 z[1] = y_1;
976 z[2] = y_2;
977 z[3] = y_3;
978 }
979 else
980 {
981 z[0] = y_0;
982 if(km>=2)
983 {
984 z[1] = y_1;
985 if(km>2)
986 {
987 z[2] = y_2;
988 }
989 }
990 }
991
992 }
993#endif
994
995
996
997#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
998void kernel_strmv_ut_4_lib4(int kmax, float *A, int sda, float *x, float *z)
999 {
1000
1001 kernel_strmv_ut_4_vs_lib4(kmax, A, sda, x, z, 4);
1002
1003 }
1004#endif
1005
1006
1007
1008
1009
1010