blob: 5512154fbc036c7f80fa457dbf57dd939d7b17ad [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29
30
31// XXX copy and scale y_n into z_n outside the kernel !!!!!
32#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
33void kernel_sgemv_nt_4_vs_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km)
34 {
35
36 if(kmax<=0)
37 return;
38
39 const int bs = 4;
40
41 int k;
42
43 float
44 a_00, a_01, a_02, a_03,
45 x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
46 x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
47
48 x_n_0 = 0;
49 x_n_1 = 0;
50 x_n_2 = 0;
51 x_n_3 = 0;
52
53 x_n_0 = alpha_n[0]*x_n[0];
54 if(km>1)
55 {
56 x_n_1 = alpha_n[0]*x_n[1];
57 if(km>2)
58 {
59 x_n_2 = alpha_n[0]*x_n[2];
60 if(km>3)
61 {
62 x_n_3 = alpha_n[0]*x_n[3];
63 }
64 }
65 }
66
67 y_t_0 = 0;
68 y_t_1 = 0;
69 y_t_2 = 0;
70 y_t_3 = 0;
71
72 k = 0;
73 for(; k<kmax-3; k+=bs)
74 {
75
76 // 0
77
78 y_n_0 = z_n[0];
79 x_t_0 = x_t[0];
80
81 a_00 = A[0+bs*0];
82 a_01 = A[0+bs*1];
83 a_02 = A[0+bs*2];
84 a_03 = A[0+bs*3];
85
86 y_n_0 += a_00 * x_n_0;
87 y_t_0 += a_00 * x_t_0;
88 y_n_0 += a_01 * x_n_1;
89 y_t_1 += a_01 * x_t_0;
90 y_n_0 += a_02 * x_n_2;
91 y_t_2 += a_02 * x_t_0;
92 y_n_0 += a_03 * x_n_3;
93 y_t_3 += a_03 * x_t_0;
94
95 z_n[0] = y_n_0;
96
97
98 // 1
99
100 y_n_0 = z_n[1];
101 x_t_0 = x_t[1];
102
103 a_00 = A[1+bs*0];
104 a_01 = A[1+bs*1];
105 a_02 = A[1+bs*2];
106 a_03 = A[1+bs*3];
107
108 y_n_0 += a_00 * x_n_0;
109 y_t_0 += a_00 * x_t_0;
110 y_n_0 += a_01 * x_n_1;
111 y_t_1 += a_01 * x_t_0;
112 y_n_0 += a_02 * x_n_2;
113 y_t_2 += a_02 * x_t_0;
114 y_n_0 += a_03 * x_n_3;
115 y_t_3 += a_03 * x_t_0;
116
117 z_n[1] = y_n_0;
118
119
120 // 2
121
122 y_n_0 = z_n[2];
123 x_t_0 = x_t[2];
124
125 a_00 = A[2+bs*0];
126 a_01 = A[2+bs*1];
127 a_02 = A[2+bs*2];
128 a_03 = A[2+bs*3];
129
130 y_n_0 += a_00 * x_n_0;
131 y_t_0 += a_00 * x_t_0;
132 y_n_0 += a_01 * x_n_1;
133 y_t_1 += a_01 * x_t_0;
134 y_n_0 += a_02 * x_n_2;
135 y_t_2 += a_02 * x_t_0;
136 y_n_0 += a_03 * x_n_3;
137 y_t_3 += a_03 * x_t_0;
138
139 z_n[2] = y_n_0;
140
141
142 // 3
143
144 y_n_0 = z_n[3];
145 x_t_0 = x_t[3];
146
147 a_00 = A[3+bs*0];
148 a_01 = A[3+bs*1];
149 a_02 = A[3+bs*2];
150 a_03 = A[3+bs*3];
151
152 y_n_0 += a_00 * x_n_0;
153 y_t_0 += a_00 * x_t_0;
154 y_n_0 += a_01 * x_n_1;
155 y_t_1 += a_01 * x_t_0;
156 y_n_0 += a_02 * x_n_2;
157 y_t_2 += a_02 * x_t_0;
158 y_n_0 += a_03 * x_n_3;
159 y_t_3 += a_03 * x_t_0;
160
161 z_n[3] = y_n_0;
162
163
164 A += sda*bs;
165 z_n += 4;
166 x_t += 4;
167
168 }
169 for(; k<kmax; k++)
170 {
171
172 // 0
173
174 y_n_0 = z_n[0];
175 x_t_0 = x_t[0];
176
177 a_00 = A[0+bs*0];
178 a_01 = A[0+bs*1];
179 a_02 = A[0+bs*2];
180 a_03 = A[0+bs*3];
181
182 y_n_0 += a_00 * x_n_0;
183 y_t_0 += a_00 * x_t_0;
184 y_n_0 += a_01 * x_n_1;
185 y_t_1 += a_01 * x_t_0;
186 y_n_0 += a_02 * x_n_2;
187 y_t_2 += a_02 * x_t_0;
188 y_n_0 += a_03 * x_n_3;
189 y_t_3 += a_03 * x_t_0;
190
191 z_n[0] = y_n_0;
192
193 A += 1;
194 z_n += 1;
195 x_t += 1;
196
197 }
198
199 // store t
200 z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
201 if(km>1)
202 {
203 z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
204 if(km>2)
205 {
206 z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
207 if(km>3)
208 {
209 z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
210 }
211 }
212 }
213
214 return;
215
216 }
217#endif
218
219
220
221// XXX copy and scale y_n into z_n outside the kernel !!!!!
222#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
223void kernel_sgemv_nt_4_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t)
224 {
225
226 kernel_sgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
227
228 return;
229
230 }
231#endif
232
233
234
235// XXX copy and scale y_n into z_n outside the kernel !!!!!
236#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
237void kernel_ssymv_l_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x_n, float *z_n, int km)
238 {
239
240 if(kmax<=0)
241 return;
242
243 float *x_t = x_n;
244 float *z_t = z_n;
245
246 const int bs = 4;
247
248 int k;
249
250 float
251 a_00, a_01, a_02, a_03,
252 x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
253 x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
254
255 x_n_0 = 0;
256 x_n_1 = 0;
257 x_n_2 = 0;
258 x_n_3 = 0;
259
260 x_n_0 = alpha[0]*x_n[0];
261 if(km>1)
262 {
263 x_n_1 = alpha[0]*x_n[1];
264 if(km>2)
265 {
266 x_n_2 = alpha[0]*x_n[2];
267 if(km>3)
268 {
269 x_n_3 = alpha[0]*x_n[3];
270 }
271 }
272 }
273
274 y_t_0 = 0;
275 y_t_1 = 0;
276 y_t_2 = 0;
277 y_t_3 = 0;
278
279 k = 0;
280 if(offA==0)
281 {
282 if(kmax<4)
283 {
284 // 0
285
286 x_t_0 = x_t[0];
287
288 a_00 = A[0+bs*0];
289
290 y_t_0 += a_00 * x_t_0;
291
292 if(kmax==1)
293 goto store_t;
294
295 // 1
296
297 y_n_0 = z_n[1];
298 x_t_0 = x_t[1];
299
300 a_00 = A[1+bs*0];
301 a_01 = A[1+bs*1];
302
303 y_n_0 += a_00 * x_n_0;
304 y_t_0 += a_00 * x_t_0;
305 y_t_1 += a_01 * x_t_0;
306
307 z_n[1] = y_n_0;
308
309 if(kmax==2)
310 goto store_t;
311
312 // 2
313
314 y_n_0 = z_n[2];
315 x_t_0 = x_t[2];
316
317 a_00 = A[2+bs*0];
318 a_01 = A[2+bs*1];
319 a_02 = A[2+bs*2];
320
321 y_n_0 += a_00 * x_n_0;
322 y_t_0 += a_00 * x_t_0;
323 y_n_0 += a_01 * x_n_1;
324 y_t_1 += a_01 * x_t_0;
325 y_t_2 += a_02 * x_t_0;
326
327 z_n[2] = y_n_0;
328
329 goto store_t;
330 }
331 else
332 {
333
334 // 0
335
336 x_t_0 = x_t[0];
337
338 a_00 = A[0+bs*0];
339
340 y_t_0 += a_00 * x_t_0;
341
342
343 // 1
344
345 y_n_0 = z_n[1];
346 x_t_0 = x_t[1];
347
348 a_00 = A[1+bs*0];
349 a_01 = A[1+bs*1];
350
351 y_n_0 += a_00 * x_n_0;
352 y_t_0 += a_00 * x_t_0;
353 y_t_1 += a_01 * x_t_0;
354
355 z_n[1] = y_n_0;
356
357
358 // 2
359
360 y_n_0 = z_n[2];
361 x_t_0 = x_t[2];
362
363 a_00 = A[2+bs*0];
364 a_01 = A[2+bs*1];
365 a_02 = A[2+bs*2];
366
367 y_n_0 += a_00 * x_n_0;
368 y_t_0 += a_00 * x_t_0;
369 y_n_0 += a_01 * x_n_1;
370 y_t_1 += a_01 * x_t_0;
371 y_t_2 += a_02 * x_t_0;
372
373 z_n[2] = y_n_0;
374
375
376 // 3
377
378 y_n_0 = z_n[3];
379 x_t_0 = x_t[3];
380
381 a_00 = A[3+bs*0];
382 a_01 = A[3+bs*1];
383 a_02 = A[3+bs*2];
384 a_03 = A[3+bs*3];
385
386 y_n_0 += a_00 * x_n_0;
387 y_t_0 += a_00 * x_t_0;
388 y_n_0 += a_01 * x_n_1;
389 y_t_1 += a_01 * x_t_0;
390 y_n_0 += a_02 * x_n_2;
391 y_t_2 += a_02 * x_t_0;
392 y_t_3 += a_03 * x_t_0;
393
394 z_n[3] = y_n_0;
395
396 k += 4;
397 A += sda*bs;
398 z_n += 4;
399 x_t += 4;
400
401 }
402 }
403 else if(offA==1)
404 {
405
406 // 0
407
408 x_t_0 = x_t[0];
409
410 a_00 = A[0+bs*0];
411
412 y_t_0 += a_00 * x_t_0;
413
414 A += 1;
415 z_n += 1;
416 x_t += 1;
417
418 if(kmax==1)
419 goto store_t;
420
421 // 1
422
423 y_n_0 = z_n[0];
424 x_t_0 = x_t[0];
425
426 a_00 = A[0+bs*0];
427 a_01 = A[0+bs*1];
428
429 y_n_0 += a_00 * x_n_0;
430 y_t_0 += a_00 * x_t_0;
431 y_t_1 += a_01 * x_t_0;
432
433 z_n[0] = y_n_0;
434
435 A += 1;
436 z_n += 1;
437 x_t += 1;
438
439 if(kmax==2)
440 goto store_t;
441
442 // 2
443
444 y_n_0 = z_n[0];
445 x_t_0 = x_t[0];
446
447 a_00 = A[0+bs*0];
448 a_01 = A[0+bs*1];
449 a_02 = A[0+bs*2];
450
451 y_n_0 += a_00 * x_n_0;
452 y_t_0 += a_00 * x_t_0;
453 y_n_0 += a_01 * x_n_1;
454 y_t_1 += a_01 * x_t_0;
455 y_t_2 += a_02 * x_t_0;
456
457 z_n[0] = y_n_0;
458
459 A += 1;
460 z_n += 1;
461 x_t += 1;
462
463 A += (sda-1)*bs; // new panel
464
465 if(kmax==3)
466 goto store_t;
467
468 // 3
469
470 y_n_0 = z_n[0];
471 x_t_0 = x_t[0];
472
473 a_00 = A[0+bs*0];
474 a_01 = A[0+bs*1];
475 a_02 = A[0+bs*2];
476 a_03 = A[0+bs*3];
477
478 y_n_0 += a_00 * x_n_0;
479 y_t_0 += a_00 * x_t_0;
480 y_n_0 += a_01 * x_n_1;
481 y_t_1 += a_01 * x_t_0;
482 y_n_0 += a_02 * x_n_2;
483 y_t_2 += a_02 * x_t_0;
484 y_t_3 += a_03 * x_t_0;
485
486 z_n[0] = y_n_0;
487
488 A += 1;
489 z_n += 1;
490 x_t += 1;
491
492 if(kmax==4)
493 goto store_t;
494
495 // 4
496
497 y_n_0 = z_n[0];
498 x_t_0 = x_t[0];
499
500 a_00 = A[0+bs*0];
501 a_01 = A[0+bs*1];
502 a_02 = A[0+bs*2];
503 a_03 = A[0+bs*3];
504
505 y_n_0 += a_00 * x_n_0;
506 y_t_0 += a_00 * x_t_0;
507 y_n_0 += a_01 * x_n_1;
508 y_t_1 += a_01 * x_t_0;
509 y_n_0 += a_02 * x_n_2;
510 y_t_2 += a_02 * x_t_0;
511 y_n_0 += a_03 * x_n_3;
512 y_t_3 += a_03 * x_t_0;
513
514 z_n[0] = y_n_0;
515
516 A += 1;
517 z_n += 1;
518 x_t += 1;
519
520 if(kmax==5)
521 goto store_t;
522
523 // 5
524
525 y_n_0 = z_n[0];
526 x_t_0 = x_t[0];
527
528 a_00 = A[0+bs*0];
529 a_01 = A[0+bs*1];
530 a_02 = A[0+bs*2];
531 a_03 = A[0+bs*3];
532
533 y_n_0 += a_00 * x_n_0;
534 y_t_0 += a_00 * x_t_0;
535 y_n_0 += a_01 * x_n_1;
536 y_t_1 += a_01 * x_t_0;
537 y_n_0 += a_02 * x_n_2;
538 y_t_2 += a_02 * x_t_0;
539 y_n_0 += a_03 * x_n_3;
540 y_t_3 += a_03 * x_t_0;
541
542 z_n[0] = y_n_0;
543
544 A += 1;
545 z_n += 1;
546 x_t += 1;
547
548 if(kmax==6)
549 goto store_t;
550
551 // 6
552
553 y_n_0 = z_n[0];
554 x_t_0 = x_t[0];
555
556 a_00 = A[0+bs*0];
557 a_01 = A[0+bs*1];
558 a_02 = A[0+bs*2];
559 a_03 = A[0+bs*3];
560
561 y_n_0 += a_00 * x_n_0;
562 y_t_0 += a_00 * x_t_0;
563 y_n_0 += a_01 * x_n_1;
564 y_t_1 += a_01 * x_t_0;
565 y_n_0 += a_02 * x_n_2;
566 y_t_2 += a_02 * x_t_0;
567 y_n_0 += a_03 * x_n_3;
568 y_t_3 += a_03 * x_t_0;
569
570 z_n[0] = y_n_0;
571
572 A += 1;
573 z_n += 1;
574 x_t += 1;
575
576 A += (sda-1)*bs; // new panel
577
578 if(kmax==7)
579 goto store_t;
580
581 k += 7;
582
583 }
584 else if(offA==2)
585 {
586
587 // 0
588
589 x_t_0 = x_t[0];
590
591 a_00 = A[0+bs*0];
592
593 y_t_0 += a_00 * x_t_0;
594
595 A += 1;
596 z_n += 1;
597 x_t += 1;
598
599 if(kmax==1)
600 goto store_t;
601
602 // 1
603
604 y_n_0 = z_n[0];
605 x_t_0 = x_t[0];
606
607 a_00 = A[0+bs*0];
608 a_01 = A[0+bs*1];
609
610 y_n_0 += a_00 * x_n_0;
611 y_t_0 += a_00 * x_t_0;
612 y_t_1 += a_01 * x_t_0;
613
614 z_n[0] = y_n_0;
615
616 A += 1;
617 z_n += 1;
618 x_t += 1;
619
620 A += (sda-1)*bs; // new panel
621
622 if(kmax==2)
623 goto store_t;
624
625 // 2
626
627 y_n_0 = z_n[0];
628 x_t_0 = x_t[0];
629
630 a_00 = A[0+bs*0];
631 a_01 = A[0+bs*1];
632 a_02 = A[0+bs*2];
633
634 y_n_0 += a_00 * x_n_0;
635 y_t_0 += a_00 * x_t_0;
636 y_n_0 += a_01 * x_n_1;
637 y_t_1 += a_01 * x_t_0;
638 y_t_2 += a_02 * x_t_0;
639
640 z_n[0] = y_n_0;
641
642 A += 1;
643 z_n += 1;
644 x_t += 1;
645
646 if(kmax==3)
647 goto store_t;
648
649 // 3
650
651 y_n_0 = z_n[0];
652 x_t_0 = x_t[0];
653
654 a_00 = A[0+bs*0];
655 a_01 = A[0+bs*1];
656 a_02 = A[0+bs*2];
657 a_03 = A[0+bs*3];
658
659 y_n_0 += a_00 * x_n_0;
660 y_t_0 += a_00 * x_t_0;
661 y_n_0 += a_01 * x_n_1;
662 y_t_1 += a_01 * x_t_0;
663 y_n_0 += a_02 * x_n_2;
664 y_t_2 += a_02 * x_t_0;
665 y_t_3 += a_03 * x_t_0;
666
667 z_n[0] = y_n_0;
668
669 A += 1;
670 z_n += 1;
671 x_t += 1;
672
673 if(kmax==4)
674 goto store_t;
675
676 // 4
677
678 y_n_0 = z_n[0];
679 x_t_0 = x_t[0];
680
681 a_00 = A[0+bs*0];
682 a_01 = A[0+bs*1];
683 a_02 = A[0+bs*2];
684 a_03 = A[0+bs*3];
685
686 y_n_0 += a_00 * x_n_0;
687 y_t_0 += a_00 * x_t_0;
688 y_n_0 += a_01 * x_n_1;
689 y_t_1 += a_01 * x_t_0;
690 y_n_0 += a_02 * x_n_2;
691 y_t_2 += a_02 * x_t_0;
692 y_n_0 += a_03 * x_n_3;
693 y_t_3 += a_03 * x_t_0;
694
695 z_n[0] = y_n_0;
696
697 A += 1;
698 z_n += 1;
699 x_t += 1;
700
701 if(kmax==5)
702 goto store_t;
703
704 // 5
705
706 y_n_0 = z_n[0];
707 x_t_0 = x_t[0];
708
709 a_00 = A[0+bs*0];
710 a_01 = A[0+bs*1];
711 a_02 = A[0+bs*2];
712 a_03 = A[0+bs*3];
713
714 y_n_0 += a_00 * x_n_0;
715 y_t_0 += a_00 * x_t_0;
716 y_n_0 += a_01 * x_n_1;
717 y_t_1 += a_01 * x_t_0;
718 y_n_0 += a_02 * x_n_2;
719 y_t_2 += a_02 * x_t_0;
720 y_n_0 += a_03 * x_n_3;
721 y_t_3 += a_03 * x_t_0;
722
723 z_n[0] = y_n_0;
724
725 A += 1;
726 z_n += 1;
727 x_t += 1;
728
729 A += (sda-1)*bs; // new panel
730
731 if(kmax==6)
732 goto store_t;
733
734 k += 6;
735
736 }
737 else // if(offA==3)
738 {
739
740 // 0
741
742 x_t_0 = x_t[0];
743
744 a_00 = A[0+bs*0];
745
746 y_t_0 += a_00 * x_t_0;
747
748 A += 1;
749 z_n += 1;
750 x_t += 1;
751
752 A += (sda-1)*bs; // new panel
753
754 if(kmax==1)
755 goto store_t;
756
757 // 1
758
759 y_n_0 = z_n[0];
760 x_t_0 = x_t[0];
761
762 a_00 = A[0+bs*0];
763 a_01 = A[0+bs*1];
764
765 y_n_0 += a_00 * x_n_0;
766 y_t_0 += a_00 * x_t_0;
767 y_t_1 += a_01 * x_t_0;
768
769 z_n[0] = y_n_0;
770
771 A += 1;
772 z_n += 1;
773 x_t += 1;
774
775 if(kmax==2)
776 goto store_t;
777
778 // 2
779
780 y_n_0 = z_n[0];
781 x_t_0 = x_t[0];
782
783 a_00 = A[0+bs*0];
784 a_01 = A[0+bs*1];
785 a_02 = A[0+bs*2];
786
787 y_n_0 += a_00 * x_n_0;
788 y_t_0 += a_00 * x_t_0;
789 y_n_0 += a_01 * x_n_1;
790 y_t_1 += a_01 * x_t_0;
791 y_t_2 += a_02 * x_t_0;
792
793 z_n[0] = y_n_0;
794
795 A += 1;
796 z_n += 1;
797 x_t += 1;
798
799 if(kmax==3)
800 goto store_t;
801
802 // 3
803
804 y_n_0 = z_n[0];
805 x_t_0 = x_t[0];
806
807 a_00 = A[0+bs*0];
808 a_01 = A[0+bs*1];
809 a_02 = A[0+bs*2];
810 a_03 = A[0+bs*3];
811
812 y_n_0 += a_00 * x_n_0;
813 y_t_0 += a_00 * x_t_0;
814 y_n_0 += a_01 * x_n_1;
815 y_t_1 += a_01 * x_t_0;
816 y_n_0 += a_02 * x_n_2;
817 y_t_2 += a_02 * x_t_0;
818 y_t_3 += a_03 * x_t_0;
819
820 z_n[0] = y_n_0;
821
822 A += 1;
823 z_n += 1;
824 x_t += 1;
825
826 if(kmax==4)
827 goto store_t;
828
829 // 4
830
831 y_n_0 = z_n[0];
832 x_t_0 = x_t[0];
833
834 a_00 = A[0+bs*0];
835 a_01 = A[0+bs*1];
836 a_02 = A[0+bs*2];
837 a_03 = A[0+bs*3];
838
839 y_n_0 += a_00 * x_n_0;
840 y_t_0 += a_00 * x_t_0;
841 y_n_0 += a_01 * x_n_1;
842 y_t_1 += a_01 * x_t_0;
843 y_n_0 += a_02 * x_n_2;
844 y_t_2 += a_02 * x_t_0;
845 y_n_0 += a_03 * x_n_3;
846 y_t_3 += a_03 * x_t_0;
847
848 z_n[0] = y_n_0;
849
850 A += 1;
851 z_n += 1;
852 x_t += 1;
853
854 A += (sda-1)*bs; // new panel
855
856 if(kmax==5)
857 goto store_t;
858
859 k += 5;
860
861 }
862 for(; k<kmax-3; k+=bs)
863 {
864
865 // 0
866
867 y_n_0 = z_n[0];
868 x_t_0 = x_t[0];
869
870 a_00 = A[0+bs*0];
871 a_01 = A[0+bs*1];
872 a_02 = A[0+bs*2];
873 a_03 = A[0+bs*3];
874
875 y_n_0 += a_00 * x_n_0;
876 y_t_0 += a_00 * x_t_0;
877 y_n_0 += a_01 * x_n_1;
878 y_t_1 += a_01 * x_t_0;
879 y_n_0 += a_02 * x_n_2;
880 y_t_2 += a_02 * x_t_0;
881 y_n_0 += a_03 * x_n_3;
882 y_t_3 += a_03 * x_t_0;
883
884 z_n[0] = y_n_0;
885
886
887 // 1
888
889 y_n_0 = z_n[1];
890 x_t_0 = x_t[1];
891
892 a_00 = A[1+bs*0];
893 a_01 = A[1+bs*1];
894 a_02 = A[1+bs*2];
895 a_03 = A[1+bs*3];
896
897 y_n_0 += a_00 * x_n_0;
898 y_t_0 += a_00 * x_t_0;
899 y_n_0 += a_01 * x_n_1;
900 y_t_1 += a_01 * x_t_0;
901 y_n_0 += a_02 * x_n_2;
902 y_t_2 += a_02 * x_t_0;
903 y_n_0 += a_03 * x_n_3;
904 y_t_3 += a_03 * x_t_0;
905
906 z_n[1] = y_n_0;
907
908
909 // 2
910
911 y_n_0 = z_n[2];
912 x_t_0 = x_t[2];
913
914 a_00 = A[2+bs*0];
915 a_01 = A[2+bs*1];
916 a_02 = A[2+bs*2];
917 a_03 = A[2+bs*3];
918
919 y_n_0 += a_00 * x_n_0;
920 y_t_0 += a_00 * x_t_0;
921 y_n_0 += a_01 * x_n_1;
922 y_t_1 += a_01 * x_t_0;
923 y_n_0 += a_02 * x_n_2;
924 y_t_2 += a_02 * x_t_0;
925 y_n_0 += a_03 * x_n_3;
926 y_t_3 += a_03 * x_t_0;
927
928 z_n[2] = y_n_0;
929
930
931 // 3
932
933 y_n_0 = z_n[3];
934 x_t_0 = x_t[3];
935
936 a_00 = A[3+bs*0];
937 a_01 = A[3+bs*1];
938 a_02 = A[3+bs*2];
939 a_03 = A[3+bs*3];
940
941 y_n_0 += a_00 * x_n_0;
942 y_t_0 += a_00 * x_t_0;
943 y_n_0 += a_01 * x_n_1;
944 y_t_1 += a_01 * x_t_0;
945 y_n_0 += a_02 * x_n_2;
946 y_t_2 += a_02 * x_t_0;
947 y_n_0 += a_03 * x_n_3;
948 y_t_3 += a_03 * x_t_0;
949
950 z_n[3] = y_n_0;
951
952
953 A += sda*bs;
954 z_n += 4;
955 x_t += 4;
956
957 }
958 for(; k<kmax; k++)
959 {
960
961 // 0
962
963 y_n_0 = z_n[0];
964 x_t_0 = x_t[0];
965
966 a_00 = A[0+bs*0];
967 a_01 = A[0+bs*1];
968 a_02 = A[0+bs*2];
969 a_03 = A[0+bs*3];
970
971 y_n_0 += a_00 * x_n_0;
972 y_t_0 += a_00 * x_t_0;
973 y_n_0 += a_01 * x_n_1;
974 y_t_1 += a_01 * x_t_0;
975 y_n_0 += a_02 * x_n_2;
976 y_t_2 += a_02 * x_t_0;
977 y_n_0 += a_03 * x_n_3;
978 y_t_3 += a_03 * x_t_0;
979
980 z_n[0] = y_n_0;
981
982 A += 1;
983 z_n += 1;
984 x_t += 1;
985
986 }
987
988 store_t:
989 z_t[0] += alpha[0]*y_t_0;
990 if(km>1)
991 {
992 z_t[1] += alpha[0]*y_t_1;
993 if(km>2)
994 {
995 z_t[2] += alpha[0]*y_t_2;
996 if(km>3)
997 {
998 z_t[3] += alpha[0]*y_t_3;
999 }
1000 }
1001 }
1002
1003 return;
1004
1005 }
1006#endif
1007
1008
1009
1010// XXX copy and scale y_n into z_n outside the kernel !!!!!
1011#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
1012void kernel_ssymv_l_4_lib4(int kmax, float *alpha, float *A, int sda, float *x_n, float *z_n)
1013 {
1014
1015 kernel_ssymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
1016
1017 return;
1018
1019 }
1020#endif
1021
1022
1023
1024
1025