blob: de5b7048beb042bd985d32f42830abe4303b1e47 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29
30
31
32// both A and B are aligned to 256-bit boundaries
33void kernel_sgesc_4_lib4(int kmax, float *alphap, float *A)
34 {
35
36 if(kmax<=0)
37 return;
38
39 const int bs = 4;
40
41 float alpha = alphap[0];
42
43 int k;
44
45 for(k=0; k<kmax; k++)
46 {
47
48 A[0+bs*0] *= alpha;
49 A[1+bs*0] *= alpha;
50 A[2+bs*0] *= alpha;
51 A[3+bs*0] *= alpha;
52
53 A += 4;
54
55 }
56
57 }
58
59
60
61void kernel_sgesc_3_lib4(int kmax, float *alphap, float *A)
62 {
63
64 if(kmax<=0)
65 return;
66
67 const int bs = 4;
68
69 float alpha = alphap[0];
70
71 int k;
72
73 for(k=0; k<kmax; k++)
74 {
75
76 A[0+bs*0] *= alpha;
77 A[1+bs*0] *= alpha;
78 A[2+bs*0] *= alpha;
79
80 A += 4;
81
82 }
83
84 }
85
86
87
88void kernel_sgesc_2_lib4(int kmax, float *alphap, float *A)
89 {
90
91 if(kmax<=0)
92 return;
93
94 const int bs = 4;
95
96 float alpha = alphap[0];
97
98 int k;
99
100 for(k=0; k<kmax; k++)
101 {
102
103 A[0+bs*0] *= alpha;
104 A[1+bs*0] *= alpha;
105
106 A += 4;
107
108 }
109
110 }
111
112
113
114void kernel_sgesc_1_lib4(int kmax, float *alphap, float *A)
115 {
116
117 if(kmax<=0)
118 return;
119
120 const int bs = 4;
121
122 float alpha = alphap[0];
123
124 int k;
125
126 for(k=0; k<kmax; k++)
127 {
128
129 A[0+bs*0] *= alpha;
130
131 A += 4;
132
133 }
134
135 }
136
137
138
139// both A and B are aligned to 256-bit boundaries
140void kernel_sgecp_4_0_lib4(int kmax, float *A, float *B)
141 {
142
143 if(kmax<=0)
144 return;
145
146 const int bs = 4;
147
148 int k;
149
150 for(k=0; k<kmax; k++)
151 {
152
153 B[0+bs*0] = A[0+bs*0];
154 B[1+bs*0] = A[1+bs*0];
155 B[2+bs*0] = A[2+bs*0];
156 B[3+bs*0] = A[3+bs*0];
157
158 A += 4;
159 B += 4;
160
161 }
162
163 }
164
165
166
167// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
168void kernel_sgecp_4_1_lib4(int kmax, float *A0, int sda, float *B)
169 {
170
171 if(kmax<=0)
172 return;
173
174 const int bs = 4;
175
176 float *A1 = A0 + bs*sda;
177
178 int k;
179
180 for(k=0; k<kmax; k++)
181 {
182
183 B[0+bs*0] = A0[1+bs*0];
184 B[1+bs*0] = A0[2+bs*0];
185 B[2+bs*0] = A0[3+bs*0];
186 B[3+bs*0] = A1[0+bs*0];
187
188 A0 += 4;
189 A1 += 4;
190 B += 4;
191
192 }
193
194 }
195
196
197
198// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
199void kernel_sgecp_4_2_lib4(int kmax, float *A0, int sda, float *B)
200 {
201
202 if(kmax<=0)
203 return;
204
205 const int bs = 4;
206
207 float *A1 = A0 + bs*sda;
208
209 int k;
210
211 for(k=0; k<kmax; k++)
212 {
213
214 B[0+bs*0] = A0[2+bs*0];
215 B[1+bs*0] = A0[3+bs*0];
216 B[2+bs*0] = A1[0+bs*0];
217 B[3+bs*0] = A1[1+bs*0];
218
219 A0 += 4;
220 A1 += 4;
221 B += 4;
222
223 }
224
225 }
226
227
228
229// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
230void kernel_sgecp_4_3_lib4(int kmax, float *A0, int sda, float *B)
231 {
232
233 if(kmax<=0)
234 return;
235
236 const int bs = 4;
237
238 float *A1 = A0 + bs*sda;
239
240 int k;
241
242 for(k=0; k<kmax; k++)
243 {
244
245 B[0+bs*0] = A0[3+bs*0];
246 B[1+bs*0] = A1[0+bs*0];
247 B[2+bs*0] = A1[1+bs*0];
248 B[3+bs*0] = A1[2+bs*0];
249
250 A0 += 4;
251 A1 += 4;
252 B += 4;
253
254 }
255
256 }
257
258
259
260// both A and B are aligned to 64-bit boundaries
261void kernel_sgecp_3_0_lib4(int kmax, float *A, float *B)
262 {
263
264 if(kmax<=0)
265 return;
266
267 const int bs = 4;
268
269 int k;
270
271 for(k=0; k<kmax; k++)
272 {
273
274 B[0+bs*0] = A[0+bs*0];
275 B[1+bs*0] = A[1+bs*0];
276 B[2+bs*0] = A[2+bs*0];
277
278 A += 4;
279 B += 4;
280
281 }
282
283 }
284
285
286
287// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
288void kernel_sgecp_3_2_lib4(int kmax, float *A0, int sda, float *B)
289 {
290
291 if(kmax<=0)
292 return;
293
294 const int bs = 4;
295
296 float *A1 = A0 + bs*sda;
297
298 int k;
299
300 for(k=0; k<kmax; k++)
301 {
302
303 B[0+bs*0] = A0[2+bs*0];
304 B[1+bs*0] = A0[3+bs*0];
305 B[2+bs*0] = A1[0+bs*0];
306
307 A0 += 4;
308 A1 += 4;
309 B += 4;
310
311 }
312
313 }
314
315
316
317// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
318void kernel_sgecp_3_3_lib4(int kmax, float *A0, int sda, float *B)
319 {
320
321 if(kmax<=0)
322 return;
323
324 const int bs = 4;
325
326 float *A1 = A0 + bs*sda;
327
328 int k;
329
330 for(k=0; k<kmax; k++)
331 {
332
333 B[0+bs*0] = A0[3+bs*0];
334 B[1+bs*0] = A1[0+bs*0];
335 B[2+bs*0] = A1[1+bs*0];
336
337 A0 += 4;
338 A1 += 4;
339 B += 4;
340
341 }
342
343 }
344
345
346
347// both A and B are aligned to 64-bit boundaries
348void kernel_sgecp_2_0_lib4(int kmax, float *A, float *B)
349 {
350
351 if(kmax<=0)
352 return;
353
354 const int bs = 4;
355
356 int k;
357
358 for(k=0; k<kmax; k++)
359 {
360
361 B[0+bs*0] = A[0+bs*0];
362 B[1+bs*0] = A[1+bs*0];
363
364 A += 4;
365 B += 4;
366
367 }
368
369 }
370
371
372
373// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
374void kernel_sgecp_2_3_lib4(int kmax, float *A0, int sda, float *B)
375 {
376
377 if(kmax<=0)
378 return;
379
380 const int bs = 4;
381
382 float *A1 = A0 + bs*sda;
383
384 int k;
385
386 for(k=0; k<kmax; k++)
387 {
388
389 B[0+bs*0] = A0[3+bs*0];
390 B[1+bs*0] = A1[0+bs*0];
391
392 A0 += 4;
393 A1 += 4;
394 B += 4;
395
396 }
397
398 }
399
400
401
402// both A and B are aligned 64-bit boundaries
403void kernel_sgecp_1_0_lib4(int kmax, float *A, float *B)
404 {
405
406 if(kmax<=0)
407 return;
408
409 const int bs = 4;
410
411 int k;
412
413 for(k=0; k<kmax; k++)
414 {
415
416 B[0+bs*0] = A[0+bs*0];
417
418 A += 4;
419 B += 4;
420
421 }
422
423 }
424
425
426
427// both A and B are aligned to 256-bit boundaries
428void kernel_strcp_l_4_0_lib4(int kmax, float *A, float *B)
429 {
430
431 // A and C are lower triangular
432 // kmax+1 4-wide + end 3x3 triangle
433
434 kmax += 1;
435
436 if(kmax<=0)
437 return;
438
439 const int bs = 4;
440
441 int k;
442
443 for(k=0; k<kmax; k++)
444 {
445
446 B[0+bs*0] = A[0+bs*0];
447 B[1+bs*0] = A[1+bs*0];
448 B[2+bs*0] = A[2+bs*0];
449 B[3+bs*0] = A[3+bs*0];
450
451 A += 4;
452 B += 4;
453
454 }
455
456 // 3x3 triangle
457
458 B[1+bs*0] = A[1+bs*0];
459 B[2+bs*0] = A[2+bs*0];
460 B[3+bs*0] = A[3+bs*0];
461
462 B[2+bs*1] = A[2+bs*1];
463 B[3+bs*1] = A[3+bs*1];
464
465 B[3+bs*2] = A[3+bs*2];
466
467 }
468
469
470
471// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
472void kernel_strcp_l_4_1_lib4(int kmax, float *A0, int sda, float *B)
473 {
474
475 // A and C are lower triangular
476 // kmax+1 4-wide + end 3x3 triangle
477
478 kmax += 1;
479
480 if(kmax<=0)
481 return;
482
483 const int bs = 4;
484
485 float *A1 = A0 + bs*sda;
486
487 int k;
488
489 for(k=0; k<kmax; k++)
490 {
491
492 B[0+bs*0] = A0[1+bs*0];
493 B[1+bs*0] = A0[2+bs*0];
494 B[2+bs*0] = A0[3+bs*0];
495 B[3+bs*0] = A1[0+bs*0];
496
497 A0 += 4;
498 A1 += 4;
499 B += 4;
500
501 }
502
503 // 3x3 triangle
504
505 B[1+0*bs] = A0[2+0*bs];
506 B[2+0*bs] = A0[3+0*bs];
507 B[3+0*bs] = A1[0+0*bs];
508
509 B[2+1*bs] = A0[3+1*bs];
510 B[3+1*bs] = A1[0+1*bs];
511
512 B[3+2*bs] = A1[0+2*bs];
513
514 }
515
516
517
518// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
519void kernel_strcp_l_4_2_lib4(int kmax, float *A0, int sda, float *B)
520 {
521
522 // A and C are lower triangular
523 // kmax+1 4-wide + end 3x3 triangle
524
525 kmax += 1;
526
527 if(kmax<=0)
528 return;
529
530 const int bs = 4;
531
532 float *A1 = A0 + bs*sda;
533
534 int k;
535
536 for(k=0; k<kmax; k++)
537 {
538
539 B[0+bs*0] = A0[2+bs*0];
540 B[1+bs*0] = A0[3+bs*0];
541 B[2+bs*0] = A1[0+bs*0];
542 B[3+bs*0] = A1[1+bs*0];
543
544 A0 += 4;
545 A1 += 4;
546 B += 4;
547
548 }
549
550 // 3x3 triangle}
551
552 B[1+bs*0] = A0[3+bs*0];
553 B[2+bs*0] = A1[0+bs*0];
554 B[3+bs*0] = A1[1+bs*0];
555
556 B[2+bs*1] = A1[0+bs*1];
557 B[3+bs*1] = A1[1+bs*1];
558
559 B[3+bs*2] = A1[1+bs*2];
560
561 }
562
563
564
565// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
566void kernel_strcp_l_4_3_lib4(int kmax, float *A0, int sda, float *B)
567 {
568
569 // A and C are lower triangular
570 // kmax+1 4-wide + end 3x3 triangle
571
572 kmax += 1;
573
574 if(kmax<=0)
575 return;
576
577 const int bs = 4;
578
579 float *A1 = A0 + bs*sda;
580
581 int k;
582
583 for(k=0; k<kmax; k++)
584 {
585
586 B[0+bs*0] = A0[3+bs*0];
587 B[1+bs*0] = A1[0+bs*0];
588 B[2+bs*0] = A1[1+bs*0];
589 B[3+bs*0] = A1[2+bs*0];
590
591 A0 += 4;
592 A1 += 4;
593 B += 4;
594
595 }
596
597 // 3x3 triangle
598
599 B[1+bs*0] = A1[0+bs*0];
600 B[2+bs*0] = A1[1+bs*0];
601 B[3+bs*0] = A1[2+bs*0];
602
603 B[2+bs*1] = A1[1+bs*1];
604 B[3+bs*1] = A1[2+bs*1];
605
606 B[3+bs*2] = A1[2+bs*2];
607
608 }
609
610
611
612// both A and B are aligned to 64-bit boundaries
613void kernel_strcp_l_3_0_lib4(int kmax, float *A, float *B)
614 {
615
616 // A and C are lower triangular
617 // kmax+1 3-wide + end 2x2 triangle
618
619 kmax += 1;
620
621 if(kmax<=0)
622 return;
623
624 const int bs = 4;
625
626 int k;
627
628 for(k=0; k<kmax; k++)
629 {
630
631 B[0+bs*0] = A[0+bs*0];
632 B[1+bs*0] = A[1+bs*0];
633 B[2+bs*0] = A[2+bs*0];
634
635 A += 4;
636 B += 4;
637
638 }
639
640 // 2x2 triangle
641
642 B[1+bs*0] = A[1+bs*0];
643 B[2+bs*0] = A[2+bs*0];
644
645 B[2+bs*1] = A[2+bs*1];
646
647 }
648
649
650
651// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
652void kernel_strcp_l_3_2_lib4(int kmax, float *A0, int sda, float *B)
653 {
654
655 // A and C are lower triangular
656 // kmax+1 3-wide + end 2x2 triangle
657
658 kmax += 1;
659
660 if(kmax<=0)
661 return;
662
663 const int bs = 4;
664
665 float *A1 = A0 + bs*sda;
666
667 int k;
668
669 for(k=0; k<kmax; k++)
670 {
671
672 B[0+bs*0] = A0[2+bs*0];
673 B[1+bs*0] = A0[3+bs*0];
674 B[2+bs*0] = A1[0+bs*0];
675
676 A0 += 4;
677 A1 += 4;
678 B += 4;
679
680 }
681
682 // 2x2 triangle
683
684 B[1+bs*0] = A0[3+bs*0];
685 B[2+bs*0] = A1[0+bs*0];
686
687 B[2+bs*1] = A1[0+bs*1];
688
689 }
690
691
692
693// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
694void kernel_strcp_l_3_3_lib4(int kmax, float *A0, int sda, float *B)
695 {
696
697 // A and C are lower triangular
698 // kmax+1 3-wide + end 2x2 triangle
699
700 kmax += 1;
701
702 if(kmax<=0)
703 return;
704
705 const int bs = 4;
706
707 float *A1 = A0 + bs*sda;
708
709 int k;
710
711 for(k=0; k<kmax; k++)
712 {
713
714 B[0+bs*0] = A0[3+bs*0];
715 B[1+bs*0] = A1[0+bs*0];
716 B[2+bs*0] = A1[1+bs*0];
717
718 A0 += 4;
719 A1 += 4;
720 B += 4;
721
722 }
723
724 // 2x2 triangle
725
726 B[1+bs*0] = A1[0+bs*0];
727 B[2+bs*0] = A1[1+bs*0];
728
729 B[2+bs*1] = A1[1+bs*1];
730
731 }
732
733
734
735// both A and B are aligned to 64-bit boundaries
736void kernel_strcp_l_2_0_lib4(int kmax, float alpha, float *A, float *B)
737 {
738
739 // A and C are lower triangular
740 // kmax+1 2-wide + end 1x1 triangle
741
742 kmax += 1;
743
744 if(kmax<=0)
745 return;
746
747 const int bs = 4;
748
749 int k;
750
751 for(k=0; k<kmax; k++)
752 {
753
754 B[0+bs*0] = A[0+bs*0];
755 B[1+bs*0] = A[1+bs*0];
756
757 A += 4;
758 B += 4;
759
760 }
761
762 // 1x1 triangle
763
764 B[1+bs*0] = A[1+bs*0];
765
766 }
767
768
769
770// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
771void kernel_strcp_l_2_3_lib4(int kmax, float *A0, int sda, float *B)
772 {
773
774 // A and C are lower triangular
775 // kmax+1 2-wide + end 1x1 triangle
776
777 kmax += 1;
778
779 if(kmax<=0)
780 return;
781
782 const int bs = 4;
783
784 float *A1 = A0 + bs*sda;
785
786 int k;
787
788 for(k=0; k<kmax; k++)
789 {
790
791 B[0+bs*0] = A0[3+bs*0];
792 B[1+bs*0] = A1[0+bs*0];
793
794 A0 += 4;
795 A1 += 4;
796 B += 4;
797
798 }
799
800 // 1x1 triangle
801
802 B[1+bs*0] = A1[0+bs*0];
803
804 }
805
806
807
808// both A and B are aligned 64-bit boundaries
809void kernel_strcp_l_1_0_lib4(int kmax, float *A, float *B)
810 {
811
812 // A and C are lower triangular
813 // kmax+1 1-wide
814
815 kmax += 1;
816
817 if(kmax<=0)
818 return;
819
820 const int bs = 4;
821
822 int k;
823
824 for(k=0; k<kmax; k++)
825 {
826
827 B[0+bs*0] = A[0+bs*0];
828
829 A += 4;
830 B += 4;
831
832 }
833
834 }
835
836
837
838
839// both A and B are aligned to 256-bit boundaries
840void kernel_sgead_4_0_lib4(int kmax, float *alphap, float *A, float *B)
841 {
842
843 if(kmax<=0)
844 return;
845
846 const int bs = 4;
847
848 float alpha = alphap[0];
849
850 int k;
851
852 for(k=0; k<kmax; k++)
853 {
854
855 B[0+bs*0] += alpha * A[0+bs*0];
856 B[1+bs*0] += alpha * A[1+bs*0];
857 B[2+bs*0] += alpha * A[2+bs*0];
858 B[3+bs*0] += alpha * A[3+bs*0];
859
860 A += 4;
861 B += 4;
862
863 }
864
865 }
866
867
868
869// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
870void kernel_sgead_4_1_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
871 {
872
873 if(kmax<=0)
874 return;
875
876 const int bs = 4;
877
878 float alpha = alphap[0];
879
880 float *A1 = A0 + bs*sda;
881
882 int k;
883
884 for(k=0; k<kmax; k++)
885 {
886
887 B[0+bs*0] += alpha * A0[1+bs*0];
888 B[1+bs*0] += alpha * A0[2+bs*0];
889 B[2+bs*0] += alpha * A0[3+bs*0];
890 B[3+bs*0] += alpha * A1[0+bs*0];
891
892 A0 += 4;
893 A1 += 4;
894 B += 4;
895
896 }
897
898 }
899
900
901
902// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
903void kernel_sgead_4_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
904 {
905
906 if(kmax<=0)
907 return;
908
909 const int bs = 4;
910
911 float alpha = alphap[0];
912
913 float *A1 = A0 + bs*sda;
914
915 int k;
916
917 for(k=0; k<kmax; k++)
918 {
919
920 B[0+bs*0] += alpha * A0[2+bs*0];
921 B[1+bs*0] += alpha * A0[3+bs*0];
922 B[2+bs*0] += alpha * A1[0+bs*0];
923 B[3+bs*0] += alpha * A1[1+bs*0];
924
925 A0 += 4;
926 A1 += 4;
927 B += 4;
928
929 }
930
931 }
932
933
934
935// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
936void kernel_sgead_4_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
937 {
938
939 if(kmax<=0)
940 return;
941
942 const int bs = 4;
943
944 float alpha = alphap[0];
945
946 float *A1 = A0 + bs*sda;
947
948 int k;
949
950 for(k=0; k<kmax; k++)
951 {
952
953 B[0+bs*0] += alpha * A0[3+bs*0];
954 B[1+bs*0] += alpha * A1[0+bs*0];
955 B[2+bs*0] += alpha * A1[1+bs*0];
956 B[3+bs*0] += alpha * A1[2+bs*0];
957
958 A0 += 4;
959 A1 += 4;
960 B += 4;
961
962 }
963
964 }
965
966
967
968// both A and B are aligned to 64-bit boundaries
969void kernel_sgead_3_0_lib4(int kmax, float *alphap, float *A, float *B)
970 {
971
972 if(kmax<=0)
973 return;
974
975 const int bs = 4;
976
977 float alpha = alphap[0];
978
979 int k;
980
981 for(k=0; k<kmax; k++)
982 {
983
984 B[0+bs*0] += alpha * A[0+bs*0];
985 B[1+bs*0] += alpha * A[1+bs*0];
986 B[2+bs*0] += alpha * A[2+bs*0];
987
988 A += 4;
989 B += 4;
990
991 }
992
993 }
994
995
996
997// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
998void kernel_sgead_3_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
999 {
1000
1001 if(kmax<=0)
1002 return;
1003
1004 const int bs = 4;
1005
1006 float alpha = alphap[0];
1007
1008 float *A1 = A0 + bs*sda;
1009
1010 int k;
1011
1012 for(k=0; k<kmax; k++)
1013 {
1014
1015 B[0+bs*0] += alpha * A0[2+bs*0];
1016 B[1+bs*0] += alpha * A0[3+bs*0];
1017 B[2+bs*0] += alpha * A1[0+bs*0];
1018
1019 A0 += 4;
1020 A1 += 4;
1021 B += 4;
1022
1023 }
1024
1025 }
1026
1027
1028
1029// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
1030void kernel_sgead_3_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1031 {
1032
1033 if(kmax<=0)
1034 return;
1035
1036 const int bs = 4;
1037
1038 float alpha = alphap[0];
1039
1040 float *A1 = A0 + bs*sda;
1041
1042 int k;
1043
1044 for(k=0; k<kmax; k++)
1045 {
1046
1047 B[0+bs*0] += alpha * A0[3+bs*0];
1048 B[1+bs*0] += alpha * A1[0+bs*0];
1049 B[2+bs*0] += alpha * A1[1+bs*0];
1050
1051 A0 += 4;
1052 A1 += 4;
1053 B += 4;
1054
1055 }
1056
1057 }
1058
1059
1060
1061// both A and B are aligned to 64-bit boundaries
1062void kernel_sgead_2_0_lib4(int kmax, float *alphap, float *A, float *B)
1063 {
1064
1065 if(kmax<=0)
1066 return;
1067
1068 const int bs = 4;
1069
1070 float alpha = alphap[0];
1071
1072 int k;
1073
1074 for(k=0; k<kmax; k++)
1075 {
1076
1077 B[0+bs*0] += alpha * A[0+bs*0];
1078 B[1+bs*0] += alpha * A[1+bs*0];
1079
1080 A += 4;
1081 B += 4;
1082
1083 }
1084
1085 }
1086
1087
1088
1089// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
1090void kernel_sgead_2_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1091 {
1092
1093 if(kmax<=0)
1094 return;
1095
1096 const int bs = 4;
1097
1098 float alpha = alphap[0];
1099
1100 float *A1 = A0 + bs*sda;
1101
1102 int k;
1103
1104 for(k=0; k<kmax; k++)
1105 {
1106
1107 B[0+bs*0] += alpha * A0[3+bs*0];
1108 B[1+bs*0] += alpha * A1[0+bs*0];
1109
1110 A0 += 4;
1111 A1 += 4;
1112 B += 4;
1113
1114 }
1115
1116 }
1117
1118
1119
1120// both A and B are aligned 64-bit boundaries
1121void kernel_sgead_1_0_lib4(int kmax, float *alphap, float *A, float *B)
1122 {
1123
1124 if(kmax<=0)
1125 return;
1126
1127 const int bs = 4;
1128
1129 float alpha = alphap[0];
1130
1131 int k;
1132
1133 for(k=0; k<kmax; k++)
1134 {
1135
1136 B[0+bs*0] += alpha * A[0+bs*0];
1137
1138 A += 4;
1139 B += 4;
1140
1141 }
1142
1143 }
1144
1145
1146
1147
1148