Blame - kernel/c99/kernel_sgecp_lib4.c - RealtimeRoboticsGroup/test

blob: de5b7048beb042bd985d32f42830abe4303b1e47 [file] [log] [blame]

Austin Schuh	9a24b37	2018-01-28 16:12:29 -0800	[diff] [blame^]	1	/**************************************************************************************************
				2	* *
				3	* This file is part of BLASFEO. *
				4	* *
				5	* BLASFEO -- BLAS For Embedded Optimization. *
				6	* Copyright (C) 2016-2017 by Gianluca Frison. *
				7	* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
				8	* All rights reserved. *
				9	* *
				10	* HPMPC is free software; you can redistribute it and/or *
				11	* modify it under the terms of the GNU Lesser General Public *
				12	* License as published by the Free Software Foundation; either *
				13	* version 2.1 of the License, or (at your option) any later version. *
				14	* *
				15	* HPMPC is distributed in the hope that it will be useful, *
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
				18	* See the GNU Lesser General Public License for more details. *
				19	* *
				20	* You should have received a copy of the GNU Lesser General Public *
				21	* License along with HPMPC; if not, write to the Free Software *
				22	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
				23	* *
				24	* Author: Gianluca Frison, giaf (at) dtu.dk *
				25	* gianluca.frison (at) imtek.uni-freiburg.de *
				26	* *
				27	**************************************************************************************************/
				28
				29
				30
				31
				32	// both A and B are aligned to 256-bit boundaries
				33	void kernel_sgesc_4_lib4(int kmax, float alphap, float A)
				34	{
				35
				36	if(kmax<=0)
				37	return;
				38
				39	const int bs = 4;
				40
				41	float alpha = alphap[0];
				42
				43	int k;
				44
				45	for(k=0; k<kmax; k++)
				46	{
				47
				48	A[0+bs0] = alpha;
				49	A[1+bs0] = alpha;
				50	A[2+bs0] = alpha;
				51	A[3+bs0] = alpha;
				52
				53	A += 4;
				54
				55	}
				56
				57	}
				58
				59
				60
				61	void kernel_sgesc_3_lib4(int kmax, float alphap, float A)
				62	{
				63
				64	if(kmax<=0)
				65	return;
				66
				67	const int bs = 4;
				68
				69	float alpha = alphap[0];
				70
				71	int k;
				72
				73	for(k=0; k<kmax; k++)
				74	{
				75
				76	A[0+bs0] = alpha;
				77	A[1+bs0] = alpha;
				78	A[2+bs0] = alpha;
				79
				80	A += 4;
				81
				82	}
				83
				84	}
				85
				86
				87
				88	void kernel_sgesc_2_lib4(int kmax, float alphap, float A)
				89	{
				90
				91	if(kmax<=0)
				92	return;
				93
				94	const int bs = 4;
				95
				96	float alpha = alphap[0];
				97
				98	int k;
				99
				100	for(k=0; k<kmax; k++)
				101	{
				102
				103	A[0+bs0] = alpha;
				104	A[1+bs0] = alpha;
				105
				106	A += 4;
				107
				108	}
				109
				110	}
				111
				112
				113
				114	void kernel_sgesc_1_lib4(int kmax, float alphap, float A)
				115	{
				116
				117	if(kmax<=0)
				118	return;
				119
				120	const int bs = 4;
				121
				122	float alpha = alphap[0];
				123
				124	int k;
				125
				126	for(k=0; k<kmax; k++)
				127	{
				128
				129	A[0+bs0] = alpha;
				130
				131	A += 4;
				132
				133	}
				134
				135	}
				136
				137
				138
				139	// both A and B are aligned to 256-bit boundaries
				140	void kernel_sgecp_4_0_lib4(int kmax, float A, float B)
				141	{
				142
				143	if(kmax<=0)
				144	return;
				145
				146	const int bs = 4;
				147
				148	int k;
				149
				150	for(k=0; k<kmax; k++)
				151	{
				152
				153	B[0+bs0] = A[0+bs0];
				154	B[1+bs0] = A[1+bs0];
				155	B[2+bs0] = A[2+bs0];
				156	B[3+bs0] = A[3+bs0];
				157
				158	A += 4;
				159	B += 4;
				160
				161	}
				162
				163	}
				164
				165
				166
				167	// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
				168	void kernel_sgecp_4_1_lib4(int kmax, float A0, int sda, float B)
				169	{
				170
				171	if(kmax<=0)
				172	return;
				173
				174	const int bs = 4;
				175
				176	float A1 = A0 + bssda;
				177
				178	int k;
				179
				180	for(k=0; k<kmax; k++)
				181	{
				182
				183	B[0+bs0] = A0[1+bs0];
				184	B[1+bs0] = A0[2+bs0];
				185	B[2+bs0] = A0[3+bs0];
				186	B[3+bs0] = A1[0+bs0];
				187
				188	A0 += 4;
				189	A1 += 4;
				190	B += 4;
				191
				192	}
				193
				194	}
				195
				196
				197
				198	// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
				199	void kernel_sgecp_4_2_lib4(int kmax, float A0, int sda, float B)
				200	{
				201
				202	if(kmax<=0)
				203	return;
				204
				205	const int bs = 4;
				206
				207	float A1 = A0 + bssda;
				208
				209	int k;
				210
				211	for(k=0; k<kmax; k++)
				212	{
				213
				214	B[0+bs0] = A0[2+bs0];
				215	B[1+bs0] = A0[3+bs0];
				216	B[2+bs0] = A1[0+bs0];
				217	B[3+bs0] = A1[1+bs0];
				218
				219	A0 += 4;
				220	A1 += 4;
				221	B += 4;
				222
				223	}
				224
				225	}
				226
				227
				228
				229	// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
				230	void kernel_sgecp_4_3_lib4(int kmax, float A0, int sda, float B)
				231	{
				232
				233	if(kmax<=0)
				234	return;
				235
				236	const int bs = 4;
				237
				238	float A1 = A0 + bssda;
				239
				240	int k;
				241
				242	for(k=0; k<kmax; k++)
				243	{
				244
				245	B[0+bs0] = A0[3+bs0];
				246	B[1+bs0] = A1[0+bs0];
				247	B[2+bs0] = A1[1+bs0];
				248	B[3+bs0] = A1[2+bs0];
				249
				250	A0 += 4;
				251	A1 += 4;
				252	B += 4;
				253
				254	}
				255
				256	}
				257
				258
				259
				260	// both A and B are aligned to 64-bit boundaries
				261	void kernel_sgecp_3_0_lib4(int kmax, float A, float B)
				262	{
				263
				264	if(kmax<=0)
				265	return;
				266
				267	const int bs = 4;
				268
				269	int k;
				270
				271	for(k=0; k<kmax; k++)
				272	{
				273
				274	B[0+bs0] = A[0+bs0];
				275	B[1+bs0] = A[1+bs0];
				276	B[2+bs0] = A[2+bs0];
				277
				278	A += 4;
				279	B += 4;
				280
				281	}
				282
				283	}
				284
				285
				286
				287	// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
				288	void kernel_sgecp_3_2_lib4(int kmax, float A0, int sda, float B)
				289	{
				290
				291	if(kmax<=0)
				292	return;
				293
				294	const int bs = 4;
				295
				296	float A1 = A0 + bssda;
				297
				298	int k;
				299
				300	for(k=0; k<kmax; k++)
				301	{
				302
				303	B[0+bs0] = A0[2+bs0];
				304	B[1+bs0] = A0[3+bs0];
				305	B[2+bs0] = A1[0+bs0];
				306
				307	A0 += 4;
				308	A1 += 4;
				309	B += 4;
				310
				311	}
				312
				313	}
				314
				315
				316
				317	// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
				318	void kernel_sgecp_3_3_lib4(int kmax, float A0, int sda, float B)
				319	{
				320
				321	if(kmax<=0)
				322	return;
				323
				324	const int bs = 4;
				325
				326	float A1 = A0 + bssda;
				327
				328	int k;
				329
				330	for(k=0; k<kmax; k++)
				331	{
				332
				333	B[0+bs0] = A0[3+bs0];
				334	B[1+bs0] = A1[0+bs0];
				335	B[2+bs0] = A1[1+bs0];
				336
				337	A0 += 4;
				338	A1 += 4;
				339	B += 4;
				340
				341	}
				342
				343	}
				344
				345
				346
				347	// both A and B are aligned to 64-bit boundaries
				348	void kernel_sgecp_2_0_lib4(int kmax, float A, float B)
				349	{
				350
				351	if(kmax<=0)
				352	return;
				353
				354	const int bs = 4;
				355
				356	int k;
				357
				358	for(k=0; k<kmax; k++)
				359	{
				360
				361	B[0+bs0] = A[0+bs0];
				362	B[1+bs0] = A[1+bs0];
				363
				364	A += 4;
				365	B += 4;
				366
				367	}
				368
				369	}
				370
				371
				372
				373	// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
				374	void kernel_sgecp_2_3_lib4(int kmax, float A0, int sda, float B)
				375	{
				376
				377	if(kmax<=0)
				378	return;
				379
				380	const int bs = 4;
				381
				382	float A1 = A0 + bssda;
				383
				384	int k;
				385
				386	for(k=0; k<kmax; k++)
				387	{
				388
				389	B[0+bs0] = A0[3+bs0];
				390	B[1+bs0] = A1[0+bs0];
				391
				392	A0 += 4;
				393	A1 += 4;
				394	B += 4;
				395
				396	}
				397
				398	}
				399
				400
				401
				402	// both A and B are aligned 64-bit boundaries
				403	void kernel_sgecp_1_0_lib4(int kmax, float A, float B)
				404	{
				405
				406	if(kmax<=0)
				407	return;
				408
				409	const int bs = 4;
				410
				411	int k;
				412
				413	for(k=0; k<kmax; k++)
				414	{
				415
				416	B[0+bs0] = A[0+bs0];
				417
				418	A += 4;
				419	B += 4;
				420
				421	}
				422
				423	}
				424
				425
				426
				427	// both A and B are aligned to 256-bit boundaries
				428	void kernel_strcp_l_4_0_lib4(int kmax, float A, float B)
				429	{
				430
				431	// A and C are lower triangular
				432	// kmax+1 4-wide + end 3x3 triangle
				433
				434	kmax += 1;
				435
				436	if(kmax<=0)
				437	return;
				438
				439	const int bs = 4;
				440
				441	int k;
				442
				443	for(k=0; k<kmax; k++)
				444	{
				445
				446	B[0+bs0] = A[0+bs0];
				447	B[1+bs0] = A[1+bs0];
				448	B[2+bs0] = A[2+bs0];
				449	B[3+bs0] = A[3+bs0];
				450
				451	A += 4;
				452	B += 4;
				453
				454	}
				455
				456	// 3x3 triangle
				457
				458	B[1+bs0] = A[1+bs0];
				459	B[2+bs0] = A[2+bs0];
				460	B[3+bs0] = A[3+bs0];
				461
				462	B[2+bs1] = A[2+bs1];
				463	B[3+bs1] = A[3+bs1];
				464
				465	B[3+bs2] = A[3+bs2];
				466
				467	}
				468
				469
				470
				471	// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
				472	void kernel_strcp_l_4_1_lib4(int kmax, float A0, int sda, float B)
				473	{
				474
				475	// A and C are lower triangular
				476	// kmax+1 4-wide + end 3x3 triangle
				477
				478	kmax += 1;
				479
				480	if(kmax<=0)
				481	return;
				482
				483	const int bs = 4;
				484
				485	float A1 = A0 + bssda;
				486
				487	int k;
				488
				489	for(k=0; k<kmax; k++)
				490	{
				491
				492	B[0+bs0] = A0[1+bs0];
				493	B[1+bs0] = A0[2+bs0];
				494	B[2+bs0] = A0[3+bs0];
				495	B[3+bs0] = A1[0+bs0];
				496
				497	A0 += 4;
				498	A1 += 4;
				499	B += 4;
				500
				501	}
				502
				503	// 3x3 triangle
				504
				505	B[1+0bs] = A0[2+0bs];
				506	B[2+0bs] = A0[3+0bs];
				507	B[3+0bs] = A1[0+0bs];
				508
				509	B[2+1bs] = A0[3+1bs];
				510	B[3+1bs] = A1[0+1bs];
				511
				512	B[3+2bs] = A1[0+2bs];
				513
				514	}
				515
				516
				517
				518	// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
				519	void kernel_strcp_l_4_2_lib4(int kmax, float A0, int sda, float B)
				520	{
				521
				522	// A and C are lower triangular
				523	// kmax+1 4-wide + end 3x3 triangle
				524
				525	kmax += 1;
				526
				527	if(kmax<=0)
				528	return;
				529
				530	const int bs = 4;
				531
				532	float A1 = A0 + bssda;
				533
				534	int k;
				535
				536	for(k=0; k<kmax; k++)
				537	{
				538
				539	B[0+bs0] = A0[2+bs0];
				540	B[1+bs0] = A0[3+bs0];
				541	B[2+bs0] = A1[0+bs0];
				542	B[3+bs0] = A1[1+bs0];
				543
				544	A0 += 4;
				545	A1 += 4;
				546	B += 4;
				547
				548	}
				549
				550	// 3x3 triangle}
				551
				552	B[1+bs0] = A0[3+bs0];
				553	B[2+bs0] = A1[0+bs0];
				554	B[3+bs0] = A1[1+bs0];
				555
				556	B[2+bs1] = A1[0+bs1];
				557	B[3+bs1] = A1[1+bs1];
				558
				559	B[3+bs2] = A1[1+bs2];
				560
				561	}
				562
				563
				564
				565	// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
				566	void kernel_strcp_l_4_3_lib4(int kmax, float A0, int sda, float B)
				567	{
				568
				569	// A and C are lower triangular
				570	// kmax+1 4-wide + end 3x3 triangle
				571
				572	kmax += 1;
				573
				574	if(kmax<=0)
				575	return;
				576
				577	const int bs = 4;
				578
				579	float A1 = A0 + bssda;
				580
				581	int k;
				582
				583	for(k=0; k<kmax; k++)
				584	{
				585
				586	B[0+bs0] = A0[3+bs0];
				587	B[1+bs0] = A1[0+bs0];
				588	B[2+bs0] = A1[1+bs0];
				589	B[3+bs0] = A1[2+bs0];
				590
				591	A0 += 4;
				592	A1 += 4;
				593	B += 4;
				594
				595	}
				596
				597	// 3x3 triangle
				598
				599	B[1+bs0] = A1[0+bs0];
				600	B[2+bs0] = A1[1+bs0];
				601	B[3+bs0] = A1[2+bs0];
				602
				603	B[2+bs1] = A1[1+bs1];
				604	B[3+bs1] = A1[2+bs1];
				605
				606	B[3+bs2] = A1[2+bs2];
				607
				608	}
				609
				610
				611
				612	// both A and B are aligned to 64-bit boundaries
				613	void kernel_strcp_l_3_0_lib4(int kmax, float A, float B)
				614	{
				615
				616	// A and C are lower triangular
				617	// kmax+1 3-wide + end 2x2 triangle
				618
				619	kmax += 1;
				620
				621	if(kmax<=0)
				622	return;
				623
				624	const int bs = 4;
				625
				626	int k;
				627
				628	for(k=0; k<kmax; k++)
				629	{
				630
				631	B[0+bs0] = A[0+bs0];
				632	B[1+bs0] = A[1+bs0];
				633	B[2+bs0] = A[2+bs0];
				634
				635	A += 4;
				636	B += 4;
				637
				638	}
				639
				640	// 2x2 triangle
				641
				642	B[1+bs0] = A[1+bs0];
				643	B[2+bs0] = A[2+bs0];
				644
				645	B[2+bs1] = A[2+bs1];
				646
				647	}
				648
				649
				650
				651	// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
				652	void kernel_strcp_l_3_2_lib4(int kmax, float A0, int sda, float B)
				653	{
				654
				655	// A and C are lower triangular
				656	// kmax+1 3-wide + end 2x2 triangle
				657
				658	kmax += 1;
				659
				660	if(kmax<=0)
				661	return;
				662
				663	const int bs = 4;
				664
				665	float A1 = A0 + bssda;
				666
				667	int k;
				668
				669	for(k=0; k<kmax; k++)
				670	{
				671
				672	B[0+bs0] = A0[2+bs0];
				673	B[1+bs0] = A0[3+bs0];
				674	B[2+bs0] = A1[0+bs0];
				675
				676	A0 += 4;
				677	A1 += 4;
				678	B += 4;
				679
				680	}
				681
				682	// 2x2 triangle
				683
				684	B[1+bs0] = A0[3+bs0];
				685	B[2+bs0] = A1[0+bs0];
				686
				687	B[2+bs1] = A1[0+bs1];
				688
				689	}
				690
				691
				692
				693	// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
				694	void kernel_strcp_l_3_3_lib4(int kmax, float A0, int sda, float B)
				695	{
				696
				697	// A and C are lower triangular
				698	// kmax+1 3-wide + end 2x2 triangle
				699
				700	kmax += 1;
				701
				702	if(kmax<=0)
				703	return;
				704
				705	const int bs = 4;
				706
				707	float A1 = A0 + bssda;
				708
				709	int k;
				710
				711	for(k=0; k<kmax; k++)
				712	{
				713
				714	B[0+bs0] = A0[3+bs0];
				715	B[1+bs0] = A1[0+bs0];
				716	B[2+bs0] = A1[1+bs0];
				717
				718	A0 += 4;
				719	A1 += 4;
				720	B += 4;
				721
				722	}
				723
				724	// 2x2 triangle
				725
				726	B[1+bs0] = A1[0+bs0];
				727	B[2+bs0] = A1[1+bs0];
				728
				729	B[2+bs1] = A1[1+bs1];
				730
				731	}
				732
				733
				734
				735	// both A and B are aligned to 64-bit boundaries
				736	void kernel_strcp_l_2_0_lib4(int kmax, float alpha, float A, float B)
				737	{
				738
				739	// A and C are lower triangular
				740	// kmax+1 2-wide + end 1x1 triangle
				741
				742	kmax += 1;
				743
				744	if(kmax<=0)
				745	return;
				746
				747	const int bs = 4;
				748
				749	int k;
				750
				751	for(k=0; k<kmax; k++)
				752	{
				753
				754	B[0+bs0] = A[0+bs0];
				755	B[1+bs0] = A[1+bs0];
				756
				757	A += 4;
				758	B += 4;
				759
				760	}
				761
				762	// 1x1 triangle
				763
				764	B[1+bs0] = A[1+bs0];
				765
				766	}
				767
				768
				769
				770	// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
				771	void kernel_strcp_l_2_3_lib4(int kmax, float A0, int sda, float B)
				772	{
				773
				774	// A and C are lower triangular
				775	// kmax+1 2-wide + end 1x1 triangle
				776
				777	kmax += 1;
				778
				779	if(kmax<=0)
				780	return;
				781
				782	const int bs = 4;
				783
				784	float A1 = A0 + bssda;
				785
				786	int k;
				787
				788	for(k=0; k<kmax; k++)
				789	{
				790
				791	B[0+bs0] = A0[3+bs0];
				792	B[1+bs0] = A1[0+bs0];
				793
				794	A0 += 4;
				795	A1 += 4;
				796	B += 4;
				797
				798	}
				799
				800	// 1x1 triangle
				801
				802	B[1+bs0] = A1[0+bs0];
				803
				804	}
				805
				806
				807
				808	// both A and B are aligned 64-bit boundaries
				809	void kernel_strcp_l_1_0_lib4(int kmax, float A, float B)
				810	{
				811
				812	// A and C are lower triangular
				813	// kmax+1 1-wide
				814
				815	kmax += 1;
				816
				817	if(kmax<=0)
				818	return;
				819
				820	const int bs = 4;
				821
				822	int k;
				823
				824	for(k=0; k<kmax; k++)
				825	{
				826
				827	B[0+bs0] = A[0+bs0];
				828
				829	A += 4;
				830	B += 4;
				831
				832	}
				833
				834	}
				835
				836
				837
				838
				839	// both A and B are aligned to 256-bit boundaries
				840	void kernel_sgead_4_0_lib4(int kmax, float alphap, float A, float *B)
				841	{
				842
				843	if(kmax<=0)
				844	return;
				845
				846	const int bs = 4;
				847
				848	float alpha = alphap[0];
				849
				850	int k;
				851
				852	for(k=0; k<kmax; k++)
				853	{
				854
				855	B[0+bs0] += alpha A[0+bs*0];
				856	B[1+bs0] += alpha A[1+bs*0];
				857	B[2+bs0] += alpha A[2+bs*0];
				858	B[3+bs0] += alpha A[3+bs*0];
				859
				860	A += 4;
				861	B += 4;
				862
				863	}
				864
				865	}
				866
				867
				868
				869	// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
				870	void kernel_sgead_4_1_lib4(int kmax, float alphap, float A0, int sda, float *B)
				871	{
				872
				873	if(kmax<=0)
				874	return;
				875
				876	const int bs = 4;
				877
				878	float alpha = alphap[0];
				879
				880	float A1 = A0 + bssda;
				881
				882	int k;
				883
				884	for(k=0; k<kmax; k++)
				885	{
				886
				887	B[0+bs0] += alpha A0[1+bs*0];
				888	B[1+bs0] += alpha A0[2+bs*0];
				889	B[2+bs0] += alpha A0[3+bs*0];
				890	B[3+bs0] += alpha A1[0+bs*0];
				891
				892	A0 += 4;
				893	A1 += 4;
				894	B += 4;
				895
				896	}
				897
				898	}
				899
				900
				901
				902	// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
				903	void kernel_sgead_4_2_lib4(int kmax, float alphap, float A0, int sda, float *B)
				904	{
				905
				906	if(kmax<=0)
				907	return;
				908
				909	const int bs = 4;
				910
				911	float alpha = alphap[0];
				912
				913	float A1 = A0 + bssda;
				914
				915	int k;
				916
				917	for(k=0; k<kmax; k++)
				918	{
				919
				920	B[0+bs0] += alpha A0[2+bs*0];
				921	B[1+bs0] += alpha A0[3+bs*0];
				922	B[2+bs0] += alpha A1[0+bs*0];
				923	B[3+bs0] += alpha A1[1+bs*0];
				924
				925	A0 += 4;
				926	A1 += 4;
				927	B += 4;
				928
				929	}
				930
				931	}
				932
				933
				934
				935	// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
				936	void kernel_sgead_4_3_lib4(int kmax, float alphap, float A0, int sda, float *B)
				937	{
				938
				939	if(kmax<=0)
				940	return;
				941
				942	const int bs = 4;
				943
				944	float alpha = alphap[0];
				945
				946	float A1 = A0 + bssda;
				947
				948	int k;
				949
				950	for(k=0; k<kmax; k++)
				951	{
				952
				953	B[0+bs0] += alpha A0[3+bs*0];
				954	B[1+bs0] += alpha A1[0+bs*0];
				955	B[2+bs0] += alpha A1[1+bs*0];
				956	B[3+bs0] += alpha A1[2+bs*0];
				957
				958	A0 += 4;
				959	A1 += 4;
				960	B += 4;
				961
				962	}
				963
				964	}
				965
				966
				967
				968	// both A and B are aligned to 64-bit boundaries
				969	void kernel_sgead_3_0_lib4(int kmax, float alphap, float A, float *B)
				970	{
				971
				972	if(kmax<=0)
				973	return;
				974
				975	const int bs = 4;
				976
				977	float alpha = alphap[0];
				978
				979	int k;
				980
				981	for(k=0; k<kmax; k++)
				982	{
				983
				984	B[0+bs0] += alpha A[0+bs*0];
				985	B[1+bs0] += alpha A[1+bs*0];
				986	B[2+bs0] += alpha A[2+bs*0];
				987
				988	A += 4;
				989	B += 4;
				990
				991	}
				992
				993	}
				994
				995
				996
				997	// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
				998	void kernel_sgead_3_2_lib4(int kmax, float alphap, float A0, int sda, float *B)
				999	{
				1000
				1001	if(kmax<=0)
				1002	return;
				1003
				1004	const int bs = 4;
				1005
				1006	float alpha = alphap[0];
				1007
				1008	float A1 = A0 + bssda;
				1009
				1010	int k;
				1011
				1012	for(k=0; k<kmax; k++)
				1013	{
				1014
				1015	B[0+bs0] += alpha A0[2+bs*0];
				1016	B[1+bs0] += alpha A0[3+bs*0];
				1017	B[2+bs0] += alpha A1[0+bs*0];
				1018
				1019	A0 += 4;
				1020	A1 += 4;
				1021	B += 4;
				1022
				1023	}
				1024
				1025	}
				1026
				1027
				1028
				1029	// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
				1030	void kernel_sgead_3_3_lib4(int kmax, float alphap, float A0, int sda, float *B)
				1031	{
				1032
				1033	if(kmax<=0)
				1034	return;
				1035
				1036	const int bs = 4;
				1037
				1038	float alpha = alphap[0];
				1039
				1040	float A1 = A0 + bssda;
				1041
				1042	int k;
				1043
				1044	for(k=0; k<kmax; k++)
				1045	{
				1046
				1047	B[0+bs0] += alpha A0[3+bs*0];
				1048	B[1+bs0] += alpha A1[0+bs*0];
				1049	B[2+bs0] += alpha A1[1+bs*0];
				1050
				1051	A0 += 4;
				1052	A1 += 4;
				1053	B += 4;
				1054
				1055	}
				1056
				1057	}
				1058
				1059
				1060
				1061	// both A and B are aligned to 64-bit boundaries
				1062	void kernel_sgead_2_0_lib4(int kmax, float alphap, float A, float *B)
				1063	{
				1064
				1065	if(kmax<=0)
				1066	return;
				1067
				1068	const int bs = 4;
				1069
				1070	float alpha = alphap[0];
				1071
				1072	int k;
				1073
				1074	for(k=0; k<kmax; k++)
				1075	{
				1076
				1077	B[0+bs0] += alpha A[0+bs*0];
				1078	B[1+bs0] += alpha A[1+bs*0];
				1079
				1080	A += 4;
				1081	B += 4;
				1082
				1083	}
				1084
				1085	}
				1086
				1087
				1088
				1089	// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
				1090	void kernel_sgead_2_3_lib4(int kmax, float alphap, float A0, int sda, float *B)
				1091	{
				1092
				1093	if(kmax<=0)
				1094	return;
				1095
				1096	const int bs = 4;
				1097
				1098	float alpha = alphap[0];
				1099
				1100	float A1 = A0 + bssda;
				1101
				1102	int k;
				1103
				1104	for(k=0; k<kmax; k++)
				1105	{
				1106
				1107	B[0+bs0] += alpha A0[3+bs*0];
				1108	B[1+bs0] += alpha A1[0+bs*0];
				1109
				1110	A0 += 4;
				1111	A1 += 4;
				1112	B += 4;
				1113
				1114	}
				1115
				1116	}
				1117
				1118
				1119
				1120	// both A and B are aligned 64-bit boundaries
				1121	void kernel_sgead_1_0_lib4(int kmax, float alphap, float A, float *B)
				1122	{
				1123
				1124	if(kmax<=0)
				1125	return;
				1126
				1127	const int bs = 4;
				1128
				1129	float alpha = alphap[0];
				1130
				1131	int k;
				1132
				1133	for(k=0; k<kmax; k++)
				1134	{
				1135
				1136	B[0+bs0] += alpha A[0+bs*0];
				1137
				1138	A += 4;
				1139	B += 4;
				1140
				1141	}
				1142
				1143	}
				1144
				1145
				1146
				1147
				1148