blob: 4bc8c9ab8708ae3d9077731d3ab3a0e3bf5d0b80 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#include <mmintrin.h>
30#include <xmmintrin.h> // SSE
31#include <emmintrin.h> // SSE2
32#include <pmmintrin.h> // SSE3
33#include <smmintrin.h> // SSE4
34#include <immintrin.h> // AVX
35
36
37
38// both A and B are aligned to 256-bit boundaries
39void kernel_dgecp_8_0_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
40 {
41
42 if(tri==1)
43 {
44 // A and C are lower triangular
45 // kmax+1 8-wide + end 7x7 triangle
46
47 kmax += 1;
48 }
49
50 if(kmax<=0)
51 return;
52
53 const int bs = 4;
54
55 double *A1 = A0 + bs*sda;
56 double *B1 = B0 + bs*sdb;
57
58 __m256d
59 alpha_0,
60 a_0;
61
62 __m128d
63 c_0;
64
65 int k;
66
67 alpha_0 = _mm256_broadcast_sd( &alpha );
68
69 for(k=0; k<kmax-3; k+=4)
70 {
71
72 a_0 = _mm256_load_pd( &A0[0+bs*0] );
73 a_0 = _mm256_mul_pd( alpha_0, a_0 );
74 _mm256_store_pd( &B0[0+bs*0], a_0 );
75
76 a_0 = _mm256_load_pd( &A0[0+bs*1] );
77 a_0 = _mm256_mul_pd( alpha_0, a_0 );
78 _mm256_store_pd( &B0[0+bs*1], a_0 );
79
80 a_0 = _mm256_load_pd( &A0[0+bs*2] );
81 a_0 = _mm256_mul_pd( alpha_0, a_0 );
82 _mm256_store_pd( &B0[0+bs*2], a_0 );
83
84 a_0 = _mm256_load_pd( &A0[0+bs*3] );
85 a_0 = _mm256_mul_pd( alpha_0, a_0 );
86 _mm256_store_pd( &B0[0+bs*3], a_0 );
87
88 A0 += 16;
89 B0 += 16;
90
91 a_0 = _mm256_load_pd( &A1[0+bs*0] );
92 a_0 = _mm256_mul_pd( alpha_0, a_0 );
93 _mm256_store_pd( &B1[0+bs*0], a_0 );
94
95 a_0 = _mm256_load_pd( &A1[0+bs*1] );
96 a_0 = _mm256_mul_pd( alpha_0, a_0 );
97 _mm256_store_pd( &B1[0+bs*1], a_0 );
98
99 a_0 = _mm256_load_pd( &A1[0+bs*2] );
100 a_0 = _mm256_mul_pd( alpha_0, a_0 );
101 _mm256_store_pd( &B1[0+bs*2], a_0 );
102
103 a_0 = _mm256_load_pd( &A1[0+bs*3] );
104 a_0 = _mm256_mul_pd( alpha_0, a_0 );
105 _mm256_store_pd( &B1[0+bs*3], a_0 );
106
107 A1 += 16;
108 B1 += 16;
109
110 }
111 for(; k<kmax; k++)
112 {
113
114 a_0 = _mm256_load_pd( &A0[0+bs*0] );
115 a_0 = _mm256_mul_pd( alpha_0, a_0 );
116 _mm256_store_pd( &B0[0+bs*0], a_0 );
117
118 A0 += 4;
119 B0 += 4;
120
121 a_0 = _mm256_load_pd( &A1[0+bs*0] );
122 a_0 = _mm256_mul_pd( alpha_0, a_0 );
123 _mm256_store_pd( &B1[0+bs*0], a_0 );
124
125 A1 += 4;
126 B1 += 4;
127
128 }
129
130 if(tri==1)
131 {
132 // 7x7 triangle
133
134 c_0 = _mm_load_sd( &A0[1+0*bs] );
135 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
136 _mm_store_sd( &B0[1+0*bs], c_0 );
137 c_0 = _mm_load_pd( &A0[2+0*bs] );
138 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
139 _mm_store_pd( &B0[2+0*bs], c_0 );
140 a_0 = _mm256_load_pd( &A1[0+0*bs] );
141 a_0 = _mm256_mul_pd( alpha_0, a_0 );
142 _mm256_store_pd( &B1[0+0*bs], a_0 );
143
144 c_0 = _mm_load_pd( &A0[2+1*bs] );
145 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
146 _mm_store_pd( &B0[2+1*bs], c_0 );
147 a_0 = _mm256_load_pd( &A1[0+1*bs] );
148 a_0 = _mm256_mul_pd( alpha_0, a_0 );
149 _mm256_store_pd( &B1[0+1*bs], a_0 );
150
151 c_0 = _mm_load_sd( &A0[3+2*bs] );
152 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
153 _mm_store_sd( &B0[3+2*bs], c_0 );
154 a_0 = _mm256_load_pd( &A1[0+2*bs] );
155 a_0 = _mm256_mul_pd( alpha_0, a_0 );
156 _mm256_store_pd( &B1[0+2*bs], a_0 );
157
158 a_0 = _mm256_load_pd( &A1[0+3*bs] );
159 a_0 = _mm256_mul_pd( alpha_0, a_0 );
160 _mm256_store_pd( &B1[0+3*bs], a_0 );
161
162 c_0 = _mm_load_sd( &A1[1+4*bs] );
163 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
164 _mm_store_sd( &B1[1+4*bs], c_0 );
165 c_0 = _mm_load_pd( &A1[2+4*bs] );
166 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
167 _mm_store_pd( &B1[2+4*bs], c_0 );
168
169 c_0 = _mm_load_pd( &A1[2+5*bs] );
170 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
171 _mm_store_pd( &B1[2+5*bs], c_0 );
172
173 c_0 = _mm_load_sd( &A1[3+6*bs] );
174 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
175 _mm_store_sd( &B1[3+6*bs], c_0 );
176
177 }
178
179 }
180
181
182
183// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
184void kernel_dgecp_8_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
185 {
186
187 if(tri==1)
188 {
189 // A and C are lower triangular
190 // kmax+1 8-wide + end 7x7 triangle
191
192 kmax += 1;
193 }
194
195 if(kmax<=0)
196 return;
197
198 const int bs = 4;
199
200 double *A1 = A0 + bs*sda;
201 double *A2 = A1 + bs*sda;
202 double *B1 = B0 + bs*sdb;
203
204 __m256d
205 alpha_0,
206 a_0, a_1, a_2,
207 b_0, b_1;
208
209 __m128d
210 c_0;
211
212 int k;
213
214 alpha_0 = _mm256_broadcast_sd( &alpha );
215
216 for(k=0; k<kmax-3; k+=4)
217 {
218
219 a_2 = _mm256_load_pd( &A2[0+bs*0] );
220 a_1 = _mm256_load_pd( &A1[0+bs*0] );
221 a_0 = _mm256_load_pd( &A0[0+bs*0] );
222 a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
223 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
224 b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
225 b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
226 b_0 = _mm256_mul_pd( alpha_0, b_0 );
227 b_1 = _mm256_mul_pd( alpha_0, b_1 );
228 _mm256_store_pd( &B1[0+bs*0], b_1 );
229 _mm256_store_pd( &B0[0+bs*0], b_0 );
230
231 a_2 = _mm256_load_pd( &A2[0+bs*1] );
232 a_1 = _mm256_load_pd( &A1[0+bs*1] );
233 a_0 = _mm256_load_pd( &A0[0+bs*1] );
234 a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
235 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
236 b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
237 b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
238 b_0 = _mm256_mul_pd( alpha_0, b_0 );
239 b_1 = _mm256_mul_pd( alpha_0, b_1 );
240 _mm256_store_pd( &B1[0+bs*1], b_1 );
241 _mm256_store_pd( &B0[0+bs*1], b_0 );
242
243 a_2 = _mm256_load_pd( &A2[0+bs*2] );
244 a_1 = _mm256_load_pd( &A1[0+bs*2] );
245 a_0 = _mm256_load_pd( &A0[0+bs*2] );
246 a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
247 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
248 b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
249 b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
250 b_0 = _mm256_mul_pd( alpha_0, b_0 );
251 b_1 = _mm256_mul_pd( alpha_0, b_1 );
252 _mm256_store_pd( &B1[0+bs*2], b_1 );
253 _mm256_store_pd( &B0[0+bs*2], b_0 );
254
255 a_2 = _mm256_load_pd( &A2[0+bs*3] );
256 a_1 = _mm256_load_pd( &A1[0+bs*3] );
257 a_0 = _mm256_load_pd( &A0[0+bs*3] );
258 a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
259 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
260 b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
261 b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
262 b_0 = _mm256_mul_pd( alpha_0, b_0 );
263 b_1 = _mm256_mul_pd( alpha_0, b_1 );
264 _mm256_store_pd( &B1[0+bs*3], b_1 );
265 _mm256_store_pd( &B0[0+bs*3], b_0 );
266
267 A0 += 16;
268 A1 += 16;
269 A2 += 16;
270 B0 += 16;
271 B1 += 16;
272
273 }
274 for(; k<kmax; k++)
275 {
276
277 a_2 = _mm256_load_pd( &A2[0+bs*0] );
278 a_1 = _mm256_load_pd( &A1[0+bs*0] );
279 a_0 = _mm256_load_pd( &A0[0+bs*0] );
280 a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
281 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
282 b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
283 b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
284 b_0 = _mm256_mul_pd( alpha_0, b_0 );
285 b_1 = _mm256_mul_pd( alpha_0, b_1 );
286 _mm256_store_pd( &B1[0+bs*0], b_1 );
287 _mm256_store_pd( &B0[0+bs*0], b_0 );
288
289 A0 += 4;
290 A1 += 4;
291 A2 += 4;
292 B0 += 4;
293 B1 += 4;
294
295 }
296
297 if(tri==1)
298 {
299 // 7x7 triangle
300
301 c_0 = _mm_load_pd( &A0[2+bs*0] );
302 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
303 _mm_storeu_pd( &B0[1+bs*0], c_0 );
304 c_0 = _mm_load_sd( &A1[0+bs*0] );
305 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
306 _mm_store_sd( &B0[3+bs*0], c_0 );
307 c_0 = _mm_load_sd( &A1[1+bs*0] );
308 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
309 _mm_store_sd( &B1[0+bs*0], c_0 );
310 c_0 = _mm_load_pd( &A1[2+bs*0] );
311 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
312 _mm_storeu_pd( &B1[1+bs*0], c_0 );
313 c_0 = _mm_load_sd( &A2[0+bs*0] );
314 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
315 _mm_store_sd( &B1[3+bs*0], c_0 );
316
317 c_0 = _mm_load_sd( &A0[3+bs*1] );
318 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
319 _mm_store_sd( &B0[2+bs*1], c_0 );
320 c_0 = _mm_load_sd( &A1[0+bs*1] );
321 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
322 _mm_store_sd( &B0[3+bs*1], c_0 );
323 c_0 = _mm_load_sd( &A1[1+bs*1] );
324 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
325 _mm_store_sd( &B1[0+bs*1], c_0 );
326 c_0 = _mm_load_pd( &A1[2+bs*1] );
327 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
328 _mm_storeu_pd( &B1[1+bs*1], c_0 );
329 c_0 = _mm_load_sd( &A2[0+bs*1] );
330 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
331 _mm_store_sd( &B1[3+bs*1], c_0 );
332
333 c_0 = _mm_load_sd( &A1[0+bs*2] );
334 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
335 _mm_store_sd( &B0[3+bs*2], c_0 );
336 c_0 = _mm_load_sd( &A1[1+bs*2] );
337 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
338 _mm_store_sd( &B1[0+bs*2], c_0 );
339 c_0 = _mm_load_pd( &A1[2+bs*2] );
340 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
341 _mm_storeu_pd( &B1[1+bs*2], c_0 );
342 c_0 = _mm_load_sd( &A2[0+bs*2] );
343 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
344 _mm_store_sd( &B1[3+bs*2], c_0 );
345
346 c_0 = _mm_load_sd( &A1[1+bs*3] );
347 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
348 _mm_store_sd( &B1[0+bs*3], c_0 );
349 c_0 = _mm_load_pd( &A1[2+bs*3] );
350 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
351 _mm_storeu_pd( &B1[1+bs*3], c_0 );
352 c_0 = _mm_load_sd( &A2[0+bs*3] );
353 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
354 _mm_store_sd( &B1[3+bs*3], c_0 );
355
356 c_0 = _mm_load_pd( &A1[2+bs*4] );
357 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
358 _mm_storeu_pd( &B1[1+bs*4], c_0 );
359 c_0 = _mm_load_sd( &A2[0+bs*4] );
360 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
361 _mm_store_sd( &B1[3+bs*4], c_0 );
362
363 c_0 = _mm_load_sd( &A1[3+bs*5] );
364 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
365 _mm_store_sd( &B1[2+bs*5], c_0 );
366 c_0 = _mm_load_sd( &A2[0+bs*5] );
367 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
368 _mm_store_sd( &B1[3+bs*5], c_0 );
369
370 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
371 c_0 = _mm_load_sd( &A2[0+bs*6] );
372 _mm_store_sd( &B1[3+bs*6], c_0 );
373
374 }
375 }
376
377
378
379// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
380void kernel_dgecp_8_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
381 {
382
383 if(tri==1)
384 {
385 // A and C are lower triangular
386 // kmax+1 8-wide + end 7x7 triangle
387
388 kmax += 1;
389 }
390
391 if(kmax<=0)
392 return;
393
394 const int bs = 4;
395
396 double *A1 = A0 + bs*sda;
397 double *A2 = A1 + bs*sda;
398 double *B1 = B0 + bs*sdb;
399
400 __m256d
401 alpha_0,
402 a_0, a_1, a_2,
403 b_0, b_1;
404
405 __m128d
406 c_0;
407
408 int k;
409
410 alpha_0 = _mm256_broadcast_sd( &alpha );
411
412 for(k=0; k<kmax-3; k+=4)
413 {
414
415 a_0 = _mm256_load_pd( &A0[0+bs*0] );
416 a_1 = _mm256_load_pd( &A1[0+bs*0] );
417 a_2 = _mm256_load_pd( &A2[0+bs*0] );
418 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
419 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
420 b_0 = _mm256_mul_pd( alpha_0, b_0 );
421 b_1 = _mm256_mul_pd( alpha_0, b_1 );
422 _mm256_store_pd( &B0[0+bs*0], b_0 );
423 _mm256_store_pd( &B1[0+bs*0], b_1 );
424
425 a_0 = _mm256_load_pd( &A0[0+bs*1] );
426 a_1 = _mm256_load_pd( &A1[0+bs*1] );
427 a_2 = _mm256_load_pd( &A2[0+bs*1] );
428 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
429 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
430 b_0 = _mm256_mul_pd( alpha_0, b_0 );
431 b_1 = _mm256_mul_pd( alpha_0, b_1 );
432 _mm256_store_pd( &B0[0+bs*1], b_0 );
433 _mm256_store_pd( &B1[0+bs*1], b_1 );
434
435 a_0 = _mm256_load_pd( &A0[0+bs*2] );
436 a_1 = _mm256_load_pd( &A1[0+bs*2] );
437 a_2 = _mm256_load_pd( &A2[0+bs*2] );
438 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
439 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
440 b_0 = _mm256_mul_pd( alpha_0, b_0 );
441 b_1 = _mm256_mul_pd( alpha_0, b_1 );
442 _mm256_store_pd( &B0[0+bs*2], b_0 );
443 _mm256_store_pd( &B1[0+bs*2], b_1 );
444
445 a_0 = _mm256_load_pd( &A0[0+bs*3] );
446 a_1 = _mm256_load_pd( &A1[0+bs*3] );
447 a_2 = _mm256_load_pd( &A2[0+bs*3] );
448 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
449 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
450 b_0 = _mm256_mul_pd( alpha_0, b_0 );
451 b_1 = _mm256_mul_pd( alpha_0, b_1 );
452 _mm256_store_pd( &B0[0+bs*3], b_0 );
453 _mm256_store_pd( &B1[0+bs*3], b_1 );
454
455 A0 += 16;
456 A1 += 16;
457 A2 += 16;
458 B0 += 16;
459 B1 += 16;
460
461 }
462 for(; k<kmax; k++)
463 {
464
465 a_0 = _mm256_load_pd( &A0[0+bs*0] );
466 a_1 = _mm256_load_pd( &A1[0+bs*0] );
467 a_2 = _mm256_load_pd( &A2[0+bs*0] );
468 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
469 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
470 b_0 = _mm256_mul_pd( alpha_0, b_0 );
471 b_1 = _mm256_mul_pd( alpha_0, b_1 );
472 _mm256_store_pd( &B0[0+bs*0], b_0 );
473 _mm256_store_pd( &B1[0+bs*0], b_1 );
474
475 A0 += 4;
476 A1 += 4;
477 A2 += 4;
478 B0 += 4;
479 B1 += 4;
480
481 }
482
483 if(tri==1)
484 {
485 // 7x7 triangle
486
487 c_0 = _mm_load_sd( &A0[3+bs*0] );
488 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
489 _mm_store_sd( &B0[1+bs*0], c_0 );
490 c_0 = _mm_load_pd( &A1[0+bs*0] );
491 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
492 _mm_store_pd( &B0[2+bs*0], c_0 );
493 c_0 = _mm_load_pd( &A1[2+bs*0] );
494 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
495 _mm_store_pd( &B1[0+bs*0], c_0 );
496 c_0 = _mm_load_pd( &A2[0+bs*0] );
497 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
498 _mm_store_pd( &B1[2+bs*0], c_0 );
499
500 c_0 = _mm_load_pd( &A1[0+bs*1] );
501 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
502 _mm_store_pd( &B0[2+bs*1], c_0 );
503 c_0 = _mm_load_pd( &A1[2+bs*1] );
504 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
505 _mm_store_pd( &B1[0+bs*1], c_0 );
506 c_0 = _mm_load_pd( &A2[0+bs*1] );
507 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
508 _mm_store_pd( &B1[2+bs*1], c_0 );
509
510 c_0 = _mm_load_sd( &A1[1+bs*2] );
511 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
512 _mm_store_sd( &B0[3+bs*2], c_0 );
513 c_0 = _mm_load_pd( &A1[2+bs*2] );
514 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
515 _mm_store_pd( &B1[0+bs*2], c_0 );
516 c_0 = _mm_load_pd( &A2[0+bs*2] );
517 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
518 _mm_store_pd( &B1[2+bs*2], c_0 );
519
520 c_0 = _mm_load_pd( &A1[2+bs*3] );
521 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
522 _mm_store_pd( &B1[0+bs*3], c_0 );
523 c_0 = _mm_load_pd( &A2[0+bs*3] );
524 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
525 _mm_store_pd( &B1[2+bs*3], c_0 );
526
527 c_0 = _mm_load_sd( &A1[3+bs*4] );
528 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
529 _mm_store_sd( &B1[1+bs*4], c_0 );
530 c_0 = _mm_load_pd( &A2[0+bs*4] );
531 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
532 _mm_store_pd( &B1[2+bs*4], c_0 );
533
534 c_0 = _mm_load_pd( &A2[0+bs*5] );
535 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
536 _mm_store_pd( &B1[2+bs*5], c_0 );
537
538 c_0 = _mm_load_sd( &A2[1+bs*6] );
539 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
540 _mm_store_sd( &B1[3+bs*6], c_0 );
541
542 }
543
544 }
545
546
547
548// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
549void kernel_dgecp_8_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
550 {
551
552 if(tri==1)
553 {
554 // A and C are lower triangular
555 // kmax+1 8-wide + end 7x7 triangle
556
557 kmax += 1;
558 }
559
560 if(kmax<=0)
561 return;
562
563 const int bs = 4;
564
565 double *A1 = A0 + bs*sda;
566 double *A2 = A1 + bs*sda;
567 double *B1 = B0 + bs*sdb;
568
569 __m256d
570 alpha_0,
571 a_0, a_1, a_2,
572 b_0, b_1;
573
574 __m128d
575 c_0;
576
577 int k;
578
579 alpha_0 = _mm256_broadcast_sd( &alpha );
580
581 for(k=0; k<kmax-3; k+=4)
582 {
583
584 a_0 = _mm256_load_pd( &A0[0+bs*0] );
585 a_1 = _mm256_load_pd( &A1[0+bs*0] );
586 a_2 = _mm256_load_pd( &A2[0+bs*0] );
587 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
588 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
589 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
590 b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
591 b_0 = _mm256_mul_pd( alpha_0, b_0 );
592 b_1 = _mm256_mul_pd( alpha_0, b_1 );
593 _mm256_store_pd( &B0[0+bs*0], b_0 );
594 _mm256_store_pd( &B1[0+bs*0], b_1 );
595
596 a_0 = _mm256_load_pd( &A0[0+bs*1] );
597 a_1 = _mm256_load_pd( &A1[0+bs*1] );
598 a_2 = _mm256_load_pd( &A2[0+bs*1] );
599 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
600 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
601 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
602 b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
603 b_0 = _mm256_mul_pd( alpha_0, b_0 );
604 b_1 = _mm256_mul_pd( alpha_0, b_1 );
605 _mm256_store_pd( &B0[0+bs*1], b_0 );
606 _mm256_store_pd( &B1[0+bs*1], b_1 );
607
608 a_0 = _mm256_load_pd( &A0[0+bs*2] );
609 a_1 = _mm256_load_pd( &A1[0+bs*2] );
610 a_2 = _mm256_load_pd( &A2[0+bs*2] );
611 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
612 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
613 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
614 b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
615 b_0 = _mm256_mul_pd( alpha_0, b_0 );
616 b_1 = _mm256_mul_pd( alpha_0, b_1 );
617 _mm256_store_pd( &B0[0+bs*2], b_0 );
618 _mm256_store_pd( &B1[0+bs*2], b_1 );
619
620 a_0 = _mm256_load_pd( &A0[0+bs*3] );
621 a_1 = _mm256_load_pd( &A1[0+bs*3] );
622 a_2 = _mm256_load_pd( &A2[0+bs*3] );
623 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
624 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
625 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
626 b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
627 b_0 = _mm256_mul_pd( alpha_0, b_0 );
628 b_1 = _mm256_mul_pd( alpha_0, b_1 );
629 _mm256_store_pd( &B0[0+bs*3], b_0 );
630 _mm256_store_pd( &B1[0+bs*3], b_1 );
631
632 A0 += 16;
633 A1 += 16;
634 A2 += 16;
635 B0 += 16;
636 B1 += 16;
637
638 }
639 for(; k<kmax; k++)
640 {
641
642 a_0 = _mm256_load_pd( &A0[0+bs*0] );
643 a_1 = _mm256_load_pd( &A1[0+bs*0] );
644 a_2 = _mm256_load_pd( &A2[0+bs*0] );
645 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
646 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
647 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
648 b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
649 b_0 = _mm256_mul_pd( alpha_0, b_0 );
650 b_1 = _mm256_mul_pd( alpha_0, b_1 );
651 _mm256_store_pd( &B0[0+bs*0], b_0 );
652 _mm256_store_pd( &B1[0+bs*0], b_1 );
653
654 A0 += 4;
655 A1 += 4;
656 A2 += 4;
657 B0 += 4;
658 B1 += 4;
659
660 }
661
662 if(tri==1)
663 {
664 // 7x7 triangle
665
666 c_0 = _mm_load_pd( &A1[0+bs*0] );
667 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
668 _mm_storeu_pd( &B0[1+bs*0], c_0 );
669 c_0 = _mm_load_sd( &A1[2+bs*0] );
670 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
671 _mm_store_sd( &B0[3+bs*0], c_0 );
672 c_0 = _mm_load_sd( &A1[3+bs*0] );
673 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
674 _mm_store_sd( &B1[0+bs*0], c_0 );
675 c_0 = _mm_load_pd( &A2[0+bs*0] );
676 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
677 _mm_storeu_pd( &B1[1+bs*0], c_0 );
678 c_0 = _mm_load_sd( &A2[2+bs*0] );
679 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
680 _mm_store_sd( &B1[3+bs*0], c_0 );
681
682 c_0 = _mm_load_sd( &A1[1+bs*1] );
683 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
684 _mm_store_sd( &B0[2+bs*1], c_0 );
685 c_0 = _mm_load_sd( &A1[2+bs*1] );
686 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
687 _mm_store_sd( &B0[3+bs*1], c_0 );
688 c_0 = _mm_load_sd( &A1[3+bs*1] );
689 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
690 _mm_store_sd( &B1[0+bs*1], c_0 );
691 c_0 = _mm_load_pd( &A2[0+bs*1] );
692 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
693 _mm_storeu_pd( &B1[1+bs*1], c_0 );
694 c_0 = _mm_load_sd( &A2[2+bs*1] );
695 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
696 _mm_store_sd( &B1[3+bs*1], c_0 );
697
698 c_0 = _mm_load_sd( &A1[2+bs*2] );
699 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
700 _mm_store_sd( &B0[3+bs*2], c_0 );
701 c_0 = _mm_load_sd( &A1[3+bs*2] );
702 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
703 _mm_store_sd( &B1[0+bs*2], c_0 );
704 c_0 = _mm_load_pd( &A2[0+bs*2] );
705 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
706 _mm_storeu_pd( &B1[1+bs*2], c_0 );
707 c_0 = _mm_load_sd( &A2[2+bs*2] );
708 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
709 _mm_store_sd( &B1[3+bs*2], c_0 );
710
711 c_0 = _mm_load_sd( &A1[3+bs*3] );
712 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
713 _mm_store_sd( &B1[0+bs*3], c_0 );
714 c_0 = _mm_load_pd( &A2[0+bs*3] );
715 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
716 _mm_storeu_pd( &B1[1+bs*3], c_0 );
717 c_0 = _mm_load_sd( &A2[2+bs*3] );
718 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
719 _mm_store_sd( &B1[3+bs*3], c_0 );
720
721 c_0 = _mm_load_pd( &A2[0+bs*4] );
722 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
723 _mm_storeu_pd( &B1[1+bs*4], c_0 );
724 c_0 = _mm_load_sd( &A2[2+bs*4] );
725 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
726 _mm_store_sd( &B1[3+bs*4], c_0 );
727
728 c_0 = _mm_load_sd( &A2[1+bs*5] );
729 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
730 _mm_store_sd( &B1[2+bs*5], c_0 );
731 c_0 = _mm_load_sd( &A2[2+bs*5] );
732 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
733 _mm_store_sd( &B1[3+bs*5], c_0 );
734
735 c_0 = _mm_load_sd( &A2[2+bs*6] );
736 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
737 _mm_store_sd( &B1[3+bs*6], c_0 );
738
739 }
740
741 }
742
743
744
745// both A and B are aligned to 256-bit boundaries
746void kernel_dgecp_4_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
747 {
748
749 if(tri==1)
750 {
751 // A and C are lower triangular
752 // kmax+1 4-wide + end 3x3 triangle
753
754 kmax += 1;
755 }
756
757 if(kmax<=0)
758 return;
759
760 const int bs = 4;
761
762 __m256d
763 alpha_0,
764 a_0;
765
766 __m128d
767 c_0;
768
769 int k;
770
771 alpha_0 = _mm256_broadcast_sd( &alpha );
772
773 for(k=0; k<kmax-3; k+=4)
774 {
775
776 a_0 = _mm256_load_pd( &A[0+bs*0] );
777 a_0 = _mm256_mul_pd( alpha_0, a_0 );
778 _mm256_store_pd( &B[0+bs*0], a_0 );
779
780 a_0 = _mm256_load_pd( &A[0+bs*1] );
781 a_0 = _mm256_mul_pd( alpha_0, a_0 );
782 _mm256_store_pd( &B[0+bs*1], a_0 );
783
784 a_0 = _mm256_load_pd( &A[0+bs*2] );
785 a_0 = _mm256_mul_pd( alpha_0, a_0 );
786 _mm256_store_pd( &B[0+bs*2], a_0 );
787
788 a_0 = _mm256_load_pd( &A[0+bs*3] );
789 a_0 = _mm256_mul_pd( alpha_0, a_0 );
790 _mm256_store_pd( &B[0+bs*3], a_0 );
791
792 A += 16;
793 B += 16;
794
795 }
796 for(; k<kmax; k++)
797 {
798
799 a_0 = _mm256_load_pd( &A[0+bs*0] );
800 a_0 = _mm256_mul_pd( alpha_0, a_0 );
801 _mm256_store_pd( &B[0+bs*0], a_0 );
802
803 A += 4;
804 B += 4;
805
806 }
807
808 if(tri==1)
809 {
810 // 3x3 triangle
811
812 c_0 = _mm_load_sd( &A[1+bs*0] );
813 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
814 _mm_store_sd( &B[1+bs*0], c_0 );
815 c_0 = _mm_load_pd( &A[2+bs*0] );
816 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
817 _mm_store_pd( &B[2+bs*0], c_0 );
818
819 c_0 = _mm_load_pd( &A[2+bs*1] );
820 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
821 _mm_store_pd( &B[2+bs*1], c_0 );
822
823 c_0 = _mm_load_sd( &A[3+bs*2] );
824 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
825 _mm_store_sd( &B[3+bs*2], c_0 );
826
827 }
828
829 }
830
831
832
833// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
834void kernel_dgecp_4_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
835 {
836
837 if(tri==1)
838 {
839 // A and C are lower triangular
840 // kmax+1 4-wide + end 3x3 triangle
841
842 kmax += 1;
843 }
844
845 if(kmax<=0)
846 return;
847
848 const int bs = 4;
849
850 double *A1 = A0 + bs*sda;
851
852 __m256d
853 alpha_0,
854 a_0, a_1,
855 b_0;
856
857 __m128d
858 c_0;
859
860 int k;
861
862 alpha_0 = _mm256_broadcast_sd( &alpha );
863
864 for(k=0; k<kmax-3; k+=4)
865 {
866
867 a_0 = _mm256_load_pd( &A0[0+bs*0] );
868 a_1 = _mm256_load_pd( &A1[0+bs*0] );
869 a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
870 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
871 b_0 = _mm256_mul_pd( alpha_0, b_0 );
872 _mm256_store_pd( &B[0+bs*0], b_0 );
873
874 a_0 = _mm256_load_pd( &A0[0+bs*1] );
875 a_1 = _mm256_load_pd( &A1[0+bs*1] );
876 a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
877 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
878 b_0 = _mm256_mul_pd( alpha_0, b_0 );
879 _mm256_store_pd( &B[0+bs*1], b_0 );
880
881 a_0 = _mm256_load_pd( &A0[0+bs*2] );
882 a_1 = _mm256_load_pd( &A1[0+bs*2] );
883 a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
884 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
885 b_0 = _mm256_mul_pd( alpha_0, b_0 );
886 _mm256_store_pd( &B[0+bs*2], b_0 );
887
888 a_0 = _mm256_load_pd( &A0[0+bs*3] );
889 a_1 = _mm256_load_pd( &A1[0+bs*3] );
890 a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
891 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
892 b_0 = _mm256_mul_pd( alpha_0, b_0 );
893 _mm256_store_pd( &B[0+bs*3], b_0 );
894
895 A0 += 16;
896 A1 += 16;
897 B += 16;
898
899 }
900 for(; k<kmax; k++)
901 {
902
903 a_0 = _mm256_load_pd( &A0[0+bs*0] );
904 a_1 = _mm256_load_pd( &A1[0+bs*0] );
905 a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
906 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
907 b_0 = _mm256_mul_pd( alpha_0, b_0 );
908 _mm256_store_pd( &B[0+bs*0], b_0 );
909
910 A0 += 4;
911 A1 += 4;
912 B += 4;
913
914 }
915
916 if(tri==1)
917 {
918 // 3x3 triangle
919
920 c_0 = _mm_load_pd( &A0[2+bs*0] );
921 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
922 _mm_storeu_pd( &B[1+bs*0], c_0 );
923 c_0 = _mm_load_sd( &A1[0+bs*0] );
924 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
925 _mm_store_sd( &B[3+bs*0], c_0 );
926
927 c_0 = _mm_load_sd( &A0[3+bs*1] );
928 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
929 _mm_store_sd( &B[2+bs*1], c_0 );
930 c_0 = _mm_load_sd( &A1[0+bs*1] );
931 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
932 _mm_store_sd( &B[3+bs*1], c_0 );
933
934 c_0 = _mm_load_sd( &A1[0+bs*2] );
935 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
936 _mm_store_sd( &B[3+bs*2], c_0 );
937
938 }
939
940 }
941
942
943
944// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
945void kernel_dgecp_4_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
946 {
947
948 if(tri==1)
949 {
950 // A and C are lower triangular
951 // kmax+1 4-wide + end 3x3 triangle
952
953 kmax += 1;
954 }
955
956 if(kmax<=0)
957 return;
958
959 const int bs = 4;
960
961 double *A1 = A0 + bs*sda;
962
963 __m256d
964 alpha_0,
965 a_0, a_1,
966 b_0;
967
968 __m128d
969 c_0;
970
971 int k;
972
973 alpha_0 = _mm256_broadcast_sd( &alpha );
974
975 for(k=0; k<kmax-3; k+=4)
976 {
977
978 a_0 = _mm256_load_pd( &A0[0+bs*0] );
979 a_1 = _mm256_load_pd( &A1[0+bs*0] );
980 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
981 b_0 = _mm256_mul_pd( alpha_0, b_0 );
982 _mm256_store_pd( &B[0+bs*0], b_0 );
983
984 a_0 = _mm256_load_pd( &A0[0+bs*1] );
985 a_1 = _mm256_load_pd( &A1[0+bs*1] );
986 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
987 b_0 = _mm256_mul_pd( alpha_0, b_0 );
988 _mm256_store_pd( &B[0+bs*1], b_0 );
989
990 a_0 = _mm256_load_pd( &A0[0+bs*2] );
991 a_1 = _mm256_load_pd( &A1[0+bs*2] );
992 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
993 b_0 = _mm256_mul_pd( alpha_0, b_0 );
994 _mm256_store_pd( &B[0+bs*2], b_0 );
995
996 a_0 = _mm256_load_pd( &A0[0+bs*3] );
997 a_1 = _mm256_load_pd( &A1[0+bs*3] );
998 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
999 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1000 _mm256_store_pd( &B[0+bs*3], b_0 );
1001
1002 A0 += 16;
1003 A1 += 16;
1004 B += 16;
1005
1006 }
1007 for(; k<kmax; k++)
1008 {
1009
1010 a_0 = _mm256_load_pd( &A0[0+bs*0] );
1011 a_1 = _mm256_load_pd( &A1[0+bs*0] );
1012 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1013 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1014 _mm256_store_pd( &B[0+bs*0], b_0 );
1015
1016 A0 += 4;
1017 A1 += 4;
1018 B += 4;
1019
1020 }
1021
1022 if(tri==1)
1023 {
1024 // 3x3 triangle
1025
1026 c_0 = _mm_load_sd( &A0[3+bs*0] );
1027 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
1028 _mm_store_sd( &B[1+bs*0], c_0 );
1029 c_0 = _mm_load_pd( &A1[0+bs*0] );
1030 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
1031 _mm_store_pd( &B[2+bs*0], c_0 );
1032
1033 c_0 = _mm_load_pd( &A1[0+bs*1] );
1034 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
1035 _mm_store_pd( &B[2+bs*1], c_0 );
1036
1037 c_0 = _mm_load_sd( &A1[1+bs*2] );
1038 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
1039 _mm_store_sd( &B[3+bs*2], c_0 );
1040
1041 }
1042
1043 }
1044
1045
1046
1047// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
1048void kernel_dgecp_4_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
1049 {
1050
1051 if(tri==1)
1052 {
1053 // A and C are lower triangular
1054 // kmax+1 4-wide + end 3x3 triangle
1055
1056 kmax += 1;
1057 }
1058
1059 if(kmax<=0)
1060 return;
1061
1062 const int bs = 4;
1063
1064 double *A1 = A0 + bs*sda;
1065
1066 __m256d
1067 alpha_0,
1068 a_0, a_1,
1069 b_0;
1070
1071 __m128d
1072 c_0;
1073
1074 int k;
1075
1076 alpha_0 = _mm256_broadcast_sd( &alpha );
1077
1078 for(k=0; k<kmax-3; k+=4)
1079 {
1080
1081 a_0 = _mm256_load_pd( &A0[0+bs*0] );
1082 a_1 = _mm256_load_pd( &A1[0+bs*0] );
1083 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1084 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
1085 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1086 _mm256_store_pd( &B[0+bs*0], b_0 );
1087
1088 a_0 = _mm256_load_pd( &A0[0+bs*1] );
1089 a_1 = _mm256_load_pd( &A1[0+bs*1] );
1090 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1091 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
1092 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1093 _mm256_store_pd( &B[0+bs*1], b_0 );
1094
1095 a_0 = _mm256_load_pd( &A0[0+bs*2] );
1096 a_1 = _mm256_load_pd( &A1[0+bs*2] );
1097 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1098 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
1099 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1100 _mm256_store_pd( &B[0+bs*2], b_0 );
1101
1102 a_0 = _mm256_load_pd( &A0[0+bs*3] );
1103 a_1 = _mm256_load_pd( &A1[0+bs*3] );
1104 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1105 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
1106 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1107 _mm256_store_pd( &B[0+bs*3], b_0 );
1108
1109 A0 += 16;
1110 A1 += 16;
1111 B += 16;
1112
1113 }
1114 for(; k<kmax; k++)
1115 {
1116
1117 a_0 = _mm256_load_pd( &A0[0+bs*0] );
1118 a_1 = _mm256_load_pd( &A1[0+bs*0] );
1119 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1120 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
1121 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1122 _mm256_store_pd( &B[0+bs*0], b_0 );
1123
1124 A0 += 4;
1125 A1 += 4;
1126 B += 4;
1127
1128 }
1129
1130 if(tri==1)
1131 {
1132 // 3x3 triangle
1133
1134 c_0 = _mm_load_pd( &A1[0+bs*0] );
1135 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
1136 _mm_storeu_pd( &B[1+bs*0], c_0 );
1137 c_0 = _mm_load_sd( &A1[2+bs*0] );
1138 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
1139 _mm_store_sd( &B[3+bs*0], c_0 );
1140
1141 c_0 = _mm_load_sd( &A1[1+bs*1] );
1142 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
1143 _mm_store_sd( &B[2+bs*1], c_0 );
1144 c_0 = _mm_load_sd( &A1[2+bs*1] );
1145 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
1146 _mm_store_sd( &B[3+bs*1], c_0 );
1147
1148 c_0 = _mm_load_sd( &A1[2+bs*2] );
1149 c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
1150 _mm_store_sd( &B[3+bs*2], c_0 );
1151 }
1152
1153
1154 }
1155
1156
1157
1158// both A and B are aligned to 64-bit boundaries
1159void kernel_dgecp_3_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
1160 {
1161
1162 if(tri==1)
1163 {
1164 // A and C are lower triangular
1165 // kmax+1 3-wide + end 2x2 triangle
1166
1167 kmax += 1;
1168 }
1169
1170 if(kmax<=0)
1171 return;
1172
1173 const int bs = 4;
1174
1175 __m128d
1176 alpha_0,
1177 a_0, a_1;
1178
1179 int k;
1180
1181 alpha_0 = _mm_loaddup_pd( &alpha );
1182
1183 for(k=0; k<kmax-3; k+=4)
1184 {
1185
1186 a_0 = _mm_loadu_pd( &A[0+bs*0] );
1187 a_1 = _mm_load_sd( &A[2+bs*0] );
1188 a_0 = _mm_mul_pd( alpha_0, a_0 );
1189 a_1 = _mm_mul_pd( alpha_0, a_1 );
1190 _mm_storeu_pd( &B[0+bs*0], a_0 );
1191 _mm_store_sd( &B[2+bs*0], a_1 );
1192
1193 a_0 = _mm_loadu_pd( &A[0+bs*1] );
1194 a_1 = _mm_load_sd( &A[2+bs*1] );
1195 a_0 = _mm_mul_pd( alpha_0, a_0 );
1196 a_1 = _mm_mul_pd( alpha_0, a_1 );
1197 _mm_storeu_pd( &B[0+bs*1], a_0 );
1198 _mm_store_sd( &B[2+bs*1], a_1 );
1199
1200 a_0 = _mm_loadu_pd( &A[0+bs*2] );
1201 a_1 = _mm_load_sd( &A[2+bs*2] );
1202 a_0 = _mm_mul_pd( alpha_0, a_0 );
1203 a_1 = _mm_mul_pd( alpha_0, a_1 );
1204 _mm_storeu_pd( &B[0+bs*2], a_0 );
1205 _mm_store_sd( &B[2+bs*2], a_1 );
1206
1207 a_0 = _mm_loadu_pd( &A[0+bs*3] );
1208 a_1 = _mm_load_sd( &A[2+bs*3] );
1209 a_0 = _mm_mul_pd( alpha_0, a_0 );
1210 a_1 = _mm_mul_pd( alpha_0, a_1 );
1211 _mm_storeu_pd( &B[0+bs*3], a_0 );
1212 _mm_store_sd( &B[2+bs*3], a_1 );
1213
1214 A += 16;
1215 B += 16;
1216
1217 }
1218 for(; k<kmax; k++)
1219 {
1220
1221 a_0 = _mm_loadu_pd( &A[0+bs*0] );
1222 a_1 = _mm_load_sd( &A[2+bs*0] );
1223 a_0 = _mm_mul_pd( alpha_0, a_0 );
1224 a_1 = _mm_mul_pd( alpha_0, a_1 );
1225 _mm_storeu_pd( &B[0+bs*0], a_0 );
1226 _mm_store_sd( &B[2+bs*0], a_1 );
1227
1228 A += 4;
1229 B += 4;
1230
1231 }
1232
1233 if(tri==1)
1234 {
1235 // 2x2 triangle
1236
1237 a_0 = _mm_loadu_pd( &A[1+bs*0] );
1238 a_0 = _mm_mul_pd( alpha_0, a_0 );
1239 _mm_storeu_pd( &B[1+bs*0], a_0 );
1240
1241 a_0 = _mm_load_sd( &A[2+bs*1] );
1242 a_0 = _mm_mul_pd( alpha_0, a_0 );
1243 _mm_store_sd( &B[2+bs*1], a_0 );
1244
1245 }
1246
1247 }
1248
1249
1250
1251// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
1252void kernel_dgecp_3_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
1253 {
1254
1255 if(tri==1)
1256 {
1257 // A and C are lower triangular
1258 // kmax+1 3-wide + end 2x2 triangle
1259
1260 kmax += 1;
1261 }
1262
1263 if(kmax<=0)
1264 return;
1265
1266 const int bs = 4;
1267
1268 double *A1 = A0 + bs*sda;
1269
1270 __m128d
1271 alpha_0,
1272 a_0, a_1;
1273
1274 int k;
1275
1276 alpha_0 = _mm_loaddup_pd( &alpha );
1277
1278 for(k=0; k<kmax-3; k+=4)
1279 {
1280
1281 a_0 = _mm_loadu_pd( &A0[2+bs*0] );
1282 a_0 = _mm_mul_pd( alpha_0, a_0 );
1283 _mm_storeu_pd( &B[0+bs*0], a_0 );
1284 a_1 = _mm_load_sd( &A1[0+bs*0] );
1285 a_1 = _mm_mul_pd( alpha_0, a_1 );
1286 _mm_store_sd( &B[2+bs*0], a_1 );
1287
1288 a_0 = _mm_loadu_pd( &A0[2+bs*1] );
1289 a_0 = _mm_mul_pd( alpha_0, a_0 );
1290 _mm_storeu_pd( &B[0+bs*1], a_0 );
1291 a_1 = _mm_load_sd( &A1[0+bs*1] );
1292 a_1 = _mm_mul_pd( alpha_0, a_1 );
1293 _mm_store_sd( &B[2+bs*1], a_1 );
1294
1295 a_0 = _mm_loadu_pd( &A0[2+bs*2] );
1296 a_0 = _mm_mul_pd( alpha_0, a_0 );
1297 _mm_storeu_pd( &B[0+bs*2], a_0 );
1298 a_1 = _mm_load_sd( &A1[0+bs*2] );
1299 a_1 = _mm_mul_pd( alpha_0, a_1 );
1300 _mm_store_sd( &B[2+bs*2], a_1 );
1301
1302 a_0 = _mm_loadu_pd( &A0[2+bs*3] );
1303 a_0 = _mm_mul_pd( alpha_0, a_0 );
1304 _mm_storeu_pd( &B[0+bs*3], a_0 );
1305 a_1 = _mm_load_sd( &A1[0+bs*3] );
1306 a_1 = _mm_mul_pd( alpha_0, a_1 );
1307 _mm_store_sd( &B[2+bs*3], a_1 );
1308
1309 A0 += 16;
1310 A1 += 16;
1311 B += 16;
1312
1313 }
1314 for(; k<kmax; k++)
1315 {
1316
1317 a_0 = _mm_loadu_pd( &A0[2+bs*0] );
1318 a_0 = _mm_mul_pd( alpha_0, a_0 );
1319 _mm_storeu_pd( &B[0+bs*0], a_0 );
1320 a_1 = _mm_load_sd( &A1[0+bs*0] );
1321 a_1 = _mm_mul_pd( alpha_0, a_1 );
1322 _mm_store_sd( &B[2+bs*0], a_1 );
1323
1324 A0 += 4;
1325 A1 += 4;
1326 B += 4;
1327
1328 }
1329
1330 if(tri==1)
1331 {
1332 // 2x2 triangle
1333
1334 a_0 = _mm_load_sd( &A0[3+bs*0] );
1335 a_0 = _mm_mul_pd( alpha_0, a_0 );
1336 _mm_store_sd( &B[1+bs*0], a_0 );
1337 a_0 = _mm_load_sd( &A1[0+bs*0] );
1338 a_0 = _mm_mul_pd( alpha_0, a_0 );
1339 _mm_store_sd( &B[2+bs*0], a_0 );
1340
1341 a_0 = _mm_load_sd( &A1[0+bs*1] );
1342 a_0 = _mm_mul_pd( alpha_0, a_0 );
1343 _mm_store_sd( &B[2+bs*1], a_0 );
1344
1345 }
1346
1347
1348 }
1349
1350
1351
1352// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
1353void kernel_dgecp_3_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
1354 {
1355
1356 if(tri==1)
1357 {
1358 // A and C are lower triangular
1359 // kmax+1 3-wide + end 2x2 triangle
1360
1361 kmax += 1;
1362 }
1363
1364 if(kmax<=0)
1365 return;
1366
1367 const int bs = 4;
1368
1369 double *A1 = A0 + bs*sda;
1370
1371 __m128d
1372 alpha_0,
1373 a_0, a_1;
1374
1375 int k;
1376
1377 alpha_0 = _mm_loaddup_pd( &alpha );
1378
1379 for(k=0; k<kmax-3; k+=4)
1380 {
1381
1382 a_0 = _mm_load_sd( &A0[3+bs*0] );
1383 a_0 = _mm_mul_pd( alpha_0, a_0 );
1384 _mm_store_sd( &B[0+bs*0], a_0 );
1385 a_1 = _mm_loadu_pd( &A1[0+bs*0] );
1386 a_1 = _mm_mul_pd( alpha_0, a_1 );
1387 _mm_storeu_pd( &B[1+bs*0], a_1 );
1388
1389 a_0 = _mm_load_sd( &A0[3+bs*1] );
1390 a_0 = _mm_mul_pd( alpha_0, a_0 );
1391 _mm_store_sd( &B[0+bs*1], a_0 );
1392 a_1 = _mm_loadu_pd( &A1[0+bs*1] );
1393 a_1 = _mm_mul_pd( alpha_0, a_1 );
1394 _mm_storeu_pd( &B[1+bs*1], a_1 );
1395
1396 a_0 = _mm_load_sd( &A0[3+bs*2] );
1397 a_0 = _mm_mul_pd( alpha_0, a_0 );
1398 _mm_store_sd( &B[0+bs*2], a_0 );
1399 a_1 = _mm_loadu_pd( &A1[0+bs*2] );
1400 a_1 = _mm_mul_pd( alpha_0, a_1 );
1401 _mm_storeu_pd( &B[1+bs*2], a_1 );
1402
1403 a_0 = _mm_load_sd( &A0[3+bs*3] );
1404 a_0 = _mm_mul_pd( alpha_0, a_0 );
1405 _mm_store_sd( &B[0+bs*3], a_0 );
1406 a_1 = _mm_loadu_pd( &A1[0+bs*3] );
1407 a_1 = _mm_mul_pd( alpha_0, a_1 );
1408 _mm_storeu_pd( &B[1+bs*3], a_1 );
1409
1410 A0 += 16;
1411 A1 += 16;
1412 B += 16;
1413
1414 }
1415 for(; k<kmax; k++)
1416 {
1417
1418 a_0 = _mm_load_sd( &A0[3+bs*0] );
1419 a_0 = _mm_mul_pd( alpha_0, a_0 );
1420 _mm_store_sd( &B[0+bs*0], a_0 );
1421 a_1 = _mm_loadu_pd( &A1[0+bs*0] );
1422 a_1 = _mm_mul_pd( alpha_0, a_1 );
1423 _mm_storeu_pd( &B[1+bs*0], a_1 );
1424
1425 A0 += 4;
1426 A1 += 4;
1427 B += 4;
1428
1429 }
1430
1431 if(tri==1)
1432 {
1433 // 2x2 triangle
1434
1435 a_0 = _mm_loadu_pd( &A1[0+bs*0] );
1436 a_0 = _mm_mul_pd( alpha_0, a_0 );
1437 _mm_storeu_pd( &B[1+bs*0], a_0 );
1438
1439 a_0 = _mm_load_sd( &A1[1+bs*1] );
1440 a_0 = _mm_mul_pd( alpha_0, a_0 );
1441 _mm_store_sd( &B[2+bs*1], a_0 );
1442
1443 }
1444
1445 }
1446
1447
1448
1449// both A and B are aligned to 64-bit boundaries
1450void kernel_dgecp_2_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
1451 {
1452
1453 if(tri==1)
1454 {
1455 // A and C are lower triangular
1456 // kmax+1 2-wide + end 1x1 triangle
1457
1458 kmax += 1;
1459 }
1460
1461 if(kmax<=0)
1462 return;
1463
1464 const int bs = 4;
1465
1466 __m128d
1467 alpha_0,
1468 a_0;
1469
1470 int k;
1471
1472 alpha_0 = _mm_loaddup_pd( &alpha );
1473
1474 for(k=0; k<kmax-3; k+=4)
1475 {
1476
1477 a_0 = _mm_loadu_pd( &A[0+bs*0] );
1478 a_0 = _mm_mul_pd( alpha_0, a_0 );
1479 _mm_storeu_pd( &B[0+bs*0], a_0 );
1480
1481 a_0 = _mm_loadu_pd( &A[0+bs*1] );
1482 a_0 = _mm_mul_pd( alpha_0, a_0 );
1483 _mm_storeu_pd( &B[0+bs*1], a_0 );
1484
1485 a_0 = _mm_loadu_pd( &A[0+bs*2] );
1486 a_0 = _mm_mul_pd( alpha_0, a_0 );
1487 _mm_storeu_pd( &B[0+bs*2], a_0 );
1488
1489 a_0 = _mm_loadu_pd( &A[0+bs*3] );
1490 a_0 = _mm_mul_pd( alpha_0, a_0 );
1491 _mm_storeu_pd( &B[0+bs*3], a_0 );
1492
1493 A += 16;
1494 B += 16;
1495
1496 }
1497 for(; k<kmax; k++)
1498 {
1499
1500 a_0 = _mm_loadu_pd( &A[0+bs*0] );
1501 a_0 = _mm_mul_pd( alpha_0, a_0 );
1502 _mm_storeu_pd( &B[0+bs*0], a_0 );
1503
1504 A += 4;
1505 B += 4;
1506
1507 }
1508
1509 if(tri==1)
1510 {
1511 // 1x1 triangle
1512
1513 a_0 = _mm_load_sd( &A[1+bs*0] );
1514 a_0 = _mm_mul_pd( alpha_0, a_0 );
1515 _mm_store_sd( &B[1+bs*0], a_0 );
1516
1517 }
1518
1519 }
1520
1521
1522
1523// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
1524void kernel_dgecp_2_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
1525 {
1526
1527 if(tri==1)
1528 {
1529 // A and C are lower triangular
1530 // kmax+1 2-wide + end 1x1 triangle
1531
1532 kmax += 1;
1533 }
1534
1535 if(kmax<=0)
1536 return;
1537
1538 const int bs = 4;
1539
1540 double *A1 = A0 + bs*sda;
1541
1542 __m128d
1543 alpha_0,
1544 a_0;
1545
1546 int k;
1547
1548 alpha_0 = _mm_loaddup_pd( &alpha );
1549
1550 for(k=0; k<kmax-3; k+=4)
1551 {
1552
1553 a_0 = _mm_load_sd( &A0[3+bs*0] );
1554 a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
1555 a_0 = _mm_mul_pd( alpha_0, a_0 );
1556 _mm_storeu_pd( &B[0+bs*0], a_0 );
1557
1558 a_0 = _mm_load_sd( &A0[3+bs*1] );
1559 a_0 = _mm_loadh_pd( a_0, &A1[0+bs*1] );
1560 a_0 = _mm_mul_pd( alpha_0, a_0 );
1561 _mm_storeu_pd( &B[0+bs*1], a_0 );
1562
1563 a_0 = _mm_load_sd( &A0[3+bs*2] );
1564 a_0 = _mm_loadh_pd( a_0, &A1[0+bs*2] );
1565 a_0 = _mm_mul_pd( alpha_0, a_0 );
1566 _mm_storeu_pd( &B[0+bs*2], a_0 );
1567
1568 a_0 = _mm_load_sd( &A0[3+bs*3] );
1569 a_0 = _mm_loadh_pd( a_0, &A1[0+bs*3] );
1570 a_0 = _mm_mul_pd( alpha_0, a_0 );
1571 _mm_storeu_pd( &B[0+bs*3], a_0 );
1572
1573 A0 += 16;
1574 A1 += 16;
1575 B += 16;
1576
1577 }
1578 for(; k<kmax; k++)
1579 {
1580
1581 a_0 = _mm_load_sd( &A0[3+bs*0] );
1582 a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
1583 a_0 = _mm_mul_pd( alpha_0, a_0 );
1584 _mm_storeu_pd( &B[0+bs*0], a_0 );
1585
1586 A0 += 4;
1587 A1 += 4;
1588 B += 4;
1589
1590 }
1591
1592 if(tri==1)
1593 {
1594 // 1x1 triangle
1595
1596 a_0 = _mm_load_sd( &A1[0+bs*0] );
1597 a_0 = _mm_mul_pd( alpha_0, a_0 );
1598 _mm_store_sd( &B[1+bs*0], a_0 );
1599
1600 }
1601
1602 }
1603
1604
1605
1606// both A and B are aligned 64-bit boundaries
1607void kernel_dgecp_1_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
1608 {
1609
1610 if(tri==1)
1611 {
1612 // A and C are lower triangular
1613 // kmax+1 1-wide
1614
1615 kmax += 1;
1616 }
1617
1618 if(kmax<=0)
1619 return;
1620
1621 const int bs = 4;
1622
1623 __m128d
1624 alpha_0,
1625 a_0;
1626
1627 int k;
1628
1629 alpha_0 = _mm_loaddup_pd( &alpha );
1630
1631 for(k=0; k<kmax-3; k+=4)
1632 {
1633
1634 a_0 = _mm_load_sd( &A[0+bs*0] );
1635 a_0 = _mm_mul_pd( alpha_0, a_0 );
1636 _mm_store_sd( &B[0+bs*0], a_0 );
1637
1638 a_0 = _mm_load_sd( &A[0+bs*1] );
1639 a_0 = _mm_mul_pd( alpha_0, a_0 );
1640 _mm_store_sd( &B[0+bs*1], a_0 );
1641
1642 a_0 = _mm_load_sd( &A[0+bs*2] );
1643 a_0 = _mm_mul_pd( alpha_0, a_0 );
1644 _mm_store_sd( &B[0+bs*2], a_0 );
1645
1646 a_0 = _mm_load_sd( &A[0+bs*3] );
1647 a_0 = _mm_mul_pd( alpha_0, a_0 );
1648 _mm_store_sd( &B[0+bs*3], a_0 );
1649
1650 A += 16;
1651 B += 16;
1652
1653 }
1654 for(; k<kmax; k++)
1655 {
1656
1657 a_0 = _mm_load_sd( &A[0+bs*0] );
1658 a_0 = _mm_mul_pd( alpha_0, a_0 );
1659 _mm_store_sd( &B[0+bs*0], a_0 );
1660
1661 A += 4;
1662 B += 4;
1663
1664 }
1665
1666 }
1667
1668
1669
1670
1671// both A and B are aligned to 256-bit boundaries
1672void kernel_dgead_8_0_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
1673 {
1674
1675 if(kmax<=0)
1676 return;
1677
1678 const int bs = 4;
1679
1680 double *A1 = A0 + bs*sda;
1681 double *B1 = B0 + bs*sdb;
1682
1683 __m256d
1684 a_0, c_0, alpha_0;
1685
1686 int k;
1687
1688 alpha_0 = _mm256_broadcast_sd( &alpha );
1689
1690 for(k=0; k<kmax-3; k+=4)
1691 {
1692
1693 a_0 = _mm256_load_pd( &A0[0+bs*0] );
1694 c_0 = _mm256_load_pd( &B0[0+bs*0] );
1695 a_0 = _mm256_mul_pd( alpha_0, a_0 );
1696 a_0 = _mm256_add_pd( a_0, c_0 );
1697 _mm256_store_pd( &B0[0+bs*0], a_0 );
1698
1699 a_0 = _mm256_load_pd( &A0[0+bs*1] );
1700 c_0 = _mm256_load_pd( &B0[0+bs*1] );
1701 a_0 = _mm256_mul_pd( alpha_0, a_0 );
1702 a_0 = _mm256_add_pd( a_0, c_0 );
1703 _mm256_store_pd( &B0[0+bs*1], a_0 );
1704
1705 a_0 = _mm256_load_pd( &A0[0+bs*2] );
1706 c_0 = _mm256_load_pd( &B0[0+bs*2] );
1707 a_0 = _mm256_mul_pd( alpha_0, a_0 );
1708 a_0 = _mm256_add_pd( a_0, c_0 );
1709 _mm256_store_pd( &B0[0+bs*2], a_0 );
1710
1711 a_0 = _mm256_load_pd( &A0[0+bs*3] );
1712 c_0 = _mm256_load_pd( &B0[0+bs*3] );
1713 a_0 = _mm256_mul_pd( alpha_0, a_0 );
1714 a_0 = _mm256_add_pd( a_0, c_0 );
1715 _mm256_store_pd( &B0[0+bs*3], a_0 );
1716
1717 A0 += 16;
1718 B0 += 16;
1719
1720 a_0 = _mm256_load_pd( &A1[0+bs*0] );
1721 c_0 = _mm256_load_pd( &B1[0+bs*0] );
1722 a_0 = _mm256_mul_pd( alpha_0, a_0 );
1723 a_0 = _mm256_add_pd( a_0, c_0 );
1724 _mm256_store_pd( &B1[0+bs*0], a_0 );
1725
1726 a_0 = _mm256_load_pd( &A1[0+bs*1] );
1727 c_0 = _mm256_load_pd( &B1[0+bs*1] );
1728 a_0 = _mm256_mul_pd( alpha_0, a_0 );
1729 a_0 = _mm256_add_pd( a_0, c_0 );
1730 _mm256_store_pd( &B1[0+bs*1], a_0 );
1731
1732 a_0 = _mm256_load_pd( &A1[0+bs*2] );
1733 c_0 = _mm256_load_pd( &B1[0+bs*2] );
1734 a_0 = _mm256_mul_pd( alpha_0, a_0 );
1735 a_0 = _mm256_add_pd( a_0, c_0 );
1736 _mm256_store_pd( &B1[0+bs*2], a_0 );
1737
1738 a_0 = _mm256_load_pd( &A1[0+bs*3] );
1739 c_0 = _mm256_load_pd( &B1[0+bs*3] );
1740 a_0 = _mm256_mul_pd( alpha_0, a_0 );
1741 a_0 = _mm256_add_pd( a_0, c_0 );
1742 _mm256_store_pd( &B1[0+bs*3], a_0 );
1743
1744 A1 += 16;
1745 B1 += 16;
1746
1747 }
1748 for(; k<kmax; k++)
1749 {
1750
1751 a_0 = _mm256_load_pd( &A0[0+bs*0] );
1752 c_0 = _mm256_load_pd( &B0[0+bs*0] );
1753 a_0 = _mm256_mul_pd( alpha_0, a_0 );
1754 a_0 = _mm256_add_pd( a_0, c_0 );
1755 _mm256_store_pd( &B0[0+bs*0], a_0 );
1756
1757 A0 += 4;
1758 B0 += 4;
1759
1760 a_0 = _mm256_load_pd( &A1[0+bs*0] );
1761 c_0 = _mm256_load_pd( &B1[0+bs*0] );
1762 a_0 = _mm256_mul_pd( alpha_0, a_0 );
1763 a_0 = _mm256_add_pd( a_0, c_0 );
1764 _mm256_store_pd( &B1[0+bs*0], a_0 );
1765
1766 A1 += 4;
1767 B1 += 4;
1768
1769 }
1770
1771 }
1772
1773
1774
1775// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
1776void kernel_dgead_8_1_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
1777 {
1778
1779 if(kmax<=0)
1780 return;
1781
1782 const int bs = 4;
1783
1784 double *A1 = A0 + bs*sda;
1785 double *A2 = A1 + bs*sda;
1786 double *B1 = B0 + bs*sdb;
1787
1788 __m256d
1789 a_0, a_1, a_2,
1790 b_0, b_1,
1791 alpha_0, c_0, c_1;
1792
1793 int k;
1794
1795 alpha_0 = _mm256_broadcast_sd( &alpha );
1796
1797 for(k=0; k<kmax-3; k+=4)
1798 {
1799
1800 a_2 = _mm256_load_pd( &A2[0+bs*0] );
1801 a_1 = _mm256_load_pd( &A1[0+bs*0] );
1802 a_0 = _mm256_load_pd( &A0[0+bs*0] );
1803 a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
1804 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1805 b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
1806 b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
1807 c_1 = _mm256_load_pd( &B1[0+bs*0] );
1808 c_0 = _mm256_load_pd( &B0[0+bs*0] );
1809 b_1 = _mm256_mul_pd( alpha_0, b_1 );
1810 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1811 b_1 = _mm256_add_pd ( c_1, b_1 );
1812 b_0 = _mm256_add_pd ( c_0, b_0 );
1813 _mm256_store_pd( &B1[0+bs*0], b_1 );
1814 _mm256_store_pd( &B0[0+bs*0], b_0 );
1815
1816 a_2 = _mm256_load_pd( &A2[0+bs*1] );
1817 a_1 = _mm256_load_pd( &A1[0+bs*1] );
1818 a_0 = _mm256_load_pd( &A0[0+bs*1] );
1819 a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
1820 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1821 b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
1822 b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
1823 c_1 = _mm256_load_pd( &B1[0+bs*1] );
1824 c_0 = _mm256_load_pd( &B0[0+bs*1] );
1825 b_1 = _mm256_mul_pd( alpha_0, b_1 );
1826 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1827 b_1 = _mm256_add_pd ( c_1, b_1 );
1828 b_0 = _mm256_add_pd ( c_0, b_0 );
1829 _mm256_store_pd( &B1[0+bs*1], b_1 );
1830 _mm256_store_pd( &B0[0+bs*1], b_0 );
1831
1832 a_2 = _mm256_load_pd( &A2[0+bs*2] );
1833 a_1 = _mm256_load_pd( &A1[0+bs*2] );
1834 a_0 = _mm256_load_pd( &A0[0+bs*2] );
1835 a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
1836 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1837 b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
1838 b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
1839 c_1 = _mm256_load_pd( &B1[0+bs*2] );
1840 c_0 = _mm256_load_pd( &B0[0+bs*2] );
1841 b_1 = _mm256_mul_pd( alpha_0, b_1 );
1842 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1843 b_1 = _mm256_add_pd ( c_1, b_1 );
1844 b_0 = _mm256_add_pd ( c_0, b_0 );
1845 _mm256_store_pd( &B1[0+bs*2], b_1 );
1846 _mm256_store_pd( &B0[0+bs*2], b_0 );
1847
1848 a_2 = _mm256_load_pd( &A2[0+bs*3] );
1849 a_1 = _mm256_load_pd( &A1[0+bs*3] );
1850 a_0 = _mm256_load_pd( &A0[0+bs*3] );
1851 a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
1852 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1853 b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
1854 b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
1855 c_1 = _mm256_load_pd( &B1[0+bs*3] );
1856 c_0 = _mm256_load_pd( &B0[0+bs*3] );
1857 b_1 = _mm256_mul_pd( alpha_0, b_1 );
1858 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1859 b_1 = _mm256_add_pd ( c_1, b_1 );
1860 b_0 = _mm256_add_pd ( c_0, b_0 );
1861 _mm256_store_pd( &B1[0+bs*3], b_1 );
1862 _mm256_store_pd( &B0[0+bs*3], b_0 );
1863
1864 A0 += 16;
1865 A1 += 16;
1866 A2 += 16;
1867 B0 += 16;
1868 B1 += 16;
1869
1870 }
1871 for(; k<kmax; k++)
1872 {
1873
1874 a_2 = _mm256_load_pd( &A2[0+bs*0] );
1875 a_1 = _mm256_load_pd( &A1[0+bs*0] );
1876 a_0 = _mm256_load_pd( &A0[0+bs*0] );
1877 a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
1878 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1879 b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
1880 b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
1881 c_1 = _mm256_load_pd( &B1[0+bs*0] );
1882 c_0 = _mm256_load_pd( &B0[0+bs*0] );
1883 b_1 = _mm256_mul_pd( alpha_0, b_1 );
1884 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1885 b_1 = _mm256_add_pd ( c_1, b_1 );
1886 b_0 = _mm256_add_pd ( c_0, b_0 );
1887 _mm256_store_pd( &B1[0+bs*0], b_1 );
1888 _mm256_store_pd( &B0[0+bs*0], b_0 );
1889
1890 A0 += 4;
1891 A1 += 4;
1892 A2 += 4;
1893 B0 += 4;
1894 B1 += 4;
1895
1896 }
1897
1898 }
1899
1900
1901
1902// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
1903void kernel_dgead_8_2_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
1904 {
1905
1906 if(kmax<=0)
1907 return;
1908
1909 const int bs = 4;
1910
1911 double *A1 = A0 + bs*sda;
1912 double *A2 = A1 + bs*sda;
1913 double *B1 = B0 + bs*sdb;
1914
1915 __m256d
1916 a_0, a_1, a_2,
1917 b_0, b_1,
1918 alpha_0, c_0, c_1;
1919
1920 int k;
1921
1922 alpha_0 = _mm256_broadcast_sd( &alpha );
1923
1924 for(k=0; k<kmax-3; k+=4)
1925 {
1926
1927 a_0 = _mm256_load_pd( &A0[0+bs*0] );
1928 a_1 = _mm256_load_pd( &A1[0+bs*0] );
1929 a_2 = _mm256_load_pd( &A2[0+bs*0] );
1930 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1931 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
1932 c_0 = _mm256_load_pd( &B0[0+bs*0] );
1933 c_1 = _mm256_load_pd( &B1[0+bs*0] );
1934 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1935 b_1 = _mm256_mul_pd( alpha_0, b_1 );
1936 b_0 = _mm256_add_pd ( c_0, b_0 );
1937 b_1 = _mm256_add_pd ( c_1, b_1 );
1938 _mm256_store_pd( &B0[0+bs*0], b_0 );
1939 _mm256_store_pd( &B1[0+bs*0], b_1 );
1940
1941 a_0 = _mm256_load_pd( &A0[0+bs*1] );
1942 a_1 = _mm256_load_pd( &A1[0+bs*1] );
1943 a_2 = _mm256_load_pd( &A2[0+bs*1] );
1944 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1945 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
1946 c_0 = _mm256_load_pd( &B0[0+bs*1] );
1947 c_1 = _mm256_load_pd( &B1[0+bs*1] );
1948 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1949 b_1 = _mm256_mul_pd( alpha_0, b_1 );
1950 b_0 = _mm256_add_pd ( c_0, b_0 );
1951 b_1 = _mm256_add_pd ( c_1, b_1 );
1952 _mm256_store_pd( &B0[0+bs*1], b_0 );
1953 _mm256_store_pd( &B1[0+bs*1], b_1 );
1954
1955 a_0 = _mm256_load_pd( &A0[0+bs*2] );
1956 a_1 = _mm256_load_pd( &A1[0+bs*2] );
1957 a_2 = _mm256_load_pd( &A2[0+bs*2] );
1958 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1959 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
1960 c_0 = _mm256_load_pd( &B0[0+bs*2] );
1961 c_1 = _mm256_load_pd( &B1[0+bs*2] );
1962 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1963 b_1 = _mm256_mul_pd( alpha_0, b_1 );
1964 b_0 = _mm256_add_pd ( c_0, b_0 );
1965 b_1 = _mm256_add_pd ( c_1, b_1 );
1966 _mm256_store_pd( &B0[0+bs*2], b_0 );
1967 _mm256_store_pd( &B1[0+bs*2], b_1 );
1968
1969 a_0 = _mm256_load_pd( &A0[0+bs*3] );
1970 a_1 = _mm256_load_pd( &A1[0+bs*3] );
1971 a_2 = _mm256_load_pd( &A2[0+bs*3] );
1972 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1973 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
1974 c_0 = _mm256_load_pd( &B0[0+bs*3] );
1975 c_1 = _mm256_load_pd( &B1[0+bs*3] );
1976 b_0 = _mm256_mul_pd( alpha_0, b_0 );
1977 b_1 = _mm256_mul_pd( alpha_0, b_1 );
1978 b_0 = _mm256_add_pd ( c_0, b_0 );
1979 b_1 = _mm256_add_pd ( c_1, b_1 );
1980 _mm256_store_pd( &B0[0+bs*3], b_0 );
1981 _mm256_store_pd( &B1[0+bs*3], b_1 );
1982
1983 A0 += 16;
1984 A1 += 16;
1985 A2 += 16;
1986 B0 += 16;
1987 B1 += 16;
1988
1989 }
1990 for(; k<kmax; k++)
1991 {
1992
1993 a_0 = _mm256_load_pd( &A0[0+bs*0] );
1994 a_1 = _mm256_load_pd( &A1[0+bs*0] );
1995 a_2 = _mm256_load_pd( &A2[0+bs*0] );
1996 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
1997 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
1998 c_0 = _mm256_load_pd( &B0[0+bs*0] );
1999 c_1 = _mm256_load_pd( &B1[0+bs*0] );
2000 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2001 b_1 = _mm256_mul_pd( alpha_0, b_1 );
2002 b_0 = _mm256_add_pd ( c_0, b_0 );
2003 b_1 = _mm256_add_pd ( c_1, b_1 );
2004 _mm256_store_pd( &B0[0+bs*0], b_0 );
2005 _mm256_store_pd( &B1[0+bs*0], b_1 );
2006
2007 A0 += 4;
2008 A1 += 4;
2009 A2 += 4;
2010 B0 += 4;
2011 B1 += 4;
2012
2013 }
2014
2015 }
2016
2017
2018
2019// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
2020void kernel_dgead_8_3_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
2021 {
2022
2023 if(kmax<=0)
2024 return;
2025
2026 const int bs = 4;
2027
2028 double *A1 = A0 + bs*sda;
2029 double *A2 = A1 + bs*sda;
2030 double *B1 = B0 + bs*sdb;
2031
2032 __m256d
2033 a_0, a_1, a_2,
2034 b_0, b_1,
2035 alpha_0, c_0, c_1;
2036
2037 int k;
2038
2039 alpha_0 = _mm256_broadcast_sd( &alpha );
2040
2041 for(k=0; k<kmax-3; k+=4)
2042 {
2043
2044 a_0 = _mm256_load_pd( &A0[0+bs*0] );
2045 a_1 = _mm256_load_pd( &A1[0+bs*0] );
2046 a_2 = _mm256_load_pd( &A2[0+bs*0] );
2047 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2048 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
2049 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2050 b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
2051 c_0 = _mm256_load_pd( &B0[0+bs*0] );
2052 c_1 = _mm256_load_pd( &B1[0+bs*0] );
2053 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2054 b_1 = _mm256_mul_pd( alpha_0, b_1 );
2055 b_0 = _mm256_add_pd ( c_0, b_0 );
2056 b_1 = _mm256_add_pd ( c_1, b_1 );
2057 _mm256_store_pd( &B0[0+bs*0], b_0 );
2058 _mm256_store_pd( &B1[0+bs*0], b_1 );
2059
2060 a_0 = _mm256_load_pd( &A0[0+bs*1] );
2061 a_1 = _mm256_load_pd( &A1[0+bs*1] );
2062 a_2 = _mm256_load_pd( &A2[0+bs*1] );
2063 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2064 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
2065 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2066 b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
2067 c_0 = _mm256_load_pd( &B0[0+bs*1] );
2068 c_1 = _mm256_load_pd( &B1[0+bs*1] );
2069 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2070 b_1 = _mm256_mul_pd( alpha_0, b_1 );
2071 b_0 = _mm256_add_pd ( c_0, b_0 );
2072 b_1 = _mm256_add_pd ( c_1, b_1 );
2073 _mm256_store_pd( &B0[0+bs*1], b_0 );
2074 _mm256_store_pd( &B1[0+bs*1], b_1 );
2075
2076 a_0 = _mm256_load_pd( &A0[0+bs*2] );
2077 a_1 = _mm256_load_pd( &A1[0+bs*2] );
2078 a_2 = _mm256_load_pd( &A2[0+bs*2] );
2079 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2080 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
2081 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2082 b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
2083 c_0 = _mm256_load_pd( &B0[0+bs*2] );
2084 c_1 = _mm256_load_pd( &B1[0+bs*2] );
2085 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2086 b_1 = _mm256_mul_pd( alpha_0, b_1 );
2087 b_0 = _mm256_add_pd ( c_0, b_0 );
2088 b_1 = _mm256_add_pd ( c_1, b_1 );
2089 _mm256_store_pd( &B0[0+bs*2], b_0 );
2090 _mm256_store_pd( &B1[0+bs*2], b_1 );
2091
2092 a_0 = _mm256_load_pd( &A0[0+bs*3] );
2093 a_1 = _mm256_load_pd( &A1[0+bs*3] );
2094 a_2 = _mm256_load_pd( &A2[0+bs*3] );
2095 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2096 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
2097 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2098 b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
2099 c_0 = _mm256_load_pd( &B0[0+bs*3] );
2100 c_1 = _mm256_load_pd( &B1[0+bs*3] );
2101 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2102 b_1 = _mm256_mul_pd( alpha_0, b_1 );
2103 b_0 = _mm256_add_pd ( c_0, b_0 );
2104 b_1 = _mm256_add_pd ( c_1, b_1 );
2105 _mm256_store_pd( &B0[0+bs*3], b_0 );
2106 _mm256_store_pd( &B1[0+bs*3], b_1 );
2107
2108 A0 += 16;
2109 A1 += 16;
2110 A2 += 16;
2111 B0 += 16;
2112 B1 += 16;
2113
2114 }
2115 for(; k<kmax; k++)
2116 {
2117
2118 a_0 = _mm256_load_pd( &A0[0+bs*0] );
2119 a_1 = _mm256_load_pd( &A1[0+bs*0] );
2120 a_2 = _mm256_load_pd( &A2[0+bs*0] );
2121 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2122 b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
2123 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2124 b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
2125 c_0 = _mm256_load_pd( &B0[0+bs*0] );
2126 c_1 = _mm256_load_pd( &B1[0+bs*0] );
2127 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2128 b_1 = _mm256_mul_pd( alpha_0, b_1 );
2129 b_0 = _mm256_add_pd ( c_0, b_0 );
2130 b_1 = _mm256_add_pd ( c_1, b_1 );
2131 _mm256_store_pd( &B0[0+bs*0], b_0 );
2132 _mm256_store_pd( &B1[0+bs*0], b_1 );
2133
2134 A0 += 4;
2135 A1 += 4;
2136 A2 += 4;
2137 B0 += 4;
2138 B1 += 4;
2139
2140 }
2141
2142 }
2143
2144
2145
2146// both A and B are aligned to 256-bit boundaries
2147void kernel_dgead_4_0_lib4(int kmax, double alpha, double *A, double *B)
2148 {
2149
2150 if(kmax<=0)
2151 return;
2152
2153 const int bs = 4;
2154
2155 __m256d
2156 a_0, c_0, alpha_0;
2157
2158 int k;
2159
2160 alpha_0 = _mm256_broadcast_sd( &alpha );
2161
2162 for(k=0; k<kmax-3; k+=4)
2163 {
2164
2165 a_0 = _mm256_load_pd( &A[0+bs*0] );
2166 c_0 = _mm256_load_pd( &B[0+bs*0] );
2167 a_0 = _mm256_mul_pd( alpha_0, a_0 );
2168 a_0 = _mm256_add_pd( c_0, a_0 );
2169 _mm256_store_pd( &B[0+bs*0], a_0 );
2170
2171 a_0 = _mm256_load_pd( &A[0+bs*1] );
2172 c_0 = _mm256_load_pd( &B[0+bs*1] );
2173 a_0 = _mm256_mul_pd( alpha_0, a_0 );
2174 a_0 = _mm256_add_pd( c_0, a_0 );
2175 _mm256_store_pd( &B[0+bs*1], a_0 );
2176
2177 a_0 = _mm256_load_pd( &A[0+bs*2] );
2178 c_0 = _mm256_load_pd( &B[0+bs*2] );
2179 a_0 = _mm256_mul_pd( alpha_0, a_0 );
2180 a_0 = _mm256_add_pd( c_0, a_0 );
2181 _mm256_store_pd( &B[0+bs*2], a_0 );
2182
2183 a_0 = _mm256_load_pd( &A[0+bs*3] );
2184 c_0 = _mm256_load_pd( &B[0+bs*3] );
2185 a_0 = _mm256_mul_pd( alpha_0, a_0 );
2186 a_0 = _mm256_add_pd( c_0, a_0 );
2187 _mm256_store_pd( &B[0+bs*3], a_0 );
2188
2189 A += 16;
2190 B += 16;
2191
2192 }
2193 for(; k<kmax; k++)
2194 {
2195
2196 a_0 = _mm256_load_pd( &A[0+bs*0] );
2197 c_0 = _mm256_load_pd( &B[0+bs*0] );
2198 a_0 = _mm256_mul_pd( alpha_0, a_0 );
2199 a_0 = _mm256_add_pd( c_0, a_0 );
2200 _mm256_store_pd( &B[0+bs*0], a_0 );
2201
2202 A += 4;
2203 B += 4;
2204
2205 }
2206
2207 }
2208
2209
2210
2211// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
2212void kernel_dgead_4_1_lib4(int kmax, double alpha, double *A0, int sda, double *B)
2213 {
2214
2215 if(kmax<=0)
2216 return;
2217
2218 const int bs = 4;
2219
2220 double *A1 = A0 + bs*sda;
2221
2222 __m256d
2223 a_0, a_1,
2224 b_0,
2225 alpha_0, c_0;
2226
2227 int k;
2228
2229 for(k=0; k<kmax-3; k+=4)
2230 {
2231
2232 a_0 = _mm256_load_pd( &A0[0+bs*0] );
2233 a_1 = _mm256_load_pd( &A1[0+bs*0] );
2234 a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2235 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2236 c_0 = _mm256_load_pd( &B[0+bs*0] );
2237 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2238 b_0 = _mm256_add_pd( c_0, b_0 );
2239 _mm256_store_pd( &B[0+bs*0], b_0 );
2240
2241 a_0 = _mm256_load_pd( &A0[0+bs*1] );
2242 a_1 = _mm256_load_pd( &A1[0+bs*1] );
2243 a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2244 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2245 c_0 = _mm256_load_pd( &B[0+bs*1] );
2246 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2247 b_0 = _mm256_add_pd( c_0, b_0 );
2248 _mm256_store_pd( &B[0+bs*1], b_0 );
2249
2250 a_0 = _mm256_load_pd( &A0[0+bs*2] );
2251 a_1 = _mm256_load_pd( &A1[0+bs*2] );
2252 a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2253 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2254 c_0 = _mm256_load_pd( &B[0+bs*2] );
2255 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2256 b_0 = _mm256_add_pd( c_0, b_0 );
2257 _mm256_store_pd( &B[0+bs*2], b_0 );
2258
2259 a_0 = _mm256_load_pd( &A0[0+bs*3] );
2260 a_1 = _mm256_load_pd( &A1[0+bs*3] );
2261 a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2262 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2263 c_0 = _mm256_load_pd( &B[0+bs*3] );
2264 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2265 b_0 = _mm256_add_pd( c_0, b_0 );
2266 _mm256_store_pd( &B[0+bs*3], b_0 );
2267
2268 A0 += 16;
2269 A1 += 16;
2270 B += 16;
2271
2272 }
2273 for(; k<kmax; k++)
2274 {
2275
2276 a_0 = _mm256_load_pd( &A0[0+bs*0] );
2277 a_1 = _mm256_load_pd( &A1[0+bs*0] );
2278 a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2279 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2280 c_0 = _mm256_load_pd( &B[0+bs*0] );
2281 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2282 b_0 = _mm256_add_pd( c_0, b_0 );
2283 _mm256_store_pd( &B[0+bs*0], b_0 );
2284
2285 A0 += 4;
2286 A1 += 4;
2287 B += 4;
2288
2289 }
2290
2291 }
2292
2293
2294
2295// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
2296void kernel_dgead_4_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
2297 {
2298
2299 if(kmax<=0)
2300 return;
2301
2302 const int bs = 4;
2303
2304 double *A1 = A0 + bs*sda;
2305
2306 __m256d
2307 a_0, a_1,
2308 b_0,
2309 alpha_0, c_0;
2310
2311 int k;
2312
2313 alpha_0 = _mm256_broadcast_sd( &alpha );
2314
2315 for(k=0; k<kmax-3; k+=4)
2316 {
2317
2318 a_0 = _mm256_load_pd( &A0[0+bs*0] );
2319 a_1 = _mm256_load_pd( &A1[0+bs*0] );
2320 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2321 c_0 = _mm256_load_pd( &B[0+bs*0] );
2322 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2323 b_0 = _mm256_add_pd( c_0, b_0 );
2324 _mm256_store_pd( &B[0+bs*0], b_0 );
2325
2326 a_0 = _mm256_load_pd( &A0[0+bs*1] );
2327 a_1 = _mm256_load_pd( &A1[0+bs*1] );
2328 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2329 c_0 = _mm256_load_pd( &B[0+bs*1] );
2330 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2331 b_0 = _mm256_add_pd( c_0, b_0 );
2332 _mm256_store_pd( &B[0+bs*1], b_0 );
2333
2334 a_0 = _mm256_load_pd( &A0[0+bs*2] );
2335 a_1 = _mm256_load_pd( &A1[0+bs*2] );
2336 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2337 c_0 = _mm256_load_pd( &B[0+bs*2] );
2338 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2339 b_0 = _mm256_add_pd( c_0, b_0 );
2340 _mm256_store_pd( &B[0+bs*2], b_0 );
2341
2342 a_0 = _mm256_load_pd( &A0[0+bs*3] );
2343 a_1 = _mm256_load_pd( &A1[0+bs*3] );
2344 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2345 c_0 = _mm256_load_pd( &B[0+bs*3] );
2346 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2347 b_0 = _mm256_add_pd( c_0, b_0 );
2348 _mm256_store_pd( &B[0+bs*3], b_0 );
2349
2350 A0 += 16;
2351 A1 += 16;
2352 B += 16;
2353
2354 }
2355 for(; k<kmax; k++)
2356 {
2357
2358 a_0 = _mm256_load_pd( &A0[0+bs*0] );
2359 a_1 = _mm256_load_pd( &A1[0+bs*0] );
2360 b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2361 c_0 = _mm256_load_pd( &B[0+bs*0] );
2362 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2363 b_0 = _mm256_add_pd( c_0, b_0 );
2364 _mm256_store_pd( &B[0+bs*0], b_0 );
2365
2366 A0 += 4;
2367 A1 += 4;
2368 B += 4;
2369
2370 }
2371
2372 }
2373
2374
2375
2376// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
2377void kernel_dgead_4_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
2378 {
2379
2380 if(kmax<=0)
2381 return;
2382
2383 const int bs = 4;
2384
2385 double *A1 = A0 + bs*sda;
2386
2387 __m256d
2388 a_0, a_1,
2389 b_0,
2390 alpha_0, c_0;
2391
2392 int k;
2393
2394 alpha_0 = _mm256_broadcast_sd( &alpha );
2395
2396 for(k=0; k<kmax-3; k+=4)
2397 {
2398
2399 a_0 = _mm256_load_pd( &A0[0+bs*0] );
2400 a_1 = _mm256_load_pd( &A1[0+bs*0] );
2401 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2402 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2403 c_0 = _mm256_load_pd( &B[0+bs*0] );
2404 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2405 b_0 = _mm256_add_pd( c_0, b_0 );
2406 _mm256_store_pd( &B[0+bs*0], b_0 );
2407
2408 a_0 = _mm256_load_pd( &A0[0+bs*1] );
2409 a_1 = _mm256_load_pd( &A1[0+bs*1] );
2410 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2411 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2412 c_0 = _mm256_load_pd( &B[0+bs*1] );
2413 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2414 b_0 = _mm256_add_pd( c_0, b_0 );
2415 _mm256_store_pd( &B[0+bs*1], b_0 );
2416
2417 a_0 = _mm256_load_pd( &A0[0+bs*2] );
2418 a_1 = _mm256_load_pd( &A1[0+bs*2] );
2419 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2420 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2421 c_0 = _mm256_load_pd( &B[0+bs*2] );
2422 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2423 b_0 = _mm256_add_pd( c_0, b_0 );
2424 _mm256_store_pd( &B[0+bs*2], b_0 );
2425
2426 a_0 = _mm256_load_pd( &A0[0+bs*3] );
2427 a_1 = _mm256_load_pd( &A1[0+bs*3] );
2428 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2429 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2430 c_0 = _mm256_load_pd( &B[0+bs*3] );
2431 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2432 b_0 = _mm256_add_pd( c_0, b_0 );
2433 _mm256_store_pd( &B[0+bs*3], b_0 );
2434
2435 A0 += 16;
2436 A1 += 16;
2437 B += 16;
2438
2439 }
2440 for(; k<kmax; k++)
2441 {
2442
2443 a_0 = _mm256_load_pd( &A0[0+bs*0] );
2444 a_1 = _mm256_load_pd( &A1[0+bs*0] );
2445 a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
2446 b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
2447 c_0 = _mm256_load_pd( &B[0+bs*0] );
2448 b_0 = _mm256_mul_pd( alpha_0, b_0 );
2449 b_0 = _mm256_add_pd( c_0, b_0 );
2450 _mm256_store_pd( &B[0+bs*0], b_0 );
2451
2452 A0 += 4;
2453 A1 += 4;
2454 B += 4;
2455
2456 }
2457
2458 }
2459
2460
2461
2462// both A and B are aligned to 64-bit boundaries
2463void kernel_dgead_3_0_lib4(int kmax, double alpha, double *A, double *B)
2464 {
2465
2466 if(kmax<=0)
2467 return;
2468
2469 const int bs = 4;
2470
2471 __m128d
2472 a_0, a_1,
2473 alpha_0, c_0, c_1;
2474
2475 int k;
2476
2477 alpha_0 = _mm_loaddup_pd( &alpha );
2478
2479 for(k=0; k<kmax-3; k+=4)
2480 {
2481
2482 a_0 = _mm_loadu_pd( &A[0+bs*0] );
2483 a_1 = _mm_load_sd( &A[2+bs*0] );
2484 c_0 = _mm_loadu_pd( &B[0+bs*0] );
2485 c_1 = _mm_load_sd( &B[2+bs*0] );
2486 a_0 = _mm_mul_pd( alpha_0, a_0 );
2487 a_1 = _mm_mul_sd( alpha_0, a_1 );
2488 a_0 = _mm_add_pd( c_0, a_0 );
2489 a_1 = _mm_add_sd( c_1, a_1 );
2490 _mm_storeu_pd( &B[0+bs*0], a_0 );
2491 _mm_store_sd( &B[2+bs*0], a_1 );
2492
2493 a_0 = _mm_loadu_pd( &A[0+bs*1] );
2494 a_1 = _mm_load_sd( &A[2+bs*1] );
2495 c_0 = _mm_loadu_pd( &B[0+bs*1] );
2496 c_1 = _mm_load_sd( &B[2+bs*1] );
2497 a_0 = _mm_mul_pd( alpha_0, a_0 );
2498 a_1 = _mm_mul_sd( alpha_0, a_1 );
2499 a_0 = _mm_add_pd( c_0, a_0 );
2500 a_1 = _mm_add_sd( c_1, a_1 );
2501 _mm_storeu_pd( &B[0+bs*1], a_0 );
2502 _mm_store_sd( &B[2+bs*1], a_1 );
2503
2504 a_0 = _mm_loadu_pd( &A[0+bs*2] );
2505 a_1 = _mm_load_sd( &A[2+bs*2] );
2506 c_0 = _mm_loadu_pd( &B[0+bs*2] );
2507 c_1 = _mm_load_sd( &B[2+bs*2] );
2508 a_0 = _mm_mul_pd( alpha_0, a_0 );
2509 a_1 = _mm_mul_sd( alpha_0, a_1 );
2510 a_0 = _mm_add_pd( c_0, a_0 );
2511 a_1 = _mm_add_sd( c_1, a_1 );
2512 _mm_storeu_pd( &B[0+bs*2], a_0 );
2513 _mm_store_sd( &B[2+bs*2], a_1 );
2514
2515 a_0 = _mm_loadu_pd( &A[0+bs*3] );
2516 a_1 = _mm_load_sd( &A[2+bs*3] );
2517 c_0 = _mm_loadu_pd( &B[0+bs*3] );
2518 c_1 = _mm_load_sd( &B[2+bs*3] );
2519 a_0 = _mm_mul_pd( alpha_0, a_0 );
2520 a_1 = _mm_mul_sd( alpha_0, a_1 );
2521 a_0 = _mm_add_pd( c_0, a_0 );
2522 a_1 = _mm_add_sd( c_1, a_1 );
2523 _mm_storeu_pd( &B[0+bs*3], a_0 );
2524 _mm_store_sd( &B[2+bs*3], a_1 );
2525
2526 A += 16;
2527 B += 16;
2528
2529 }
2530 for(; k<kmax; k++)
2531 {
2532
2533 a_0 = _mm_loadu_pd( &A[0+bs*0] );
2534 a_1 = _mm_load_sd( &A[2+bs*0] );
2535 c_0 = _mm_loadu_pd( &B[0+bs*0] );
2536 c_1 = _mm_load_sd( &B[2+bs*0] );
2537 a_0 = _mm_mul_pd( alpha_0, a_0 );
2538 a_1 = _mm_mul_sd( alpha_0, a_1 );
2539 a_0 = _mm_add_pd( c_0, a_0 );
2540 a_1 = _mm_add_sd( c_1, a_1 );
2541 _mm_storeu_pd( &B[0+bs*0], a_0 );
2542 _mm_store_sd( &B[2+bs*0], a_1 );
2543
2544 A += 4;
2545 B += 4;
2546
2547 }
2548
2549 }
2550
2551
2552
2553// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
2554void kernel_dgead_3_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
2555 {
2556
2557 if(kmax<=0)
2558 return;
2559
2560 const int bs = 4;
2561
2562 double *A1 = A0 + bs*sda;
2563
2564 __m128d
2565 a_0, a_1,
2566 alpha_0, c_0, c_1;
2567
2568 int k;
2569
2570 alpha_0 = _mm_loaddup_pd( &alpha );
2571
2572 for(k=0; k<kmax-3; k+=4)
2573 {
2574
2575 a_0 = _mm_loadu_pd( &A0[2+bs*0] );
2576 a_1 = _mm_load_sd( &A1[0+bs*0] );
2577 c_0 = _mm_loadu_pd( &B[0+bs*0] );
2578 c_1 = _mm_load_sd( &B[2+bs*0] );
2579 a_0 = _mm_mul_pd( alpha_0, a_0 );
2580 a_1 = _mm_mul_sd( alpha_0, a_1 );
2581 a_0 = _mm_add_pd( c_0, a_0 );
2582 a_1 = _mm_add_sd( c_1, a_1 );
2583 _mm_storeu_pd( &B[0+bs*0], a_0 );
2584 _mm_store_sd( &B[2+bs*0], a_1 );
2585
2586 a_0 = _mm_loadu_pd( &A0[2+bs*1] );
2587 a_1 = _mm_load_sd( &A1[0+bs*1] );
2588 c_0 = _mm_loadu_pd( &B[0+bs*1] );
2589 c_1 = _mm_load_sd( &B[2+bs*1] );
2590 a_0 = _mm_mul_pd( alpha_0, a_0 );
2591 a_1 = _mm_mul_sd( alpha_0, a_1 );
2592 a_0 = _mm_add_pd( c_0, a_0 );
2593 a_1 = _mm_add_sd( c_1, a_1 );
2594 _mm_storeu_pd( &B[0+bs*1], a_0 );
2595 _mm_store_sd( &B[2+bs*1], a_1 );
2596
2597 a_0 = _mm_loadu_pd( &A0[2+bs*2] );
2598 a_1 = _mm_load_sd( &A1[0+bs*2] );
2599 c_0 = _mm_loadu_pd( &B[0+bs*2] );
2600 c_1 = _mm_load_sd( &B[2+bs*2] );
2601 a_0 = _mm_mul_pd( alpha_0, a_0 );
2602 a_1 = _mm_mul_sd( alpha_0, a_1 );
2603 a_0 = _mm_add_pd( c_0, a_0 );
2604 a_1 = _mm_add_sd( c_1, a_1 );
2605 _mm_storeu_pd( &B[0+bs*2], a_0 );
2606 _mm_store_sd( &B[2+bs*2], a_1 );
2607
2608 a_0 = _mm_loadu_pd( &A0[2+bs*3] );
2609 a_1 = _mm_load_sd( &A1[0+bs*3] );
2610 c_0 = _mm_loadu_pd( &B[0+bs*3] );
2611 c_1 = _mm_load_sd( &B[2+bs*3] );
2612 a_0 = _mm_mul_pd( alpha_0, a_0 );
2613 a_1 = _mm_mul_sd( alpha_0, a_1 );
2614 a_0 = _mm_add_pd( c_0, a_0 );
2615 a_1 = _mm_add_sd( c_1, a_1 );
2616 _mm_storeu_pd( &B[0+bs*3], a_0 );
2617 _mm_store_sd( &B[2+bs*3], a_1 );
2618
2619 A0 += 16;
2620 A1 += 16;
2621 B += 16;
2622
2623 }
2624 for(; k<kmax; k++)
2625 {
2626
2627 a_0 = _mm_loadu_pd( &A0[2+bs*0] );
2628 a_1 = _mm_load_sd( &A1[0+bs*0] );
2629 c_0 = _mm_loadu_pd( &B[0+bs*0] );
2630 c_1 = _mm_load_sd( &B[2+bs*0] );
2631 a_0 = _mm_mul_pd( alpha_0, a_0 );
2632 a_1 = _mm_mul_sd( alpha_0, a_1 );
2633 a_0 = _mm_add_pd( c_0, a_0 );
2634 a_1 = _mm_add_sd( c_1, a_1 );
2635 _mm_storeu_pd( &B[0+bs*0], a_0 );
2636 _mm_store_sd( &B[2+bs*0], a_1 );
2637
2638 A0 += 4;
2639 A1 += 4;
2640 B += 4;
2641
2642 }
2643
2644 }
2645
2646
2647
2648// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
2649void kernel_dgead_3_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
2650 {
2651
2652 if(kmax<=0)
2653 return;
2654
2655 const int bs = 4;
2656
2657 double *A1 = A0 + bs*sda;
2658
2659 __m128d
2660 a_0, a_1,
2661 alpha_0, c_0, c_1;
2662
2663 int k;
2664
2665 alpha_0 = _mm_loaddup_pd( &alpha );
2666
2667 for(k=0; k<kmax-3; k+=4)
2668 {
2669
2670 a_0 = _mm_load_sd( &A0[3+bs*0] );
2671 a_1 = _mm_loadu_pd( &A1[0+bs*0] );
2672 c_0 = _mm_load_sd( &B[0+bs*0] );
2673 c_1 = _mm_loadu_pd( &B[1+bs*0] );
2674 a_0 = _mm_mul_sd( alpha_0, a_0 );
2675 a_1 = _mm_mul_pd( alpha_0, a_1 );
2676 a_0 = _mm_add_sd( c_0, a_0 );
2677 a_1 = _mm_add_pd( c_1, a_1 );
2678 _mm_store_sd( &B[0+bs*0], a_0 );
2679 _mm_storeu_pd( &B[1+bs*0], a_1 );
2680
2681 a_0 = _mm_load_sd( &A0[3+bs*1] );
2682 a_1 = _mm_loadu_pd( &A1[0+bs*1] );
2683 c_0 = _mm_load_sd( &B[0+bs*1] );
2684 c_1 = _mm_loadu_pd( &B[1+bs*1] );
2685 a_0 = _mm_mul_sd( alpha_0, a_0 );
2686 a_1 = _mm_mul_pd( alpha_0, a_1 );
2687 a_0 = _mm_add_sd( c_0, a_0 );
2688 a_1 = _mm_add_pd( c_1, a_1 );
2689 _mm_store_sd( &B[0+bs*1], a_0 );
2690 _mm_storeu_pd( &B[1+bs*1], a_1 );
2691
2692 a_0 = _mm_load_sd( &A0[3+bs*2] );
2693 a_1 = _mm_loadu_pd( &A1[0+bs*2] );
2694 c_0 = _mm_load_sd( &B[0+bs*2] );
2695 c_1 = _mm_loadu_pd( &B[1+bs*2] );
2696 a_0 = _mm_mul_sd( alpha_0, a_0 );
2697 a_1 = _mm_mul_pd( alpha_0, a_1 );
2698 a_0 = _mm_add_sd( c_0, a_0 );
2699 a_1 = _mm_add_pd( c_1, a_1 );
2700 _mm_store_sd( &B[0+bs*2], a_0 );
2701 _mm_storeu_pd( &B[1+bs*2], a_1 );
2702
2703 a_0 = _mm_load_sd( &A0[3+bs*3] );
2704 a_1 = _mm_loadu_pd( &A1[0+bs*3] );
2705 c_0 = _mm_load_sd( &B[0+bs*3] );
2706 c_1 = _mm_loadu_pd( &B[1+bs*3] );
2707 a_0 = _mm_mul_sd( alpha_0, a_0 );
2708 a_1 = _mm_mul_pd( alpha_0, a_1 );
2709 a_0 = _mm_add_sd( c_0, a_0 );
2710 a_1 = _mm_add_pd( c_1, a_1 );
2711 _mm_store_sd( &B[0+bs*3], a_0 );
2712 _mm_storeu_pd( &B[1+bs*3], a_1 );
2713
2714 A0 += 16;
2715 A1 += 16;
2716 B += 16;
2717
2718 }
2719 for(; k<kmax; k++)
2720 {
2721
2722 a_0 = _mm_load_sd( &A0[3+bs*0] );
2723 a_1 = _mm_loadu_pd( &A1[0+bs*0] );
2724 c_0 = _mm_load_sd( &B[0+bs*0] );
2725 c_1 = _mm_loadu_pd( &B[1+bs*0] );
2726 a_0 = _mm_mul_sd( alpha_0, a_0 );
2727 a_1 = _mm_mul_pd( alpha_0, a_1 );
2728 a_0 = _mm_add_sd( c_0, a_0 );
2729 a_1 = _mm_add_pd( c_1, a_1 );
2730 _mm_store_sd( &B[0+bs*0], a_0 );
2731 _mm_storeu_pd( &B[1+bs*0], a_1 );
2732
2733 A0 += 4;
2734 A1 += 4;
2735 B += 4;
2736
2737 }
2738
2739 }
2740
2741
2742
2743// both A and B are aligned to 64-bit boundaries
2744void kernel_dgead_2_0_lib4(int kmax, double alpha, double *A, double *B)
2745 {
2746
2747 if(kmax<=0)
2748 return;
2749
2750 const int bs = 4;
2751
2752 __m128d
2753 a_0, c_0, alpha_0;
2754
2755 int k;
2756
2757 alpha_0 = _mm_loaddup_pd( &alpha );
2758
2759 for(k=0; k<kmax-3; k+=4)
2760 {
2761
2762 a_0 = _mm_loadu_pd( &A[0+bs*0] );
2763 c_0 = _mm_loadu_pd( &B[0+bs*0] );
2764 a_0 = _mm_mul_pd( alpha_0, a_0 );
2765 a_0 = _mm_add_pd( c_0, a_0 );
2766 _mm_storeu_pd( &B[0+bs*0], a_0 );
2767
2768 a_0 = _mm_loadu_pd( &A[0+bs*1] );
2769 c_0 = _mm_loadu_pd( &B[0+bs*1] );
2770 a_0 = _mm_mul_pd( alpha_0, a_0 );
2771 a_0 = _mm_add_pd( c_0, a_0 );
2772 _mm_storeu_pd( &B[0+bs*1], a_0 );
2773
2774 a_0 = _mm_loadu_pd( &A[0+bs*2] );
2775 c_0 = _mm_loadu_pd( &B[0+bs*2] );
2776 a_0 = _mm_mul_pd( alpha_0, a_0 );
2777 a_0 = _mm_add_pd( c_0, a_0 );
2778 _mm_storeu_pd( &B[0+bs*2], a_0 );
2779
2780 a_0 = _mm_loadu_pd( &A[0+bs*3] );
2781 c_0 = _mm_loadu_pd( &B[0+bs*3] );
2782 a_0 = _mm_mul_pd( alpha_0, a_0 );
2783 a_0 = _mm_add_pd( c_0, a_0 );
2784 _mm_storeu_pd( &B[0+bs*3], a_0 );
2785
2786 A += 16;
2787 B += 16;
2788
2789 }
2790 for(; k<kmax; k++)
2791 {
2792
2793 a_0 = _mm_loadu_pd( &A[0+bs*0] );
2794 c_0 = _mm_loadu_pd( &B[0+bs*0] );
2795 a_0 = _mm_mul_pd( alpha_0, a_0 );
2796 a_0 = _mm_add_pd( c_0, a_0 );
2797 _mm_storeu_pd( &B[0+bs*0], a_0 );
2798
2799 A += 4;
2800 B += 4;
2801
2802 }
2803
2804 }
2805
2806
2807
2808// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
2809void kernel_dgead_2_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
2810 {
2811
2812 if(kmax<=0)
2813 return;
2814
2815 const int bs = 4;
2816
2817 double *A1 = A0 + bs*sda;
2818
2819 __m128d
2820 a_0, c_0, alpha_0;
2821
2822 int k;
2823
2824 alpha_0 = _mm_loaddup_pd( &alpha );
2825
2826 for(k=0; k<kmax-3; k+=4)
2827 {
2828
2829 a_0 = _mm_load_sd( &A0[3+bs*0] );
2830 a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
2831 c_0 = _mm_loadu_pd( &B[0+bs*0] );
2832 a_0 = _mm_mul_pd( alpha_0, a_0 );
2833 a_0 = _mm_add_pd( c_0, a_0 );
2834 _mm_storeu_pd( &B[0+bs*0], a_0 );
2835
2836 a_0 = _mm_load_sd( &A0[3+bs*1] );
2837 a_0 = _mm_loadh_pd( a_0, &A1[0+bs*1] );
2838 c_0 = _mm_loadu_pd( &B[0+bs*1] );
2839 a_0 = _mm_mul_pd( alpha_0, a_0 );
2840 a_0 = _mm_add_pd( c_0, a_0 );
2841 _mm_storeu_pd( &B[0+bs*1], a_0 );
2842
2843 a_0 = _mm_load_sd( &A0[3+bs*2] );
2844 a_0 = _mm_loadh_pd( a_0, &A1[0+bs*2] );
2845 c_0 = _mm_loadu_pd( &B[0+bs*2] );
2846 a_0 = _mm_mul_pd( alpha_0, a_0 );
2847 a_0 = _mm_add_pd( c_0, a_0 );
2848 _mm_storeu_pd( &B[0+bs*2], a_0 );
2849
2850 a_0 = _mm_load_sd( &A0[3+bs*3] );
2851 a_0 = _mm_loadh_pd( a_0, &A1[0+bs*3] );
2852 c_0 = _mm_loadu_pd( &B[0+bs*3] );
2853 a_0 = _mm_mul_pd( alpha_0, a_0 );
2854 a_0 = _mm_add_pd( c_0, a_0 );
2855 _mm_storeu_pd( &B[0+bs*3], a_0 );
2856
2857 A0 += 16;
2858 A1 += 16;
2859 B += 16;
2860
2861 }
2862 for(; k<kmax; k++)
2863 {
2864
2865 a_0 = _mm_load_sd( &A0[3+bs*0] );
2866 a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
2867 c_0 = _mm_loadu_pd( &B[0+bs*0] );
2868 a_0 = _mm_mul_pd( alpha_0, a_0 );
2869 a_0 = _mm_add_pd( c_0, a_0 );
2870 _mm_storeu_pd( &B[0+bs*0], a_0 );
2871
2872 A0 += 4;
2873 A1 += 4;
2874 B += 4;
2875
2876 }
2877
2878 }
2879
2880
2881
2882// both A and B are aligned 64-bit boundaries
2883void kernel_dgead_1_0_lib4(int kmax, double alpha, double *A, double *B)
2884 {
2885
2886 if(kmax<=0)
2887 return;
2888
2889 const int bs = 4;
2890
2891 __m128d
2892 a_0, c_0, alpha_0;
2893
2894 int k;
2895
2896 alpha_0 = _mm_load_sd( &alpha );
2897
2898 for(k=0; k<kmax-3; k+=4)
2899 {
2900
2901 a_0 = _mm_load_sd( &A[0+bs*0] );
2902 c_0 = _mm_load_sd( &B[0+bs*0] );
2903 a_0 = _mm_mul_sd( alpha_0, a_0 );
2904 a_0 = _mm_add_sd( c_0, a_0 );
2905 _mm_store_sd( &B[0+bs*0], a_0 );
2906
2907 a_0 = _mm_load_sd( &A[0+bs*1] );
2908 c_0 = _mm_load_sd( &B[0+bs*1] );
2909 a_0 = _mm_mul_sd( alpha_0, a_0 );
2910 a_0 = _mm_add_sd( c_0, a_0 );
2911 _mm_store_sd( &B[0+bs*1], a_0 );
2912
2913 a_0 = _mm_load_sd( &A[0+bs*2] );
2914 c_0 = _mm_load_sd( &B[0+bs*2] );
2915 a_0 = _mm_mul_sd( alpha_0, a_0 );
2916 a_0 = _mm_add_sd( c_0, a_0 );
2917 _mm_store_sd( &B[0+bs*2], a_0 );
2918
2919 a_0 = _mm_load_sd( &A[0+bs*3] );
2920 c_0 = _mm_load_sd( &B[0+bs*3] );
2921 a_0 = _mm_mul_sd( alpha_0, a_0 );
2922 a_0 = _mm_add_sd( c_0, a_0 );
2923 _mm_store_sd( &B[0+bs*3], a_0 );
2924
2925 A += 16;
2926 B += 16;
2927
2928 }
2929 for(; k<kmax; k++)
2930 {
2931
2932 a_0 = _mm_load_sd( &A[0+bs*0] );
2933 c_0 = _mm_load_sd( &B[0+bs*0] );
2934 a_0 = _mm_mul_sd( alpha_0, a_0 );
2935 a_0 = _mm_add_sd( c_0, a_0 );
2936 _mm_store_sd( &B[0+bs*0], a_0 );
2937
2938 A += 4;
2939 B += 4;
2940
2941 }
2942
2943 }
2944
2945
2946
2947void kernel_dgeset_4_lib4(int kmax, double alpha, double *A)
2948 {
2949
2950 int k;
2951
2952 __m256d
2953 a0;
2954
2955 a0 = _mm256_broadcast_sd( &alpha );
2956
2957 for(k=0; k<kmax-3; k+=4)
2958 {
2959
2960 _mm256_store_pd( &A[0], a0 );
2961 _mm256_store_pd( &A[4], a0 );
2962 _mm256_store_pd( &A[8], a0 );
2963 _mm256_store_pd( &A[12], a0 );
2964
2965 A += 16;
2966
2967 }
2968 for(; k<kmax; k++)
2969 {
2970
2971 _mm256_store_pd( &A[0], a0 );
2972
2973 A += 4;
2974
2975 }
2976
2977 }
2978
2979
2980// A lower triangular
2981void kernel_dtrset_4_lib4(int kmax, double alpha, double *A)
2982 {
2983
2984 int k;
2985
2986 __m256d
2987 a0;
2988
2989 a0 = _mm256_broadcast_sd( &alpha );
2990
2991 for(k=0; k<kmax-3; k+=4)
2992 {
2993
2994 _mm256_store_pd( &A[0], a0 );
2995 _mm256_store_pd( &A[4], a0 );
2996 _mm256_store_pd( &A[8], a0 );
2997 _mm256_store_pd( &A[12], a0 );
2998
2999 A += 16;
3000
3001 }
3002 for(; k<kmax; k++)
3003 {
3004
3005 _mm256_store_pd( &A[0], a0 );
3006
3007 A += 4;
3008
3009 }
3010
3011 // final 4x4 triangle
3012 _mm256_store_pd( &A[0], a0 );
3013
3014 _mm_store_sd( &A[5], _mm256_castpd256_pd128( a0 ) );
3015 _mm_store_pd( &A[6], _mm256_castpd256_pd128( a0 ) );
3016
3017 _mm_store_pd( &A[10], _mm256_castpd256_pd128( a0 ) );
3018
3019 _mm_store_sd( &A[15], _mm256_castpd256_pd128( a0 ) );
3020
3021 }
3022
3023
3024