blob: 29d095b9a64ae84679de4fb3987d88c7f43dd483 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#include <mmintrin.h>
30#include <xmmintrin.h> // SSE
31#include <emmintrin.h> // SSE2
32#include <pmmintrin.h> // SSE3
33#include <smmintrin.h> // SSE4
34#include <immintrin.h> // AVX
35
36
37
38// transposed of general matrices, read along panels, write across panels
39void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
40 {
41
42 if(tri==1)
43 {
44 // A is lower triangular, C is upper triangular
45 // kmax+1 4-wide + end 3x3 triangle
46
47 kmax += 1;
48 }
49
50 const int bs = 4;
51
52 __m256d
53 alph,
54 v0, v1, v2, v3,
55 v4, v5, v6, v7;
56
57 alph = _mm256_broadcast_sd( &alpha );
58
59 int k;
60
61 k = 0;
62
63 if(kmax<kna)
64 goto cleanup_loop;
65
66 if(kna>0)
67 {
68 for( ; k<kna; k++)
69 {
70 C[0+bs*0] = alpha * A[0+bs*0];
71 C[0+bs*1] = alpha * A[1+bs*0];
72 C[0+bs*2] = alpha * A[2+bs*0];
73 C[0+bs*3] = alpha * A[3+bs*0];
74
75 C += 1;
76 A += bs;
77 }
78 C += bs*(sdc-1);
79 }
80
81 for( ; k<kmax-7; k+=8)
82 {
83
84 v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
85 v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
86 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
87 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
88 v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
89 v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
90 v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
91 v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
92
93 A += bs*bs;
94
95 v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
96 v0 = _mm256_mul_pd( v0, alph );
97 _mm256_store_pd( &C[0+bs*0], v0 );
98 v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
99 v2 = _mm256_mul_pd( v2, alph );
100 _mm256_store_pd( &C[0+bs*2], v2 );
101 v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
102 v1 = _mm256_mul_pd( v1, alph );
103 _mm256_store_pd( &C[0+bs*1], v1 );
104 v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
105 v3 = _mm256_mul_pd( v3, alph );
106 _mm256_store_pd( &C[0+bs*3], v3 );
107
108 C += bs*sdc;
109
110 v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
111 v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
112 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
113 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
114 v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
115 v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
116 v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
117 v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
118
119 A += bs*bs;
120
121 v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
122 v0 = _mm256_mul_pd( v0, alph );
123 _mm256_store_pd( &C[0+bs*0], v0 );
124 v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
125 v2 = _mm256_mul_pd( v2, alph );
126 _mm256_store_pd( &C[0+bs*2], v2 );
127 v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
128 v1 = _mm256_mul_pd( v1, alph );
129 _mm256_store_pd( &C[0+bs*1], v1 );
130 v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
131 v3 = _mm256_mul_pd( v3, alph );
132 _mm256_store_pd( &C[0+bs*3], v3 );
133
134 C += bs*sdc;
135
136 }
137
138 for( ; k<kmax-3; k+=4)
139 {
140
141 v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
142 v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
143 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
144 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
145 v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
146 v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
147 v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
148 v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
149
150 A += bs*bs;
151
152 v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
153 v0 = _mm256_mul_pd( v0, alph );
154 _mm256_store_pd( &C[0+bs*0], v0 );
155 v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
156 v2 = _mm256_mul_pd( v2, alph );
157 _mm256_store_pd( &C[0+bs*2], v2 );
158 v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
159 v1 = _mm256_mul_pd( v1, alph );
160 _mm256_store_pd( &C[0+bs*1], v1 );
161 v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
162 v3 = _mm256_mul_pd( v3, alph );
163 _mm256_store_pd( &C[0+bs*3], v3 );
164
165 C += bs*sdc;
166
167 }
168
169 cleanup_loop:
170
171 for( ; k<kmax; k++)
172 {
173 C[0+bs*0] = alpha * A[0+bs*0];
174 C[0+bs*1] = alpha * A[1+bs*0];
175 C[0+bs*2] = alpha * A[2+bs*0];
176 C[0+bs*3] = alpha * A[3+bs*0];
177
178 C += 1;
179 A += bs;
180 }
181
182 if(tri==1)
183 {
184 // end 3x3 triangle
185 kna = (bs-(bs-kna+kmax)%bs)%bs;
186
187 if(kna==1)
188 {
189 C[0+bs*1] = alpha * A[1+bs*0];
190 C[0+bs*2] = alpha * A[2+bs*0];
191 C[0+bs*3] = alpha * A[3+bs*0];
192 C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
193 C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
194 C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
195 }
196 else if(kna==2)
197 {
198 C[0+bs*1] = alpha * A[1+bs*0];
199 C[0+bs*2] = alpha * A[2+bs*0];
200 C[0+bs*3] = alpha * A[3+bs*0];
201 C[1+bs*2] = alpha * A[2+bs*1];
202 C[1+bs*3] = alpha * A[3+bs*1];
203 C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
204 }
205 else
206 {
207 C[0+bs*1] = alpha * A[1+bs*0];
208 C[0+bs*2] = alpha * A[2+bs*0];
209 C[0+bs*3] = alpha * A[3+bs*0];
210 C[1+bs*2] = alpha * A[2+bs*1];
211 C[1+bs*3] = alpha * A[3+bs*1];
212 C[2+bs*3] = alpha * A[3+bs*2];
213 }
214 }
215
216 }
217
218
219
220// transposed of general matrices, read along panels, write across panels
221void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
222 {
223
224 if(tri==1)
225 {
226 // A is lower triangular, C is upper triangular
227 // kmax+1 3-wide + end 2x2 triangle
228
229 kmax += 1;
230 }
231
232 const int bs = 4;
233
234 int k;
235
236 k = 0;
237
238 if(kmax<kna)
239 goto cleanup_loop;
240
241 if(kna>0)
242 {
243 for( ; k<kna; k++)
244 {
245 C[0+bs*0] = alpha * A[0+bs*0];
246 C[0+bs*1] = alpha * A[1+bs*0];
247 C[0+bs*2] = alpha * A[2+bs*0];
248
249 C += 1;
250 A += bs;
251 }
252 C += bs*(sdc-1);
253 }
254
255 for( ; k<kmax-3; k+=4)
256 {
257 C[0+bs*0] = alpha * A[0+bs*0];
258 C[0+bs*1] = alpha * A[1+bs*0];
259 C[0+bs*2] = alpha * A[2+bs*0];
260
261 C[1+bs*0] = alpha * A[0+bs*1];
262 C[1+bs*1] = alpha * A[1+bs*1];
263 C[1+bs*2] = alpha * A[2+bs*1];
264
265 C[2+bs*0] = alpha * A[0+bs*2];
266 C[2+bs*1] = alpha * A[1+bs*2];
267 C[2+bs*2] = alpha * A[2+bs*2];
268
269 C[3+bs*0] = alpha * A[0+bs*3];
270 C[3+bs*1] = alpha * A[1+bs*3];
271 C[3+bs*2] = alpha * A[2+bs*3];
272
273 C += bs*sdc;
274 A += bs*bs;
275 }
276
277 cleanup_loop:
278
279 for( ; k<kmax; k++)
280 {
281 C[0+bs*0] = alpha * A[0+bs*0];
282 C[0+bs*1] = alpha * A[1+bs*0];
283 C[0+bs*2] = alpha * A[2+bs*0];
284
285 C += 1;
286 A += bs;
287 }
288
289 if(tri==1)
290 {
291 // end 2x2 triangle
292 kna = (bs-(bs-kna+kmax)%bs)%bs;
293
294 if(kna==1)
295 {
296 C[0+bs*1] = alpha * A[1+bs*0];
297 C[0+bs*2] = alpha * A[2+bs*0];
298 C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
299 }
300 else
301 {
302 C[0+bs*1] = alpha * A[1+bs*0];
303 C[0+bs*2] = alpha * A[2+bs*0];
304 C[1+bs*2] = alpha * A[2+bs*1];
305 }
306 }
307
308 }
309
310
311
312// transposed of general matrices, read along panels, write across panels
313void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
314 {
315
316 if(tri==1)
317 {
318 // A is lower triangular, C is upper triangular
319 // kmax+1 2-wide + end 1x1 triangle
320
321 kmax += 1;
322 }
323
324 const int bs = 4;
325
326 int k;
327
328 k = 0;
329
330 if(kmax<kna)
331 goto cleanup_loop;
332
333 if(kna>0)
334 {
335 for( ; k<kna; k++)
336 {
337 C[0+bs*0] = alpha * A[0+bs*0];
338 C[0+bs*1] = alpha * A[1+bs*0];
339
340 C += 1;
341 A += bs;
342 }
343 C += bs*(sdc-1);
344 }
345
346 for( ; k<kmax-3; k+=4)
347 {
348 C[0+bs*0] = alpha * A[0+bs*0];
349 C[0+bs*1] = alpha * A[1+bs*0];
350
351 C[1+bs*0] = alpha * A[0+bs*1];
352 C[1+bs*1] = alpha * A[1+bs*1];
353
354 C[2+bs*0] = alpha * A[0+bs*2];
355 C[2+bs*1] = alpha * A[1+bs*2];
356
357 C[3+bs*0] = alpha * A[0+bs*3];
358 C[3+bs*1] = alpha * A[1+bs*3];
359
360 C += bs*sdc;
361 A += bs*bs;
362 }
363
364 cleanup_loop:
365
366 for( ; k<kmax; k++)
367 {
368 C[0+bs*0] = alpha * A[0+bs*0];
369 C[0+bs*1] = alpha * A[1+bs*0];
370
371 C += 1;
372 A += bs;
373 }
374
375 if(tri==1)
376 {
377 // end 1x1 triangle
378 C[0+bs*1] = alpha * A[1+bs*0];
379 }
380
381 }
382
383
384
385// transposed of general matrices, read along panels, write across panels
386void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
387 {
388
389 if(tri==1)
390 {
391 // A is lower triangular, C is upper triangular
392 // kmax+1 1-wide
393
394 kmax += 1;
395 }
396
397 const int bs = 4;
398
399 int k;
400
401 k = 0;
402
403 if(kmax<kna)
404 goto cleanup_loop;
405
406 if(kna>0)
407 {
408 for( ; k<kna; k++)
409 {
410 C[0+bs*0] = alpha * A[0+bs*0];
411
412 C += 1;
413 A += bs;
414 }
415 C += bs*(sdc-1);
416 }
417
418 for( ; k<kmax-3; k+=4)
419 {
420 C[0+bs*0] = alpha * A[0+bs*0];
421
422 C[1+bs*0] = alpha * A[0+bs*1];
423
424 C[2+bs*0] = alpha * A[0+bs*2];
425
426 C[3+bs*0] = alpha * A[0+bs*3];
427
428 C += bs*sdc;
429 A += bs*bs;
430 }
431
432 cleanup_loop:
433
434 for( ; k<kmax; k++)
435 {
436 C[0+bs*0] = alpha * A[0+bs*0];
437
438 C += 1;
439 A += bs;
440 }
441
442 }
443
444
445
446// transposed of general matrices, read across panels, write along panels
447void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
448 {
449 const int ps = 4;
450 __m256d
451 v0, v1, v2, v3, v4, v5, v6, v7;
452 int k;
453 for(k=0; k<kmax-3; k+=4)
454 {
455
456 v0 = _mm256_load_pd( &A[0+ps*0] ); // 00 10 20 30
457 v1 = _mm256_load_pd( &A[0+ps*1] ); // 01 11 21 31
458 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
459 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
460 v2 = _mm256_load_pd( &A[0+ps*2] ); // 02 12 22 32
461 v3 = _mm256_load_pd( &A[0+ps*3] ); // 03 13 23 33
462 v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
463 v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
464
465 v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
466 _mm256_store_pd( &B[0+ps*0], v0 );
467 v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
468 _mm256_store_pd( &B[0+ps*2], v2 );
469 v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
470 _mm256_store_pd( &B[0+ps*1], v1 );
471 v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
472 _mm256_store_pd( &B[0+ps*3], v3 );
473
474 A += ps*sda;
475 B += ps*ps;
476 }
477 for( ; k<kmax; k++)
478 {
479 //
480 B[0+ps*0] = A[0+ps*0];
481 B[1+ps*0] = A[0+ps*1];
482 B[2+ps*0] = A[0+ps*2];
483 B[3+ps*0] = A[0+ps*3];
484
485 A += 1;
486 B += ps;
487 }
488 return;
489 }
490