blob: 14d00efe7b0726e3ba85fbcdf156ec75fe19586c [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#include <mmintrin.h>
30#include <xmmintrin.h> // SSE
31#include <emmintrin.h> // SSE2
32#include <pmmintrin.h> // SSE3
33#include <smmintrin.h> // SSE4
34#include <immintrin.h> // AVX
35
36
37
38
39// TODO tri !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
40void kernel_dgetr_8_lib4(int tri, int kmax, int kna, double alpha, double *A0, int sda, double *C, int sdc)
41 {
42
43 const int bs = 4;
44
45 double *A1 = A0 + bs*sda;
46
47 int k;
48
49 __m256d
50 alph,
51 v0, v1, v2, v3, v4, v5, v6, v7,
52 v8, v9, va, vb, vc, vd, ve, vf;
53
54 alph = _mm256_broadcast_sd( &alpha );
55
56 k = 0;
57
58 if(kmax<kna)
59 goto cleanup_loop;
60
61 if(kna>0)
62 {
63 for( ; k<kna; k++)
64 {
65 C[0+bs*0] = alpha * A0[0+bs*0];
66 C[0+bs*1] = alpha * A0[1+bs*0];
67 C[0+bs*2] = alpha * A0[2+bs*0];
68 C[0+bs*3] = alpha * A0[3+bs*0];
69
70 C[0+bs*4] = alpha * A1[0+bs*0];
71 C[0+bs*5] = alpha * A1[1+bs*0];
72 C[0+bs*6] = alpha * A1[2+bs*0];
73 C[0+bs*7] = alpha * A1[3+bs*0];
74
75 C += 1;
76 A0 += bs;
77 A1 += bs;
78 }
79 C += bs*(sdc-1);
80 }
81
82 for(; k<kmax-7; k+=8)
83 {
84
85 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
86 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
87 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
88 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
89
90 A0 += 4*bs;
91
92 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
93 v4 = _mm256_mul_pd( v4, alph );
94 _mm256_store_pd( &C[0+bs*0], v4 );
95 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
96 v5 = _mm256_mul_pd( v5, alph );
97 _mm256_store_pd( &C[0+bs*1], v5 );
98 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
99 v6 = _mm256_mul_pd( v6, alph );
100 _mm256_store_pd( &C[0+bs*2], v6 );
101 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
102 v7 = _mm256_mul_pd( v7, alph );
103 _mm256_store_pd( &C[0+bs*3], v7 );
104
105 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
106 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
107 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
108 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
109
110 A1 += 4*bs;
111
112 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
113 v4 = _mm256_mul_pd( v4, alph );
114 _mm256_store_pd( &C[0+bs*4], v4 );
115 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
116 v5 = _mm256_mul_pd( v5, alph );
117 _mm256_store_pd( &C[0+bs*5], v5 );
118 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
119 v6 = _mm256_mul_pd( v6, alph );
120 _mm256_store_pd( &C[0+bs*6], v6 );
121 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
122 v7 = _mm256_mul_pd( v7, alph );
123 _mm256_store_pd( &C[0+bs*7], v7 );
124
125 C += sdc*bs;
126
127 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
128 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
129 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
130 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
131
132 A0 += 4*bs;
133
134 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
135 v4 = _mm256_mul_pd( v4, alph );
136 _mm256_store_pd( &C[0+bs*0], v4 );
137 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
138 v5 = _mm256_mul_pd( v5, alph );
139 _mm256_store_pd( &C[0+bs*1], v5 );
140 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
141 v6 = _mm256_mul_pd( v6, alph );
142 _mm256_store_pd( &C[0+bs*2], v6 );
143 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
144 v7 = _mm256_mul_pd( v7, alph );
145 _mm256_store_pd( &C[0+bs*3], v7 );
146
147 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
148 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
149 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
150 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
151
152 A1 += 4*bs;
153
154 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
155 v4 = _mm256_mul_pd( v4, alph );
156 _mm256_store_pd( &C[0+bs*4], v4 );
157 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
158 v5 = _mm256_mul_pd( v5, alph );
159 _mm256_store_pd( &C[0+bs*5], v5 );
160 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
161 v6 = _mm256_mul_pd( v6, alph );
162 _mm256_store_pd( &C[0+bs*6], v6 );
163 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
164 v7 = _mm256_mul_pd( v7, alph );
165 _mm256_store_pd( &C[0+bs*7], v7 );
166
167 C += sdc*bs;
168
169 }
170
171 for(; k<kmax-3; k+=4)
172 {
173
174 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
175 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
176 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
177 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
178
179 A0 += 4*bs;
180
181 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
182 v4 = _mm256_mul_pd( v4, alph );
183 _mm256_store_pd( &C[0+bs*0], v4 );
184 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
185 v5 = _mm256_mul_pd( v5, alph );
186 _mm256_store_pd( &C[0+bs*1], v5 );
187 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
188 v6 = _mm256_mul_pd( v6, alph );
189 _mm256_store_pd( &C[0+bs*2], v6 );
190 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
191 v7 = _mm256_mul_pd( v7, alph );
192 _mm256_store_pd( &C[0+bs*3], v7 );
193
194 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
195 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
196 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
197 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
198
199 A1 += 4*bs;
200
201 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
202 v4 = _mm256_mul_pd( v4, alph );
203 _mm256_store_pd( &C[0+bs*4], v4 );
204 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
205 v5 = _mm256_mul_pd( v5, alph );
206 _mm256_store_pd( &C[0+bs*5], v5 );
207 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
208 v6 = _mm256_mul_pd( v6, alph );
209 _mm256_store_pd( &C[0+bs*6], v6 );
210 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
211 v7 = _mm256_mul_pd( v7, alph );
212 _mm256_store_pd( &C[0+bs*7], v7 );
213
214 C += sdc*bs;
215
216 }
217
218
219 cleanup_loop:
220
221 for( ; k<kmax; k++)
222 {
223 C[0+bs*0] = alpha * A0[0+bs*0];
224 C[0+bs*1] = alpha * A0[1+bs*0];
225 C[0+bs*2] = alpha * A0[2+bs*0];
226 C[0+bs*3] = alpha * A0[3+bs*0];
227
228 C[0+bs*4] = alpha * A1[0+bs*0];
229 C[0+bs*5] = alpha * A1[1+bs*0];
230 C[0+bs*6] = alpha * A1[2+bs*0];
231 C[0+bs*7] = alpha * A1[3+bs*0];
232
233 C += 1;
234 A0 += bs;
235 A1 += bs;
236 }
237
238 }
239
240
241
242// transposed of general matrices, read along panels, write across panels
243void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
244 {
245
246 if(tri==1)
247 {
248 // A is lower triangular, C is upper triangular
249 // kmax+1 4-wide + end 3x3 triangle
250
251 kmax += 1;
252 }
253
254 const int bs = 4;
255
256 __m256d
257 alph,
258 v0, v1, v2, v3,
259 v4, v5, v6, v7;
260
261 alph = _mm256_broadcast_sd( &alpha );
262
263 int k;
264
265 k = 0;
266
267 if(kmax<kna)
268 goto cleanup_loop;
269
270 if(kna>0)
271 {
272 for( ; k<kna; k++)
273 {
274 C[0+bs*0] = alpha * A[0+bs*0];
275 C[0+bs*1] = alpha * A[1+bs*0];
276 C[0+bs*2] = alpha * A[2+bs*0];
277 C[0+bs*3] = alpha * A[3+bs*0];
278
279 C += 1;
280 A += bs;
281 }
282 C += bs*(sdc-1);
283 }
284
285 for( ; k<kmax-7; k+=8)
286 {
287
288#if 1
289
290 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 ); // 00 10 02 12
291 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 ); // 01 11 03 13
292 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 ); // 20 30 22 32
293 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 ); // 21 31 23 33
294
295 A += 4*bs;
296
297 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
298 v4 = _mm256_mul_pd( v4, alph );
299 _mm256_store_pd( &C[0+bs*0], v4 );
300 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
301 v5 = _mm256_mul_pd( v5, alph );
302 _mm256_store_pd( &C[0+bs*1], v5 );
303 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
304 v6 = _mm256_mul_pd( v6, alph );
305 _mm256_store_pd( &C[0+bs*2], v6 );
306 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
307 v7 = _mm256_mul_pd( v7, alph );
308 _mm256_store_pd( &C[0+bs*3], v7 );
309
310 C += sdc*bs;
311
312 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 );
313 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 );
314 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 );
315 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 );
316
317 A += 4*bs;
318
319 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
320 v4 = _mm256_mul_pd( v4, alph );
321 _mm256_store_pd( &C[0+bs*0], v4 );
322 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
323 v5 = _mm256_mul_pd( v5, alph );
324 _mm256_store_pd( &C[0+bs*1], v5 );
325 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
326 v6 = _mm256_mul_pd( v6, alph );
327 _mm256_store_pd( &C[0+bs*2], v6 );
328 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
329 v7 = _mm256_mul_pd( v7, alph );
330 _mm256_store_pd( &C[0+bs*3], v7 );
331
332 C += sdc*bs;
333
334#else // TODO alpha
335
336 v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
337 v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
338 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
339 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
340 v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
341 v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
342 v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
343 v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
344
345 A += bs*bs;
346
347 v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
348 _mm256_store_pd( &C[0+bs*0], v0 );
349 v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
350 _mm256_store_pd( &C[0+bs*2], v2 );
351 v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
352 _mm256_store_pd( &C[0+bs*1], v1 );
353 v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
354 _mm256_store_pd( &C[0+bs*3], v3 );
355
356 C += bs*sdc;
357
358 v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
359 v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
360 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
361 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
362 v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
363 v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
364 v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
365 v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
366
367 A += bs*bs;
368
369 v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
370 _mm256_store_pd( &C[0+bs*0], v0 );
371 v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
372 _mm256_store_pd( &C[0+bs*2], v2 );
373 v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
374 _mm256_store_pd( &C[0+bs*1], v1 );
375 v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
376 _mm256_store_pd( &C[0+bs*3], v3 );
377
378 C += bs*sdc;
379
380#endif
381
382 }
383
384 for( ; k<kmax-3; k+=4)
385 {
386
387#if 1
388
389 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 ); // 00 10 02 12
390 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 ); // 01 11 03 13
391 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 ); // 20 30 22 32
392 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 ); // 21 31 23 33
393
394 A += 4*bs;
395
396 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
397 v4 = _mm256_mul_pd( v4, alph );
398 _mm256_store_pd( &C[0+bs*0], v4 );
399 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
400 v5 = _mm256_mul_pd( v5, alph );
401 _mm256_store_pd( &C[0+bs*1], v5 );
402 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
403 v6 = _mm256_mul_pd( v6, alph );
404 _mm256_store_pd( &C[0+bs*2], v6 );
405 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
406 v7 = _mm256_mul_pd( v7, alph );
407 _mm256_store_pd( &C[0+bs*3], v7 );
408
409 C += sdc*bs;
410
411#else
412
413 v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
414 v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
415 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
416 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
417 v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
418 v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
419 v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
420 v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
421
422 A += bs*bs;
423
424 v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
425 _mm256_store_pd( &C[0+bs*0], v0 );
426 v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
427 _mm256_store_pd( &C[0+bs*2], v2 );
428 v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
429 _mm256_store_pd( &C[0+bs*1], v1 );
430 v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
431 _mm256_store_pd( &C[0+bs*3], v3 );
432
433 C += bs*sdc;
434
435#endif
436
437 }
438
439 cleanup_loop:
440
441 for( ; k<kmax; k++)
442 {
443 C[0+bs*0] = alpha * A[0+bs*0];
444 C[0+bs*1] = alpha * A[1+bs*0];
445 C[0+bs*2] = alpha * A[2+bs*0];
446 C[0+bs*3] = alpha * A[3+bs*0];
447
448 C += 1;
449 A += bs;
450 }
451
452 if(tri==1)
453 {
454 // end 3x3 triangle
455 kna = (bs-(bs-kna+kmax)%bs)%bs;
456
457 if(kna==1)
458 {
459 C[0+bs*1] = alpha * A[1+bs*0];
460 C[0+bs*2] = alpha * A[2+bs*0];
461 C[0+bs*3] = alpha * A[3+bs*0];
462 C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
463 C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
464 C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
465 }
466 else if(kna==2)
467 {
468 C[0+bs*1] = alpha * A[1+bs*0];
469 C[0+bs*2] = alpha * A[2+bs*0];
470 C[0+bs*3] = alpha * A[3+bs*0];
471 C[1+bs*2] = alpha * A[2+bs*1];
472 C[1+bs*3] = alpha * A[3+bs*1];
473 C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
474 }
475 else
476 {
477 C[0+bs*1] = alpha * A[1+bs*0];
478 C[0+bs*2] = alpha * A[2+bs*0];
479 C[0+bs*3] = alpha * A[3+bs*0];
480 C[1+bs*2] = alpha * A[2+bs*1];
481 C[1+bs*3] = alpha * A[3+bs*1];
482 C[2+bs*3] = alpha * A[3+bs*2];
483 }
484 }
485
486 }
487
488
489
490// transposed of general matrices, read along panels, write across panels
491void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
492 {
493
494 if(tri==1)
495 {
496 // A is lower triangular, C is upper triangular
497 // kmax+1 3-wide + end 2x2 triangle
498
499 kmax += 1;
500 }
501
502 const int bs = 4;
503
504 int k;
505
506 k = 0;
507
508 if(kmax<kna)
509 goto cleanup_loop;
510
511 if(kna>0)
512 {
513 for( ; k<kna; k++)
514 {
515 C[0+bs*0] = alpha * A[0+bs*0];
516 C[0+bs*1] = alpha * A[1+bs*0];
517 C[0+bs*2] = alpha * A[2+bs*0];
518
519 C += 1;
520 A += bs;
521 }
522 C += bs*(sdc-1);
523 }
524
525 for( ; k<kmax-3; k+=4)
526 {
527 C[0+bs*0] = alpha * A[0+bs*0];
528 C[0+bs*1] = alpha * A[1+bs*0];
529 C[0+bs*2] = alpha * A[2+bs*0];
530
531 C[1+bs*0] = alpha * A[0+bs*1];
532 C[1+bs*1] = alpha * A[1+bs*1];
533 C[1+bs*2] = alpha * A[2+bs*1];
534
535 C[2+bs*0] = alpha * A[0+bs*2];
536 C[2+bs*1] = alpha * A[1+bs*2];
537 C[2+bs*2] = alpha * A[2+bs*2];
538
539 C[3+bs*0] = alpha * A[0+bs*3];
540 C[3+bs*1] = alpha * A[1+bs*3];
541 C[3+bs*2] = alpha * A[2+bs*3];
542
543 C += bs*sdc;
544 A += bs*bs;
545 }
546
547 cleanup_loop:
548
549 for( ; k<kmax; k++)
550 {
551 C[0+bs*0] = alpha * A[0+bs*0];
552 C[0+bs*1] = alpha * A[1+bs*0];
553 C[0+bs*2] = alpha * A[2+bs*0];
554
555 C += 1;
556 A += bs;
557 }
558
559 if(tri==1)
560 {
561 // end 2x2 triangle
562 kna = (bs-(bs-kna+kmax)%bs)%bs;
563
564 if(kna==1)
565 {
566 C[0+bs*1] = alpha * A[1+bs*0];
567 C[0+bs*2] = alpha * A[2+bs*0];
568 C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
569 }
570 else
571 {
572 C[0+bs*1] = alpha * A[1+bs*0];
573 C[0+bs*2] = alpha * A[2+bs*0];
574 C[1+bs*2] = alpha * A[2+bs*1];
575 }
576 }
577
578 }
579
580
581
582// transposed of general matrices, read along panels, write across panels
583void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
584 {
585
586 if(tri==1)
587 {
588 // A is lower triangular, C is upper triangular
589 // kmax+1 2-wide + end 1x1 triangle
590
591 kmax += 1;
592 }
593
594 const int bs = 4;
595
596 int k;
597
598 k = 0;
599
600 if(kmax<kna)
601 goto cleanup_loop;
602
603 if(kna>0)
604 {
605 for( ; k<kna; k++)
606 {
607 C[0+bs*0] = alpha * A[0+bs*0];
608 C[0+bs*1] = alpha * A[1+bs*0];
609
610 C += 1;
611 A += bs;
612 }
613 C += bs*(sdc-1);
614 }
615
616 for( ; k<kmax-3; k+=4)
617 {
618 C[0+bs*0] = alpha * A[0+bs*0];
619 C[0+bs*1] = alpha * A[1+bs*0];
620
621 C[1+bs*0] = alpha * A[0+bs*1];
622 C[1+bs*1] = alpha * A[1+bs*1];
623
624 C[2+bs*0] = alpha * A[0+bs*2];
625 C[2+bs*1] = alpha * A[1+bs*2];
626
627 C[3+bs*0] = alpha * A[0+bs*3];
628 C[3+bs*1] = alpha * A[1+bs*3];
629
630 C += bs*sdc;
631 A += bs*bs;
632 }
633
634 cleanup_loop:
635
636 for( ; k<kmax; k++)
637 {
638 C[0+bs*0] = alpha * A[0+bs*0];
639 C[0+bs*1] = alpha * A[1+bs*0];
640
641 C += 1;
642 A += bs;
643 }
644
645 if(tri==1)
646 {
647 // end 1x1 triangle
648 C[0+bs*1] = alpha * A[1+bs*0];
649 }
650
651 }
652
653
654
655// transposed of general matrices, read along panels, write across panels
656void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
657 {
658
659 if(tri==1)
660 {
661 // A is lower triangular, C is upper triangular
662 // kmax+1 1-wide
663
664 kmax += 1;
665 }
666
667 const int bs = 4;
668
669 int k;
670
671 k = 0;
672
673 if(kmax<kna)
674 goto cleanup_loop;
675
676 if(kna>0)
677 {
678 for( ; k<kna; k++)
679 {
680 C[0+bs*0] = alpha * A[0+bs*0];
681
682 C += 1;
683 A += bs;
684 }
685 C += bs*(sdc-1);
686 }
687
688 for( ; k<kmax-3; k+=4)
689 {
690 C[0+bs*0] = alpha * A[0+bs*0];
691
692 C[1+bs*0] = alpha * A[0+bs*1];
693
694 C[2+bs*0] = alpha * A[0+bs*2];
695
696 C[3+bs*0] = alpha * A[0+bs*3];
697
698 C += bs*sdc;
699 A += bs*bs;
700 }
701
702 cleanup_loop:
703
704 for( ; k<kmax; k++)
705 {
706 C[0+bs*0] = alpha * A[0+bs*0];
707
708 C += 1;
709 A += bs;
710 }
711
712 }
713
714
715
716// transposed of general matrices, read across panels, write along panels
717void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
718 {
719 const int ps = 4;
720 __m256d
721 v0, v1, v2, v3, v4, v5, v6, v7;
722 int k;
723 for(k=0; k<kmax-3; k+=4)
724 {
725
726 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*0] ) ), _mm_load_pd( &A[0+ps*2]) , 0x1 ); // 00 10 02 12
727 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*1] ) ), _mm_load_pd( &A[0+ps*3]) , 0x1 ); // 01 11 03 13
728 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*0] ) ), _mm_load_pd( &A[2+ps*2]) , 0x1 ); // 20 30 22 32
729 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*1] ) ), _mm_load_pd( &A[2+ps*3]) , 0x1 ); // 21 31 23 33
730
731 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
732 _mm256_store_pd( &B[0+ps*0], v4 );
733 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
734 _mm256_store_pd( &B[0+ps*1], v5 );
735 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
736 _mm256_store_pd( &B[0+ps*2], v6 );
737 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
738 _mm256_store_pd( &B[0+ps*3], v7 );
739
740 A += ps*sda;
741 B += ps*ps;
742 }
743 for( ; k<kmax; k++)
744 {
745 //
746 B[0+ps*0] = A[0+ps*0];
747 B[1+ps*0] = A[0+ps*1];
748 B[2+ps*0] = A[0+ps*2];
749 B[3+ps*0] = A[0+ps*3];
750
751 A += 1;
752 B += ps;
753 }
754 return;
755 }
756