blob: 7d62277015dc69164a5b0524f94127ce4c988ef2 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29
30
31// transposed of general matrices, read along panels, write across panels
32void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
33 {
34
35 if(tri==1)
36 {
37 // A is lower triangular, C is upper triangular
38 // kmax+1 4-wide + end 3x3 triangle
39
40 kmax += 1;
41 }
42
43 const int bs = 4;
44
45 int k;
46
47 k = 0;
48
49 if(kmax<kna)
50 goto cleanup_loop;
51
52 if(kna>0)
53 {
54 for( ; k<kna; k++)
55 {
56 C[0+bs*0] = alpha * A[0+bs*0];
57 C[0+bs*1] = alpha * A[1+bs*0];
58 C[0+bs*2] = alpha * A[2+bs*0];
59 C[0+bs*3] = alpha * A[3+bs*0];
60
61 C += 1;
62 A += bs;
63 }
64 C += bs*(sdc-1);
65 }
66
67 for( ; k<kmax-3; k+=4)
68 {
69 C[0+bs*0] = alpha * A[0+bs*0];
70 C[0+bs*1] = alpha * A[1+bs*0];
71 C[0+bs*2] = alpha * A[2+bs*0];
72 C[0+bs*3] = alpha * A[3+bs*0];
73
74 C[1+bs*0] = alpha * A[0+bs*1];
75 C[1+bs*1] = alpha * A[1+bs*1];
76 C[1+bs*2] = alpha * A[2+bs*1];
77 C[1+bs*3] = alpha * A[3+bs*1];
78
79 C[2+bs*0] = alpha * A[0+bs*2];
80 C[2+bs*1] = alpha * A[1+bs*2];
81 C[2+bs*2] = alpha * A[2+bs*2];
82 C[2+bs*3] = alpha * A[3+bs*2];
83
84 C[3+bs*0] = alpha * A[0+bs*3];
85 C[3+bs*1] = alpha * A[1+bs*3];
86 C[3+bs*2] = alpha * A[2+bs*3];
87 C[3+bs*3] = alpha * A[3+bs*3];
88
89 C += bs*sdc;
90 A += bs*bs;
91 }
92
93 cleanup_loop:
94
95 for( ; k<kmax; k++)
96 {
97 C[0+bs*0] = alpha * A[0+bs*0];
98 C[0+bs*1] = alpha * A[1+bs*0];
99 C[0+bs*2] = alpha * A[2+bs*0];
100 C[0+bs*3] = alpha * A[3+bs*0];
101
102 C += 1;
103 A += bs;
104 }
105
106 if(tri==1)
107 {
108 // end 3x3 triangle
109 kna = (bs-(bs-kna+kmax)%bs)%bs;
110
111 if(kna==1)
112 {
113 C[0+bs*1] = alpha * A[1+bs*0];
114 C[0+bs*2] = alpha * A[2+bs*0];
115 C[0+bs*3] = alpha * A[3+bs*0];
116 C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
117 C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
118 C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
119 }
120 else if(kna==2)
121 {
122 C[0+bs*1] = alpha * A[1+bs*0];
123 C[0+bs*2] = alpha * A[2+bs*0];
124 C[0+bs*3] = alpha * A[3+bs*0];
125 C[1+bs*2] = alpha * A[2+bs*1];
126 C[1+bs*3] = alpha * A[3+bs*1];
127 C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
128 }
129 else
130 {
131 C[0+bs*1] = alpha * A[1+bs*0];
132 C[0+bs*2] = alpha * A[2+bs*0];
133 C[0+bs*3] = alpha * A[3+bs*0];
134 C[1+bs*2] = alpha * A[2+bs*1];
135 C[1+bs*3] = alpha * A[3+bs*1];
136 C[2+bs*3] = alpha * A[3+bs*2];
137 }
138 }
139
140 }
141
142
143
144// transposed of general matrices, read along panels, write across panels
145void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
146 {
147
148 if(tri==1)
149 {
150 // A is lower triangular, C is upper triangular
151 // kmax+1 3-wide + end 2x2 triangle
152
153 kmax += 1;
154 }
155
156 const int bs = 4;
157
158 int k;
159
160 k = 0;
161
162 if(kmax<kna)
163 goto cleanup_loop;
164
165 if(kna>0)
166 {
167 for( ; k<kna; k++)
168 {
169 C[0+bs*0] = alpha * A[0+bs*0];
170 C[0+bs*1] = alpha * A[1+bs*0];
171 C[0+bs*2] = alpha * A[2+bs*0];
172
173 C += 1;
174 A += bs;
175 }
176 C += bs*(sdc-1);
177 }
178
179 for( ; k<kmax-3; k+=4)
180 {
181 C[0+bs*0] = alpha * A[0+bs*0];
182 C[0+bs*1] = alpha * A[1+bs*0];
183 C[0+bs*2] = alpha * A[2+bs*0];
184
185 C[1+bs*0] = alpha * A[0+bs*1];
186 C[1+bs*1] = alpha * A[1+bs*1];
187 C[1+bs*2] = alpha * A[2+bs*1];
188
189 C[2+bs*0] = alpha * A[0+bs*2];
190 C[2+bs*1] = alpha * A[1+bs*2];
191 C[2+bs*2] = alpha * A[2+bs*2];
192
193 C[3+bs*0] = alpha * A[0+bs*3];
194 C[3+bs*1] = alpha * A[1+bs*3];
195 C[3+bs*2] = alpha * A[2+bs*3];
196
197 C += bs*sdc;
198 A += bs*bs;
199 }
200
201 cleanup_loop:
202
203 for( ; k<kmax; k++)
204 {
205 C[0+bs*0] = alpha * A[0+bs*0];
206 C[0+bs*1] = alpha * A[1+bs*0];
207 C[0+bs*2] = alpha * A[2+bs*0];
208
209 C += 1;
210 A += bs;
211 }
212
213 if(tri==1)
214 {
215 // end 2x2 triangle
216 kna = (bs-(bs-kna+kmax)%bs)%bs;
217
218 if(kna==1)
219 {
220 C[0+bs*1] = alpha * A[1+bs*0];
221 C[0+bs*2] = alpha * A[2+bs*0];
222 C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
223 }
224 else
225 {
226 C[0+bs*1] = alpha * A[1+bs*0];
227 C[0+bs*2] = alpha * A[2+bs*0];
228 C[1+bs*2] = alpha * A[2+bs*1];
229 }
230 }
231
232 }
233
234
235
236// transposed of general matrices, read along panels, write across panels
237void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
238 {
239
240 if(tri==1)
241 {
242 // A is lower triangular, C is upper triangular
243 // kmax+1 2-wide + end 1x1 triangle
244
245 kmax += 1;
246 }
247
248 const int bs = 4;
249
250 int k;
251
252 k = 0;
253
254 if(kmax<kna)
255 goto cleanup_loop;
256
257 if(kna>0)
258 {
259 for( ; k<kna; k++)
260 {
261 C[0+bs*0] = alpha * A[0+bs*0];
262 C[0+bs*1] = alpha * A[1+bs*0];
263
264 C += 1;
265 A += bs;
266 }
267 C += bs*(sdc-1);
268 }
269
270 for( ; k<kmax-3; k+=4)
271 {
272 C[0+bs*0] = alpha * A[0+bs*0];
273 C[0+bs*1] = alpha * A[1+bs*0];
274
275 C[1+bs*0] = alpha * A[0+bs*1];
276 C[1+bs*1] = alpha * A[1+bs*1];
277
278 C[2+bs*0] = alpha * A[0+bs*2];
279 C[2+bs*1] = alpha * A[1+bs*2];
280
281 C[3+bs*0] = alpha * A[0+bs*3];
282 C[3+bs*1] = alpha * A[1+bs*3];
283
284 C += bs*sdc;
285 A += bs*bs;
286 }
287
288 cleanup_loop:
289
290 for( ; k<kmax; k++)
291 {
292 C[0+bs*0] = alpha * A[0+bs*0];
293 C[0+bs*1] = alpha * A[1+bs*0];
294
295 C += 1;
296 A += bs;
297 }
298
299 if(tri==1)
300 {
301 // end 1x1 triangle
302 C[0+bs*1] = alpha * A[1+bs*0];
303 }
304
305 }
306
307
308
309// transposed of general matrices, read along panels, write across panels
310void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
311 {
312
313 if(tri==1)
314 {
315 // A is lower triangular, C is upper triangular
316 // kmax+1 1-wide
317
318 kmax += 1;
319 }
320
321 const int bs = 4;
322
323 int k;
324
325 k = 0;
326
327 if(kmax<kna)
328 goto cleanup_loop;
329
330 if(kna>0)
331 {
332 for( ; k<kna; k++)
333 {
334 C[0+bs*0] = alpha * A[0+bs*0];
335
336 C += 1;
337 A += bs;
338 }
339 C += bs*(sdc-1);
340 }
341
342 for( ; k<kmax-3; k+=4)
343 {
344 C[0+bs*0] = alpha * A[0+bs*0];
345
346 C[1+bs*0] = alpha * A[0+bs*1];
347
348 C[2+bs*0] = alpha * A[0+bs*2];
349
350 C[3+bs*0] = alpha * A[0+bs*3];
351
352 C += bs*sdc;
353 A += bs*bs;
354 }
355
356 cleanup_loop:
357
358 for( ; k<kmax; k++)
359 {
360 C[0+bs*0] = alpha * A[0+bs*0];
361
362 C += 1;
363 A += bs;
364 }
365
366 }
367
368
369
370// transposed of general matrices, read across panels, write along panels
371void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
372 {
373 const int ps = 4;
374 int k;
375 for(k=0; k<kmax-3; k+=4)
376 {
377 //
378 B[0+ps*0] = A[0+ps*0];
379 B[0+ps*1] = A[1+ps*0];
380 B[0+ps*2] = A[2+ps*0];
381 B[0+ps*3] = A[3+ps*0];
382 //
383 B[1+ps*0] = A[0+ps*1];
384 B[1+ps*1] = A[1+ps*1];
385 B[1+ps*2] = A[2+ps*1];
386 B[1+ps*3] = A[3+ps*1];
387 //
388 B[2+ps*0] = A[0+ps*2];
389 B[2+ps*1] = A[1+ps*2];
390 B[2+ps*2] = A[2+ps*2];
391 B[2+ps*3] = A[3+ps*2];
392 //
393 B[3+ps*0] = A[0+ps*3];
394 B[3+ps*1] = A[1+ps*3];
395 B[3+ps*2] = A[2+ps*3];
396 B[3+ps*3] = A[3+ps*3];
397
398 A += ps*sda;
399 B += ps*ps;
400 }
401 for( ; k<kmax; k++)
402 {
403 //
404 B[0+ps*0] = A[0+ps*0];
405 B[1+ps*0] = A[0+ps*1];
406 B[2+ps*0] = A[0+ps*2];
407 B[3+ps*0] = A[0+ps*3];
408
409 A += 1;
410 B += ps;
411 }
412 return;
413 }
414