blob: 36a655f8515c008e39bc3e9a4d4dbf192c2521f5 [file] [log] [blame]
Austin Schuh16ce3c72018-01-28 16:17:08 -08001/**************************************************************************************************
2* *
3* This file is part of HPMPC. *
4* *
5* HPMPC -- Library for High-Performance implementation of solvers for MPC. *
6* Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved. *
7* *
8* HPMPC is free software; you can redistribute it and/or *
9* modify it under the terms of the GNU Lesser General Public *
10* License as published by the Free Software Foundation; either *
11* version 2.1 of the License, or (at your option) any later version. *
12* *
13* HPMPC is distributed in the hope that it will be useful, *
14* but WITHOUT ANY WARRANTY; without even the implied warranty of *
15* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
16* See the GNU Lesser General Public License for more details. *
17* *
18* You should have received a copy of the GNU Lesser General Public *
19* License along with HPMPC; if not, write to the Free Software *
20* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
21* *
22* Author: Gianluca Frison, giaf (at) dtu.dk *
23* *
24**************************************************************************************************/
25
26#include <stdio.h>
27#include <stdlib.h>
28#include <math.h>
29
30#include <blasfeo_s_aux_ext_dep.h>
31
32
33
34// matrix-vector multiplication
35void sgemv_n_3l(int m, int n, float *A, int lda , float *x, float *z)
36 {
37
38 int ii, jj;
39
40 for(ii=0; ii<m; ii++)
41 {
42 z[ii] = 0.0;
43 for(jj=0; jj<n; jj++)
44 {
45 z[ii] += A[ii+lda*jj] * x[jj];
46 }
47 }
48
49 return;
50
51 }
52
53
54// matrix-matrix multiplication
55void sgemm_nn_3l(int m, int n, int k, float *A, int lda , float *B, int ldb, float *C, int ldc)
56 {
57
58 int ii, jj, kk;
59
60 for(jj=0; jj<n; jj++)
61 {
62 for(ii=0; ii<m; ii++)
63 {
64 C[ii+ldc*jj] = 0;
65 for(kk=0; kk<k; kk++)
66 {
67 C[ii+ldc*jj] += A[ii+lda*kk] * B[kk+ldb*jj];
68 }
69 }
70 }
71
72 return;
73
74 }
75
76
77void saxpy_3l(int n, float da, float *dx, float *dy)
78 {
79 int i;
80 for(i=0; i<n; i++)
81 {
82 dy[i] += da*dx[i];
83 }
84 }
85
86
87
88void sscal_3l(int n, float da, float *dx)
89 {
90 int i;
91 for(i=0; i<n; i++)
92 {
93 dx[i] *= da;
94 }
95 }
96
97
98
99/************************************************
100 Routine that copies a matrix
101************************************************/
102void smcopy(int row, int col, float *A, int lda, float *B, int ldb)
103 {
104 int i, j;
105 for(j=0; j<col; j++)
106 {
107 for(i=0; i<row; i++)
108 {
109 B[i+j*ldb] = A[i+j*lda];
110 }
111 }
112 }
113
114
115
116int idamax_3l(int n, float *x)
117 {
118
119 if(n<=0)
120 return 0;
121 if(n==1)
122 return 0;
123
124 float dabs;
125 float dmax = (x[0]>0 ? x[0] : -x[0]);
126 int idmax = 0;
127 int jj;
128 for(jj=1; jj<n; jj++)
129 {
130 dabs = (x[jj]>0 ? x[jj] : -x[jj]);
131 if(dabs>dmax)
132 {
133 dmax = dabs;
134 idmax = jj;
135 }
136 }
137
138 return idmax;
139
140 }
141
142
143
144void sswap_3l(int n, float *x, int incx, float *y, int incy)
145 {
146
147 if(n<=0)
148 return;
149
150 float temp;
151 int jj;
152 for(jj=0; jj<n; jj++)
153 {
154 temp = x[0];
155 x[0] = y[0];
156 y[0] = temp;
157 x += incx;
158 y += incy;
159 }
160
161 }
162
163
164
165void sger_3l(int m, int n, float alpha, float *x, int incx, float *y, int incy, float *A, int lda)
166 {
167
168 if(m==0 || n==0 || alpha==0.0)
169 return;
170
171 int i, j;
172 float *px, *py, temp;
173
174 py = y;
175 for(j=0; j<n; j++)
176 {
177 temp = alpha * py[0];
178 px = x;
179 for(i=0; i<m; i++)
180 {
181 A[i+lda*j] += px[0] * temp;
182 px += incx;
183 }
184 py += incy;
185 }
186
187 return;
188
189 }
190
191
192
193void sgetf2_3l(int m, int n, float *A, int lda, int *ipiv, int *info)
194 {
195
196 if(m<=0 || n<=0)
197 return;
198
199 int i, j, jp;
200
201 float Ajj;
202
203 int size_min = ( m<n ? m : n );
204
205 for(j=0; j<size_min; j++)
206 // find the pivot and test for singularity
207 {
208 jp = j + idamax_3l(m-j, &A[j+lda*j]);
209 ipiv[j] = jp;
210 if( A[jp+lda*j]!=0)
211 {
212 // apply the interchange to columns 0:n-1
213 if(jp!=j)
214 {
215 sswap_3l(n, &A[j], lda, &A[jp], lda);
216 }
217 // compute elements j+1:m-1 of j-th column
218 if(j<m-1)
219 {
220 Ajj = A[j+lda*j];
221 if( ( Ajj>0 ? Ajj : -Ajj ) >= 2.22e-16 )
222 {
223 sscal_3l(m-j-1, 1.0/Ajj, &A[j+1+lda*j]);
224 }
225 else
226 {
227 for(i=j+1; i<m; i++)
228 {
229 A[i+lda*j] /= Ajj;
230 }
231 }
232 }
233 }
234 else if(*info==0)
235 {
236 *info = j+1;
237 }
238
239 if( j < size_min )
240 {
241 // update trailing submatrix
242 sger_3l(m-j-1, n-j-1, -1.0, &A[j+1+lda*j], 1, &A[j+lda*(j+1)], lda, &A[j+1+lda*(j+1)], lda);
243 }
244
245 }
246
247 return;
248
249 }
250
251
252
253void slaswp_3l(int n, float *A, int lda, int k1, int k2, int *ipiv)
254 {
255
256 int i, j, k, ix, ix0, i1, i2, n32, ip;
257 float temp;
258
259 ix0 = k1;
260 i1 = k1;
261 i2 = k2;
262
263 n32 = (n/32)*32;
264 if(n32!=0)
265 {
266 for(j=0; j<n32; j+=32)
267 {
268 ix = ix0;
269 for(i=i1; i<i2; i++)
270 {
271 ip = ipiv[ix];
272 if(ip!=i)
273 {
274 for(k=j; k<j+32; k++)
275 {
276 temp = A[i+lda*k];
277 A[i+lda*k] = A[ip+lda*k];
278 A[ip+lda*k] = temp;
279 }
280 }
281 ix++;
282 }
283 }
284 }
285 if(n32!=n)
286 {
287 ix = ix0;
288 for(i=i1; i<i2; i++)
289 {
290 ip = ipiv[ix];
291 if(ip!=i)
292 {
293 for(k=n32; k<n; k++)
294 {
295 temp = A[i+lda*k];
296 A[i+lda*k] = A[ip+lda*k];
297 A[ip+lda*k] = temp;
298 }
299 }
300 ix++;
301 }
302 }
303
304 return;
305
306 }
307
308
309
310// left lower no-transp unit
311void strsm_l_l_n_u_3l(int m, int n, float *A, int lda, float *B, int ldb)
312 {
313
314 if(m==0 || n==0)
315 return;
316
317 int i, j, k;
318
319 for(j=0; j<n; j++)
320 {
321 for(k=0; k<m; k++)
322 {
323 for(i=k+1; i<m; i++)
324 {
325 B[i+ldb*j] -= B[k+ldb*j] * A[i+lda*k];
326 }
327 }
328 }
329
330 return;
331
332 }
333
334
335
336// left upper no-transp non-unit
337void strsm_l_u_n_n_3l(int m, int n, float *A, int lda, float *B, int ldb)
338 {
339
340 if(m==0 || n==0)
341 return;
342
343 int i, j, k;
344
345 for(j=0; j<n; j++)
346 {
347 for(k=m-1; k>=0; k--)
348 {
349 B[k+ldb*j] /= A[k+lda*k];
350 for(i=0; i<k; i++)
351 {
352 B[i+ldb*j] -= B[k+ldb*j] * A[i+lda*k];
353 }
354 }
355 }
356
357 return;
358
359 }
360
361
362
363void sgetrs_3l(int n, int nrhs, float *A, int lda, int *ipiv, float *B, int ldb, int *info)
364 {
365
366 if(n==0 || nrhs==0)
367 return;
368
369 // solve A * X = B
370
371 // apply row interchanges to the rhs
372 slaswp_3l(nrhs, B, ldb, 0, n, ipiv);
373
374 // solve L*X = B, overwriting B with X
375 strsm_l_l_n_u_3l(n, nrhs, A, lda, B, ldb);
376
377 // solve U*X = B, overwriting B with X
378 strsm_l_u_n_n_3l(n, nrhs, A, lda, B, ldb);
379
380 return;
381
382 }
383
384
385
386void sgesv_3l(int n, int nrhs, float *A, int lda, int *ipiv, float *B, int ldb, int *info)
387 {
388
389 // compute the LU factorization of A
390 sgetf2_3l(n, n, A, lda, ipiv, info);
391
392 if(*info==0)
393 {
394 // solve the system A*X = B, overwriting B with X
395 sgetrs_3l(n, nrhs, A, lda, ipiv, B, ldb, info);
396 }
397
398 return;
399
400 }
401
402
403
404/* one norm of a matrix */
405float onenorm(int row, int col, float *ptrA)
406 {
407 float max, temp;
408 int i, j;
409 temp = 0;
410 for(j=0; j<col; j++)
411 {
412 temp = fabs(*(ptrA+j*row));
413 for(i=1; i<row; i++)
414 {
415 temp += fabs(*(ptrA+j*row+i));
416 }
417 if(j==0) max = temp;
418 else if(max>temp) temp = max;
419 }
420 return temp;
421 }
422
423
424
425/* computes the Pade approximation of degree m of the matrix A */
426void padeapprox(int m, int row, float *A)
427 {
428 int row2 = row*row;
429/* int i1 = 1;*/
430/* float d0 = 0;*/
431/* float d1 = 1;*/
432/* float dm1 = -1;*/
433
434 int ii;
435
436 float *U = malloc(row*row*sizeof(float));
437 float *V = malloc(row*row*sizeof(float));
438
439 if(m==3)
440 {
441 float c[] = {120, 60, 12, 1};
442 float *A0 = malloc(row*row*sizeof(float)); for(ii=0; ii<row2; ii++) A0[ii] = 0.0; for(ii=0; ii<row; ii++) A0[ii*(row+1)] = 1.0;
443 float *A2 = malloc(row*row*sizeof(float));
444// char ta = 'n'; float alpha = 1; float beta = 0;
445// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
446 sgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
447 float *temp = malloc(row*row*sizeof(float));
448// sscal_(&row2, &d0, temp, &i1);
449 sscal_3l(row2, 0, temp);
450// saxpy_(&row2, &c[3], A2, &i1, temp, &i1);
451 saxpy_3l(row2, c[3], A2, temp);
452// saxpy_(&row2, &c[1], A0, &i1, temp, &i1);
453 saxpy_3l(row2, c[1], A0, temp);
454// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
455 sgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
456// sscal_(&row2, &d0, V, &i1);
457 sscal_3l(row2, 0, V);
458// saxpy_(&row2, &c[2], A2, &i1, V, &i1);
459 saxpy_3l(row2, c[2], A2, V);
460// saxpy_(&row2, &c[0], A0, &i1, V, &i1);
461 saxpy_3l(row2, c[0], A0, V);
462 free(A0);
463 free(A2);
464 free(temp);
465 }
466 else if(m==5)
467 {
468 float c[] = {30240, 15120, 3360, 420, 30, 1};
469 float *A0 = malloc(row*row*sizeof(float)); for(ii=0; ii<row2; ii++) A0[ii] = 0.0; for(ii=0; ii<row; ii++) A0[ii*(row+1)] = 1.0;
470 float *A2 = malloc(row*row*sizeof(float));
471 float *A4 = malloc(row*row*sizeof(float));
472// char ta = 'n'; float alpha = 1; float beta = 0;
473// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
474 sgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
475// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
476 sgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
477 smcopy(row, row, A4, row, V, row);
478 float *temp = malloc(row*row*sizeof(float));
479 smcopy(row, row, A4, row, temp, row);
480// saxpy_(&row2, &c[3], A2, &i1, temp, &i1);
481 saxpy_3l(row2, c[3], A2, temp);
482// saxpy_(&row2, &c[1], A0, &i1, temp, &i1);
483 saxpy_3l(row2, c[1], A0, temp);
484// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
485 sgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
486// sscal_(&row2, &c[4], V, &i1);
487 sscal_3l(row2, c[4], V);
488// saxpy_(&row2, &c[2], A2, &i1, V, &i1);
489 saxpy_3l(row2, c[2], A2, V);
490// saxpy_(&row2, &c[0], A0, &i1, V, &i1);
491 saxpy_3l(row2, c[0], A0, V);
492 free(A0);
493 free(A2);
494 free(A4);
495 free(temp);
496 }
497 else if(m==7)
498 {
499 float c[] = {17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1};
500 float *A0 = malloc(row*row*sizeof(float)); for(ii=0; ii<row2; ii++) A0[ii] = 0.0; for(ii=0; ii<row; ii++) A0[ii*(row+1)] = 1.0;
501 float *A2 = malloc(row*row*sizeof(float));
502 float *A4 = malloc(row*row*sizeof(float));
503 float *A6 = malloc(row*row*sizeof(float));
504// char ta = 'n'; float alpha = 1; float beta = 1;
505// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
506 sgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
507// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
508 sgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
509// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row, &beta, A6, &row);
510 sgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
511 float *temp = malloc(row*row*sizeof(float));
512// sscal_(&row2, &d0, temp, &i1);
513 sscal_3l(row2, 0, temp);
514// saxpy_(&row2, &c[3], A2, &i1, temp, &i1);
515 saxpy_3l(row2, c[3], A2, temp);
516// saxpy_(&row2, &c[1], A0, &i1, temp, &i1);
517 saxpy_3l(row2, c[1], A0, temp);
518// saxpy_(&row2, &c[5], A4, &i1, temp, &i1);
519 saxpy_3l(row2, c[5], A4, temp);
520// saxpy_(&row2, &c[7], A6, &i1, temp, &i1);
521 saxpy_3l(row2, c[7], A6, temp);
522// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
523 sgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
524// sscal_(&row2, &d0, V, &i1);
525 sscal_3l(row2, 0, V);
526// saxpy_(&row2, &c[2], A2, &i1, V, &i1);
527 saxpy_3l(row2, c[2], A2, V);
528// saxpy_(&row2, &c[0], A0, &i1, V, &i1);
529 saxpy_3l(row2, c[0], A0, V);
530// saxpy_(&row2, &c[4], A4, &i1, V, &i1);
531 saxpy_3l(row2, c[4], A4, V);
532// saxpy_(&row2, &c[6], A6, &i1, V, &i1);
533 saxpy_3l(row2, c[6], A6, V);
534 free(A0);
535 free(A2);
536 free(A4);
537 free(A6);
538 free(temp);
539 }
540 else if(m==9)
541 {
542 float c[] = {17643225600, 8821612800, 2075673600, 302702400, 30270240, 2162160, 110880, 3960, 90, 1};
543 float *A0 = malloc(row*row*sizeof(float)); for(ii=0; ii<row2; ii++) A0[ii] = 0.0; for(ii=0; ii<row; ii++) A0[ii*(row+1)] = 1.0;
544 float *A2 = malloc(row*row*sizeof(float));
545 float *A4 = malloc(row*row*sizeof(float));
546 float *A6 = malloc(row*row*sizeof(float));
547 float *A8 = malloc(row*row*sizeof(float));
548// char ta = 'n'; float alpha = 1; float beta = 0;
549// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
550 sgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
551// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
552 sgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
553// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row, &beta, A6, &row);
554 sgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
555// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, A2, &row, &beta, A8, &row);
556 sgemm_nn_3l(row, row, row, A6, row, A2, row, A8, row);
557 smcopy(row, row, A8, row, V, row);
558 float *temp = malloc(row*row*sizeof(float));
559 smcopy(row, row, A8, row, temp, row);
560// saxpy_(&row2, &c[3], A2, &i1, temp, &i1);
561 saxpy_3l(row2, c[3], A2, temp);
562// saxpy_(&row2, &c[1], A0, &i1, temp, &i1);
563 saxpy_3l(row2, c[1], A0, temp);
564// saxpy_(&row2, &c[5], A4, &i1, temp, &i1);
565 saxpy_3l(row2, c[5], A4, temp);
566// saxpy_(&row2, &c[7], A6, &i1, temp, &i1);
567 saxpy_3l(row2, c[7], A6, temp);
568// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
569 sgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
570// sscal_(&row2, &c[8], V, &i1);
571 sscal_3l(row2, c[8], V);
572// saxpy_(&row2, &c[2], A2, &i1, V, &i1);
573 saxpy_3l(row2, c[2], A2, V);
574// saxpy_(&row2, &c[0], A0, &i1, V, &i1);
575 saxpy_3l(row2, c[0], A0, V);
576// saxpy_(&row2, &c[4], A4, &i1, V, &i1);
577 saxpy_3l(row2, c[4], A4, V);
578// saxpy_(&row2, &c[6], A6, &i1, V, &i1);
579 saxpy_3l(row2, c[6], A6, V);
580 free(A0);
581 free(A2);
582 free(A4);
583 free(A6);
584 free(A8);
585 free(temp);
586 }
587 else if(m==13) // tested
588 {
589 float c[] = {64764752532480000, 32382376266240000, 7771770303897600, 1187353796428800, 129060195264000, 10559470521600, 670442572800, 33522128640, 1323241920, 40840800, 960960, 16380, 182, 1};
590 float *A0 = malloc(row*row*sizeof(float)); for(ii=0; ii<row2; ii++) A0[ii] = 0.0; for(ii=0; ii<row; ii++) A0[ii*(row+1)] = 1.0;
591 float *A2 = malloc(row*row*sizeof(float));
592 float *A4 = malloc(row*row*sizeof(float));
593 float *A6 = malloc(row*row*sizeof(float));
594// char ta = 'n'; float alpha = 1; float beta = 0;
595// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
596 sgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
597// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
598 sgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
599// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row, &beta, A6, &row);
600 sgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
601 smcopy(row, row, A2, row, U, row);
602 float *temp = malloc(row*row*sizeof(float));
603// sscal_(&row2, &c[9], U, &i1);
604 sscal_3l(row2, c[9], U);
605// saxpy_(&row2, &c[11], A4, &i1, U, &i1);
606 saxpy_3l(row2, c[11], A4, U);
607// saxpy_(&row2, &c[13], A6, &i1, U, &i1);
608 saxpy_3l(row2, c[13], A6, U);
609// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, U, &row, &beta, temp, &row);
610 sgemm_nn_3l(row, row, row, A6, row, U, row, temp, row);
611// saxpy_(&row2, &c[7], A6, &i1, temp, &i1);
612 saxpy_3l(row2, c[7], A6, temp);
613// saxpy_(&row2, &c[5], A4, &i1, temp, &i1);
614 saxpy_3l(row2, c[5], A4, temp);
615// saxpy_(&row2, &c[3], A2, &i1, temp, &i1);
616 saxpy_3l(row2, c[3], A2, temp);
617// saxpy_(&row2, &c[1], A0, &i1, temp, &i1);
618 saxpy_3l(row2, c[1], A0, temp);
619// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
620 sgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
621 smcopy(row, row, A2, row, temp, row);
622// sscal_(&row2, &c[8], V, &i1);
623 sscal_3l(row2, c[8], V);
624// saxpy_(&row2, &c[12], A6, &i1, temp, &i1);
625 saxpy_3l(row2, c[12], A6, temp);
626// saxpy_(&row2, &c[10], A4, &i1, temp, &i1);
627 saxpy_3l(row2, c[10], A4, temp);
628// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, temp, &row, &beta, V, &row);
629 sgemm_nn_3l(row, row, row, A6, row, temp, row, V, row);
630// saxpy_(&row2, &c[6], A6, &i1, V, &i1);
631 saxpy_3l(row2, c[6], A6, V);
632// saxpy_(&row2, &c[4], A4, &i1, V, &i1);
633 saxpy_3l(row2, c[4], A4, V);
634// saxpy_(&row2, &c[2], A2, &i1, V, &i1);
635 saxpy_3l(row2, c[2], A2, V);
636// saxpy_(&row2, &c[0], A0, &i1, V, &i1);
637 saxpy_3l(row2, c[0], A0, V);
638 free(A0);
639 free(A2);
640 free(A4);
641 free(A6);
642 free(temp);
643 }
644 else
645 {
646 printf("%s\n", "Wrong Pade approximatin degree");
647 exit(1);
648 }
649 float *D = malloc(row*row*sizeof(float));
650// dcopy_(&row2, V, &i1, A, &i1);
651 smcopy(row, row, V, row, A, row);
652// saxpy_(&row2, &d1, U, &i1, A, &i1);
653 saxpy_3l(row2, 1.0, U, A);
654// dcopy_(&row2, V, &i1, D, &i1);
655 smcopy(row, row, V, row, D, row);
656// saxpy_(&row2, &dm1, U, &i1, D, &i1);
657 saxpy_3l(row2, -1.0, U, D);
658 int *ipiv = (int *) malloc(row*sizeof(int));
659 int info = 0;
660// dgesv_(&row, &row, D, &row, ipiv, A, &row, &info);
661 sgesv_3l(row, row, D, row, ipiv, A, row, &info);
662 free(ipiv);
663 free(D);
664 free(U);
665 free(V);
666 }
667
668
669
670void expm(int row, float *A)
671 {
672
673 int i;
674
675 int m_vals[] = {3, 5, 7, 9, 13};
676 float theta[] = {0.01495585217958292, 0.2539398330063230, 0.9504178996162932, 2.097847961257068, 5.371920351148152};
677 int lentheta = 5;
678
679 float normA = onenorm(row, row, A);
680
681 if(normA<=theta[4])
682 {
683 for(i=0; i<lentheta; i++)
684 {
685 if(normA<=theta[i])
686 {
687 padeapprox(m_vals[i], row, A);
688 break;
689 }
690 }
691 }
692 else
693 {
694 int s;
695 float t = frexp(normA/(theta[4]), &s);
696 s = s - (t==0.5);
697 t = pow(2,-s);
698 int row2 = row*row;
699/* int i1 = 1;*/
700// sscal_(&row2, &t, A, &i1);
701 sscal_3l(row2, t, A);
702 padeapprox(m_vals[4], row, A);
703 float *temp = malloc(row*row*sizeof(float));
704// char ta = 'n'; float alpha = 1; float beta = 0;
705 for(i=0; i<s; i++)
706 {
707// sgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, temp, &row);
708 sgemm_nn_3l(row, row, row, A, row, A, row, temp, row);
709 smcopy(row, row, temp, row, A, row);
710 }
711 free(temp);
712 }
713 }
714
715