blob: 41a78c4e6d327025142c5ef441ef6d15e4969707 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#include <stdlib.h>
30#include <stdio.h>
31
32#include "../include/blasfeo_common.h"
33#include "../include/blasfeo_s_kernel.h"
34#include "../include/blasfeo_s_aux.h"
35
36
37
38#if defined(LA_HIGH_PERFORMANCE)
39
40
41
42void sgemv_n_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
43 {
44
45 if(m<0)
46 return;
47
48 const int bs = 8;
49
50 int i;
51
52 int sda = sA->cn;
53 float *pA = sA->pA + aj*bs + ai/bs*bs*sda;
54 float *x = sx->pa + xi;
55 float *y = sy->pa + yi;
56 float *z = sz->pa + zi;
57
58 i = 0;
59 // clean up at the beginning
60 if(ai%bs!=0)
61 {
62 kernel_sgemv_n_8_gen_lib8(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
63 pA += bs*sda;
64 y += 8 - ai%bs;
65 z += 8 - ai%bs;
66 m -= 8 - ai%bs;
67 }
68 // main loop
69 for( ; i<m-7; i+=8)
70 {
71 kernel_sgemv_n_8_lib8(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
72 }
73 if(i<m)
74 {
75 kernel_sgemv_n_8_vs_lib8(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
76 }
77
78 return;
79
80 }
81
82
83
84void sgemv_t_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
85 {
86
87 if(n<=0)
88 return;
89
90 const int bs = 8;
91
92 int i;
93
94 int sda = sA->cn;
95 float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
96 float *x = sx->pa + xi;
97 float *y = sy->pa + yi;
98 float *z = sz->pa + zi;
99
100 if(ai%bs==0)
101 {
102 i = 0;
103 for( ; i<n-7; i+=8)
104 {
105 kernel_sgemv_t_8_lib8(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
106 }
107 if(i<n)
108 {
109 if(n-i<=4)
110 {
111 kernel_sgemv_t_4_vs_lib8(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
112 }
113 else
114 {
115 kernel_sgemv_t_8_vs_lib8(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
116 }
117 }
118 }
119 else
120 {
121 i = 0;
122 for( ; i<n-4; i+=8)
123 {
124 kernel_sgemv_t_8_gen_lib8(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
125 }
126 if(i<n)
127 {
128 kernel_sgemv_t_4_gen_lib8(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
129 }
130 }
131
132 return;
133
134 }
135
136
137
138// m >= n
139void strmv_lnn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
140 {
141
142 if(m<=0)
143 return;
144
145 const int bs = 8;
146
147 int sda = sA->cn;
148 float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
149 float *x = sx->pa + xi;
150 float *z = sz->pa + zi;
151
152 if(m-n>0)
153 sgemv_n_libstr(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
154
155 float *pA2 = pA;
156 float *z2 = z;
157 int m2 = n;
158 int n2 = 0;
159 float *pA3, *x3;
160
161 float alpha = 1.0;
162 float beta = 1.0;
163
164 float zt[8];
165
166 int ii, jj, jj_end;
167
168 ii = 0;
169
170 if(ai%bs!=0)
171 {
172 pA2 += sda*bs - ai%bs;
173 z2 += bs-ai%bs;
174 m2 -= bs-ai%bs;
175 n2 += bs-ai%bs;
176 }
177
178 pA2 += m2/bs*bs*sda;
179 z2 += m2/bs*bs;
180 n2 += m2/bs*bs;
181
182 if(m2%bs!=0)
183 {
184 //
185 pA3 = pA2 + bs*n2;
186 x3 = x + n2;
187 zt[7] = pA3[7+bs*0]*x3[0] + pA3[7+bs*1]*x3[1] + pA3[7+bs*2]*x3[2] + pA3[7+bs*3]*x3[3] + pA3[7+bs*4]*x3[4] + pA3[7+bs*5]*x3[5] + pA3[7+bs*6]*x3[6] + pA3[7+bs*7]*x3[7];
188 zt[6] = pA3[6+bs*0]*x3[0] + pA3[6+bs*1]*x3[1] + pA3[6+bs*2]*x3[2] + pA3[6+bs*3]*x3[3] + pA3[6+bs*4]*x3[4] + pA3[6+bs*5]*x3[5] + pA3[6+bs*6]*x3[6];
189 zt[5] = pA3[5+bs*0]*x3[0] + pA3[5+bs*1]*x3[1] + pA3[5+bs*2]*x3[2] + pA3[5+bs*3]*x3[3] + pA3[5+bs*4]*x3[4] + pA3[5+bs*5]*x3[5];
190 zt[4] = pA3[4+bs*0]*x3[0] + pA3[4+bs*1]*x3[1] + pA3[4+bs*2]*x3[2] + pA3[4+bs*3]*x3[3] + pA3[4+bs*4]*x3[4];
191 zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
192 zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
193 zt[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
194 zt[0] = pA3[0+bs*0]*x3[0];
195 kernel_sgemv_n_8_lib8(n2, &alpha, pA2, x, &beta, zt, zt);
196 for(jj=0; jj<m2%bs; jj++)
197 z2[jj] = zt[jj];
198 }
199 for(; ii<m2-7; ii+=8)
200 {
201 pA2 -= bs*sda;
202 z2 -= 8;
203 n2 -= 8;
204 pA3 = pA2 + bs*n2;
205 x3 = x + n2;
206 z2[7] = pA3[7+bs*0]*x3[0] + pA3[7+bs*1]*x3[1] + pA3[7+bs*2]*x3[2] + pA3[7+bs*3]*x3[3] + pA3[7+bs*4]*x3[4] + pA3[7+bs*5]*x3[5] + pA3[7+bs*6]*x3[6] + pA3[7+bs*7]*x3[7];
207 z2[6] = pA3[6+bs*0]*x3[0] + pA3[6+bs*1]*x3[1] + pA3[6+bs*2]*x3[2] + pA3[6+bs*3]*x3[3] + pA3[6+bs*4]*x3[4] + pA3[6+bs*5]*x3[5] + pA3[6+bs*6]*x3[6];
208 z2[5] = pA3[5+bs*0]*x3[0] + pA3[5+bs*1]*x3[1] + pA3[5+bs*2]*x3[2] + pA3[5+bs*3]*x3[3] + pA3[5+bs*4]*x3[4] + pA3[5+bs*5]*x3[5];
209 z2[4] = pA3[4+bs*0]*x3[0] + pA3[4+bs*1]*x3[1] + pA3[4+bs*2]*x3[2] + pA3[4+bs*3]*x3[3] + pA3[4+bs*4]*x3[4];
210 z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
211 z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
212 z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
213 z2[0] = pA3[0+bs*0]*x3[0];
214 kernel_sgemv_n_8_lib8(n2, &alpha, pA2, x, &beta, z2, z2);
215 }
216 if(ai%bs!=0)
217 {
218 if(ai%bs==1)
219 {
220 zt[6] = pA[6+bs*0]*x[0] + pA[6+bs*1]*x[1] + pA[6+bs*2]*x[2] + pA[6+bs*3]*x[3] + pA[6+bs*4]*x[4] + pA[6+bs*5]*x[5] + pA[6+bs*6]*x[6];
221 zt[5] = pA[5+bs*0]*x[0] + pA[5+bs*1]*x[1] + pA[5+bs*2]*x[2] + pA[5+bs*3]*x[3] + pA[5+bs*4]*x[4] + pA[5+bs*5]*x[5];
222 zt[4] = pA[4+bs*0]*x[0] + pA[4+bs*1]*x[1] + pA[4+bs*2]*x[2] + pA[4+bs*3]*x[3] + pA[4+bs*4]*x[4];
223 zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
224 zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
225 zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
226 zt[0] = pA[0+bs*0]*x[0];
227 jj_end = 8-ai%bs<n ? 8-ai%bs : n;
228 for(jj=0; jj<jj_end; jj++)
229 z[jj] = zt[jj];
230 }
231 else if(ai%bs==2)
232 {
233 zt[5] = pA[5+bs*0]*x[0] + pA[5+bs*1]*x[1] + pA[5+bs*2]*x[2] + pA[5+bs*3]*x[3] + pA[5+bs*4]*x[4] + pA[5+bs*5]*x[5];
234 zt[4] = pA[4+bs*0]*x[0] + pA[4+bs*1]*x[1] + pA[4+bs*2]*x[2] + pA[4+bs*3]*x[3] + pA[4+bs*4]*x[4];
235 zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
236 zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
237 zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
238 zt[0] = pA[0+bs*0]*x[0];
239 jj_end = 8-ai%bs<n ? 8-ai%bs : n;
240 for(jj=0; jj<jj_end; jj++)
241 z[jj] = zt[jj];
242 }
243 else if(ai%bs==3)
244 {
245 zt[4] = pA[4+bs*0]*x[0] + pA[4+bs*1]*x[1] + pA[4+bs*2]*x[2] + pA[4+bs*3]*x[3] + pA[4+bs*4]*x[4];
246 zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
247 zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
248 zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
249 zt[0] = pA[0+bs*0]*x[0];
250 jj_end = 8-ai%bs<n ? 8-ai%bs : n;
251 for(jj=0; jj<jj_end; jj++)
252 z[jj] = zt[jj];
253 }
254 else if(ai%bs==4)
255 {
256 zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
257 zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
258 zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
259 zt[0] = pA[0+bs*0]*x[0];
260 jj_end = 8-ai%bs<n ? 8-ai%bs : n;
261 for(jj=0; jj<jj_end; jj++)
262 z[jj] = zt[jj];
263 }
264 else if(ai%bs==5)
265 {
266 zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
267 zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
268 zt[0] = pA[0+bs*0]*x[0];
269 jj_end = 8-ai%bs<n ? 8-ai%bs : n;
270 for(jj=0; jj<jj_end; jj++)
271 z[jj] = zt[jj];
272 }
273 else if(ai%bs==6)
274 {
275 zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
276 zt[0] = pA[0+bs*0]*x[0];
277 jj_end = 8-ai%bs<n ? 8-ai%bs : n;
278 for(jj=0; jj<jj_end; jj++)
279 z[jj] = zt[jj];
280 }
281 else // if (ai%bs==7)
282 {
283 z[0] = pA[0+bs*0]*x[0];
284 }
285 }
286
287 return;
288
289 }
290
291
292
293// m >= n
294void strmv_ltn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
295 {
296
297 if(m<=0)
298 return;
299
300 const int bs = 8;
301
302 int sda = sA->cn;
303 float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
304 float *x = sx->pa + xi;
305 float *z = sz->pa + zi;
306
307 float xt[8];
308 float zt[8];
309
310 float alpha = 1.0;
311 float beta = 1.0;
312
313 int ii, jj, ll, ll_max;
314
315 jj = 0;
316
317 if(ai%bs!=0)
318 {
319
320 if(ai%bs==1)
321 {
322 ll_max = m-jj<7 ? m-jj : 7;
323 for(ll=0; ll<ll_max; ll++)
324 xt[ll] = x[ll];
325 for(; ll<7; ll++)
326 xt[ll] = 0.0;
327 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4] + pA[5+bs*0]*xt[5] + pA[6+bs*0]*xt[6];
328 zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4] + pA[5+bs*1]*xt[5] + pA[6+bs*1]*xt[6];
329 zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4] + pA[5+bs*2]*xt[5] + pA[6+bs*2]*xt[6];
330 zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4] + pA[5+bs*3]*xt[5] + pA[6+bs*3]*xt[6];
331 zt[4] = pA[4+bs*4]*xt[4] + pA[5+bs*4]*xt[5] + pA[6+bs*4]*xt[6];
332 zt[5] = pA[5+bs*5]*xt[5] + pA[6+bs*5]*xt[6];
333 zt[6] = pA[6+bs*6]*xt[6];
334 pA += bs*sda - 1;
335 x += 7;
336 kernel_sgemv_t_8_lib8(m-7-jj, &alpha, pA, sda, x, &beta, zt, zt);
337 ll_max = n-jj<7 ? n-jj : 7;
338 for(ll=0; ll<ll_max; ll++)
339 z[ll] = zt[ll];
340 pA += bs*7;
341 z += 7;
342 jj += 7;
343 }
344 else if(ai%bs==2)
345 {
346 ll_max = m-jj<6 ? m-jj : 6;
347 for(ll=0; ll<ll_max; ll++)
348 xt[ll] = x[ll];
349 for(; ll<6; ll++)
350 xt[ll] = 0.0;
351 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4] + pA[5+bs*0]*xt[5];
352 zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4] + pA[5+bs*1]*xt[5];
353 zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4] + pA[5+bs*2]*xt[5];
354 zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4] + pA[5+bs*3]*xt[5];
355 zt[4] = pA[4+bs*4]*xt[4] + pA[5+bs*4]*xt[5];
356 zt[5] = pA[5+bs*5]*xt[5];
357 pA += bs*sda - 2;
358 x += 6;
359 kernel_sgemv_t_8_lib8(m-6-jj, &alpha, pA, sda, x, &beta, zt, zt);
360 ll_max = n-jj<6 ? n-jj : 6;
361 for(ll=0; ll<ll_max; ll++)
362 z[ll] = zt[ll];
363 pA += bs*6;
364 z += 6;
365 jj += 6;
366 }
367 else if(ai%bs==3)
368 {
369 ll_max = m-jj<5 ? m-jj : 5;
370 for(ll=0; ll<ll_max; ll++)
371 xt[ll] = x[ll];
372 for(; ll<5; ll++)
373 xt[ll] = 0.0;
374 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4];
375 zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4];
376 zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4];
377 zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4];
378 zt[4] = pA[4+bs*4]*xt[4];
379 pA += bs*sda - 3;
380 x += 5;
381 kernel_sgemv_t_8_lib8(m-5-jj, &alpha, pA, sda, x, &beta, zt, zt);
382 ll_max = n-jj<5 ? n-jj : 5;
383 for(ll=0; ll<ll_max; ll++)
384 z[ll] = zt[ll];
385 pA += bs*5;
386 z += 5;
387 jj += 5;
388 }
389 else if(ai%bs==4)
390 {
391 ll_max = m-jj<4 ? m-jj : 4;
392 for(ll=0; ll<ll_max; ll++)
393 xt[ll] = x[ll];
394 for(; ll<4; ll++)
395 xt[ll] = 0.0;
396 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
397 zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
398 zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
399 zt[3] = pA[3+bs*3]*xt[3];
400 pA += bs*sda - 4;
401 x += 4;
402 kernel_sgemv_t_8_lib8(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
403 ll_max = n-jj<4 ? n-jj : 4;
404 for(ll=0; ll<ll_max; ll++)
405 z[ll] = zt[ll];
406 pA += bs*4;
407 z += 4;
408 jj += 4;
409 }
410 else if(ai%bs==5)
411 {
412 ll_max = m-jj<3 ? m-jj : 3;
413 for(ll=0; ll<ll_max; ll++)
414 xt[ll] = x[ll];
415 for(; ll<3; ll++)
416 xt[ll] = 0.0;
417 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
418 zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
419 zt[2] = pA[2+bs*2]*xt[2];
420 pA += bs*sda - 5;
421 x += 3;
422 kernel_sgemv_t_8_lib8(m-3-jj, &alpha, pA, sda, x, &beta, zt, zt);
423 ll_max = n-jj<3 ? n-jj : 3;
424 for(ll=0; ll<ll_max; ll++)
425 z[ll] = zt[ll];
426 pA += bs*3;
427 z += 3;
428 jj += 3;
429 }
430 else if(ai%bs==6)
431 {
432 ll_max = m-jj<2 ? m-jj : 2;
433 for(ll=0; ll<ll_max; ll++)
434 xt[ll] = x[ll];
435 for(; ll<2; ll++)
436 xt[ll] = 0.0;
437 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
438 zt[1] = pA[1+bs*1]*xt[1];
439 pA += bs*sda - 6;
440 x += 2;
441 kernel_sgemv_t_8_lib8(m-2-jj, &alpha, pA, sda, x, &beta, zt, zt);
442 ll_max = n-jj<2 ? n-jj : 2;
443 for(ll=0; ll<ll_max; ll++)
444 z[ll] = zt[ll];
445 pA += bs*2;
446 z += 2;
447 jj += 2;
448 }
449 else // if(ai%bs==7)
450 {
451 ll_max = m-jj<1 ? m-jj : 1;
452 for(ll=0; ll<ll_max; ll++)
453 xt[ll] = x[ll];
454 for(; ll<1; ll++)
455 xt[ll] = 0.0;
456 zt[0] = pA[0+bs*0]*xt[0];
457 pA += bs*sda - 7;
458 x += 1;
459 kernel_sgemv_t_8_lib8(m-1-jj, &alpha, pA, sda, x, &beta, zt, zt);
460 ll_max = n-jj<1 ? n-jj : 1;
461 for(ll=0; ll<ll_max; ll++)
462 z[ll] = zt[ll];
463 pA += bs*1;
464 z += 1;
465 jj += 1;
466 }
467
468 }
469
470 for(; jj<n-7; jj+=8)
471 {
472 zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3] + pA[4+bs*0]*x[4] + pA[5+bs*0]*x[5] + pA[6+bs*0]*x[6] + pA[7+bs*0]*x[7];
473 zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3] + pA[4+bs*1]*x[4] + pA[5+bs*1]*x[5] + pA[6+bs*1]*x[6] + pA[7+bs*1]*x[7];
474 zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3] + pA[4+bs*2]*x[4] + pA[5+bs*2]*x[5] + pA[6+bs*2]*x[6] + pA[7+bs*2]*x[7];
475 zt[3] = pA[3+bs*3]*x[3] + pA[4+bs*3]*x[4] + pA[5+bs*3]*x[5] + pA[6+bs*3]*x[6] + pA[7+bs*3]*x[7];
476 zt[4] = pA[4+bs*4]*x[4] + pA[5+bs*4]*x[5] + pA[6+bs*4]*x[6] + pA[7+bs*4]*x[7];
477 zt[5] = pA[5+bs*5]*x[5] + pA[6+bs*5]*x[6] + pA[7+bs*5]*x[7];
478 zt[6] = pA[6+bs*6]*x[6] + pA[7+bs*6]*x[7];
479 zt[7] = pA[7+bs*7]*x[7];
480 pA += bs*sda;
481 x += 8;
482 kernel_sgemv_t_8_lib8(m-8-jj, &alpha, pA, sda, x, &beta, zt, z);
483 pA += bs*8;
484 z += 8;
485 }
486 if(jj<n)
487 {
488 if(n-jj<=4)
489 {
490 ll_max = m-jj<4 ? m-jj : 4;
491 for(ll=0; ll<ll_max; ll++)
492 xt[ll] = x[ll];
493 for(; ll<4; ll++)
494 xt[ll] = 0.0;
495 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
496 zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
497 zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
498 zt[3] = pA[3+bs*3]*xt[3];
499 pA += bs*sda;
500 x += 4;
501 kernel_sgemv_t_4_lib8(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
502 for(ll=0; ll<n-jj; ll++)
503 z[ll] = zt[ll];
504// pA += bs*4;
505// z += 4;
506 }
507 else
508 {
509 ll_max = m-jj<8 ? m-jj : 8;
510 for(ll=0; ll<ll_max; ll++)
511 xt[ll] = x[ll];
512 for(; ll<8; ll++)
513 xt[ll] = 0.0;
514 zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4] + pA[5+bs*0]*xt[5] + pA[6+bs*0]*xt[6] + pA[7+bs*0]*xt[7];
515 zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4] + pA[5+bs*1]*xt[5] + pA[6+bs*1]*xt[6] + pA[7+bs*1]*xt[7];
516 zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4] + pA[5+bs*2]*xt[5] + pA[6+bs*2]*xt[6] + pA[7+bs*2]*xt[7];
517 zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4] + pA[5+bs*3]*xt[5] + pA[6+bs*3]*xt[6] + pA[7+bs*3]*xt[7];
518 zt[4] = pA[4+bs*4]*xt[4] + pA[5+bs*4]*xt[5] + pA[6+bs*4]*xt[6] + pA[7+bs*4]*xt[7];
519 zt[5] = pA[5+bs*5]*xt[5] + pA[6+bs*5]*xt[6] + pA[7+bs*5]*xt[7];
520 zt[6] = pA[6+bs*6]*xt[6] + pA[7+bs*6]*xt[7];
521 zt[7] = pA[7+bs*7]*xt[7];
522 pA += bs*sda;
523 x += 8;
524 kernel_sgemv_t_8_lib8(m-8-jj, &alpha, pA, sda, x, &beta, zt, zt);
525 for(ll=0; ll<n-jj; ll++)
526 z[ll] = zt[ll];
527// pA += bs*8;
528// z += 8;
529 }
530 }
531
532 return;
533
534 }
535
536
537
538void strsv_lnn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
539 {
540
541 if(m==0)
542 return;
543
544#if defined(DIM_CHECK)
545 // non-negative size
546 if(m<0) printf("\n****** strsv_lnn_libstr : m<0 : %d<0 *****\n", m);
547 // non-negative offset
548 if(ai<0) printf("\n****** strsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
549 if(aj<0) printf("\n****** strsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
550 if(xi<0) printf("\n****** strsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
551 if(zi<0) printf("\n****** strsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
552 // inside matrix
553 // A: m x k
554 if(ai+m > sA->m) printf("\n***** strsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
555 if(aj+m > sA->n) printf("\n***** strsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
556 // x: m
557 if(xi+m > sx->m) printf("\n***** strsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
558 // z: m
559 if(zi+m > sz->m) printf("\n***** strsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
560#endif
561
562 if(ai!=0)
563 {
564 printf("\nstrsv_lnn_libstr: feature not implemented yet: ai=%d\n", ai);
565 exit(1);
566 }
567
568 const int bs = 8;
569
570 int sda = sA->cn;
571 float *pA = sA->pA + aj*bs; // TODO ai
572 float *dA = sA->dA;
573 float *x = sx->pa + xi;
574 float *z = sz->pa + zi;
575
576 int ii;
577
578 if(ai==0 & aj==0)
579 {
580 if(sA->use_dA!=1)
581 {
582 sdiaex_lib(m, 1.0, ai, pA, sda, dA);
583 for(ii=0; ii<m; ii++)
584 dA[ii] = 1.0 / dA[ii];
585 sA->use_dA = 1;
586 }
587 }
588 else
589 {
590 sdiaex_lib(m, 1.0, ai, pA, sda, dA);
591 for(ii=0; ii<m; ii++)
592 dA[ii] = 1.0 / dA[ii];
593 sA->use_dA = 0;
594 }
595
596 int i;
597
598 if(x!=z)
599 {
600 for(i=0; i<m; i++)
601 z[i] = x[i];
602 }
603
604 i = 0;
605 for( ; i<m-7; i+=8)
606 {
607 kernel_strsv_ln_inv_8_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
608 }
609 if(i<m)
610 {
611 kernel_strsv_ln_inv_8_vs_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, m-i);
612 i+=8;
613 }
614
615 return;
616
617 }
618
619
620
621void strsv_lnn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
622 {
623
624 if(m==0 | n==0)
625 return;
626
627#if defined(DIM_CHECK)
628 // non-negative size
629 if(m<0) printf("\n****** strsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
630 if(n<0) printf("\n****** strsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
631 // non-negative offset
632 if(ai<0) printf("\n****** strsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
633 if(aj<0) printf("\n****** strsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
634 if(xi<0) printf("\n****** strsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
635 if(zi<0) printf("\n****** strsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
636 // inside matrix
637 // A: m x k
638 if(ai+m > sA->m) printf("\n***** strsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
639 if(aj+n > sA->n) printf("\n***** strsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
640 // x: m
641 if(xi+m > sx->m) printf("\n***** strsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
642 // z: m
643 if(zi+m > sz->m) printf("\n***** strsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
644#endif
645
646 if(ai!=0)
647 {
648 printf("\nstrsv_lnn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
649 exit(1);
650 }
651
652 const int bs = 8;
653
654 int sda = sA->cn;
655 float *pA = sA->pA + aj*bs; // TODO ai
656 float *dA = sA->dA;
657 float *x = sx->pa + xi;
658 float *z = sz->pa + zi;
659
660 int ii;
661
662 if(ai==0 & aj==0)
663 {
664 if(sA->use_dA!=1)
665 {
666 sdiaex_lib(n, 1.0, ai, pA, sda, dA);
667 for(ii=0; ii<n; ii++)
668 dA[ii] = 1.0 / dA[ii];
669 sA->use_dA = 1;
670 }
671 }
672 else
673 {
674 sdiaex_lib(n, 1.0, ai, pA, sda, dA);
675 for(ii=0; ii<n; ii++)
676 dA[ii] = 1.0 / dA[ii];
677 sA->use_dA = 0;
678 }
679
680 if(m<n)
681 m = n;
682
683 float alpha = -1.0;
684 float beta = 1.0;
685
686 int i;
687
688 if(x!=z)
689 {
690 for(i=0; i<m; i++)
691 z[i] = x[i];
692 }
693
694 i = 0;
695 for( ; i<n-7; i+=8)
696 {
697 kernel_strsv_ln_inv_8_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
698 }
699 if(i<n)
700 {
701 kernel_strsv_ln_inv_8_vs_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, n-i);
702 i+=8;
703 }
704 for( ; i<m-7; i+=8)
705 {
706 kernel_sgemv_n_8_lib8(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i]);
707 }
708 if(i<m)
709 {
710 kernel_sgemv_n_8_vs_lib8(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i], m-i);
711 i+=8;
712 }
713
714 return;
715
716 }
717
718
719
720void strsv_ltn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
721 {
722
723 if(m==0)
724 return;
725
726#if defined(DIM_CHECK)
727 // non-negative size
728 if(m<0) printf("\n****** strsv_ltn_libstr : m<0 : %d<0 *****\n", m);
729 // non-negative offset
730 if(ai<0) printf("\n****** strsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
731 if(aj<0) printf("\n****** strsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
732 if(xi<0) printf("\n****** strsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
733 if(zi<0) printf("\n****** strsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
734 // inside matrix
735 // A: m x k
736 if(ai+m > sA->m) printf("\n***** strsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
737 if(aj+m > sA->n) printf("\n***** strsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
738 // x: m
739 if(xi+m > sx->m) printf("\n***** strsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
740 // z: m
741 if(zi+m > sz->m) printf("\n***** strsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
742#endif
743
744 if(ai!=0)
745 {
746 printf("\nstrsv_ltn_libstr: feature not implemented yet: ai=%d\n", ai);
747 exit(1);
748 }
749
750 const int bs = 8;
751
752 int sda = sA->cn;
753 float *pA = sA->pA + aj*bs; // TODO ai
754 float *dA = sA->dA;
755 float *x = sx->pa + xi;
756 float *z = sz->pa + zi;
757
758 int ii;
759
760 if(ai==0 & aj==0)
761 {
762 if(sA->use_dA!=1)
763 {
764 sdiaex_lib(m, 1.0, ai, pA, sda, dA);
765 for(ii=0; ii<m; ii++)
766 dA[ii] = 1.0 / dA[ii];
767 sA->use_dA = 1;
768 }
769 }
770 else
771 {
772 sdiaex_lib(m, 1.0, ai, pA, sda, dA);
773 for(ii=0; ii<m; ii++)
774 dA[ii] = 1.0 / dA[ii];
775 sA->use_dA = 0;
776 }
777
778 int i, i1;
779
780 if(x!=z)
781 for(i=0; i<m; i++)
782 z[i] = x[i];
783
784 i=0;
785 i1 = m%8;
786 if(i1!=0)
787 {
788 kernel_strsv_lt_inv_8_vs_lib8(i+i1, &pA[m/bs*bs*sda+(m-i-i1)*bs], sda, &dA[m-i-i1], &z[m-i-i1], &z[m-i-i1], &z[m-i-i1], i1, i1);
789 i += i1;
790 }
791 for(; i<m-7; i+=8)
792 {
793 kernel_strsv_lt_inv_8_lib8(i+8, &pA[(m-i-8)/bs*bs*sda+(m-i-8)*bs], sda, &dA[m-i-8], &z[m-i-8], &z[m-i-8], &z[m-i-8]);
794 }
795
796 return;
797
798 }
799
800
801
802void strsv_ltn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
803 {
804
805 if(m==0)
806 return;
807
808#if defined(DIM_CHECK)
809 // non-negative size
810 if(m<0) printf("\n****** strsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
811 if(n<0) printf("\n****** strsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
812 // non-negative offset
813 if(ai<0) printf("\n****** strsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
814 if(aj<0) printf("\n****** strsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
815 if(xi<0) printf("\n****** strsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
816 if(zi<0) printf("\n****** strsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
817 // inside matrix
818 // A: m x k
819 if(ai+m > sA->m) printf("\n***** strsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
820 if(aj+n > sA->n) printf("\n***** strsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
821 // x: m
822 if(xi+m > sx->m) printf("\n***** strsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
823 // z: m
824 if(zi+m > sz->m) printf("\n***** strsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
825#endif
826
827 if(ai!=0)
828 {
829 printf("\nstrsv_ltn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
830 exit(1);
831 }
832
833 const int bs = 8;
834
835 int sda = sA->cn;
836 float *pA = sA->pA + aj*bs; // TODO ai
837 float *dA = sA->dA;
838 float *x = sx->pa + xi;
839 float *z = sz->pa + zi;
840
841 int ii;
842
843 if(ai==0 & aj==0)
844 {
845 if(sA->use_dA!=1)
846 {
847 sdiaex_lib(n, 1.0, ai, pA, sda, dA);
848 for(ii=0; ii<n; ii++)
849 dA[ii] = 1.0 / dA[ii];
850 sA->use_dA = 1;
851 }
852 }
853 else
854 {
855 sdiaex_lib(n, 1.0, ai, pA, sda, dA);
856 for(ii=0; ii<n; ii++)
857 dA[ii] = 1.0 / dA[ii];
858 sA->use_dA = 0;
859 }
860
861 if(n>m)
862 n = m;
863
864 int i, i1;
865
866 if(x!=z)
867 for(i=0; i<m; i++)
868 z[i] = x[i];
869
870 i=0;
871 i1 = n%8;
872 if(i1!=0)
873 {
874 kernel_strsv_lt_inv_8_vs_lib8(m-n+i1, &pA[n/bs*bs*sda+(n-i1)*bs], sda, &dA[n-i1], &z[n-i1], &z[n-i1], &z[n-i1], m-n+i1, i1);
875 i += i1;
876 }
877 for(; i<n-7; i+=8)
878 {
879 kernel_strsv_lt_inv_8_lib8(m-n+i+8, &pA[(n-i-8)/bs*bs*sda+(n-i-8)*bs], sda, &dA[n-i-8], &z[n-i-8], &z[n-i-8], &z[n-i-8]);
880 }
881
882 return;
883
884 }
885
886
887
888void sgemv_nt_libstr(int m, int n, float alpha_n, float alpha_t, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx_n, int xi_n, struct s_strvec *sx_t, int xi_t, float beta_n, float beta_t, struct s_strvec *sy_n, int yi_n, struct s_strvec *sy_t, int yi_t, struct s_strvec *sz_n, int zi_n, struct s_strvec *sz_t, int zi_t)
889 {
890
891 if(ai!=0)
892 {
893 printf("\nsgemv_nt_libstr: feature not implemented yet: ai=%d\n", ai);
894 exit(1);
895 }
896
897 const int bs = 8;
898
899 int sda = sA->cn;
900 float *pA = sA->pA + aj*bs; // TODO ai
901 float *x_n = sx_n->pa + xi_n;
902 float *x_t = sx_t->pa + xi_t;
903 float *y_n = sy_n->pa + yi_n;
904 float *y_t = sy_t->pa + yi_t;
905 float *z_n = sz_n->pa + zi_n;
906 float *z_t = sz_t->pa + zi_t;
907
908// if(m<=0 | n<=0)
909// return;
910
911 int ii;
912
913 // copy and scale y_n int z_n
914 ii = 0;
915 for(; ii<m-3; ii+=4)
916 {
917 z_n[ii+0] = beta_n*y_n[ii+0];
918 z_n[ii+1] = beta_n*y_n[ii+1];
919 z_n[ii+2] = beta_n*y_n[ii+2];
920 z_n[ii+3] = beta_n*y_n[ii+3];
921 }
922 for(; ii<m; ii++)
923 {
924 z_n[ii+0] = beta_n*y_n[ii+0];
925 }
926
927 ii = 0;
928 for(; ii<n-3; ii+=4)
929 {
930 kernel_sgemv_nt_4_lib8(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
931 }
932 if(ii<n)
933 {
934 kernel_sgemv_nt_4_vs_lib8(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
935 }
936
937 return;
938 }
939
940
941
942void ssymv_l_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
943 {
944
945// if(m<=0 | n<=0)
946// return;
947
948 const int bs = 8;
949
950 int ii, n1, n2;
951
952 int sda = sA->cn;
953 float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
954 float *x = sx->pa + xi;
955 float *y = sy->pa + yi;
956 float *z = sz->pa + zi;
957
958 // copy and scale y int z
959 ii = 0;
960 for(; ii<m-3; ii+=4)
961 {
962 z[ii+0] = beta*y[ii+0];
963 z[ii+1] = beta*y[ii+1];
964 z[ii+2] = beta*y[ii+2];
965 z[ii+3] = beta*y[ii+3];
966 }
967 for(; ii<m; ii++)
968 {
969 z[ii+0] = beta*y[ii+0];
970 }
971
972 // clean up at the beginning
973 if(ai%bs!=0) // 1, 2, 3
974 {
975 n1 = 8-ai%bs;
976 n2 = n<n1 ? n : n1;
977 kernel_ssymv_l_4l_gen_lib8(m-0, &alpha, ai%bs, &pA[0+(0)*bs], sda, &x[0], &z[0], n2-0);
978 kernel_ssymv_l_4r_gen_lib8(m-4, &alpha, ai%bs, &pA[4+(4)*bs], sda, &x[4], &z[4], n2-4);
979 pA += n1 + n1*bs + (sda-1)*bs;
980 x += n1;
981 z += n1;
982 m -= n1;
983 n -= n1;
984 }
985 // main loop
986 ii = 0;
987 for(; ii<n-7; ii+=8)
988 {
989 kernel_ssymv_l_4l_lib8(m-ii-0, &alpha, &pA[0+(ii+0)*bs+ii*sda], sda, &x[ii+0], &z[ii+0]);
990 kernel_ssymv_l_4r_lib8(m-ii-4, &alpha, &pA[4+(ii+4)*bs+ii*sda], sda, &x[ii+4], &z[ii+4]);
991 }
992 // clean up at the end
993 if(ii<n)
994 {
995 kernel_ssymv_l_4l_gen_lib8(m-ii-0, &alpha, 0, &pA[0+(ii+0)*bs+ii*sda], sda, &x[ii+0], &z[ii+0], n-ii-0);
996 kernel_ssymv_l_4r_gen_lib8(m-ii-4, &alpha, 0, &pA[4+(ii+4)*bs+ii*sda], sda, &x[ii+4], &z[ii+4], n-ii-4);
997 }
998
999 return;
1000 }
1001
1002
1003
1004#else
1005
1006#error : wrong LA choice
1007
1008#endif