blob: 75a4a4fd82df5d71e406eaa996dff639a1adaf66 [file] [log] [blame]
Austin Schuh9a24b372018-01-28 16:12:29 -08001/**************************************************************************************************
2* *
3* This file is part of BLASFEO. *
4* *
5* BLASFEO -- BLAS For Embedded Optimization. *
6* Copyright (C) 2016-2017 by Gianluca Frison. *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8* All rights reserved. *
9* *
10* HPMPC is free software; you can redistribute it and/or *
11* modify it under the terms of the GNU Lesser General Public *
12* License as published by the Free Software Foundation; either *
13* version 2.1 of the License, or (at your option) any later version. *
14* *
15* HPMPC is distributed in the hope that it will be useful, *
16* but WITHOUT ANY WARRANTY; without even the implied warranty of *
17* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
18* See the GNU Lesser General Public License for more details. *
19* *
20* You should have received a copy of the GNU Lesser General Public *
21* License along with HPMPC; if not, write to the Free Software *
22* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
23* *
24* Author: Gianluca Frison, giaf (at) dtu.dk *
25* gianluca.frison (at) imtek.uni-freiburg.de *
26* *
27**************************************************************************************************/
28
29#include <stdlib.h>
30#include <stdio.h>
31#include <math.h>
32
33#include "../include/blasfeo_common.h"
34#include "../include/blasfeo_d_aux.h"
35#include "../include/blasfeo_d_kernel.h"
36
37
38
39/****************************
40* old interface
41****************************/
42
43
44
45void dgetrf_nn_nopivot_lib(int m, int n, double *pC, int sdc, double *pD, int sdd, double *inv_diag_D)
46 {
47
48 if(m<=0 || n<=0)
49 return;
50
51 const int ps = 4;
52
53 int ii, jj, ie;
54
55 // main loop
56 ii = 0;
57#if defined(TARGET_X64_INTEL_HASWELL)
58 for( ; ii<m-11; ii+=12)
59 {
60 jj = 0;
61 // solve lower
62 ie = n<ii ? n : ii; // ie is multiple of 4
63 for( ; jj<ie-3; jj+=4)
64 {
65 kernel_dtrsm_nn_ru_inv_12x4_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
66 }
67 if(jj<ie)
68 {
69 kernel_dtrsm_nn_ru_inv_12x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
70 jj+=4;
71 }
72 // factorize
73 if(jj<n-3)
74 {
75 kernel_dgetrf_nn_l_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
76 jj+=4;
77 }
78 else if(jj<n)
79 {
80 kernel_dgetrf_nn_l_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
81 jj+=4;
82 }
83 if(jj<n-3)
84 {
85 kernel_dgetrf_nn_m_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
86 jj+=4;
87 }
88 else if(jj<n)
89 {
90 kernel_dgetrf_nn_m_12x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
91 jj+=4;
92 }
93 if(jj<n-3)
94 {
95 kernel_dgetrf_nn_r_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
96 jj+=4;
97 }
98 else if(jj<n)
99 {
100 kernel_dgetrf_nn_r_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
101 jj+=4;
102 }
103 // solve upper
104 for( ; jj<n-3; jj+=4)
105 {
106 kernel_dtrsm_nn_ll_one_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd);
107 }
108 if(jj<n)
109 {
110 kernel_dtrsm_nn_ll_one_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
111 }
112 }
113 if(m>ii)
114 {
115 if(m-ii<=4)
116 {
117 goto left_4;
118 }
119 else if(m-ii<=8)
120 {
121 goto left_8;
122 }
123 else
124 {
125 goto left_12;
126 }
127 }
128#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
129 for( ; ii<m-7; ii+=8)
130 {
131 jj = 0;
132 // solve lower
133 ie = n<ii ? n : ii; // ie is multiple of 4
134 for( ; jj<ie-3; jj+=4)
135 {
136 kernel_dtrsm_nn_ru_inv_8x4_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
137 }
138 if(jj<ie)
139 {
140 kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
141 jj+=4;
142 }
143 // factorize
144 if(jj<n-3)
145 {
146 kernel_dgetrf_nn_l_8x4_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
147// kernel_dgetrf_nn_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj]);
148// kernel_dtrsm_nn_ru_inv_4x4_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
149 jj+=4;
150 }
151 else if(jj<n)
152 {
153 kernel_dgetrf_nn_l_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
154// kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
155// kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
156 jj+=4;
157 }
158 if(jj<n-3)
159 {
160 kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
161 kernel_dgetrf_nn_4x4_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &inv_diag_D[jj]);
162 jj+=4;
163 }
164 else if(jj<n)
165 {
166 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
167 kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
168 jj+=4;
169 }
170 // solve upper
171 for( ; jj<n-3; jj+=4)
172 {
173 kernel_dtrsm_nn_ll_one_8x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd],sdd, &pD[ii*ps+ii*sdd], sdd);
174 }
175 if(jj<n)
176 {
177 kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
178 }
179 }
180 if(m>ii)
181 {
182 if(m-ii<=4)
183 {
184 goto left_4;
185 }
186 else
187 {
188 goto left_8;
189 }
190 }
191#else
192 for( ; ii<m-3; ii+=4)
193 {
194 jj = 0;
195 // solve lower
196 ie = n<ii ? n : ii; // ie is multiple of 4
197 for( ; jj<ie-3; jj+=4)
198 {
199 kernel_dtrsm_nn_ru_inv_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
200 }
201 if(jj<ie)
202 {
203 kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
204 jj+=4;
205 }
206 // factorize
207 if(jj<n-3)
208 {
209 kernel_dgetrf_nn_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj]);
210 jj+=4;
211 }
212 else if(jj<n)
213 {
214 kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
215 jj+=4;
216 }
217 // solve upper
218 for( ; jj<n-3; jj+=4)
219 {
220 kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
221 }
222 if(jj<n)
223 {
224 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
225 }
226 }
227 if(m>ii)
228 {
229 goto left_4;
230 }
231
232#endif
233
234 // common return if i==m
235 return;
236
237#if defined(TARGET_X64_INTEL_HASWELL)
238 left_12:
239 jj = 0;
240 // solve lower
241 ie = n<ii ? n : ii; // ie is multiple of 4
242 for( ; jj<ie; jj+=4)
243 {
244 kernel_dtrsm_nn_ru_inv_12x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
245 }
246 // factorize
247 if(jj<n)
248 {
249 kernel_dgetrf_nn_l_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
250 jj+=4;
251 }
252 if(jj<n)
253 {
254 kernel_dgetrf_nn_l_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
255 jj+=4;
256 }
257 if(jj<n)
258 {
259 kernel_dgetrf_nn_r_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
260 jj+=4;
261 }
262 // solve upper
263 for( ; jj<n; jj+=4)
264 {
265 kernel_dtrsm_nn_ll_one_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
266 }
267 return;
268
269#endif
270
271#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
272 left_8:
273 jj = 0;
274 // solve lower
275 ie = n<ii ? n : ii; // ie is multiple of 4
276 for( ; jj<ie; jj+=4)
277 {
278 kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
279 }
280 // factorize
281 if(jj<n)
282 {
283 kernel_dgetrf_nn_l_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
284// kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
285// kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
286 jj+=4;
287 }
288 if(jj<n)
289 {
290 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
291 kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
292 jj+=4;
293 }
294 // solve upper
295 for( ; jj<n; jj+=4)
296 {
297 kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
298 }
299 return;
300
301#endif
302
303 left_4:
304 jj = 0;
305 // solve lower
306 ie = n<ii ? n : ii; // ie is multiple of 4
307 for( ; jj<ie; jj+=4)
308 {
309 kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
310 }
311 // factorize
312 if(jj<n)
313 {
314 kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
315 jj+=4;
316 }
317 // solve upper
318 for( ; jj<n; jj+=4)
319 {
320 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
321 }
322 return;
323
324 }
325
326
327
328void dgetrf_nn_lib(int m, int n, double *pC, int sdc, double *pD, int sdd, double *inv_diag_D, int *ipiv)
329 {
330
331 if(m<=0)
332 return;
333
334 const int ps = 4;
335
336 int ii, jj, i0, i1, j0, ll, p;
337
338 double d1 = 1.0;
339 double dm1 = -1.0;
340
341 // needs to perform row-excanges on the yet-to-be-factorized matrix too
342 if(pC!=pD)
343 dgecp_lib(m, n, 1.0, 0, pC, sdc, 0, pD, sdd);
344
345 // minimum matrix size
346 p = n<m ? n : m; // XXX
347
348 // main loop
349#if defined(TARGET_X64_INTEL_HASWELL)
350 // 12 columns at a time
351 jj = 0;
352 for(; jj<p-11; jj+=12)
353 {
354 // pivot & factorize & solve lower
355 // left block-column
356 ii = jj;
357 i0 = ii;
358 for( ; ii<m-11; ii+=12)
359 {
360 kernel_dgemm_nn_12x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
361 }
362 if(m-ii>0)
363 {
364 if(m-ii>8)
365 {
366 kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
367 }
368 else if(m-ii>4)
369 {
370 kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
371 }
372 else
373 {
374 kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
375 }
376 }
377 kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
378 ipiv[i0+0] += i0;
379 if(ipiv[i0+0]!=i0+0)
380 {
381 drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
382 drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
383 }
384 ipiv[i0+1] += i0;
385 if(ipiv[i0+1]!=i0+1)
386 {
387 drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
388 drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
389 }
390 ipiv[i0+2] += i0;
391 if(ipiv[i0+2]!=i0+2)
392 {
393 drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
394 drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
395 }
396 ipiv[i0+3] += i0;
397 if(ipiv[i0+3]!=i0+3)
398 {
399 drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
400 drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
401 }
402 // middle block-column
403 ii = i0;
404 kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
405 ii += 4;
406 i1 = ii;
407 for( ; ii<m-11; ii+=12)
408 {
409 kernel_dgemm_nn_12x4_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd);
410 }
411 if(m-ii>0)
412 {
413 if(m-ii>8)
414 {
415 kernel_dgemm_nn_12x4_vs_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd, m-ii, 4);
416 }
417 else if(m-ii>4)
418 {
419 kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
420 }
421 else
422 {
423 kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
424 }
425 }
426 kernel_dgetrf_pivot_4_lib4(m-i1, &pD[(jj+4)*ps+i1*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i1]);
427 ipiv[i1+0] += i1;
428 if(ipiv[i1+0]!=i1+0)
429 {
430 drowsw_lib(jj+4, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
431 drowsw_lib(n-jj-8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+8)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+8)*ps);
432 }
433 ipiv[i1+1] += i1;
434 if(ipiv[i1+1]!=i1+1)
435 {
436 drowsw_lib(jj+4, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
437 drowsw_lib(n-jj-8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+8)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+8)*ps);
438 }
439 ipiv[i1+2] += i1;
440 if(ipiv[i1+2]!=i1+2)
441 {
442 drowsw_lib(jj+4, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
443 drowsw_lib(n-jj-8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+8)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+8)*ps);
444 }
445 ipiv[i1+3] += i1;
446 if(ipiv[i1+3]!=i1+3)
447 {
448 drowsw_lib(jj+4, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
449 drowsw_lib(n-jj-8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+8)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+8)*ps);
450 }
451 // right block-column
452 ii = i0;
453 kernel_dtrsm_nn_ll_one_8x4_lib4(ii, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd);
454 ii += 8;
455 i1 = ii;
456 for( ; ii<m-11; ii+=12)
457 {
458 kernel_dgemm_nn_12x4_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+8)*ps], sdd, &d1, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd);
459 }
460 if(m-ii>0)
461 {
462 if(m-ii>8)
463 {
464 kernel_dgemm_nn_12x4_vs_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &d1, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, m-ii, 4);
465 }
466 else if(m-ii>4)
467 {
468 kernel_dgemm_nn_8x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
469 }
470 else
471 {
472 kernel_dgemm_nn_4x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
473 }
474 }
475 kernel_dgetrf_pivot_4_lib4(m-i1, &pD[(jj+8)*ps+i1*sdd], sdd, &inv_diag_D[(jj+8)], &ipiv[i1]);
476 ipiv[i1+0] += i1;
477 if(ipiv[i1+0]!=i1+0)
478 {
479 drowsw_lib(jj+8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
480 drowsw_lib(n-jj-12, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+12)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+12)*ps);
481 }
482 ipiv[i1+1] += i1;
483 if(ipiv[i1+1]!=i1+1)
484 {
485 drowsw_lib(jj+8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
486 drowsw_lib(n-jj-12, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+12)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+12)*ps);
487 }
488 ipiv[i1+2] += i1;
489 if(ipiv[i1+2]!=i1+2)
490 {
491 drowsw_lib(jj+8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
492 drowsw_lib(n-jj-12, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+12)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+12)*ps);
493 }
494 ipiv[i1+3] += i1;
495 if(ipiv[i1+3]!=i1+3)
496 {
497 drowsw_lib(jj+8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
498 drowsw_lib(n-jj-12, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+12)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+12)*ps);
499 }
500
501 // solve upper
502// i0 -= 8; // 4 ???
503 ll = jj+12;
504 for( ; ll<n-3; ll+=4)
505 {
506 kernel_dtrsm_nn_ll_one_12x4_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd);
507 }
508 if(ll<n)
509 {
510 kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, 12, n-ll);
511 }
512 }
513 if(m>=n)
514 {
515 if(n-jj>0)
516 {
517 if(n-jj<=4)
518 goto left_n_4;
519 else if(n-jj<=8)
520 goto left_n_8;
521 else
522 goto left_n_12;
523 }
524 }
525 else // n>m
526 {
527 if(m-jj>0)
528 {
529 if(m-jj<=4)
530 goto left_m_4;
531 else if(m-jj<=8)
532 goto left_m_8;
533 else
534 goto left_m_12;
535 }
536 }
537#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
538 // 8 columns at a time
539 jj = 0;
540 for(; jj<p-7; jj+=8)
541 {
542 // pivot & factorize & solve lower
543 // left block-column
544 ii = jj;
545 i0 = ii;
546#if defined(TARGET_X64_INTEL_HASWELL) // XXX
547 for( ; ii<m-11; ii+=12)
548 {
549 kernel_dgemm_nn_12x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
550 }
551 if(m-ii>0)
552 {
553 if(m-ii>8)
554 {
555 kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
556 }
557 else if(m-ii>4)
558 {
559 kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
560 }
561 else
562 {
563 kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
564 }
565 }
566#else // SANDY_BRIDGE
567 for( ; ii<m-7; ii+=8)
568 {
569 kernel_dgemm_nn_8x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
570 }
571 if(m-ii>0)
572 {
573 if(m-ii>4)
574 {
575 kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
576 }
577 else
578 {
579 kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
580 }
581 }
582#endif
583 kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
584 ipiv[i0+0] += i0;
585 if(ipiv[i0+0]!=i0+0)
586 {
587 drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
588 drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
589 }
590 ipiv[i0+1] += i0;
591 if(ipiv[i0+1]!=i0+1)
592 {
593 drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
594 drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
595 }
596 ipiv[i0+2] += i0;
597 if(ipiv[i0+2]!=i0+2)
598 {
599 drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
600 drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
601 }
602 ipiv[i0+3] += i0;
603 if(ipiv[i0+3]!=i0+3)
604 {
605 drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
606 drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
607 }
608 // right block-column
609 ii = i0;
610 kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
611 ii += 4;
612 i0 = ii;
613#if defined(TARGET_X64_INTEL_HASWELL) // XXX
614 for( ; ii<m-11; ii+=12)
615 {
616 kernel_dgemm_nn_12x4_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd);
617 }
618 if(m-ii>0)
619 {
620 if(m-ii>8)
621 {
622 kernel_dgemm_nn_12x4_vs_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd, m-ii, 4);
623 }
624 else if(m-ii>4)
625 {
626 kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
627 }
628 else
629 {
630 kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
631 }
632 }
633#else // SANDY_BRIDGE
634 for( ; ii<m-7; ii+=8)
635 {
636 kernel_dgemm_nn_8x4_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd);
637 }
638 if(m-ii>0)
639 {
640 if(m-ii>4)
641 {
642 kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
643 }
644 else
645 {
646 kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
647 }
648 }
649#endif
650 kernel_dgetrf_pivot_4_lib4(m-i0, &pD[(jj+4)*ps+i0*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i0]);
651 ipiv[i0+0] += i0;
652 if(ipiv[i0+0]!=i0+0)
653 {
654 drowsw_lib(jj+4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
655 drowsw_lib(n-jj-8, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+8)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+8)*ps);
656 }
657 ipiv[i0+1] += i0;
658 if(ipiv[i0+1]!=i0+1)
659 {
660 drowsw_lib(jj+4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
661 drowsw_lib(n-jj-8, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+8)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+8)*ps);
662 }
663 ipiv[i0+2] += i0;
664 if(ipiv[i0+2]!=i0+2)
665 {
666 drowsw_lib(jj+4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
667 drowsw_lib(n-jj-8, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+8)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+8)*ps);
668 }
669 ipiv[i0+3] += i0;
670 if(ipiv[i0+3]!=i0+3)
671 {
672 drowsw_lib(jj+4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
673 drowsw_lib(n-jj-8, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+8)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+8)*ps);
674 }
675
676 // solve upper
677 i0 -= 4;
678 ll = jj+8;
679 for( ; ll<n-3; ll+=4)
680 {
681 kernel_dtrsm_nn_ll_one_8x4_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd);
682 }
683 if(ll<n)
684 {
685 kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, 8, n-ll);
686 }
687 }
688 if(m>=n)
689 {
690 if(n-jj>0)
691 {
692 if(n-jj<=4) // (m>=1 && n==1) || (m>=2 && n==2) || m>=3 && n==3
693 {
694 goto left_n_4;
695 }
696 else // (m>=5 && n==5) || (m>=6 && n==6) || (m>=7 && n==7)
697 goto left_n_8;
698 }
699 }
700 else // n>m
701 {
702 if(m-jj>0)
703 {
704 if(m-jj<=4) // (m==1 && n>=2) || (m==2 && n>=3) || (m==3 && n>=4) || (m==4 && n>=5)
705 goto left_m_4;
706 else // (m==5 && n>=6) || (m==6 && n>=7) || (m==7 && n>=8)
707 {
708 goto left_m_8;
709 }
710 }
711 }
712#else
713 // 4 columns at a time
714 jj = 0;
715 for(; jj<p-3; jj+=4) // XXX
716 {
717 // pivot & factorize & solve lower
718 ii = jj;
719 i0 = ii;
720#if defined(TARGET_X64_INTEL_HASWELL) // XXX
721 for( ; ii<m-11; ii+=12)
722 {
723 kernel_dgemm_nn_12x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
724 }
725 if(m-ii>0)
726 {
727 if(m-ii>8)
728 {
729 kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
730 }
731 else if(m-ii>4)
732 {
733 kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
734 }
735 else
736 {
737 kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
738 }
739 }
740#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) // XXX
741 for( ; ii<m-7; ii+=8)
742 {
743 kernel_dgemm_nn_8x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
744 }
745 if(m-ii>0)
746 {
747 if(m-ii>4)
748 {
749 kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
750 }
751 else
752 {
753 kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
754 }
755 }
756#else
757 for( ; ii<m-3; ii+=4)
758 {
759 kernel_dgemm_nn_4x4_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], &pD[jj*ps+ii*sdd]);
760 }
761 if(m-ii>0)
762 {
763 kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
764 }
765#endif
766 kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
767 ipiv[i0+0] += i0;
768 if(ipiv[i0+0]!=i0+0)
769 {
770 drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
771 drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
772 }
773 ipiv[i0+1] += i0;
774 if(ipiv[i0+1]!=i0+1)
775 {
776 drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
777 drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
778 }
779 ipiv[i0+2] += i0;
780 if(ipiv[i0+2]!=i0+2)
781 {
782 drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
783 drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
784 }
785 ipiv[i0+3] += i0;
786 if(ipiv[i0+3]!=i0+3)
787 {
788 drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
789 drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
790 }
791
792 // solve upper
793 ll = jj+4;
794 for( ; ll<n-3; ll+=4)
795 {
796 kernel_dtrsm_nn_ll_one_4x4_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd]);
797 }
798 if(n-ll>0)
799 {
800 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd], 4, n-ll);
801 }
802 }
803 if(m>=n)
804 {
805 if(n-jj>0)
806 {
807 goto left_n_4;
808 }
809 }
810 else
811 {
812 if(m-jj>0)
813 {
814 goto left_m_4;
815 }
816 }
817#endif
818
819 // common return if jj==n
820 return;
821
822
823 // clean up
824#if defined(TARGET_X64_INTEL_HASWELL)
825 left_n_12:
826 // 9-12 columns at a time
827 // pivot & factorize & solve lower
828 // left block-column
829 ii = jj;
830 i0 = ii;
831 for( ; ii<m-8; ii+=12)
832 {
833 kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
834 }
835 if(m-ii>4)
836 {
837 kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
838// ii+=8;
839 }
840 else if(m-ii>0)
841 {
842 kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
843// ii+=4;
844 }
845 kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
846 ipiv[i0+0] += i0;
847 if(ipiv[i0+0]!=i0+0)
848 {
849 drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
850 drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
851 }
852 ipiv[i0+1] += i0;
853 if(ipiv[i0+1]!=i0+1)
854 {
855 drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
856 drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
857 }
858 ipiv[i0+2] += i0;
859 if(ipiv[i0+2]!=i0+2)
860 {
861 drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
862 drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
863 }
864 ipiv[i0+3] += i0;
865 if(ipiv[i0+3]!=i0+3)
866 {
867 drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
868 drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
869 }
870 // middle block-column
871 ii = i0;
872 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
873 ii += 4;
874 i1 = ii;
875 for( ; ii<m-8; ii+=12)
876 {
877 kernel_dgemm_nn_12x4_vs_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd, m-ii, n-jj-4);
878 }
879 if(m-ii>4)
880 {
881 kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
882 }
883 else if(m-ii>0)
884 {
885 kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
886 }
887 kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-4, &pD[(jj+4)*ps+i1*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i1]);
888 ipiv[i1+0] += i1;
889 if(ipiv[i1+0]!=i1+0)
890 {
891 drowsw_lib(jj+4, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
892 drowsw_lib(n-jj-8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+8)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+8)*ps);
893 }
894 if(n-jj-4>1)
895 {
896 ipiv[i1+1] += i1;
897 if(ipiv[i1+1]!=i1+1)
898 {
899 drowsw_lib(jj+4, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
900 drowsw_lib(n-jj-8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+8)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+8)*ps);
901 }
902 if(n-jj-4>2)
903 {
904 ipiv[i1+2] += i1;
905 if(ipiv[i1+2]!=i1+2)
906 {
907 drowsw_lib(jj+4, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
908 drowsw_lib(n-jj-8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+8)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+8)*ps);
909 }
910 if(n-jj-4>3)
911 {
912 ipiv[i1+3] += i1;
913 if(ipiv[i1+3]!=i1+3)
914 {
915 drowsw_lib(jj+4, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
916 drowsw_lib(n-jj-8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+8)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+8)*ps);
917 }
918 }
919 }
920 }
921 // right block-column
922 ii = i0;
923 kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, 8, n-jj-8);
924 ii += 8;
925 i1 = ii;
926 for( ; ii<m-8; ii+=12)
927 {
928 kernel_dgemm_nn_12x4_vs_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &d1, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, m-ii, n-jj-8);
929 }
930 if(m-ii>4)
931 {
932 kernel_dgemm_nn_8x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-8);
933 }
934 else if(m-ii>0)
935 {
936 kernel_dgemm_nn_4x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-8);
937 }
938 kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-8, &pD[(jj+8)*ps+i1*sdd], sdd, &inv_diag_D[(jj+8)], &ipiv[i1]);
939 ipiv[i1+0] += i1;
940 if(ipiv[i1+0]!=i1+0)
941 {
942 drowsw_lib(jj+8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
943 drowsw_lib(n-jj-12, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+12)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+12)*ps);
944 }
945 if(n-jj-8>1)
946 {
947 ipiv[i1+1] += i1;
948 if(ipiv[i1+1]!=i1+1)
949 {
950 drowsw_lib(jj+8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
951 drowsw_lib(n-jj-12, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+12)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+12)*ps);
952 }
953 if(n-jj-8>2)
954 {
955 ipiv[i1+2] += i1;
956 if(ipiv[i1+2]!=i1+2)
957 {
958 drowsw_lib(jj+8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
959 drowsw_lib(n-jj-12, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+12)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+12)*ps);
960 }
961 if(n-jj-8>3)
962 {
963 ipiv[i1+3] += i1;
964 if(ipiv[i1+3]!=i1+3)
965 {
966 drowsw_lib(jj+8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
967 drowsw_lib(n-jj-12, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+12)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+12)*ps);
968 }
969 }
970 }
971 }
972
973 // solve upper
974 // there is no upper
975 return;
976#endif
977
978
979#if defined(TARGET_X64_INTEL_HASWELL)
980 left_m_12:
981 // 9-12 rows at a time
982 // pivot & factorize & solve lower
983 // left block-column
984 ii = jj;
985 i0 = ii;
986 kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
987 kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
988 ipiv[i0+0] += i0;
989 if(ipiv[i0+0]!=i0+0)
990 {
991 drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
992 drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
993 }
994 ipiv[i0+1] += i0;
995 if(ipiv[i0+1]!=i0+1)
996 {
997 drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
998 drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
999 }
1000 ipiv[i0+2] += i0;
1001 if(ipiv[i0+2]!=i0+2)
1002 {
1003 drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
1004 drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
1005 }
1006 ipiv[i0+3] += i0;
1007 if(ipiv[i0+3]!=i0+3)
1008 {
1009 drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
1010 drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
1011 }
1012 // middle block-column
1013 ii = i0;
1014 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
1015 ii += 4;
1016 i1 = ii;
1017 kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
1018 kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-4, &pD[(jj+4)*ps+i1*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i1]);
1019 ipiv[i1+0] += i1;
1020 if(ipiv[i1+0]!=i1+0)
1021 {
1022 drowsw_lib(jj+4, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
1023 drowsw_lib(n-jj-8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+8)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+8)*ps);
1024 }
1025 if(m-jj-4>1)
1026 {
1027 ipiv[i1+1] += i1;
1028 if(ipiv[i1+1]!=i1+1)
1029 {
1030 drowsw_lib(jj+4, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
1031 drowsw_lib(n-jj-8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+8)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+8)*ps);
1032 }
1033 if(m-jj-4>2)
1034 {
1035 ipiv[i1+2] += i1;
1036 if(ipiv[i1+2]!=i1+2)
1037 {
1038 drowsw_lib(jj+4, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
1039 drowsw_lib(n-jj-8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+8)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+8)*ps);
1040 }
1041 if(m-jj-4>3)
1042 {
1043 ipiv[i1+3] += i1;
1044 if(ipiv[i1+3]!=i1+3)
1045 {
1046 drowsw_lib(jj+4, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
1047 drowsw_lib(n-jj-8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+8)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+8)*ps);
1048 }
1049 }
1050 }
1051 }
1052 // right block-column
1053 ii = i0;
1054 kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, 8, n-jj-8);
1055 ii += 8;
1056 i1 = ii;
1057 kernel_dgemm_nn_4x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-8);
1058 kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-8, &pD[(jj+8)*ps+i1*sdd], sdd, &inv_diag_D[(jj+8)], &ipiv[i1]);
1059 ipiv[i1+0] += i1;
1060 if(ipiv[i1+0]!=i1+0)
1061 {
1062 drowsw_lib(jj+8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
1063 drowsw_lib(n-jj-12, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+12)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+12)*ps);
1064 }
1065 if(m-jj-8>1)
1066 {
1067 ipiv[i1+1] += i1;
1068 if(ipiv[i1+1]!=i1+1)
1069 {
1070 drowsw_lib(jj+8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
1071 drowsw_lib(n-jj-12, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+12)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+12)*ps);
1072 }
1073 if(m-jj-8>2)
1074 {
1075 ipiv[i1+2] += i1;
1076 if(ipiv[i1+2]!=i1+2)
1077 {
1078 drowsw_lib(jj+8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
1079 drowsw_lib(n-jj-12, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+12)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+12)*ps);
1080 }
1081 if(m-jj-8>3)
1082 {
1083 ipiv[i1+3] += i1;
1084 if(ipiv[i1+3]!=i1+3)
1085 {
1086 drowsw_lib(jj+8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
1087 drowsw_lib(n-jj-12, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+12)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+12)*ps);
1088 }
1089 }
1090 }
1091 }
1092
1093 // solve upper
1094// i0 -= 8;
1095 ll = jj+12;
1096 for( ; ll<n; ll+=4)
1097 {
1098 kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, m-i0, n-ll);
1099 }
1100 return;
1101#endif
1102
1103
1104#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1105 left_n_8:
1106 // 5-8 columns at a time
1107 // pivot & factorize & solve lower
1108 // left block-column
1109 ii = jj;
1110 i0 = ii;
1111 for( ; ii<m-4; ii+=8)
1112 {
1113 kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
1114 }
1115 if(m-ii>0)
1116 {
1117 kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
1118// ii+=4;
1119 }
1120 kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
1121 ipiv[i0+0] += i0;
1122 if(ipiv[i0+0]!=i0+0)
1123 {
1124 drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
1125 drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
1126 }
1127 ipiv[i0+1] += i0;
1128 if(ipiv[i0+1]!=i0+1)
1129 {
1130 drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
1131 drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
1132 }
1133 ipiv[i0+2] += i0;
1134 if(ipiv[i0+2]!=i0+2)
1135 {
1136 drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
1137 drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
1138 }
1139 ipiv[i0+3] += i0;
1140 if(ipiv[i0+3]!=i0+3)
1141 {
1142 drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
1143 drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
1144 }
1145 // right block-column
1146 ii = i0;
1147 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
1148 ii += 4;
1149 i0 = ii;
1150 for( ; ii<m-4; ii+=8)
1151 {
1152 kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], 0, sdd, m-ii, 0, n-jj-4);
1153 }
1154 if(m-ii>0)
1155 {
1156 kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
1157 }
1158 kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj-4, &pD[(jj+4)*ps+i0*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i0]);
1159 ipiv[i0+0] += i0;
1160 if(ipiv[i0+0]!=i0+0)
1161 {
1162 drowsw_lib(jj+4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
1163 drowsw_lib(n-jj-8, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+8)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+8)*ps);
1164 }
1165 if(n-jj-4>1)
1166 {
1167 ipiv[i0+1] += i0;
1168 if(ipiv[i0+1]!=i0+1)
1169 {
1170 drowsw_lib(jj+4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
1171 drowsw_lib(n-jj-8, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+8)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+8)*ps);
1172 }
1173 if(n-jj-4>2)
1174 {
1175 ipiv[i0+2] += i0;
1176 if(ipiv[i0+2]!=i0+2)
1177 {
1178 drowsw_lib(jj+4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
1179 drowsw_lib(n-jj-8, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+8)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+8)*ps);
1180 }
1181 if(n-jj-4>3)
1182 {
1183 ipiv[i0+3] += i0;
1184 if(ipiv[i0+3]!=i0+3)
1185 {
1186 drowsw_lib(jj+4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
1187 drowsw_lib(n-jj-8, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+8)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+8)*ps);
1188 }
1189 }
1190 }
1191 }
1192
1193 // solve upper
1194 // there is no upper
1195 return;
1196#endif
1197
1198
1199#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1200 left_m_8:
1201 // 5-8 rows at a time
1202 // pivot & factorize & solve lower
1203 // left block-column
1204 ii = jj;
1205 i0 = ii;
1206 kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
1207 kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
1208 ipiv[i0+0] += i0;
1209 if(ipiv[i0+0]!=i0+0)
1210 {
1211 drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
1212 drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
1213 }
1214 ipiv[i0+1] += i0;
1215 if(ipiv[i0+1]!=i0+1)
1216 {
1217 drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
1218 drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
1219 }
1220 ipiv[i0+2] += i0;
1221 if(ipiv[i0+2]!=i0+2)
1222 {
1223 drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
1224 drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
1225 }
1226 ipiv[i0+3] += i0;
1227 if(ipiv[i0+3]!=i0+3)
1228 {
1229 drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
1230 drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
1231 }
1232 // right block-column
1233 ii = i0;
1234 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
1235 ii += 4;
1236 i0 = ii;
1237 kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
1238 kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj-4, &pD[(jj+4)*ps+i0*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i0]);
1239 ipiv[i0+0] += i0;
1240 if(ipiv[i0+0]!=i0+0)
1241 {
1242 drowsw_lib(jj+4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
1243 drowsw_lib(n-jj-8, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+8)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+8)*ps);
1244 }
1245 if(m-jj-4>1)
1246 {
1247 ipiv[i0+1] += i0;
1248 if(ipiv[i0+1]!=i0+1)
1249 {
1250 drowsw_lib(jj+4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
1251 drowsw_lib(n-jj-8, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+8)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+8)*ps);
1252 }
1253 if(m-jj-4>2)
1254 {
1255 ipiv[i0+2] += i0;
1256 if(ipiv[i0+2]!=i0+2)
1257 {
1258 drowsw_lib(jj+4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
1259 drowsw_lib(n-jj-8, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+8)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+8)*ps);
1260 }
1261 if(m-jj-4>3)
1262 {
1263 ipiv[i0+3] += i0;
1264 if(ipiv[i0+3]!=i0+3)
1265 {
1266 drowsw_lib(jj+4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
1267 drowsw_lib(n-jj-8, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+8)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+8)*ps);
1268 }
1269 }
1270 }
1271 }
1272
1273 // solve upper
1274 i0 -= 4;
1275 ll = jj+8;
1276 for( ; ll<n; ll+=4)
1277 {
1278 kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, m-i0, n-ll);
1279 }
1280 return;
1281#endif
1282
1283
1284 left_n_4:
1285 // 1-4 columns at a time
1286 // pivot & factorize & solve lower
1287 ii = jj;
1288 i0 = ii;
1289#if 0//defined(TARGET_X64_AVX2) || defined(TARGET_X64_AVX)
1290 for( ; ii<m-4; ii+=8)
1291 {
1292 kernel_dgemm_nn_8x4_vs_lib4(m-ii, n-jj, jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, -1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, 0, 0);
1293 }
1294 if(m-ii>0)
1295 {
1296 kernel_dgemm_nn_4x4_vs_lib4(m-ii, n-jj, jj, &pD[ii*sdd], &pD[jj*ps], sdd, -1, &pD[jj*ps+ii*sdd], &pD[jj*ps+ii*sdd], 0, 0);
1297// ii+=4;
1298 }
1299#else
1300 for( ; ii<m; ii+=4)
1301 {
1302 kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj);
1303 }
1304#endif
1305 kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
1306 ipiv[i0+0] += i0;
1307 if(ipiv[i0+0]!=i0+0)
1308 {
1309 drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
1310 drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
1311 }
1312 if(n-jj>1)
1313 {
1314 ipiv[i0+1] += i0;
1315 if(ipiv[i0+1]!=i0+1)
1316 {
1317 drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
1318 drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
1319 }
1320 if(n-jj>2)
1321 {
1322 ipiv[i0+2] += i0;
1323 if(ipiv[i0+2]!=i0+2)
1324 {
1325 drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
1326 drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
1327 }
1328 if(n-jj>3)
1329 {
1330 ipiv[i0+3] += i0;
1331 if(ipiv[i0+3]!=i0+3)
1332 {
1333 drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
1334 drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
1335 }
1336 }
1337 }
1338 }
1339
1340 // solve upper
1341 if(0) // there is no upper
1342 {
1343 ll = jj+4;
1344 for( ; ll<n; ll+=4)
1345 {
1346 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd], m-i0, n-ll);
1347 }
1348 }
1349 return;
1350
1351
1352 left_m_4:
1353 // 1-4 rows at a time
1354 // pivot & factorize & solve lower
1355 ii = jj;
1356 i0 = ii;
1357 kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj);
1358 kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
1359 ipiv[i0+0] += i0;
1360 if(ipiv[i0+0]!=i0+0)
1361 {
1362 drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
1363 drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
1364 }
1365 if(m-i0>1)
1366 {
1367 ipiv[i0+1] += i0;
1368 if(ipiv[i0+1]!=i0+1)
1369 {
1370 drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
1371 drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
1372 }
1373 if(m-i0>2)
1374 {
1375 ipiv[i0+2] += i0;
1376 if(ipiv[i0+2]!=i0+2)
1377 {
1378 drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
1379 drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
1380 }
1381 if(m-i0>3)
1382 {
1383 ipiv[i0+3] += i0;
1384 if(ipiv[i0+3]!=i0+3)
1385 {
1386 drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
1387 drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
1388 }
1389 }
1390 }
1391 }
1392
1393 // solve upper
1394 ll = jj+4;
1395 for( ; ll<n; ll+=4)
1396 {
1397 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd], m-i0, n-ll);
1398 }
1399 return;
1400
1401 }
1402
1403
1404# if 0
1405void dlauum_dpotrf_blk_nt_l_lib(int m, int n, int nv, int *rv, int *cv, double *pA, int sda, double *pB, int sdb, int alg, double *pC, int sdc, double *pD, int sdd, double *inv_diag_D)
1406 {
1407
1408 if(m<=0 || n<=0)
1409 return;
1410
1411 // TODO remove
1412 int k = cv[nv-1];
1413
1414 const int ps = 4;
1415
1416 int i, j, l;
1417 int ii, iii, jj, kii, kiii, kjj, k0, k1;
1418
1419 i = 0;
1420 ii = 0;
1421 iii = 0;
1422
1423#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
1424 for(; i<m-7; i+=8)
1425 {
1426
1427 while(ii<nv && rv[ii]<i+8)
1428 ii++;
1429 if(ii<nv)
1430 kii = cv[ii];
1431 else
1432 kii = cv[ii-1];
1433
1434 j = 0;
1435 jj = 0;
1436 for(; j<i && j<n-3; j+=4)
1437 {
1438
1439 while(jj<nv && rv[jj]<j+4)
1440 jj++;
1441 if(jj<nv)
1442 kjj = cv[jj];
1443 else
1444 kjj = cv[jj-1];
1445 k0 = kii<kjj ? kii : kjj;
1446
1447 kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j]);
1448 }
1449 if(j<n)
1450 {
1451
1452 while(jj<nv && rv[jj]<j+4)
1453 jj++;
1454 if(jj<nv)
1455 kjj = cv[jj];
1456 else
1457 kjj = cv[jj-1];
1458 k0 = kii<kjj ? kii : kjj;
1459
1460 if(j<i) // dgemm
1461 {
1462 kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j], 8, n-j);
1463 }
1464 else // dsyrk
1465 {
1466 kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &inv_diag_D[j], 8, n-j);
1467 if(j<n-4)
1468 {
1469 kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], alg, &pC[(j+4)*ps+(j+4)*sdc], &pD[(j+4)*ps+(j+4)*sdd], &inv_diag_D[j+4], 4, n-j-4); // TODO
1470 }
1471 }
1472 }
1473 }
1474 if(m>i)
1475 {
1476 if(m-i<=4)
1477 {
1478 goto left_4;
1479 }
1480 else
1481 {
1482 goto left_8;
1483 }
1484 }
1485#else
1486 for(; i<m-3; i+=4)
1487 {
1488
1489 while(ii<nv && rv[ii]<i+4)
1490 ii++;
1491 if(ii<nv)
1492 kii = cv[ii];
1493 else
1494 kii = cv[ii-1];
1495
1496 j = 0;
1497 jj = 0;
1498 for(; j<i && j<n-3; j+=4)
1499 {
1500
1501 while(jj<nv && rv[jj]<j+4)
1502 jj++;
1503 if(jj<nv)
1504 kjj = cv[jj];
1505 else
1506 kjj = cv[jj-1];
1507 k0 = kii<kjj ? kii : kjj;
1508
1509 kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j]);
1510 }
1511 if(j<n)
1512 {
1513
1514 while(jj<nv && rv[jj]<j+4)
1515 jj++;
1516 if(jj<nv)
1517 kjj = cv[jj];
1518 else
1519 kjj = cv[jj-1];
1520 k0 = kii<kjj ? kii : kjj;
1521
1522 if(i<j) // dgemm
1523 {
1524 kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j], 4, n-j);
1525 }
1526 else // dsyrk
1527 {
1528 kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &inv_diag_D[j], 4, n-j);
1529 }
1530 }
1531 }
1532 if(m>i)
1533 {
1534 goto left_4;
1535 }
1536#endif
1537
1538 // common return if i==m
1539 return;
1540
1541 // clean up loops definitions
1542
1543#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
1544 left_8:
1545
1546 kii = cv[nv-1];
1547
1548 j = 0;
1549 jj = 0;
1550 for(; j<i && j<n-3; j+=4)
1551 {
1552
1553 while(jj<nv && rv[jj]<j+4)
1554 jj++;
1555 if(jj<nv)
1556 kjj = cv[jj];
1557 else
1558 kjj = cv[jj-1];
1559 k0 = kii<kjj ? kii : kjj;
1560
1561 kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
1562 }
1563 if(j<n)
1564 {
1565
1566 while(jj<nv && rv[jj]<j+4)
1567 jj++;
1568 if(jj<nv)
1569 kjj = cv[jj];
1570 else
1571 kjj = cv[jj-1];
1572 k0 = kii<kjj ? kii : kjj;
1573
1574 if(j<i) // dgemm
1575 {
1576 kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
1577 }
1578 else // dsyrk
1579 {
1580 kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &inv_diag_D[j], m-i, n-j);
1581 if(j<n-4)
1582 {
1583 kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], alg, &pC[(j+4)*ps+(j+4)*sdc], &pD[(j+4)*ps+(j+4)*sdd], &inv_diag_D[j+4], m-i-4, n-j-4); // TODO
1584 }
1585 }
1586 }
1587 return;
1588#endif
1589
1590 left_4:
1591
1592 kii = cv[nv-1];
1593
1594 j = 0;
1595 jj = 0;
1596 for(; j<i && j<n-3; j+=4)
1597 {
1598
1599 while(jj<nv && rv[jj]<j+4)
1600 jj++;
1601 if(jj<nv)
1602 kjj = cv[jj];
1603 else
1604 kjj = cv[jj-1];
1605 k0 = kii<kjj ? kii : kjj;
1606
1607 kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
1608 }
1609 if(j<n)
1610 {
1611
1612 while(jj<nv && rv[jj]<j+4)
1613 jj++;
1614 if(jj<nv)
1615 kjj = cv[jj];
1616 else
1617 kjj = cv[jj-1];
1618 k0 = kii<kjj ? kii : kjj;
1619
1620 if(j<i) // dgemm
1621 {
1622 kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
1623 }
1624 else // dsyrk
1625 {
1626 kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
1627 }
1628 }
1629 return;
1630
1631 }
1632#endif
1633
1634
1635
1636
1637/****************************
1638* new interface
1639****************************/
1640
1641
1642
1643#if defined(LA_HIGH_PERFORMANCE)
1644
1645
1646
1647// dpotrf
1648void dpotrf_l_libstr(int m, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
1649 {
1650
1651 if(m<=0)
1652 return;
1653
1654 if(ci!=0 | di!=0)
1655 {
1656 printf("\ndpotrf_l_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
1657 exit(1);
1658 }
1659
1660 const int ps = 4;
1661
1662 int sdc = sC->cn;
1663 int sdd = sD->cn;
1664 double *pC = sC->pA + cj*ps;
1665 double *pD = sD->pA + dj*ps;
1666 double *dD = sD->dA;
1667
1668 if(di==0 & dj==0) // XXX what to do if di and dj are not zero
1669 sD->use_dA = 1;
1670 else
1671 sD->use_dA = 0;
1672
1673 int i, j, l;
1674
1675 i = 0;
1676#if defined(TARGET_X64_INTEL_HASWELL)
1677 for(; i<m-11; i+=12)
1678 {
1679 j = 0;
1680 for(; j<i; j+=4)
1681 {
1682 kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
1683 }
1684 kernel_dpotrf_nt_l_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
1685 kernel_dpotrf_nt_l_8x8_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4]);
1686 }
1687 if(m>i)
1688 {
1689 if(m-i<=4)
1690 {
1691 goto left_4;
1692 }
1693 else if(m-i<=8)
1694 {
1695 goto left_8;
1696 }
1697 else
1698 {
1699 goto left_12;
1700 }
1701 }
1702#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1703 for(; i<m-7; i+=8)
1704 {
1705 j = 0;
1706 for(; j<i; j+=4)
1707 {
1708 kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
1709 }
1710 kernel_dpotrf_nt_l_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
1711 kernel_dpotrf_nt_l_4x4_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4]);
1712 }
1713 if(m>i)
1714 {
1715 if(m-i<=4)
1716 {
1717 goto left_4;
1718 }
1719 else
1720 {
1721 goto left_8;
1722 }
1723 }
1724#else
1725 for(; i<m-3; i+=4)
1726 {
1727 j = 0;
1728 for(; j<i; j+=4)
1729 {
1730 kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j]);
1731 }
1732 kernel_dpotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j]);
1733 }
1734 if(m>i)
1735 {
1736 goto left_4;
1737 }
1738#endif
1739
1740 // common return if i==m
1741 return;
1742
1743 // clean up loops definitions
1744
1745#if defined(TARGET_X64_INTEL_HASWELL)
1746 left_12: // 9 - 12
1747 j = 0;
1748 for(; j<i; j+=4)
1749 {
1750 kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
1751 }
1752 kernel_dpotrf_nt_l_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
1753 kernel_dpotrf_nt_l_8x8_vs_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, m-j-4);
1754 return;
1755#endif
1756
1757#if defined(TARGET_X64_INTEL_HASWELL)
1758 left_8:
1759 j = 0;
1760 for(; j<i-8; j+=12)
1761 {
1762 kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
1763 kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, &pD[(j+4)*ps+(j+4)*sdd], sdd, &dD[(j+4)], m-i, m-(j+4));
1764 }
1765 if(j<i-4)
1766 {
1767 kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
1768 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], &pD[(j+4)*ps+(j+4)*sdd], &dD[(j+4)], m-i, m-(j+4));
1769 j += 8;
1770 }
1771 else if(j<i)
1772 {
1773 kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
1774 j += 4;
1775 }
1776 kernel_dpotrf_nt_l_8x8_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
1777 return;
1778#endif
1779#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1780 left_8:
1781 j = 0;
1782 for(; j<i; j+=4)
1783 {
1784 kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
1785 }
1786 kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
1787 kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, m-j-4);
1788 return;
1789#endif
1790
1791#if defined(TARGET_X64_INTEL_HASWELL)
1792 left_4:
1793 j = 0;
1794 for(; j<i-8; j+=12)
1795 {
1796 kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
1797 }
1798 if(j<i-4)
1799 {
1800 kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
1801 j += 8;
1802 }
1803 else if(j<i)
1804 {
1805 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
1806 j += 4;
1807 }
1808 kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
1809 return;
1810#else
1811 left_4:
1812 j = 0;
1813 for(; j<i; j+=4)
1814 {
1815 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
1816 }
1817 kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
1818 return;
1819#endif
1820
1821 }
1822
1823
1824
1825// dpotrf
1826void dpotrf_l_mn_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
1827 {
1828
1829 if(m<=0 || n<=0)
1830 return;
1831
1832 if(ci!=0 | di!=0)
1833 {
1834 printf("\ndpotrf_l_mn_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
1835 exit(1);
1836 }
1837
1838 const int ps = 4;
1839
1840 int sdc = sC->cn;
1841 int sdd = sD->cn;
1842 double *pC = sC->pA + cj*ps;
1843 double *pD = sD->pA + dj*ps;
1844 double *dD = sD->dA;
1845
1846 if(di==0 & dj==0) // XXX what to do if di and dj are not zero
1847 sD->use_dA = 1;
1848 else
1849 sD->use_dA = 0;
1850
1851 int i, j, l;
1852
1853 i = 0;
1854#if defined(TARGET_X64_INTEL_HASWELL)
1855 for(; i<m-11; i+=12)
1856 {
1857 j = 0;
1858 for(; j<i & j<n-3; j+=4)
1859 {
1860 kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
1861 }
1862 if(j<n)
1863 {
1864 if(j<i) // dtrsm
1865 {
1866 kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
1867 }
1868 else // dpptrf
1869 {
1870 if(n<j-11)
1871 {
1872 kernel_dpotrf_nt_l_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
1873 kernel_dpotrf_nt_l_8x8_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4]);
1874 }
1875 else
1876 {
1877 kernel_dpotrf_nt_l_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
1878 if(j<n-4)
1879 {
1880 kernel_dpotrf_nt_l_8x4_vs_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
1881 if(j<n-8)
1882 {
1883 kernel_dpotrf_nt_l_4x4_vs_lib4(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], &dD[j+8], m-i-8, n-j-8);
1884 }
1885 }
1886 }
1887 }
1888 }
1889 }
1890 if(m>i)
1891 {
1892 if(m-i<=4)
1893 {
1894 goto left_4;
1895 }
1896 else if(m-i<=8)
1897 {
1898 goto left_8;
1899 }
1900 else
1901 {
1902 goto left_12;
1903 }
1904 }
1905#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1906 for(; i<m-7; i+=8)
1907 {
1908 j = 0;
1909 for(; j<i & j<n-3; j+=4)
1910 {
1911 kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
1912 }
1913 if(j<n)
1914 {
1915 if(j<i) // dtrsm
1916 {
1917 kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
1918 }
1919 else // dpotrf
1920 {
1921 if(j<n-7)
1922// if(0)
1923 {
1924 kernel_dpotrf_nt_l_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
1925 kernel_dpotrf_nt_l_4x4_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4]);
1926 }
1927 else
1928 {
1929 kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
1930 if(j<n-4)
1931 {
1932 kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
1933 }
1934 }
1935 }
1936 }
1937 }
1938 if(m>i)
1939 {
1940 if(m-i<=4)
1941 {
1942 goto left_4;
1943 }
1944 else
1945 {
1946 goto left_8;
1947 }
1948 }
1949#else
1950 for(; i<m-3; i+=4)
1951 {
1952 j = 0;
1953 for(; j<i & j<n-3; j+=4)
1954 {
1955 kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j]);
1956 }
1957 if(j<n)
1958 {
1959 if(i<j) // dtrsm
1960 {
1961 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
1962 }
1963 else // dpotrf
1964 {
1965 if(j<n-3)
1966 {
1967 kernel_dpotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j]);
1968 }
1969 else
1970 {
1971 kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
1972 }
1973 }
1974 }
1975 }
1976 if(m>i)
1977 {
1978 goto left_4;
1979 }
1980#endif
1981
1982 // common return if i==m
1983 return;
1984
1985 // clean up loops definitions
1986
1987#if defined(TARGET_X64_INTEL_HASWELL)
1988 left_12:
1989 j = 0;
1990 for(; j<i & j<n; j+=4)
1991 {
1992 kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
1993 }
1994 if(j<n)
1995 {
1996 kernel_dpotrf_nt_l_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
1997 if(j<n-4)
1998 {
1999 kernel_dpotrf_nt_l_8x4_vs_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
2000 if(j<n-8)
2001 {
2002 kernel_dpotrf_nt_l_4x4_vs_lib4(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], &dD[j+8], m-i-8, n-j-8);
2003 }
2004 }
2005 }
2006 return;
2007#endif
2008
2009#if defined(TARGET_X64_INTEL_HASWELL)
2010 left_8:
2011 j = 0;
2012 for(; j<i-8 & j<n-8; j+=12)
2013 {
2014 kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2015 kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, &pD[(j+4)*ps+(j+4)*sdd], sdd, &dD[(j+4)], m-i, n-(j+4));
2016 }
2017 if(j<i-4 & j<n-4)
2018 {
2019 kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2020 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], &pD[(j+4)*ps+(j+4)*sdd], &dD[(j+4)], m-i, n-(j+4));
2021 j += 8;
2022 }
2023 else if(j<i & j<n)
2024 {
2025 kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2026 j += 4;
2027 }
2028 if(j<n)
2029 {
2030 kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2031 if(j<n-4)
2032 {
2033 kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
2034 }
2035 }
2036 return;
2037#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2038 left_8:
2039 j = 0;
2040 for(; j<i & j<n; j+=4)
2041 {
2042 kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2043 }
2044 if(j<n)
2045 {
2046 kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2047 if(j<n-4)
2048 {
2049 kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
2050 }
2051 }
2052 return;
2053#endif
2054
2055#if defined(TARGET_X64_INTEL_HASWELL)
2056 left_4:
2057 j = 0;
2058 for(; j<i-8 & j<n-8; j+=12)
2059 {
2060 kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2061 }
2062 if(j<i-4 & j<n-4)
2063 {
2064 kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2065 j += 8;
2066 }
2067 else if(j<i & j<n)
2068 {
2069 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2070 j += 4;
2071 }
2072 if(j<n)
2073 {
2074 kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2075 }
2076 return;
2077#else
2078 left_4:
2079 j = 0;
2080 for(; j<i & j<n; j+=4)
2081 {
2082 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2083 }
2084 if(j<n)
2085 {
2086 kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2087 }
2088 return;
2089#endif
2090
2091 }
2092
2093
2094
2095// dsyrk dpotrf
2096void dsyrk_dpotrf_ln_libstr(int m, int n, int k, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
2097 {
2098
2099 if(m<=0 || n<=0)
2100 return;
2101
2102 if(ai!=0 | bi!=0 | ci!=0 | di!=0)
2103 {
2104 printf("\ndsyrk_dpotrf_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
2105 exit(1);
2106 }
2107
2108 const int ps = 4;
2109
2110 int sda = sA->cn;
2111 int sdb = sB->cn;
2112 int sdc = sC->cn;
2113 int sdd = sD->cn;
2114 double *pA = sA->pA + aj*ps;
2115 double *pB = sB->pA + bj*ps;
2116 double *pC = sC->pA + cj*ps;
2117 double *pD = sD->pA + dj*ps;
2118 double *dD = sD->dA; // XXX what to do if di and dj are not zero
2119
2120 if(di==0 & dj==0)
2121 sD->use_dA = 1;
2122 else
2123 sD->use_dA = 0;
2124
2125 int i, j, l;
2126
2127 i = 0;
2128
2129#if defined(TARGET_X64_INTEL_HASWELL)
2130 for(; i<m-11; i+=12)
2131 {
2132 j = 0;
2133 for(; j<i & j<n-3; j+=4)
2134 {
2135 kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
2136 }
2137 if(j<n)
2138 {
2139 if(j<i) // dgemm
2140 {
2141 kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2142 }
2143 else // dsyrk
2144 {
2145 if(j<n-11)
2146 {
2147 kernel_dsyrk_dpotrf_nt_l_12x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
2148 kernel_dsyrk_dpotrf_nt_l_8x8_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4]);
2149 }
2150 else
2151 {
2152 kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2153 if(j<n-4)
2154 {
2155 if(j<n-8)
2156 {
2157 kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
2158 }
2159 else
2160 {
2161 kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
2162 }
2163 }
2164 }
2165 }
2166 }
2167 }
2168 if(m>i)
2169 {
2170 if(m-i<=4)
2171 {
2172 goto left_4;
2173 }
2174 else if(m-i<=8)
2175 {
2176 goto left_8;
2177 }
2178 else
2179 {
2180 goto left_12;
2181 }
2182 }
2183#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2184 for(; i<m-7; i+=8)
2185 {
2186 j = 0;
2187 for(; j<i & j<n-3; j+=4)
2188 {
2189 kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
2190 }
2191 if(j<n)
2192 {
2193 if(j<i) // dgemm
2194 {
2195 kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2196 }
2197 else // dsyrk
2198 {
2199 if(j<n-7)
2200// if(0)
2201 {
2202 kernel_dsyrk_dpotrf_nt_l_8x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
2203 kernel_dsyrk_dpotrf_nt_l_4x4_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4]);
2204 }
2205 else
2206 {
2207 kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2208 if(j<n-4)
2209 {
2210 kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
2211 }
2212 }
2213 }
2214 }
2215 }
2216 if(m>i)
2217 {
2218 if(m-i<=4)
2219 {
2220 goto left_4;
2221 }
2222 else
2223 {
2224 goto left_8;
2225 }
2226 }
2227#else
2228 for(; i<m-3; i+=4)
2229 {
2230 j = 0;
2231 for(; j<i & j<n-3; j+=4)
2232 {
2233 kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j]);
2234 }
2235 if(j<n)
2236 {
2237 if(i<j) // dgemm
2238 {
2239 kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2240 }
2241 else // dsyrk
2242 {
2243 if(j<n-3)
2244 {
2245 kernel_dsyrk_dpotrf_nt_l_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j]);
2246 }
2247 else
2248 {
2249 kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2250 }
2251 }
2252 }
2253 }
2254 if(m>i)
2255 {
2256 goto left_4;
2257 }
2258#endif
2259
2260 // common return if i==m
2261 return;
2262
2263 // clean up loops definitions
2264
2265#if defined(TARGET_X64_INTEL_HASWELL)
2266 left_12:
2267 j = 0;
2268 for(; j<i & j<n; j+=4)
2269 {
2270 kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2271 }
2272 if(j<n)
2273 {
2274 kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2275 if(j<n-4)
2276 {
2277 kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
2278 if(j<n-8)
2279 {
2280 kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], &dD[j+8], m-i-8, n-j-8);
2281 }
2282 }
2283 }
2284 return;
2285#endif
2286
2287#if defined(TARGET_X64_INTEL_HASWELL)
2288 left_8:
2289 j = 0;
2290 for(; j<i-8 & j<n-8; j+=12)
2291 {
2292 kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], sdb, j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2293 kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4(k, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, (j+4), &pD[i*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, &pD[(j+4)*ps+(j+4)*sdd], sdd, &dD[(j+4)], m-i, n-(j+4));
2294 }
2295 if(j<i-3 & j<n-3)
2296 {
2297 kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], sdb, j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2298 kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[(j+4)*sdb], (j+4), &pD[i*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], &pD[(j+4)*ps+(j+4)*sdd], &dD[(j+4)], m-i, n-(j+4));
2299 j += 8;
2300 }
2301 else if(j<i & j<n)
2302 {
2303 kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2304 j += 4;
2305 }
2306 if(j<n)
2307 {
2308 kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2309 if(j<n-4)
2310 {
2311 kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
2312 }
2313 }
2314 return;
2315#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2316 left_8:
2317 j = 0;
2318 for(; j<i & j<n; j+=4)
2319 {
2320 kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2321 }
2322 if(j<n)
2323 {
2324 kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2325 if(j<n-4)
2326 {
2327 kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
2328 }
2329 }
2330 return;
2331#endif
2332
2333#if defined(TARGET_X64_INTEL_HASWELL)
2334 left_4:
2335 j = 0;
2336 for(; j<i-8 & j<n-8; j+=12)
2337 {
2338 kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4(k, &pA[i*sda], &pB[j*sdb], sdb, j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2339 }
2340 if(j<i-4 & j<n-4)
2341 {
2342 kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4(k, &pA[i*sda], &pB[j*sdb], sdb, j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
2343 j += 8;
2344 }
2345 else if(j<i & j<n)
2346 {
2347 kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2348 j += 4;
2349 }
2350 if(j<n)
2351 {
2352 kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2353 }
2354#else
2355 left_4:
2356 j = 0;
2357 for(; j<i & j<n; j+=4)
2358 {
2359 kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2360 }
2361 if(j<n)
2362 {
2363 kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
2364 }
2365#endif
2366
2367 return;
2368
2369 }
2370
2371
2372
2373// dgetrf without pivoting
2374void dgetrf_nopivot_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
2375 {
2376 if(ci!=0 | di!=0)
2377 {
2378 printf("\ndgetf_nopivot_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
2379 exit(1);
2380 }
2381 const int ps = 4;
2382 int sdc = sC->cn;
2383 int sdd = sD->cn;
2384 double *pC = sC->pA + cj*ps;
2385 double *pD = sD->pA + dj*ps;
2386 double *dD = sD->dA; // XXX what to do if di and dj are not zero
2387 dgetrf_nn_nopivot_lib(m, n, pC, sdc, pD, sdd, dD);
2388 if(di==0 && dj==0)
2389 sD->use_dA = 1;
2390 else
2391 sD->use_dA = 0;
2392 return;
2393 }
2394
2395
2396
2397
2398// dgetrf pivoting
2399void dgetrf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, int *ipiv)
2400 {
2401 if(ci!=0 | di!=0)
2402 {
2403 printf("\ndgetrf_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
2404 exit(1);
2405 }
2406 const int ps = 4;
2407 int sdc = sC->cn;
2408 int sdd = sD->cn;
2409 double *pC = sC->pA + cj*ps;
2410 double *pD = sD->pA + dj*ps;
2411 double *dD = sD->dA; // XXX what to do if di and dj are not zero
2412 dgetrf_nn_lib(m, n, pC, sdc, pD, sdd, dD, ipiv);
2413 if(di==0 && dj==0)
2414 sD->use_dA = 1;
2415 else
2416 sD->use_dA = 0;
2417 return;
2418 }
2419
2420
2421
2422int dgeqrf_work_size_libstr(int m, int n)
2423 {
2424 const int ps = 4;
2425 int cm = (m+ps-1)/ps*ps;
2426 int cn = (n+ps-1)/ps*ps;
2427 return ps*(cm+cn)*sizeof(double);
2428// return 0;
2429 }
2430
2431
2432
2433void dgeqrf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, void *v_work)
2434 {
2435 char *work = (char *) v_work;
2436 if(m<=0 | n<=0)
2437 return;
2438 const int ps = 4;
2439 int sdc = sC->cn;
2440 int sdd = sD->cn;
2441 double *pC = &(DMATEL_LIBSTR(sC,ci,cj));
2442 double *pD = &(DMATEL_LIBSTR(sD,di,dj));
2443 double *dD = sD->dA + di;
2444 int cm = (m+ps-1)/ps*ps;
2445 int cn = (n+ps-1)/ps*ps;
2446 double *pVt = (double *) work;
2447 work += ps*cm*sizeof(double);
2448 double *pW = (double *) work;
2449 work += ps*cn*sizeof(double);
2450 if(pC!=pD)
2451 dgecp_lib(m, n, 1.0, ci&(ps-1), pC, sdc, di&(ps-1), pD, sdd);
2452 int ii;
2453 int imax0 = (ps-(di&(ps-1)))&(ps-1);
2454 int imax = m<n ? m : n;
2455 imax0 = imax<imax0 ? imax : imax0;
2456 if(imax0>0)
2457 {
2458 kernel_dgeqrf_vs_lib4(m, n, imax0, di&(ps-1), pD, sdd, dD);
2459 pD += imax0-ps+ps*sdd+imax0*ps;
2460 dD += imax0;
2461 m -= imax0;
2462 n -= imax0;
2463 imax -= imax0;
2464 }
2465 for(ii=0; ii<imax-3; ii+=4)
2466 {
2467 kernel_dgeqrf_4_lib4(m-ii, pD+ii*sdd+ii*ps, sdd, dD+ii);
2468#if 0
2469 kernel_dlarf_4_lib4(m-ii, n-ii-4, pD+ii*sdd+ii*ps, sdd, dD+ii, pD+ii*sdd+(ii+4)*ps, sdd);
2470#else
2471 kernel_dgetr_4_0_lib4(m-ii, pD+ii*sdd+ii*ps, sdd, pVt);
2472 pVt[0+ps*0] = 1.0;
2473 pVt[1+ps*0] = 0.0;
2474 pVt[2+ps*0] = 0.0;
2475 pVt[3+ps*0] = 0.0;
2476 pVt[1+ps*1] = 1.0;
2477 pVt[2+ps*1] = 0.0;
2478 pVt[3+ps*1] = 0.0;
2479 pVt[2+ps*2] = 1.0;
2480 pVt[3+ps*2] = 0.0;
2481 pVt[3+ps*3] = 1.0;
2482 kernel_dlarf_t_4_lib4(m-ii, n-ii-4, pD+ii*sdd+ii*ps, sdd, pVt, dD+ii, pD+ii*sdd+(ii+4)*ps, sdd, pW);
2483#endif
2484 }
2485 if(ii<imax)
2486 {
2487 kernel_dgeqrf_vs_lib4(m-ii, n-ii, imax-ii, ii&(ps-1), pD+ii*sdd+ii*ps, sdd, dD+ii);
2488 }
2489 return;
2490 }
2491
2492
2493
2494int dgelqf_work_size_libstr(int m, int n)
2495 {
2496 return 0;
2497 }
2498
2499
2500
2501void dgelqf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, void *work)
2502 {
2503 if(m<=0 | n<=0)
2504 return;
2505 const int ps = 4;
2506 int sdc = sC->cn;
2507 int sdd = sD->cn;
2508 double *pC = &(DMATEL_LIBSTR(sC,ci,cj));
2509 double *pD = &(DMATEL_LIBSTR(sD,di,dj));
2510 double *dD = sD->dA + di;
2511#if defined(TARGET_X64_INTEL_HASWELL)
2512 double pT[144] __attribute__ ((aligned (64))) = {0};
2513 double pK[96] __attribute__ ((aligned (64))) = {0};
2514#else
2515 double pT[144] = {0};
2516 double pK[96] = {0};
2517#endif
2518 if(pC!=pD)
2519 dgecp_lib(m, n, 1.0, ci&(ps-1), pC, sdc, di&(ps-1), pD, sdd);
2520 int ii, jj, ll;
2521 int imax0 = (ps-(di&(ps-1)))&(ps-1);
2522 int imax = m<n ? m : n;
2523#if 0
2524 kernel_dgelqf_vs_lib4(m, n, imax, di&(ps-1), pD, sdd, dD);
2525#else
2526 imax0 = imax<imax0 ? imax : imax0;
2527 if(imax0>0)
2528 {
2529 kernel_dgelqf_vs_lib4(m, n, imax0, di&(ps-1), pD, sdd, dD);
2530 pD += imax0-ps+ps*sdd+imax0*ps;
2531 dD += imax0;
2532 m -= imax0;
2533 n -= imax0;
2534 imax -= imax0;
2535 }
2536 ii = 0;
2537#if defined(TARGET_X64_INTEL_HASWELL)
2538// for(; ii<imax-11; ii+=12)
2539 for(; ii<imax-127; ii+=12) // crossover point ~ ii=128
2540 {
2541 kernel_dgelqf_dlarft12_12_lib4(n-(ii+0), pD+(ii+0)*sdd+(ii+0)*ps, sdd, dD+(ii+0), &pT[0+0*12+0*ps]);
2542 jj = ii+12;
2543 for(; jj<m; jj+=4)
2544 {
2545 kernel_dlarfb12_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, sdd, pT, pD+jj*sdd+ii*ps, pK, m-jj);
2546 }
2547 }
2548 for(; ii<imax-11; ii+=4)
2549 {
2550 kernel_dgelqf_dlarft4_12_lib4(n-ii, pD+ii*sdd+ii*ps, sdd, dD+ii, pT);
2551 jj = ii+12;
2552 for(; jj<m-11; jj+=12)
2553 {
2554 kernel_dlarfb4_r_12_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
2555 }
2556 for(; jj<m-7; jj+=8)
2557 {
2558 kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
2559 }
2560 for(; jj<m-3; jj+=4)
2561 {
2562 kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
2563 }
2564 for(ll=0; ll<m-jj; ll++)
2565 {
2566 kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
2567 }
2568 }
2569 // 8 9 10 11
2570 if(ii<imax-7)
2571 {
2572 kernel_dgelqf_dlarft4_8_lib4(n-ii, pD+ii*sdd+ii*ps, sdd, dD+ii, pT);
2573 jj = ii+8;
2574 if(jj<m)
2575 {
2576 for(; jj<m-11; jj+=12)
2577 {
2578 kernel_dlarfb4_r_12_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
2579 }
2580 for(; jj<m-7; jj+=8)
2581 {
2582 kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
2583 }
2584 for(; jj<m-3; jj+=4)
2585 {
2586 kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
2587 }
2588 for(ll=0; ll<m-jj; ll++)
2589 {
2590 kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
2591 }
2592 }
2593 ii += 4;
2594 }
2595 // 4 5 6 7
2596 if(ii<imax-3)
2597 {
2598 kernel_dgelqf_dlarft4_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii, pT);
2599 jj = ii+4;
2600 if(jj<m)
2601 {
2602 for(; jj<m-11; jj+=12)
2603 {
2604 kernel_dlarfb4_r_12_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
2605 }
2606 for(; jj<m-7; jj+=8)
2607 {
2608 kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
2609 }
2610 for(; jj<m-3; jj+=4)
2611 {
2612 kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
2613 }
2614 for(ll=0; ll<m-jj; ll++)
2615 {
2616 kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
2617 }
2618 }
2619 ii += 4;
2620 }
2621 // 1 2 3
2622 if(ii<imax)
2623 {
2624 kernel_dgelqf_vs_lib4(m-ii, n-ii, imax-ii, ii&(ps-1), pD+ii*sdd+ii*ps, sdd, dD+ii);
2625 }
2626#else // no haswell
2627 for(ii=0; ii<imax-4; ii+=4)
2628 {
2629// kernel_dgelqf_vs_lib4(4, n-ii, 4, 0, pD+ii*sdd+ii*ps, sdd, dD+ii);
2630// kernel_dgelqf_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii);
2631// kernel_dlarft_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii, pT);
2632 kernel_dgelqf_dlarft4_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii, pT);
2633 jj = ii+4;
2634#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2635 for(; jj<m-7; jj+=8)
2636 {
2637 kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
2638 }
2639#endif
2640 for(; jj<m-3; jj+=4)
2641 {
2642 kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
2643 }
2644 for(ll=0; ll<m-jj; ll++)
2645 {
2646 kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
2647 }
2648 }
2649 if(ii<imax)
2650 {
2651 if(ii==imax-4)
2652 {
2653 kernel_dgelqf_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii);
2654 }
2655 else
2656 {
2657 kernel_dgelqf_vs_lib4(m-ii, n-ii, imax-ii, ii&(ps-1), pD+ii*sdd+ii*ps, sdd, dD+ii);
2658 }
2659 }
2660#endif // no haswell
2661#endif
2662 return;
2663 }
2664
2665
2666
2667#else
2668
2669#error : wrong LA choice
2670
2671#endif