blob: 3f3ea424c809dd6300dc8f4287dd5f190cd7c02e [file] [log] [blame]
Austin Schuh70cc9552019-01-21 19:46:48 -08001// Ceres Solver - A fast non-linear least squares minimizer
2// Copyright 2018 Google Inc. All rights reserved.
3// http://ceres-solver.org/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//
8// * Redistributions of source code must retain the above copyright notice,
9// this list of conditions and the following disclaimer.
10// * Redistributions in binary form must reproduce the above copyright notice,
11// this list of conditions and the following disclaimer in the documentation
12// and/or other materials provided with the distribution.
13// * Neither the name of Google Inc. nor the names of its contributors may be
14// used to endorse or promote products derived from this software without
15// specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28//
29// Author: yangfan34@lenovo.com (Lenovo Research Device+ Lab - Shanghai)
30//
31// Optimization for simple blas functions used in the Schur Eliminator.
32// These are fairly basic implementations which already yield a significant
33// speedup in the eliminator performance.
34
35#ifndef CERES_INTERNAL_SMALL_BLAS_GENERIC_H_
36#define CERES_INTERNAL_SMALL_BLAS_GENERIC_H_
37
38namespace ceres {
39namespace internal {
40
41// The following macros are used to share code
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080042#define CERES_GEMM_OPT_NAIVE_HEADER \
43 double c0 = 0.0; \
44 double c1 = 0.0; \
45 double c2 = 0.0; \
46 double c3 = 0.0; \
47 const double* pa = a; \
48 const double* pb = b; \
49 const int span = 4; \
50 int col_r = col_a & (span - 1); \
Austin Schuh70cc9552019-01-21 19:46:48 -080051 int col_m = col_a - col_r;
52
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080053#define CERES_GEMM_OPT_STORE_MAT1X4 \
54 if (kOperation > 0) { \
55 *c++ += c0; \
56 *c++ += c1; \
57 *c++ += c2; \
58 *c++ += c3; \
59 } else if (kOperation < 0) { \
60 *c++ -= c0; \
61 *c++ -= c1; \
62 *c++ -= c2; \
63 *c++ -= c3; \
64 } else { \
65 *c++ = c0; \
66 *c++ = c1; \
67 *c++ = c2; \
68 *c++ = c3; \
Austin Schuh70cc9552019-01-21 19:46:48 -080069 }
70
71// Matrix-Matrix Multiplication
72// Figure out 1x4 of Matrix C in one batch
73//
74// c op a * B;
75// where op can be +=, -=, or =, indicated by kOperation.
76//
77// Matrix C Matrix A Matrix B
78//
79// C0, C1, C2, C3 op A0, A1, A2, A3, ... * B0, B1, B2, B3
80// B4, B5, B6, B7
81// B8, B9, Ba, Bb
82// Bc, Bd, Be, Bf
83// . , . , . , .
84// . , . , . , .
85// . , . , . , .
86//
87// unroll for loops
88// utilize the data resided in cache
89// NOTE: col_a means the columns of A
90static inline void MMM_mat1x4(const int col_a,
91 const double* a,
92 const double* b,
93 const int col_stride_b,
94 double* c,
95 const int kOperation) {
96 CERES_GEMM_OPT_NAIVE_HEADER
97 double av = 0.0;
98 int bi = 0;
99
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800100#define CERES_GEMM_OPT_MMM_MAT1X4_MUL \
101 av = pa[k]; \
102 pb = b + bi; \
103 c0 += av * *pb++; \
104 c1 += av * *pb++; \
105 c2 += av * *pb++; \
106 c3 += av * *pb++; \
107 bi += col_stride_b; \
Austin Schuh70cc9552019-01-21 19:46:48 -0800108 k++;
109
110 for (int k = 0; k < col_m;) {
111 CERES_GEMM_OPT_MMM_MAT1X4_MUL
112 CERES_GEMM_OPT_MMM_MAT1X4_MUL
113 CERES_GEMM_OPT_MMM_MAT1X4_MUL
114 CERES_GEMM_OPT_MMM_MAT1X4_MUL
115 }
116
117 for (int k = col_m; k < col_a;) {
118 CERES_GEMM_OPT_MMM_MAT1X4_MUL
119 }
120
121 CERES_GEMM_OPT_STORE_MAT1X4
122
123#undef CERES_GEMM_OPT_MMM_MAT1X4_MUL
124}
125
126// Matrix Transpose-Matrix multiplication
127// Figure out 1x4 of Matrix C in one batch
128//
129// c op a' * B;
130// where op can be +=, -=, or = indicated by kOperation.
131//
132// Matrix A
133//
134// A0
135// A1
136// A2
137// A3
138// .
139// .
140// .
141//
142// Matrix C Matrix A' Matrix B
143//
144// C0, C1, C2, C3 op A0, A1, A2, A3, ... * B0, B1, B2, B3
145// B4, B5, B6, B7
146// B8, B9, Ba, Bb
147// Bc, Bd, Be, Bf
148// . , . , . , .
149// . , . , . , .
150// . , . , . , .
151//
152// unroll for loops
153// utilize the data resided in cache
154// NOTE: col_a means the columns of A'
155static inline void MTM_mat1x4(const int col_a,
156 const double* a,
157 const int col_stride_a,
158 const double* b,
159 const int col_stride_b,
160 double* c,
161 const int kOperation) {
162 CERES_GEMM_OPT_NAIVE_HEADER
163 double av = 0.0;
164 int ai = 0;
165 int bi = 0;
166
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800167#define CERES_GEMM_OPT_MTM_MAT1X4_MUL \
168 av = pa[ai]; \
169 pb = b + bi; \
170 c0 += av * *pb++; \
171 c1 += av * *pb++; \
172 c2 += av * *pb++; \
173 c3 += av * *pb++; \
174 ai += col_stride_a; \
Austin Schuh70cc9552019-01-21 19:46:48 -0800175 bi += col_stride_b;
176
177 for (int k = 0; k < col_m; k += span) {
178 CERES_GEMM_OPT_MTM_MAT1X4_MUL
179 CERES_GEMM_OPT_MTM_MAT1X4_MUL
180 CERES_GEMM_OPT_MTM_MAT1X4_MUL
181 CERES_GEMM_OPT_MTM_MAT1X4_MUL
182 }
183
184 for (int k = col_m; k < col_a; k++) {
185 CERES_GEMM_OPT_MTM_MAT1X4_MUL
186 }
187
188 CERES_GEMM_OPT_STORE_MAT1X4
189
190#undef CERES_GEMM_OPT_MTM_MAT1X4_MUL
191}
192
193// Matrix-Vector Multiplication
194// Figure out 4x1 of vector c in one batch
195//
196// c op A * b;
197// where op can be +=, -=, or =, indicated by kOperation.
198//
199// Vector c Matrix A Vector b
200//
201// C0 op A0, A1, A2, A3, ... * B0
202// C1 A4, A5, A6, A7, ... B1
203// C2 A8, A9, Aa, Ab, ... B2
204// C3 Ac, Ad, Ae, Af, ... B3
205// .
206// .
207// .
208//
209// unroll for loops
210// utilize the data resided in cache
211// NOTE: col_a means the columns of A
212static inline void MVM_mat4x1(const int col_a,
213 const double* a,
214 const int col_stride_a,
215 const double* b,
216 double* c,
217 const int kOperation) {
218 CERES_GEMM_OPT_NAIVE_HEADER
219 double bv = 0.0;
220
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800221 // clang-format off
222#define CERES_GEMM_OPT_MVM_MAT4X1_MUL \
223 bv = *pb; \
224 c0 += *(pa ) * bv; \
225 c1 += *(pa + col_stride_a ) * bv; \
226 c2 += *(pa + col_stride_a * 2) * bv; \
227 c3 += *(pa + col_stride_a * 3) * bv; \
228 pa++; \
Austin Schuh70cc9552019-01-21 19:46:48 -0800229 pb++;
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800230 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800231
232 for (int k = 0; k < col_m; k += span) {
233 CERES_GEMM_OPT_MVM_MAT4X1_MUL
234 CERES_GEMM_OPT_MVM_MAT4X1_MUL
235 CERES_GEMM_OPT_MVM_MAT4X1_MUL
236 CERES_GEMM_OPT_MVM_MAT4X1_MUL
237 }
238
239 for (int k = col_m; k < col_a; k++) {
240 CERES_GEMM_OPT_MVM_MAT4X1_MUL
241 }
242
243 CERES_GEMM_OPT_STORE_MAT1X4
244
245#undef CERES_GEMM_OPT_MVM_MAT4X1_MUL
246}
247
248// Matrix Transpose-Vector multiplication
249// Figure out 4x1 of vector c in one batch
250//
251// c op A' * b;
252// where op can be +=, -=, or =, indicated by kOperation.
253//
254// Matrix A
255//
256// A0, A4, A8, Ac
257// A1, A5, A9, Ad
258// A2, A6, Aa, Ae
259// A3, A7, Ab, Af
260// . , . , . , .
261// . , . , . , .
262// . , . , . , .
263//
264// Vector c Matrix A' Vector b
265//
266// C0 op A0, A1, A2, A3, ... * B0
267// C1 A4, A5, A6, A7, ... B1
268// C2 A8, A9, Aa, Ab, ... B2
269// C3 Ac, Ad, Ae, Af, ... B3
270// .
271// .
272// .
273//
274// unroll for loops
275// utilize the data resided in cache
276// NOTE: col_a means the columns of A'
277static inline void MTV_mat4x1(const int col_a,
278 const double* a,
279 const int col_stride_a,
280 const double* b,
281 double* c,
282 const int kOperation) {
283 CERES_GEMM_OPT_NAIVE_HEADER
284 double bv = 0.0;
285
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800286 // clang-format off
287#define CERES_GEMM_OPT_MTV_MAT4X1_MUL \
288 bv = *pb; \
289 c0 += *(pa ) * bv; \
290 c1 += *(pa + 1) * bv; \
291 c2 += *(pa + 2) * bv; \
292 c3 += *(pa + 3) * bv; \
293 pa += col_stride_a; \
Austin Schuh70cc9552019-01-21 19:46:48 -0800294 pb++;
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800295 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800296
297 for (int k = 0; k < col_m; k += span) {
298 CERES_GEMM_OPT_MTV_MAT4X1_MUL
299 CERES_GEMM_OPT_MTV_MAT4X1_MUL
300 CERES_GEMM_OPT_MTV_MAT4X1_MUL
301 CERES_GEMM_OPT_MTV_MAT4X1_MUL
302 }
303
304 for (int k = col_m; k < col_a; k++) {
305 CERES_GEMM_OPT_MTV_MAT4X1_MUL
306 }
307
308 CERES_GEMM_OPT_STORE_MAT1X4
309
310#undef CERES_GEMM_OPT_MTV_MAT4X1_MUL
311}
312
313#undef CERES_GEMM_OPT_NAIVE_HEADER
314#undef CERES_GEMM_OPT_STORE_MAT1X4
315
316} // namespace internal
317} // namespace ceres
318
319#endif // CERES_INTERNAL_SMALL_BLAS_GENERIC_H_