blob: 93ee338813a2085b43de6999b7bde057d6780c49 [file] [log] [blame]
Austin Schuh70cc9552019-01-21 19:46:48 -08001// Ceres Solver - A fast non-linear least squares minimizer
Austin Schuh3de38b02024-06-25 18:25:10 -07002// Copyright 2023 Google Inc. All rights reserved.
Austin Schuh70cc9552019-01-21 19:46:48 -08003// http://ceres-solver.org/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//
8// * Redistributions of source code must retain the above copyright notice,
9// this list of conditions and the following disclaimer.
10// * Redistributions in binary form must reproduce the above copyright notice,
11// this list of conditions and the following disclaimer in the documentation
12// and/or other materials provided with the distribution.
13// * Neither the name of Google Inc. nor the names of its contributors may be
14// used to endorse or promote products derived from this software without
15// specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28//
29// Author: yangfan34@lenovo.com (Lenovo Research Device+ Lab - Shanghai)
30//
31// Optimization for simple blas functions used in the Schur Eliminator.
32// These are fairly basic implementations which already yield a significant
33// speedup in the eliminator performance.
34
35#ifndef CERES_INTERNAL_SMALL_BLAS_GENERIC_H_
36#define CERES_INTERNAL_SMALL_BLAS_GENERIC_H_
37
Austin Schuh3de38b02024-06-25 18:25:10 -070038namespace ceres::internal {
Austin Schuh70cc9552019-01-21 19:46:48 -080039
40// The following macros are used to share code
Austin Schuh3de38b02024-06-25 18:25:10 -070041#define CERES_GEMM_OPT_NAIVE_HEADER \
42 double cvec4[4] = {0.0, 0.0, 0.0, 0.0}; \
43 const double* pa = a; \
44 const double* pb = b; \
45 const int span = 4; \
46 int col_r = col_a & (span - 1); \
Austin Schuh70cc9552019-01-21 19:46:48 -080047 int col_m = col_a - col_r;
48
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080049#define CERES_GEMM_OPT_STORE_MAT1X4 \
50 if (kOperation > 0) { \
Austin Schuh3de38b02024-06-25 18:25:10 -070051 c[0] += cvec4[0]; \
52 c[1] += cvec4[1]; \
53 c[2] += cvec4[2]; \
54 c[3] += cvec4[3]; \
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080055 } else if (kOperation < 0) { \
Austin Schuh3de38b02024-06-25 18:25:10 -070056 c[0] -= cvec4[0]; \
57 c[1] -= cvec4[1]; \
58 c[2] -= cvec4[2]; \
59 c[3] -= cvec4[3]; \
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080060 } else { \
Austin Schuh3de38b02024-06-25 18:25:10 -070061 c[0] = cvec4[0]; \
62 c[1] = cvec4[1]; \
63 c[2] = cvec4[2]; \
64 c[3] = cvec4[3]; \
65 } \
66 c += 4;
Austin Schuh70cc9552019-01-21 19:46:48 -080067
68// Matrix-Matrix Multiplication
69// Figure out 1x4 of Matrix C in one batch
70//
71// c op a * B;
72// where op can be +=, -=, or =, indicated by kOperation.
73//
74// Matrix C Matrix A Matrix B
75//
76// C0, C1, C2, C3 op A0, A1, A2, A3, ... * B0, B1, B2, B3
77// B4, B5, B6, B7
78// B8, B9, Ba, Bb
79// Bc, Bd, Be, Bf
80// . , . , . , .
81// . , . , . , .
82// . , . , . , .
83//
84// unroll for loops
85// utilize the data resided in cache
86// NOTE: col_a means the columns of A
87static inline void MMM_mat1x4(const int col_a,
88 const double* a,
89 const double* b,
90 const int col_stride_b,
91 double* c,
92 const int kOperation) {
93 CERES_GEMM_OPT_NAIVE_HEADER
94 double av = 0.0;
95 int bi = 0;
96
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080097#define CERES_GEMM_OPT_MMM_MAT1X4_MUL \
98 av = pa[k]; \
99 pb = b + bi; \
Austin Schuh3de38b02024-06-25 18:25:10 -0700100 cvec4[0] += av * pb[0]; \
101 cvec4[1] += av * pb[1]; \
102 cvec4[2] += av * pb[2]; \
103 cvec4[3] += av * pb[3]; \
104 pb += 4; \
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800105 bi += col_stride_b; \
Austin Schuh70cc9552019-01-21 19:46:48 -0800106 k++;
107
108 for (int k = 0; k < col_m;) {
109 CERES_GEMM_OPT_MMM_MAT1X4_MUL
110 CERES_GEMM_OPT_MMM_MAT1X4_MUL
111 CERES_GEMM_OPT_MMM_MAT1X4_MUL
112 CERES_GEMM_OPT_MMM_MAT1X4_MUL
113 }
114
115 for (int k = col_m; k < col_a;) {
116 CERES_GEMM_OPT_MMM_MAT1X4_MUL
117 }
118
119 CERES_GEMM_OPT_STORE_MAT1X4
120
121#undef CERES_GEMM_OPT_MMM_MAT1X4_MUL
122}
123
124// Matrix Transpose-Matrix multiplication
125// Figure out 1x4 of Matrix C in one batch
126//
127// c op a' * B;
128// where op can be +=, -=, or = indicated by kOperation.
129//
130// Matrix A
131//
132// A0
133// A1
134// A2
135// A3
136// .
137// .
138// .
139//
140// Matrix C Matrix A' Matrix B
141//
142// C0, C1, C2, C3 op A0, A1, A2, A3, ... * B0, B1, B2, B3
143// B4, B5, B6, B7
144// B8, B9, Ba, Bb
145// Bc, Bd, Be, Bf
146// . , . , . , .
147// . , . , . , .
148// . , . , . , .
149//
150// unroll for loops
151// utilize the data resided in cache
152// NOTE: col_a means the columns of A'
153static inline void MTM_mat1x4(const int col_a,
154 const double* a,
155 const int col_stride_a,
156 const double* b,
157 const int col_stride_b,
158 double* c,
159 const int kOperation) {
160 CERES_GEMM_OPT_NAIVE_HEADER
161 double av = 0.0;
162 int ai = 0;
163 int bi = 0;
164
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800165#define CERES_GEMM_OPT_MTM_MAT1X4_MUL \
166 av = pa[ai]; \
167 pb = b + bi; \
Austin Schuh3de38b02024-06-25 18:25:10 -0700168 cvec4[0] += av * pb[0]; \
169 cvec4[1] += av * pb[1]; \
170 cvec4[2] += av * pb[2]; \
171 cvec4[3] += av * pb[3]; \
172 pb += 4; \
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800173 ai += col_stride_a; \
Austin Schuh70cc9552019-01-21 19:46:48 -0800174 bi += col_stride_b;
175
176 for (int k = 0; k < col_m; k += span) {
177 CERES_GEMM_OPT_MTM_MAT1X4_MUL
178 CERES_GEMM_OPT_MTM_MAT1X4_MUL
179 CERES_GEMM_OPT_MTM_MAT1X4_MUL
180 CERES_GEMM_OPT_MTM_MAT1X4_MUL
181 }
182
183 for (int k = col_m; k < col_a; k++) {
184 CERES_GEMM_OPT_MTM_MAT1X4_MUL
185 }
186
187 CERES_GEMM_OPT_STORE_MAT1X4
188
189#undef CERES_GEMM_OPT_MTM_MAT1X4_MUL
190}
191
192// Matrix-Vector Multiplication
193// Figure out 4x1 of vector c in one batch
194//
195// c op A * b;
196// where op can be +=, -=, or =, indicated by kOperation.
197//
198// Vector c Matrix A Vector b
199//
200// C0 op A0, A1, A2, A3, ... * B0
201// C1 A4, A5, A6, A7, ... B1
202// C2 A8, A9, Aa, Ab, ... B2
203// C3 Ac, Ad, Ae, Af, ... B3
204// .
205// .
206// .
207//
208// unroll for loops
209// utilize the data resided in cache
210// NOTE: col_a means the columns of A
211static inline void MVM_mat4x1(const int col_a,
212 const double* a,
213 const int col_stride_a,
214 const double* b,
215 double* c,
216 const int kOperation) {
217 CERES_GEMM_OPT_NAIVE_HEADER
218 double bv = 0.0;
219
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800220 // clang-format off
Austin Schuh3de38b02024-06-25 18:25:10 -0700221#define CERES_GEMM_OPT_MVM_MAT4X1_MUL \
222 bv = *pb; \
223 cvec4[0] += *(pa ) * bv; \
224 cvec4[1] += *(pa + col_stride_a ) * bv; \
225 cvec4[2] += *(pa + col_stride_a * 2) * bv; \
226 cvec4[3] += *(pa + col_stride_a * 3) * bv; \
227 pa++; \
Austin Schuh70cc9552019-01-21 19:46:48 -0800228 pb++;
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800229 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800230
231 for (int k = 0; k < col_m; k += span) {
232 CERES_GEMM_OPT_MVM_MAT4X1_MUL
233 CERES_GEMM_OPT_MVM_MAT4X1_MUL
234 CERES_GEMM_OPT_MVM_MAT4X1_MUL
235 CERES_GEMM_OPT_MVM_MAT4X1_MUL
236 }
237
238 for (int k = col_m; k < col_a; k++) {
239 CERES_GEMM_OPT_MVM_MAT4X1_MUL
240 }
241
242 CERES_GEMM_OPT_STORE_MAT1X4
243
244#undef CERES_GEMM_OPT_MVM_MAT4X1_MUL
245}
246
247// Matrix Transpose-Vector multiplication
248// Figure out 4x1 of vector c in one batch
249//
250// c op A' * b;
251// where op can be +=, -=, or =, indicated by kOperation.
252//
253// Matrix A
254//
255// A0, A4, A8, Ac
256// A1, A5, A9, Ad
257// A2, A6, Aa, Ae
258// A3, A7, Ab, Af
259// . , . , . , .
260// . , . , . , .
261// . , . , . , .
262//
263// Vector c Matrix A' Vector b
264//
265// C0 op A0, A1, A2, A3, ... * B0
266// C1 A4, A5, A6, A7, ... B1
267// C2 A8, A9, Aa, Ab, ... B2
268// C3 Ac, Ad, Ae, Af, ... B3
269// .
270// .
271// .
272//
273// unroll for loops
274// utilize the data resided in cache
275// NOTE: col_a means the columns of A'
276static inline void MTV_mat4x1(const int col_a,
277 const double* a,
278 const int col_stride_a,
279 const double* b,
280 double* c,
281 const int kOperation) {
282 CERES_GEMM_OPT_NAIVE_HEADER
283 double bv = 0.0;
284
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800285#define CERES_GEMM_OPT_MTV_MAT4X1_MUL \
286 bv = *pb; \
Austin Schuh3de38b02024-06-25 18:25:10 -0700287 cvec4[0] += pa[0] * bv; \
288 cvec4[1] += pa[1] * bv; \
289 cvec4[2] += pa[2] * bv; \
290 cvec4[3] += pa[3] * bv; \
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800291 pa += col_stride_a; \
Austin Schuh70cc9552019-01-21 19:46:48 -0800292 pb++;
Austin Schuh70cc9552019-01-21 19:46:48 -0800293
294 for (int k = 0; k < col_m; k += span) {
295 CERES_GEMM_OPT_MTV_MAT4X1_MUL
296 CERES_GEMM_OPT_MTV_MAT4X1_MUL
297 CERES_GEMM_OPT_MTV_MAT4X1_MUL
298 CERES_GEMM_OPT_MTV_MAT4X1_MUL
299 }
300
301 for (int k = col_m; k < col_a; k++) {
302 CERES_GEMM_OPT_MTV_MAT4X1_MUL
303 }
304
305 CERES_GEMM_OPT_STORE_MAT1X4
306
307#undef CERES_GEMM_OPT_MTV_MAT4X1_MUL
308}
309
310#undef CERES_GEMM_OPT_NAIVE_HEADER
311#undef CERES_GEMM_OPT_STORE_MAT1X4
312
Austin Schuh3de38b02024-06-25 18:25:10 -0700313} // namespace ceres::internal
Austin Schuh70cc9552019-01-21 19:46:48 -0800314
315#endif // CERES_INTERNAL_SMALL_BLAS_GENERIC_H_