blob: fb8d7fa58172dffb13ae028e1f81a2aa1372eb27 [file] [log] [blame]
Austin Schuh70cc9552019-01-21 19:46:48 -08001// Ceres Solver - A fast non-linear least squares minimizer
Austin Schuh3de38b02024-06-25 18:25:10 -07002// Copyright 2023 Google Inc. All rights reserved.
Austin Schuh70cc9552019-01-21 19:46:48 -08003// http://ceres-solver.org/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//
8// * Redistributions of source code must retain the above copyright notice,
9// this list of conditions and the following disclaimer.
10// * Redistributions in binary form must reproduce the above copyright notice,
11// this list of conditions and the following disclaimer in the documentation
12// and/or other materials provided with the distribution.
13// * Neither the name of Google Inc. nor the names of its contributors may be
14// used to endorse or promote products derived from this software without
15// specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28//
29// Author: sameeragarwal@google.com (Sameer Agarwal)
30//
31// Simple blas functions for use in the Schur Eliminator. These are
32// fairly basic implementations which already yield a significant
33// speedup in the eliminator performance.
34
35#ifndef CERES_INTERNAL_SMALL_BLAS_H_
36#define CERES_INTERNAL_SMALL_BLAS_H_
37
Austin Schuh70cc9552019-01-21 19:46:48 -080038#include "ceres/internal/eigen.h"
Austin Schuh3de38b02024-06-25 18:25:10 -070039#include "ceres/internal/export.h"
Austin Schuh70cc9552019-01-21 19:46:48 -080040#include "glog/logging.h"
41#include "small_blas_generic.h"
42
Austin Schuh3de38b02024-06-25 18:25:10 -070043namespace ceres::internal {
Austin Schuh70cc9552019-01-21 19:46:48 -080044
45// The following three macros are used to share code and reduce
46// template junk across the various GEMM variants.
47#define CERES_GEMM_BEGIN(name) \
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080048 template <int kRowA, int kColA, int kRowB, int kColB, int kOperation> \
Austin Schuh70cc9552019-01-21 19:46:48 -080049 inline void name(const double* A, \
50 const int num_row_a, \
51 const int num_col_a, \
52 const double* B, \
53 const int num_row_b, \
54 const int num_col_b, \
55 double* C, \
56 const int start_row_c, \
57 const int start_col_c, \
58 const int row_stride_c, \
59 const int col_stride_c)
60
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080061#define CERES_GEMM_NAIVE_HEADER \
62 DCHECK_GT(num_row_a, 0); \
63 DCHECK_GT(num_col_a, 0); \
64 DCHECK_GT(num_row_b, 0); \
65 DCHECK_GT(num_col_b, 0); \
66 DCHECK_GE(start_row_c, 0); \
67 DCHECK_GE(start_col_c, 0); \
68 DCHECK_GT(row_stride_c, 0); \
69 DCHECK_GT(col_stride_c, 0); \
70 DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a)); \
71 DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a)); \
72 DCHECK((kRowB == Eigen::Dynamic) || (kRowB == num_row_b)); \
73 DCHECK((kColB == Eigen::Dynamic) || (kColB == num_col_b)); \
74 const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a); \
75 const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a); \
76 const int NUM_ROW_B = (kRowB != Eigen::Dynamic ? kRowB : num_row_b); \
Austin Schuh70cc9552019-01-21 19:46:48 -080077 const int NUM_COL_B = (kColB != Eigen::Dynamic ? kColB : num_col_b);
78
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080079#define CERES_GEMM_EIGEN_HEADER \
80 const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref( \
81 A, num_row_a, num_col_a); \
82 const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref( \
83 B, num_row_b, num_col_b); \
84 MatrixRef Cref(C, row_stride_c, col_stride_c);
Austin Schuh70cc9552019-01-21 19:46:48 -080085
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080086// clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -080087#define CERES_CALL_GEMM(name) \
88 name<kRowA, kColA, kRowB, kColB, kOperation>( \
89 A, num_row_a, num_col_a, \
90 B, num_row_b, num_col_b, \
91 C, start_row_c, start_col_c, row_stride_c, col_stride_c);
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080092// clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -080093
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080094#define CERES_GEMM_STORE_SINGLE(p, index, value) \
95 if (kOperation > 0) { \
96 p[index] += value; \
97 } else if (kOperation < 0) { \
98 p[index] -= value; \
99 } else { \
100 p[index] = value; \
Austin Schuh70cc9552019-01-21 19:46:48 -0800101 }
102
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800103#define CERES_GEMM_STORE_PAIR(p, index, v1, v2) \
104 if (kOperation > 0) { \
105 p[index] += v1; \
106 p[index + 1] += v2; \
107 } else if (kOperation < 0) { \
108 p[index] -= v1; \
109 p[index + 1] -= v2; \
110 } else { \
111 p[index] = v1; \
112 p[index + 1] = v2; \
Austin Schuh70cc9552019-01-21 19:46:48 -0800113 }
114
115// For the matrix-matrix functions below, there are three variants for
116// each functionality. Foo, FooNaive and FooEigen. Foo is the one to
117// be called by the user. FooNaive is a basic loop based
118// implementation and FooEigen uses Eigen's implementation. Foo
119// chooses between FooNaive and FooEigen depending on how many of the
120// template arguments are fixed at compile time. Currently, FooEigen
121// is called if all matrix dimensions are compile time
122// constants. FooNaive is called otherwise. This leads to the best
123// performance currently.
124//
125// The MatrixMatrixMultiply variants compute:
126//
127// C op A * B;
128//
129// The MatrixTransposeMatrixMultiply variants compute:
130//
131// C op A' * B
132//
133// where op can be +=, -=, or =.
134//
135// The template parameters (kRowA, kColA, kRowB, kColB) allow
136// specialization of the loop at compile time. If this information is
137// not available, then Eigen::Dynamic should be used as the template
138// argument.
139//
140// kOperation = 1 -> C += A * B
141// kOperation = -1 -> C -= A * B
142// kOperation = 0 -> C = A * B
143//
144// The functions can write into matrices C which are larger than the
145// matrix A * B. This is done by specifying the true size of C via
146// row_stride_c and col_stride_c, and then indicating where A * B
147// should be written into by start_row_c and start_col_c.
148//
149// Graphically if row_stride_c = 10, col_stride_c = 12, start_row_c =
150// 4 and start_col_c = 5, then if A = 3x2 and B = 2x4, we get
151//
152// ------------
153// ------------
154// ------------
155// ------------
156// -----xxxx---
157// -----xxxx---
158// -----xxxx---
159// ------------
160// ------------
161// ------------
162//
163CERES_GEMM_BEGIN(MatrixMatrixMultiplyEigen) {
164 CERES_GEMM_EIGEN_HEADER
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800165 Eigen::Block<MatrixRef, kRowA, kColB> block(
166 Cref, start_row_c, start_col_c, num_row_a, num_col_b);
Austin Schuh70cc9552019-01-21 19:46:48 -0800167
168 if (kOperation > 0) {
169 block.noalias() += Aref * Bref;
170 } else if (kOperation < 0) {
171 block.noalias() -= Aref * Bref;
172 } else {
173 block.noalias() = Aref * Bref;
174 }
175}
176
177CERES_GEMM_BEGIN(MatrixMatrixMultiplyNaive) {
178 CERES_GEMM_NAIVE_HEADER
179 DCHECK_EQ(NUM_COL_A, NUM_ROW_B);
180
181 const int NUM_ROW_C = NUM_ROW_A;
182 const int NUM_COL_C = NUM_COL_B;
183 DCHECK_LE(start_row_c + NUM_ROW_C, row_stride_c);
184 DCHECK_LE(start_col_c + NUM_COL_C, col_stride_c);
185 const int span = 4;
186
187 // Calculate the remainder part first.
188
189 // Process the last odd column if present.
190 if (NUM_COL_C & 1) {
191 int col = NUM_COL_C - 1;
192 const double* pa = &A[0];
193 for (int row = 0; row < NUM_ROW_C; ++row, pa += NUM_COL_A) {
194 const double* pb = &B[col];
195 double tmp = 0.0;
196 for (int k = 0; k < NUM_COL_A; ++k, pb += NUM_COL_B) {
197 tmp += pa[k] * pb[0];
198 }
199
200 const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
201 CERES_GEMM_STORE_SINGLE(C, index, tmp);
202 }
203
204 // Return directly for efficiency of extremely small matrix multiply.
205 if (NUM_COL_C == 1) {
206 return;
207 }
208 }
209
210 // Process the couple columns in remainder if present.
211 if (NUM_COL_C & 2) {
Austin Schuh3de38b02024-06-25 18:25:10 -0700212 int col = NUM_COL_C & (~(span - 1));
Austin Schuh70cc9552019-01-21 19:46:48 -0800213 const double* pa = &A[0];
214 for (int row = 0; row < NUM_ROW_C; ++row, pa += NUM_COL_A) {
215 const double* pb = &B[col];
216 double tmp1 = 0.0, tmp2 = 0.0;
217 for (int k = 0; k < NUM_COL_A; ++k, pb += NUM_COL_B) {
218 double av = pa[k];
219 tmp1 += av * pb[0];
220 tmp2 += av * pb[1];
221 }
222
223 const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
224 CERES_GEMM_STORE_PAIR(C, index, tmp1, tmp2);
225 }
226
227 // Return directly for efficiency of extremely small matrix multiply.
228 if (NUM_COL_C < span) {
229 return;
230 }
231 }
232
233 // Calculate the main part with multiples of 4.
Austin Schuh3de38b02024-06-25 18:25:10 -0700234 int col_m = NUM_COL_C & (~(span - 1));
Austin Schuh70cc9552019-01-21 19:46:48 -0800235 for (int col = 0; col < col_m; col += span) {
236 for (int row = 0; row < NUM_ROW_C; ++row) {
237 const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800238 // clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -0800239 MMM_mat1x4(NUM_COL_A, &A[row * NUM_COL_A],
240 &B[col], NUM_COL_B, &C[index], kOperation);
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800241 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800242 }
243 }
Austin Schuh70cc9552019-01-21 19:46:48 -0800244}
245
246CERES_GEMM_BEGIN(MatrixMatrixMultiply) {
247#ifdef CERES_NO_CUSTOM_BLAS
248
249 CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
250 return;
251
252#else
253
254 if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
255 kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
256 CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
257 } else {
258 CERES_CALL_GEMM(MatrixMatrixMultiplyNaive)
259 }
260
261#endif
262}
263
264CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyEigen) {
265 CERES_GEMM_EIGEN_HEADER
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800266 // clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -0800267 Eigen::Block<MatrixRef, kColA, kColB> block(Cref,
268 start_row_c, start_col_c,
269 num_col_a, num_col_b);
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800270 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800271 if (kOperation > 0) {
272 block.noalias() += Aref.transpose() * Bref;
273 } else if (kOperation < 0) {
274 block.noalias() -= Aref.transpose() * Bref;
275 } else {
276 block.noalias() = Aref.transpose() * Bref;
277 }
278}
279
280CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyNaive) {
281 CERES_GEMM_NAIVE_HEADER
282 DCHECK_EQ(NUM_ROW_A, NUM_ROW_B);
283
284 const int NUM_ROW_C = NUM_COL_A;
285 const int NUM_COL_C = NUM_COL_B;
286 DCHECK_LE(start_row_c + NUM_ROW_C, row_stride_c);
287 DCHECK_LE(start_col_c + NUM_COL_C, col_stride_c);
288 const int span = 4;
289
290 // Process the remainder part first.
291
292 // Process the last odd column if present.
293 if (NUM_COL_C & 1) {
294 int col = NUM_COL_C - 1;
295 for (int row = 0; row < NUM_ROW_C; ++row) {
296 const double* pa = &A[row];
297 const double* pb = &B[col];
298 double tmp = 0.0;
299 for (int k = 0; k < NUM_ROW_A; ++k) {
300 tmp += pa[0] * pb[0];
301 pa += NUM_COL_A;
302 pb += NUM_COL_B;
303 }
304
305 const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
306 CERES_GEMM_STORE_SINGLE(C, index, tmp);
307 }
308
309 // Return directly for efficiency of extremely small matrix multiply.
310 if (NUM_COL_C == 1) {
311 return;
312 }
313 }
314
315 // Process the couple columns in remainder if present.
316 if (NUM_COL_C & 2) {
Austin Schuh3de38b02024-06-25 18:25:10 -0700317 int col = NUM_COL_C & (~(span - 1));
Austin Schuh70cc9552019-01-21 19:46:48 -0800318 for (int row = 0; row < NUM_ROW_C; ++row) {
319 const double* pa = &A[row];
320 const double* pb = &B[col];
321 double tmp1 = 0.0, tmp2 = 0.0;
322 for (int k = 0; k < NUM_ROW_A; ++k) {
323 double av = *pa;
324 tmp1 += av * pb[0];
325 tmp2 += av * pb[1];
326 pa += NUM_COL_A;
327 pb += NUM_COL_B;
328 }
329
330 const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
331 CERES_GEMM_STORE_PAIR(C, index, tmp1, tmp2);
332 }
333
334 // Return directly for efficiency of extremely small matrix multiply.
335 if (NUM_COL_C < span) {
336 return;
337 }
338 }
339
340 // Process the main part with multiples of 4.
Austin Schuh3de38b02024-06-25 18:25:10 -0700341 int col_m = NUM_COL_C & (~(span - 1));
Austin Schuh70cc9552019-01-21 19:46:48 -0800342 for (int col = 0; col < col_m; col += span) {
343 for (int row = 0; row < NUM_ROW_C; ++row) {
344 const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800345 // clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -0800346 MTM_mat1x4(NUM_ROW_A, &A[row], NUM_COL_A,
347 &B[col], NUM_COL_B, &C[index], kOperation);
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800348 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800349 }
350 }
Austin Schuh70cc9552019-01-21 19:46:48 -0800351}
352
353CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiply) {
354#ifdef CERES_NO_CUSTOM_BLAS
355
356 CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
357 return;
358
359#else
360
361 if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
362 kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
363 CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
364 } else {
365 CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyNaive)
366 }
367
368#endif
369}
370
371// Matrix-Vector multiplication
372//
373// c op A * b;
374//
375// where op can be +=, -=, or =.
376//
377// The template parameters (kRowA, kColA) allow specialization of the
378// loop at compile time. If this information is not available, then
379// Eigen::Dynamic should be used as the template argument.
380//
381// kOperation = 1 -> c += A' * b
382// kOperation = -1 -> c -= A' * b
383// kOperation = 0 -> c = A' * b
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800384template <int kRowA, int kColA, int kOperation>
Austin Schuh70cc9552019-01-21 19:46:48 -0800385inline void MatrixVectorMultiply(const double* A,
386 const int num_row_a,
387 const int num_col_a,
388 const double* b,
389 double* c) {
390#ifdef CERES_NO_CUSTOM_BLAS
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800391 const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(
392 A, num_row_a, num_col_a);
Austin Schuh70cc9552019-01-21 19:46:48 -0800393 const typename EigenTypes<kColA>::ConstVectorRef bref(b, num_col_a);
394 typename EigenTypes<kRowA>::VectorRef cref(c, num_row_a);
395
396 // lazyProduct works better than .noalias() for matrix-vector
397 // products.
398 if (kOperation > 0) {
399 cref += Aref.lazyProduct(bref);
400 } else if (kOperation < 0) {
401 cref -= Aref.lazyProduct(bref);
402 } else {
403 cref = Aref.lazyProduct(bref);
404 }
405#else
406
407 DCHECK_GT(num_row_a, 0);
408 DCHECK_GT(num_col_a, 0);
409 DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a));
410 DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a));
411
412 const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
413 const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
414 const int span = 4;
415
416 // Calculate the remainder part first.
417
418 // Process the last odd row if present.
419 if (NUM_ROW_A & 1) {
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800420 int row = NUM_ROW_A - 1;
Austin Schuh70cc9552019-01-21 19:46:48 -0800421 const double* pa = &A[row * NUM_COL_A];
422 const double* pb = &b[0];
423 double tmp = 0.0;
424 for (int col = 0; col < NUM_COL_A; ++col) {
425 tmp += (*pa++) * (*pb++);
426 }
427 CERES_GEMM_STORE_SINGLE(c, row, tmp);
428
429 // Return directly for efficiency of extremely small matrix multiply.
430 if (NUM_ROW_A == 1) {
431 return;
432 }
433 }
434
435 // Process the couple rows in remainder if present.
436 if (NUM_ROW_A & 2) {
Austin Schuh3de38b02024-06-25 18:25:10 -0700437 int row = NUM_ROW_A & (~(span - 1));
Austin Schuh70cc9552019-01-21 19:46:48 -0800438 const double* pa1 = &A[row * NUM_COL_A];
439 const double* pa2 = pa1 + NUM_COL_A;
440 const double* pb = &b[0];
441 double tmp1 = 0.0, tmp2 = 0.0;
442 for (int col = 0; col < NUM_COL_A; ++col) {
443 double bv = *pb++;
444 tmp1 += *(pa1++) * bv;
445 tmp2 += *(pa2++) * bv;
446 }
447 CERES_GEMM_STORE_PAIR(c, row, tmp1, tmp2);
448
449 // Return directly for efficiency of extremely small matrix multiply.
450 if (NUM_ROW_A < span) {
451 return;
452 }
453 }
454
455 // Calculate the main part with multiples of 4.
Austin Schuh3de38b02024-06-25 18:25:10 -0700456 int row_m = NUM_ROW_A & (~(span - 1));
Austin Schuh70cc9552019-01-21 19:46:48 -0800457 for (int row = 0; row < row_m; row += span) {
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800458 // clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -0800459 MVM_mat4x1(NUM_COL_A, &A[row * NUM_COL_A], NUM_COL_A,
460 &b[0], &c[row], kOperation);
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800461 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800462 }
463
464#endif // CERES_NO_CUSTOM_BLAS
465}
466
467// Similar to MatrixVectorMultiply, except that A is transposed, i.e.,
468//
469// c op A' * b;
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800470template <int kRowA, int kColA, int kOperation>
Austin Schuh70cc9552019-01-21 19:46:48 -0800471inline void MatrixTransposeVectorMultiply(const double* A,
472 const int num_row_a,
473 const int num_col_a,
474 const double* b,
475 double* c) {
476#ifdef CERES_NO_CUSTOM_BLAS
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800477 const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(
478 A, num_row_a, num_col_a);
Austin Schuh70cc9552019-01-21 19:46:48 -0800479 const typename EigenTypes<kRowA>::ConstVectorRef bref(b, num_row_a);
480 typename EigenTypes<kColA>::VectorRef cref(c, num_col_a);
481
482 // lazyProduct works better than .noalias() for matrix-vector
483 // products.
484 if (kOperation > 0) {
485 cref += Aref.transpose().lazyProduct(bref);
486 } else if (kOperation < 0) {
487 cref -= Aref.transpose().lazyProduct(bref);
488 } else {
489 cref = Aref.transpose().lazyProduct(bref);
490 }
491#else
492
493 DCHECK_GT(num_row_a, 0);
494 DCHECK_GT(num_col_a, 0);
495 DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a));
496 DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a));
497
498 const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
499 const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
500 const int span = 4;
501
502 // Calculate the remainder part first.
503
504 // Process the last odd column if present.
505 if (NUM_COL_A & 1) {
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800506 int row = NUM_COL_A - 1;
Austin Schuh70cc9552019-01-21 19:46:48 -0800507 const double* pa = &A[row];
508 const double* pb = &b[0];
509 double tmp = 0.0;
510 for (int col = 0; col < NUM_ROW_A; ++col) {
511 tmp += *pa * (*pb++);
512 pa += NUM_COL_A;
513 }
514 CERES_GEMM_STORE_SINGLE(c, row, tmp);
515
516 // Return directly for efficiency of extremely small matrix multiply.
517 if (NUM_COL_A == 1) {
518 return;
519 }
520 }
521
522 // Process the couple columns in remainder if present.
523 if (NUM_COL_A & 2) {
Austin Schuh3de38b02024-06-25 18:25:10 -0700524 int row = NUM_COL_A & (~(span - 1));
Austin Schuh70cc9552019-01-21 19:46:48 -0800525 const double* pa = &A[row];
526 const double* pb = &b[0];
527 double tmp1 = 0.0, tmp2 = 0.0;
528 for (int col = 0; col < NUM_ROW_A; ++col) {
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800529 // clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -0800530 double bv = *pb++;
531 tmp1 += *(pa ) * bv;
532 tmp2 += *(pa + 1) * bv;
533 pa += NUM_COL_A;
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800534 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800535 }
536 CERES_GEMM_STORE_PAIR(c, row, tmp1, tmp2);
537
538 // Return directly for efficiency of extremely small matrix multiply.
539 if (NUM_COL_A < span) {
540 return;
541 }
542 }
543
544 // Calculate the main part with multiples of 4.
Austin Schuh3de38b02024-06-25 18:25:10 -0700545 int row_m = NUM_COL_A & (~(span - 1));
Austin Schuh70cc9552019-01-21 19:46:48 -0800546 for (int row = 0; row < row_m; row += span) {
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800547 // clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -0800548 MTV_mat4x1(NUM_ROW_A, &A[row], NUM_COL_A,
549 &b[0], &c[row], kOperation);
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800550 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800551 }
552
553#endif // CERES_NO_CUSTOM_BLAS
554}
555
556#undef CERES_GEMM_BEGIN
557#undef CERES_GEMM_EIGEN_HEADER
558#undef CERES_GEMM_NAIVE_HEADER
559#undef CERES_CALL_GEMM
560#undef CERES_GEMM_STORE_SINGLE
561#undef CERES_GEMM_STORE_PAIR
562
Austin Schuh3de38b02024-06-25 18:25:10 -0700563} // namespace ceres::internal
Austin Schuh70cc9552019-01-21 19:46:48 -0800564
565#endif // CERES_INTERNAL_SMALL_BLAS_H_