blob: 4ee9229f35fc290beaa4693f15637794cb7f13af [file] [log] [blame]
Austin Schuh70cc9552019-01-21 19:46:48 -08001// Ceres Solver - A fast non-linear least squares minimizer
2// Copyright 2015 Google Inc. All rights reserved.
3// http://ceres-solver.org/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//
8// * Redistributions of source code must retain the above copyright notice,
9// this list of conditions and the following disclaimer.
10// * Redistributions in binary form must reproduce the above copyright notice,
11// this list of conditions and the following disclaimer in the documentation
12// and/or other materials provided with the distribution.
13// * Neither the name of Google Inc. nor the names of its contributors may be
14// used to endorse or promote products derived from this software without
15// specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28//
29// Author: sameeragarwal@google.com (Sameer Agarwal)
30//
31// Simple blas functions for use in the Schur Eliminator. These are
32// fairly basic implementations which already yield a significant
33// speedup in the eliminator performance.
34
35#ifndef CERES_INTERNAL_SMALL_BLAS_H_
36#define CERES_INTERNAL_SMALL_BLAS_H_
37
Austin Schuh70cc9552019-01-21 19:46:48 -080038#include "ceres/internal/eigen.h"
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080039#include "ceres/internal/port.h"
Austin Schuh70cc9552019-01-21 19:46:48 -080040#include "glog/logging.h"
41#include "small_blas_generic.h"
42
43namespace ceres {
44namespace internal {
45
46// The following three macros are used to share code and reduce
47// template junk across the various GEMM variants.
48#define CERES_GEMM_BEGIN(name) \
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080049 template <int kRowA, int kColA, int kRowB, int kColB, int kOperation> \
Austin Schuh70cc9552019-01-21 19:46:48 -080050 inline void name(const double* A, \
51 const int num_row_a, \
52 const int num_col_a, \
53 const double* B, \
54 const int num_row_b, \
55 const int num_col_b, \
56 double* C, \
57 const int start_row_c, \
58 const int start_col_c, \
59 const int row_stride_c, \
60 const int col_stride_c)
61
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080062#define CERES_GEMM_NAIVE_HEADER \
63 DCHECK_GT(num_row_a, 0); \
64 DCHECK_GT(num_col_a, 0); \
65 DCHECK_GT(num_row_b, 0); \
66 DCHECK_GT(num_col_b, 0); \
67 DCHECK_GE(start_row_c, 0); \
68 DCHECK_GE(start_col_c, 0); \
69 DCHECK_GT(row_stride_c, 0); \
70 DCHECK_GT(col_stride_c, 0); \
71 DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a)); \
72 DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a)); \
73 DCHECK((kRowB == Eigen::Dynamic) || (kRowB == num_row_b)); \
74 DCHECK((kColB == Eigen::Dynamic) || (kColB == num_col_b)); \
75 const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a); \
76 const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a); \
77 const int NUM_ROW_B = (kRowB != Eigen::Dynamic ? kRowB : num_row_b); \
Austin Schuh70cc9552019-01-21 19:46:48 -080078 const int NUM_COL_B = (kColB != Eigen::Dynamic ? kColB : num_col_b);
79
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080080#define CERES_GEMM_EIGEN_HEADER \
81 const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref( \
82 A, num_row_a, num_col_a); \
83 const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref( \
84 B, num_row_b, num_col_b); \
85 MatrixRef Cref(C, row_stride_c, col_stride_c);
Austin Schuh70cc9552019-01-21 19:46:48 -080086
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080087// clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -080088#define CERES_CALL_GEMM(name) \
89 name<kRowA, kColA, kRowB, kColB, kOperation>( \
90 A, num_row_a, num_col_a, \
91 B, num_row_b, num_col_b, \
92 C, start_row_c, start_col_c, row_stride_c, col_stride_c);
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080093// clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -080094
Austin Schuh1d1e6ea2020-12-23 21:56:30 -080095#define CERES_GEMM_STORE_SINGLE(p, index, value) \
96 if (kOperation > 0) { \
97 p[index] += value; \
98 } else if (kOperation < 0) { \
99 p[index] -= value; \
100 } else { \
101 p[index] = value; \
Austin Schuh70cc9552019-01-21 19:46:48 -0800102 }
103
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800104#define CERES_GEMM_STORE_PAIR(p, index, v1, v2) \
105 if (kOperation > 0) { \
106 p[index] += v1; \
107 p[index + 1] += v2; \
108 } else if (kOperation < 0) { \
109 p[index] -= v1; \
110 p[index + 1] -= v2; \
111 } else { \
112 p[index] = v1; \
113 p[index + 1] = v2; \
Austin Schuh70cc9552019-01-21 19:46:48 -0800114 }
115
116// For the matrix-matrix functions below, there are three variants for
117// each functionality. Foo, FooNaive and FooEigen. Foo is the one to
118// be called by the user. FooNaive is a basic loop based
119// implementation and FooEigen uses Eigen's implementation. Foo
120// chooses between FooNaive and FooEigen depending on how many of the
121// template arguments are fixed at compile time. Currently, FooEigen
122// is called if all matrix dimensions are compile time
123// constants. FooNaive is called otherwise. This leads to the best
124// performance currently.
125//
126// The MatrixMatrixMultiply variants compute:
127//
128// C op A * B;
129//
130// The MatrixTransposeMatrixMultiply variants compute:
131//
132// C op A' * B
133//
134// where op can be +=, -=, or =.
135//
136// The template parameters (kRowA, kColA, kRowB, kColB) allow
137// specialization of the loop at compile time. If this information is
138// not available, then Eigen::Dynamic should be used as the template
139// argument.
140//
141// kOperation = 1 -> C += A * B
142// kOperation = -1 -> C -= A * B
143// kOperation = 0 -> C = A * B
144//
145// The functions can write into matrices C which are larger than the
146// matrix A * B. This is done by specifying the true size of C via
147// row_stride_c and col_stride_c, and then indicating where A * B
148// should be written into by start_row_c and start_col_c.
149//
150// Graphically if row_stride_c = 10, col_stride_c = 12, start_row_c =
151// 4 and start_col_c = 5, then if A = 3x2 and B = 2x4, we get
152//
153// ------------
154// ------------
155// ------------
156// ------------
157// -----xxxx---
158// -----xxxx---
159// -----xxxx---
160// ------------
161// ------------
162// ------------
163//
164CERES_GEMM_BEGIN(MatrixMatrixMultiplyEigen) {
165 CERES_GEMM_EIGEN_HEADER
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800166 Eigen::Block<MatrixRef, kRowA, kColB> block(
167 Cref, start_row_c, start_col_c, num_row_a, num_col_b);
Austin Schuh70cc9552019-01-21 19:46:48 -0800168
169 if (kOperation > 0) {
170 block.noalias() += Aref * Bref;
171 } else if (kOperation < 0) {
172 block.noalias() -= Aref * Bref;
173 } else {
174 block.noalias() = Aref * Bref;
175 }
176}
177
178CERES_GEMM_BEGIN(MatrixMatrixMultiplyNaive) {
179 CERES_GEMM_NAIVE_HEADER
180 DCHECK_EQ(NUM_COL_A, NUM_ROW_B);
181
182 const int NUM_ROW_C = NUM_ROW_A;
183 const int NUM_COL_C = NUM_COL_B;
184 DCHECK_LE(start_row_c + NUM_ROW_C, row_stride_c);
185 DCHECK_LE(start_col_c + NUM_COL_C, col_stride_c);
186 const int span = 4;
187
188 // Calculate the remainder part first.
189
190 // Process the last odd column if present.
191 if (NUM_COL_C & 1) {
192 int col = NUM_COL_C - 1;
193 const double* pa = &A[0];
194 for (int row = 0; row < NUM_ROW_C; ++row, pa += NUM_COL_A) {
195 const double* pb = &B[col];
196 double tmp = 0.0;
197 for (int k = 0; k < NUM_COL_A; ++k, pb += NUM_COL_B) {
198 tmp += pa[k] * pb[0];
199 }
200
201 const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
202 CERES_GEMM_STORE_SINGLE(C, index, tmp);
203 }
204
205 // Return directly for efficiency of extremely small matrix multiply.
206 if (NUM_COL_C == 1) {
207 return;
208 }
209 }
210
211 // Process the couple columns in remainder if present.
212 if (NUM_COL_C & 2) {
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800213 int col = NUM_COL_C & (int)(~(span - 1));
Austin Schuh70cc9552019-01-21 19:46:48 -0800214 const double* pa = &A[0];
215 for (int row = 0; row < NUM_ROW_C; ++row, pa += NUM_COL_A) {
216 const double* pb = &B[col];
217 double tmp1 = 0.0, tmp2 = 0.0;
218 for (int k = 0; k < NUM_COL_A; ++k, pb += NUM_COL_B) {
219 double av = pa[k];
220 tmp1 += av * pb[0];
221 tmp2 += av * pb[1];
222 }
223
224 const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
225 CERES_GEMM_STORE_PAIR(C, index, tmp1, tmp2);
226 }
227
228 // Return directly for efficiency of extremely small matrix multiply.
229 if (NUM_COL_C < span) {
230 return;
231 }
232 }
233
234 // Calculate the main part with multiples of 4.
235 int col_m = NUM_COL_C & (int)(~(span - 1));
236 for (int col = 0; col < col_m; col += span) {
237 for (int row = 0; row < NUM_ROW_C; ++row) {
238 const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800239 // clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -0800240 MMM_mat1x4(NUM_COL_A, &A[row * NUM_COL_A],
241 &B[col], NUM_COL_B, &C[index], kOperation);
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800242 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800243 }
244 }
Austin Schuh70cc9552019-01-21 19:46:48 -0800245}
246
247CERES_GEMM_BEGIN(MatrixMatrixMultiply) {
248#ifdef CERES_NO_CUSTOM_BLAS
249
250 CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
251 return;
252
253#else
254
255 if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
256 kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
257 CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
258 } else {
259 CERES_CALL_GEMM(MatrixMatrixMultiplyNaive)
260 }
261
262#endif
263}
264
265CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyEigen) {
266 CERES_GEMM_EIGEN_HEADER
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800267 // clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -0800268 Eigen::Block<MatrixRef, kColA, kColB> block(Cref,
269 start_row_c, start_col_c,
270 num_col_a, num_col_b);
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800271 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800272 if (kOperation > 0) {
273 block.noalias() += Aref.transpose() * Bref;
274 } else if (kOperation < 0) {
275 block.noalias() -= Aref.transpose() * Bref;
276 } else {
277 block.noalias() = Aref.transpose() * Bref;
278 }
279}
280
281CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyNaive) {
282 CERES_GEMM_NAIVE_HEADER
283 DCHECK_EQ(NUM_ROW_A, NUM_ROW_B);
284
285 const int NUM_ROW_C = NUM_COL_A;
286 const int NUM_COL_C = NUM_COL_B;
287 DCHECK_LE(start_row_c + NUM_ROW_C, row_stride_c);
288 DCHECK_LE(start_col_c + NUM_COL_C, col_stride_c);
289 const int span = 4;
290
291 // Process the remainder part first.
292
293 // Process the last odd column if present.
294 if (NUM_COL_C & 1) {
295 int col = NUM_COL_C - 1;
296 for (int row = 0; row < NUM_ROW_C; ++row) {
297 const double* pa = &A[row];
298 const double* pb = &B[col];
299 double tmp = 0.0;
300 for (int k = 0; k < NUM_ROW_A; ++k) {
301 tmp += pa[0] * pb[0];
302 pa += NUM_COL_A;
303 pb += NUM_COL_B;
304 }
305
306 const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
307 CERES_GEMM_STORE_SINGLE(C, index, tmp);
308 }
309
310 // Return directly for efficiency of extremely small matrix multiply.
311 if (NUM_COL_C == 1) {
312 return;
313 }
314 }
315
316 // Process the couple columns in remainder if present.
317 if (NUM_COL_C & 2) {
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800318 int col = NUM_COL_C & (int)(~(span - 1));
Austin Schuh70cc9552019-01-21 19:46:48 -0800319 for (int row = 0; row < NUM_ROW_C; ++row) {
320 const double* pa = &A[row];
321 const double* pb = &B[col];
322 double tmp1 = 0.0, tmp2 = 0.0;
323 for (int k = 0; k < NUM_ROW_A; ++k) {
324 double av = *pa;
325 tmp1 += av * pb[0];
326 tmp2 += av * pb[1];
327 pa += NUM_COL_A;
328 pb += NUM_COL_B;
329 }
330
331 const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
332 CERES_GEMM_STORE_PAIR(C, index, tmp1, tmp2);
333 }
334
335 // Return directly for efficiency of extremely small matrix multiply.
336 if (NUM_COL_C < span) {
337 return;
338 }
339 }
340
341 // Process the main part with multiples of 4.
342 int col_m = NUM_COL_C & (int)(~(span - 1));
343 for (int col = 0; col < col_m; col += span) {
344 for (int row = 0; row < NUM_ROW_C; ++row) {
345 const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800346 // clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -0800347 MTM_mat1x4(NUM_ROW_A, &A[row], NUM_COL_A,
348 &B[col], NUM_COL_B, &C[index], kOperation);
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800349 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800350 }
351 }
Austin Schuh70cc9552019-01-21 19:46:48 -0800352}
353
354CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiply) {
355#ifdef CERES_NO_CUSTOM_BLAS
356
357 CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
358 return;
359
360#else
361
362 if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
363 kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
364 CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
365 } else {
366 CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyNaive)
367 }
368
369#endif
370}
371
372// Matrix-Vector multiplication
373//
374// c op A * b;
375//
376// where op can be +=, -=, or =.
377//
378// The template parameters (kRowA, kColA) allow specialization of the
379// loop at compile time. If this information is not available, then
380// Eigen::Dynamic should be used as the template argument.
381//
382// kOperation = 1 -> c += A' * b
383// kOperation = -1 -> c -= A' * b
384// kOperation = 0 -> c = A' * b
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800385template <int kRowA, int kColA, int kOperation>
Austin Schuh70cc9552019-01-21 19:46:48 -0800386inline void MatrixVectorMultiply(const double* A,
387 const int num_row_a,
388 const int num_col_a,
389 const double* b,
390 double* c) {
391#ifdef CERES_NO_CUSTOM_BLAS
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800392 const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(
393 A, num_row_a, num_col_a);
Austin Schuh70cc9552019-01-21 19:46:48 -0800394 const typename EigenTypes<kColA>::ConstVectorRef bref(b, num_col_a);
395 typename EigenTypes<kRowA>::VectorRef cref(c, num_row_a);
396
397 // lazyProduct works better than .noalias() for matrix-vector
398 // products.
399 if (kOperation > 0) {
400 cref += Aref.lazyProduct(bref);
401 } else if (kOperation < 0) {
402 cref -= Aref.lazyProduct(bref);
403 } else {
404 cref = Aref.lazyProduct(bref);
405 }
406#else
407
408 DCHECK_GT(num_row_a, 0);
409 DCHECK_GT(num_col_a, 0);
410 DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a));
411 DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a));
412
413 const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
414 const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
415 const int span = 4;
416
417 // Calculate the remainder part first.
418
419 // Process the last odd row if present.
420 if (NUM_ROW_A & 1) {
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800421 int row = NUM_ROW_A - 1;
Austin Schuh70cc9552019-01-21 19:46:48 -0800422 const double* pa = &A[row * NUM_COL_A];
423 const double* pb = &b[0];
424 double tmp = 0.0;
425 for (int col = 0; col < NUM_COL_A; ++col) {
426 tmp += (*pa++) * (*pb++);
427 }
428 CERES_GEMM_STORE_SINGLE(c, row, tmp);
429
430 // Return directly for efficiency of extremely small matrix multiply.
431 if (NUM_ROW_A == 1) {
432 return;
433 }
434 }
435
436 // Process the couple rows in remainder if present.
437 if (NUM_ROW_A & 2) {
438 int row = NUM_ROW_A & (int)(~(span - 1));
439 const double* pa1 = &A[row * NUM_COL_A];
440 const double* pa2 = pa1 + NUM_COL_A;
441 const double* pb = &b[0];
442 double tmp1 = 0.0, tmp2 = 0.0;
443 for (int col = 0; col < NUM_COL_A; ++col) {
444 double bv = *pb++;
445 tmp1 += *(pa1++) * bv;
446 tmp2 += *(pa2++) * bv;
447 }
448 CERES_GEMM_STORE_PAIR(c, row, tmp1, tmp2);
449
450 // Return directly for efficiency of extremely small matrix multiply.
451 if (NUM_ROW_A < span) {
452 return;
453 }
454 }
455
456 // Calculate the main part with multiples of 4.
457 int row_m = NUM_ROW_A & (int)(~(span - 1));
458 for (int row = 0; row < row_m; row += span) {
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800459 // clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -0800460 MVM_mat4x1(NUM_COL_A, &A[row * NUM_COL_A], NUM_COL_A,
461 &b[0], &c[row], kOperation);
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800462 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800463 }
464
465#endif // CERES_NO_CUSTOM_BLAS
466}
467
468// Similar to MatrixVectorMultiply, except that A is transposed, i.e.,
469//
470// c op A' * b;
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800471template <int kRowA, int kColA, int kOperation>
Austin Schuh70cc9552019-01-21 19:46:48 -0800472inline void MatrixTransposeVectorMultiply(const double* A,
473 const int num_row_a,
474 const int num_col_a,
475 const double* b,
476 double* c) {
477#ifdef CERES_NO_CUSTOM_BLAS
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800478 const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(
479 A, num_row_a, num_col_a);
Austin Schuh70cc9552019-01-21 19:46:48 -0800480 const typename EigenTypes<kRowA>::ConstVectorRef bref(b, num_row_a);
481 typename EigenTypes<kColA>::VectorRef cref(c, num_col_a);
482
483 // lazyProduct works better than .noalias() for matrix-vector
484 // products.
485 if (kOperation > 0) {
486 cref += Aref.transpose().lazyProduct(bref);
487 } else if (kOperation < 0) {
488 cref -= Aref.transpose().lazyProduct(bref);
489 } else {
490 cref = Aref.transpose().lazyProduct(bref);
491 }
492#else
493
494 DCHECK_GT(num_row_a, 0);
495 DCHECK_GT(num_col_a, 0);
496 DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a));
497 DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a));
498
499 const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
500 const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
501 const int span = 4;
502
503 // Calculate the remainder part first.
504
505 // Process the last odd column if present.
506 if (NUM_COL_A & 1) {
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800507 int row = NUM_COL_A - 1;
Austin Schuh70cc9552019-01-21 19:46:48 -0800508 const double* pa = &A[row];
509 const double* pb = &b[0];
510 double tmp = 0.0;
511 for (int col = 0; col < NUM_ROW_A; ++col) {
512 tmp += *pa * (*pb++);
513 pa += NUM_COL_A;
514 }
515 CERES_GEMM_STORE_SINGLE(c, row, tmp);
516
517 // Return directly for efficiency of extremely small matrix multiply.
518 if (NUM_COL_A == 1) {
519 return;
520 }
521 }
522
523 // Process the couple columns in remainder if present.
524 if (NUM_COL_A & 2) {
525 int row = NUM_COL_A & (int)(~(span - 1));
526 const double* pa = &A[row];
527 const double* pb = &b[0];
528 double tmp1 = 0.0, tmp2 = 0.0;
529 for (int col = 0; col < NUM_ROW_A; ++col) {
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800530 // clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -0800531 double bv = *pb++;
532 tmp1 += *(pa ) * bv;
533 tmp2 += *(pa + 1) * bv;
534 pa += NUM_COL_A;
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800535 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800536 }
537 CERES_GEMM_STORE_PAIR(c, row, tmp1, tmp2);
538
539 // Return directly for efficiency of extremely small matrix multiply.
540 if (NUM_COL_A < span) {
541 return;
542 }
543 }
544
545 // Calculate the main part with multiples of 4.
546 int row_m = NUM_COL_A & (int)(~(span - 1));
547 for (int row = 0; row < row_m; row += span) {
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800548 // clang-format off
Austin Schuh70cc9552019-01-21 19:46:48 -0800549 MTV_mat4x1(NUM_ROW_A, &A[row], NUM_COL_A,
550 &b[0], &c[row], kOperation);
Austin Schuh1d1e6ea2020-12-23 21:56:30 -0800551 // clang-format on
Austin Schuh70cc9552019-01-21 19:46:48 -0800552 }
553
554#endif // CERES_NO_CUSTOM_BLAS
555}
556
557#undef CERES_GEMM_BEGIN
558#undef CERES_GEMM_EIGEN_HEADER
559#undef CERES_GEMM_NAIVE_HEADER
560#undef CERES_CALL_GEMM
561#undef CERES_GEMM_STORE_SINGLE
562#undef CERES_GEMM_STORE_PAIR
563
564} // namespace internal
565} // namespace ceres
566
567#endif // CERES_INTERNAL_SMALL_BLAS_H_