Blame - internal/ceres/small_blas.h - RealtimeRoboticsGroup/test

blob: fb8d7fa58172dffb13ae028e1f81a2aa1372eb27 [file] [log] [blame]

Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	1	// Ceres Solver - A fast non-linear least squares minimizer
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	2	// Copyright 2023 Google Inc. All rights reserved.
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	3	// http://ceres-solver.org/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are met:
				7	//
				8	// * Redistributions of source code must retain the above copyright notice,
				9	// this list of conditions and the following disclaimer.
				10	// * Redistributions in binary form must reproduce the above copyright notice,
				11	// this list of conditions and the following disclaimer in the documentation
				12	// and/or other materials provided with the distribution.
				13	// * Neither the name of Google Inc. nor the names of its contributors may be
				14	// used to endorse or promote products derived from this software without
				15	// specific prior written permission.
				16	//
				17	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				18	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				19	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				20	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				21	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				22	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				23	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				24	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				25	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				26	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				27	// POSSIBILITY OF SUCH DAMAGE.
				28	//
				29	// Author: sameeragarwal@google.com (Sameer Agarwal)
				30	//
				31	// Simple blas functions for use in the Schur Eliminator. These are
				32	// fairly basic implementations which already yield a significant
				33	// speedup in the eliminator performance.
				34
				35	#ifndef CERES_INTERNAL_SMALL_BLAS_H_
				36	#define CERES_INTERNAL_SMALL_BLAS_H_
				37
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	38	#include "ceres/internal/eigen.h"
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	39	#include "ceres/internal/export.h"
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	40	#include "glog/logging.h"
				41	#include "small_blas_generic.h"
				42
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	43	namespace ceres::internal {
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	44
				45	// The following three macros are used to share code and reduce
				46	// template junk across the various GEMM variants.
				47	#define CERES_GEMM_BEGIN(name) \
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	48	template <int kRowA, int kColA, int kRowB, int kColB, int kOperation> \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	49	inline void name(const double* A, \
				50	const int num_row_a, \
				51	const int num_col_a, \
				52	const double* B, \
				53	const int num_row_b, \
				54	const int num_col_b, \
				55	double* C, \
				56	const int start_row_c, \
				57	const int start_col_c, \
				58	const int row_stride_c, \
				59	const int col_stride_c)
				60
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	61	#define CERES_GEMM_NAIVE_HEADER \
				62	DCHECK_GT(num_row_a, 0); \
				63	DCHECK_GT(num_col_a, 0); \
				64	DCHECK_GT(num_row_b, 0); \
				65	DCHECK_GT(num_col_b, 0); \
				66	DCHECK_GE(start_row_c, 0); \
				67	DCHECK_GE(start_col_c, 0); \
				68	DCHECK_GT(row_stride_c, 0); \
				69	DCHECK_GT(col_stride_c, 0); \
				70	DCHECK((kRowA == Eigen::Dynamic) \|\| (kRowA == num_row_a)); \
				71	DCHECK((kColA == Eigen::Dynamic) \|\| (kColA == num_col_a)); \
				72	DCHECK((kRowB == Eigen::Dynamic) \|\| (kRowB == num_row_b)); \
				73	DCHECK((kColB == Eigen::Dynamic) \|\| (kColB == num_col_b)); \
				74	const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a); \
				75	const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a); \
				76	const int NUM_ROW_B = (kRowB != Eigen::Dynamic ? kRowB : num_row_b); \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	77	const int NUM_COL_B = (kColB != Eigen::Dynamic ? kColB : num_col_b);
				78
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	79	#define CERES_GEMM_EIGEN_HEADER \
				80	const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref( \
				81	A, num_row_a, num_col_a); \
				82	const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref( \
				83	B, num_row_b, num_col_b); \
				84	MatrixRef Cref(C, row_stride_c, col_stride_c);
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	85
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	86	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	87	#define CERES_CALL_GEMM(name) \
				88	name<kRowA, kColA, kRowB, kColB, kOperation>( \
				89	A, num_row_a, num_col_a, \
				90	B, num_row_b, num_col_b, \
				91	C, start_row_c, start_col_c, row_stride_c, col_stride_c);
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	92	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	93
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	94	#define CERES_GEMM_STORE_SINGLE(p, index, value) \
				95	if (kOperation > 0) { \
				96	p[index] += value; \
				97	} else if (kOperation < 0) { \
				98	p[index] -= value; \
				99	} else { \
				100	p[index] = value; \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	101	}
				102
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	103	#define CERES_GEMM_STORE_PAIR(p, index, v1, v2) \
				104	if (kOperation > 0) { \
				105	p[index] += v1; \
				106	p[index + 1] += v2; \
				107	} else if (kOperation < 0) { \
				108	p[index] -= v1; \
				109	p[index + 1] -= v2; \
				110	} else { \
				111	p[index] = v1; \
				112	p[index + 1] = v2; \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	113	}
				114
				115	// For the matrix-matrix functions below, there are three variants for
				116	// each functionality. Foo, FooNaive and FooEigen. Foo is the one to
				117	// be called by the user. FooNaive is a basic loop based
				118	// implementation and FooEigen uses Eigen's implementation. Foo
				119	// chooses between FooNaive and FooEigen depending on how many of the
				120	// template arguments are fixed at compile time. Currently, FooEigen
				121	// is called if all matrix dimensions are compile time
				122	// constants. FooNaive is called otherwise. This leads to the best
				123	// performance currently.
				124	//
				125	// The MatrixMatrixMultiply variants compute:
				126	//
				127	// C op A * B;
				128	//
				129	// The MatrixTransposeMatrixMultiply variants compute:
				130	//
				131	// C op A' * B
				132	//
				133	// where op can be +=, -=, or =.
				134	//
				135	// The template parameters (kRowA, kColA, kRowB, kColB) allow
				136	// specialization of the loop at compile time. If this information is
				137	// not available, then Eigen::Dynamic should be used as the template
				138	// argument.
				139	//
				140	// kOperation = 1 -> C += A * B
				141	// kOperation = -1 -> C -= A * B
				142	// kOperation = 0 -> C = A * B
				143	//
				144	// The functions can write into matrices C which are larger than the
				145	// matrix A * B. This is done by specifying the true size of C via
				146	// row_stride_c and col_stride_c, and then indicating where A * B
				147	// should be written into by start_row_c and start_col_c.
				148	//
				149	// Graphically if row_stride_c = 10, col_stride_c = 12, start_row_c =
				150	// 4 and start_col_c = 5, then if A = 3x2 and B = 2x4, we get
				151	//
				152	// ------------
				153	// ------------
				154	// ------------
				155	// ------------
				156	// -----xxxx---
				157	// -----xxxx---
				158	// -----xxxx---
				159	// ------------
				160	// ------------
				161	// ------------
				162	//
				163	CERES_GEMM_BEGIN(MatrixMatrixMultiplyEigen) {
				164	CERES_GEMM_EIGEN_HEADER
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	165	Eigen::Block<MatrixRef, kRowA, kColB> block(
				166	Cref, start_row_c, start_col_c, num_row_a, num_col_b);
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	167
				168	if (kOperation > 0) {
				169	block.noalias() += Aref * Bref;
				170	} else if (kOperation < 0) {
				171	block.noalias() -= Aref * Bref;
				172	} else {
				173	block.noalias() = Aref * Bref;
				174	}
				175	}
				176
				177	CERES_GEMM_BEGIN(MatrixMatrixMultiplyNaive) {
				178	CERES_GEMM_NAIVE_HEADER
				179	DCHECK_EQ(NUM_COL_A, NUM_ROW_B);
				180
				181	const int NUM_ROW_C = NUM_ROW_A;
				182	const int NUM_COL_C = NUM_COL_B;
				183	DCHECK_LE(start_row_c + NUM_ROW_C, row_stride_c);
				184	DCHECK_LE(start_col_c + NUM_COL_C, col_stride_c);
				185	const int span = 4;
				186
				187	// Calculate the remainder part first.
				188
				189	// Process the last odd column if present.
				190	if (NUM_COL_C & 1) {
				191	int col = NUM_COL_C - 1;
				192	const double* pa = &A[0];
				193	for (int row = 0; row < NUM_ROW_C; ++row, pa += NUM_COL_A) {
				194	const double* pb = &B[col];
				195	double tmp = 0.0;
				196	for (int k = 0; k < NUM_COL_A; ++k, pb += NUM_COL_B) {
				197	tmp += pa[k] * pb[0];
				198	}
				199
				200	const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
				201	CERES_GEMM_STORE_SINGLE(C, index, tmp);
				202	}
				203
				204	// Return directly for efficiency of extremely small matrix multiply.
				205	if (NUM_COL_C == 1) {
				206	return;
				207	}
				208	}
				209
				210	// Process the couple columns in remainder if present.
				211	if (NUM_COL_C & 2) {
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	212	int col = NUM_COL_C & (~(span - 1));
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	213	const double* pa = &A[0];
				214	for (int row = 0; row < NUM_ROW_C; ++row, pa += NUM_COL_A) {
				215	const double* pb = &B[col];
				216	double tmp1 = 0.0, tmp2 = 0.0;
				217	for (int k = 0; k < NUM_COL_A; ++k, pb += NUM_COL_B) {
				218	double av = pa[k];
				219	tmp1 += av * pb[0];
				220	tmp2 += av * pb[1];
				221	}
				222
				223	const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
				224	CERES_GEMM_STORE_PAIR(C, index, tmp1, tmp2);
				225	}
				226
				227	// Return directly for efficiency of extremely small matrix multiply.
				228	if (NUM_COL_C < span) {
				229	return;
				230	}
				231	}
				232
				233	// Calculate the main part with multiples of 4.
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	234	int col_m = NUM_COL_C & (~(span - 1));
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	235	for (int col = 0; col < col_m; col += span) {
				236	for (int row = 0; row < NUM_ROW_C; ++row) {
				237	const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	238	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	239	MMM_mat1x4(NUM_COL_A, &A[row * NUM_COL_A],
				240	&B[col], NUM_COL_B, &C[index], kOperation);
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	241	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	242	}
				243	}
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	244	}
				245
				246	CERES_GEMM_BEGIN(MatrixMatrixMultiply) {
				247	#ifdef CERES_NO_CUSTOM_BLAS
				248
				249	CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
				250	return;
				251
				252	#else
				253
				254	if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
				255	kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
				256	CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
				257	} else {
				258	CERES_CALL_GEMM(MatrixMatrixMultiplyNaive)
				259	}
				260
				261	#endif
				262	}
				263
				264	CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyEigen) {
				265	CERES_GEMM_EIGEN_HEADER
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	266	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	267	Eigen::Block<MatrixRef, kColA, kColB> block(Cref,
				268	start_row_c, start_col_c,
				269	num_col_a, num_col_b);
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	270	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	271	if (kOperation > 0) {
				272	block.noalias() += Aref.transpose() * Bref;
				273	} else if (kOperation < 0) {
				274	block.noalias() -= Aref.transpose() * Bref;
				275	} else {
				276	block.noalias() = Aref.transpose() * Bref;
				277	}
				278	}
				279
				280	CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyNaive) {
				281	CERES_GEMM_NAIVE_HEADER
				282	DCHECK_EQ(NUM_ROW_A, NUM_ROW_B);
				283
				284	const int NUM_ROW_C = NUM_COL_A;
				285	const int NUM_COL_C = NUM_COL_B;
				286	DCHECK_LE(start_row_c + NUM_ROW_C, row_stride_c);
				287	DCHECK_LE(start_col_c + NUM_COL_C, col_stride_c);
				288	const int span = 4;
				289
				290	// Process the remainder part first.
				291
				292	// Process the last odd column if present.
				293	if (NUM_COL_C & 1) {
				294	int col = NUM_COL_C - 1;
				295	for (int row = 0; row < NUM_ROW_C; ++row) {
				296	const double* pa = &A[row];
				297	const double* pb = &B[col];
				298	double tmp = 0.0;
				299	for (int k = 0; k < NUM_ROW_A; ++k) {
				300	tmp += pa[0] * pb[0];
				301	pa += NUM_COL_A;
				302	pb += NUM_COL_B;
				303	}
				304
				305	const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
				306	CERES_GEMM_STORE_SINGLE(C, index, tmp);
				307	}
				308
				309	// Return directly for efficiency of extremely small matrix multiply.
				310	if (NUM_COL_C == 1) {
				311	return;
				312	}
				313	}
				314
				315	// Process the couple columns in remainder if present.
				316	if (NUM_COL_C & 2) {
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	317	int col = NUM_COL_C & (~(span - 1));
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	318	for (int row = 0; row < NUM_ROW_C; ++row) {
				319	const double* pa = &A[row];
				320	const double* pb = &B[col];
				321	double tmp1 = 0.0, tmp2 = 0.0;
				322	for (int k = 0; k < NUM_ROW_A; ++k) {
				323	double av = *pa;
				324	tmp1 += av * pb[0];
				325	tmp2 += av * pb[1];
				326	pa += NUM_COL_A;
				327	pb += NUM_COL_B;
				328	}
				329
				330	const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
				331	CERES_GEMM_STORE_PAIR(C, index, tmp1, tmp2);
				332	}
				333
				334	// Return directly for efficiency of extremely small matrix multiply.
				335	if (NUM_COL_C < span) {
				336	return;
				337	}
				338	}
				339
				340	// Process the main part with multiples of 4.
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	341	int col_m = NUM_COL_C & (~(span - 1));
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	342	for (int col = 0; col < col_m; col += span) {
				343	for (int row = 0; row < NUM_ROW_C; ++row) {
				344	const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	345	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	346	MTM_mat1x4(NUM_ROW_A, &A[row], NUM_COL_A,
				347	&B[col], NUM_COL_B, &C[index], kOperation);
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	348	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	349	}
				350	}
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	351	}
				352
				353	CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiply) {
				354	#ifdef CERES_NO_CUSTOM_BLAS
				355
				356	CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
				357	return;
				358
				359	#else
				360
				361	if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
				362	kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
				363	CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
				364	} else {
				365	CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyNaive)
				366	}
				367
				368	#endif
				369	}
				370
				371	// Matrix-Vector multiplication
				372	//
				373	// c op A * b;
				374	//
				375	// where op can be +=, -=, or =.
				376	//
				377	// The template parameters (kRowA, kColA) allow specialization of the
				378	// loop at compile time. If this information is not available, then
				379	// Eigen::Dynamic should be used as the template argument.
				380	//
				381	// kOperation = 1 -> c += A' * b
				382	// kOperation = -1 -> c -= A' * b
				383	// kOperation = 0 -> c = A' * b
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	384	template <int kRowA, int kColA, int kOperation>
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	385	inline void MatrixVectorMultiply(const double* A,
				386	const int num_row_a,
				387	const int num_col_a,
				388	const double* b,
				389	double* c) {
				390	#ifdef CERES_NO_CUSTOM_BLAS
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	391	const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(
				392	A, num_row_a, num_col_a);
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	393	const typename EigenTypes<kColA>::ConstVectorRef bref(b, num_col_a);
				394	typename EigenTypes<kRowA>::VectorRef cref(c, num_row_a);
				395
				396	// lazyProduct works better than .noalias() for matrix-vector
				397	// products.
				398	if (kOperation > 0) {
				399	cref += Aref.lazyProduct(bref);
				400	} else if (kOperation < 0) {
				401	cref -= Aref.lazyProduct(bref);
				402	} else {
				403	cref = Aref.lazyProduct(bref);
				404	}
				405	#else
				406
				407	DCHECK_GT(num_row_a, 0);
				408	DCHECK_GT(num_col_a, 0);
				409	DCHECK((kRowA == Eigen::Dynamic) \|\| (kRowA == num_row_a));
				410	DCHECK((kColA == Eigen::Dynamic) \|\| (kColA == num_col_a));
				411
				412	const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
				413	const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
				414	const int span = 4;
				415
				416	// Calculate the remainder part first.
				417
				418	// Process the last odd row if present.
				419	if (NUM_ROW_A & 1) {
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	420	int row = NUM_ROW_A - 1;
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	421	const double* pa = &A[row * NUM_COL_A];
				422	const double* pb = &b[0];
				423	double tmp = 0.0;
				424	for (int col = 0; col < NUM_COL_A; ++col) {
				425	tmp += (pa++) (*pb++);
				426	}
				427	CERES_GEMM_STORE_SINGLE(c, row, tmp);
				428
				429	// Return directly for efficiency of extremely small matrix multiply.
				430	if (NUM_ROW_A == 1) {
				431	return;
				432	}
				433	}
				434
				435	// Process the couple rows in remainder if present.
				436	if (NUM_ROW_A & 2) {
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	437	int row = NUM_ROW_A & (~(span - 1));
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	438	const double* pa1 = &A[row * NUM_COL_A];
				439	const double* pa2 = pa1 + NUM_COL_A;
				440	const double* pb = &b[0];
				441	double tmp1 = 0.0, tmp2 = 0.0;
				442	for (int col = 0; col < NUM_COL_A; ++col) {
				443	double bv = *pb++;
				444	tmp1 += (pa1++) bv;
				445	tmp2 += (pa2++) bv;
				446	}
				447	CERES_GEMM_STORE_PAIR(c, row, tmp1, tmp2);
				448
				449	// Return directly for efficiency of extremely small matrix multiply.
				450	if (NUM_ROW_A < span) {
				451	return;
				452	}
				453	}
				454
				455	// Calculate the main part with multiples of 4.
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	456	int row_m = NUM_ROW_A & (~(span - 1));
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	457	for (int row = 0; row < row_m; row += span) {
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	458	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	459	MVM_mat4x1(NUM_COL_A, &A[row * NUM_COL_A], NUM_COL_A,
				460	&b[0], &c[row], kOperation);
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	461	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	462	}
				463
				464	#endif // CERES_NO_CUSTOM_BLAS
				465	}
				466
				467	// Similar to MatrixVectorMultiply, except that A is transposed, i.e.,
				468	//
				469	// c op A' * b;
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	470	template <int kRowA, int kColA, int kOperation>
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	471	inline void MatrixTransposeVectorMultiply(const double* A,
				472	const int num_row_a,
				473	const int num_col_a,
				474	const double* b,
				475	double* c) {
				476	#ifdef CERES_NO_CUSTOM_BLAS
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	477	const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(
				478	A, num_row_a, num_col_a);
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	479	const typename EigenTypes<kRowA>::ConstVectorRef bref(b, num_row_a);
				480	typename EigenTypes<kColA>::VectorRef cref(c, num_col_a);
				481
				482	// lazyProduct works better than .noalias() for matrix-vector
				483	// products.
				484	if (kOperation > 0) {
				485	cref += Aref.transpose().lazyProduct(bref);
				486	} else if (kOperation < 0) {
				487	cref -= Aref.transpose().lazyProduct(bref);
				488	} else {
				489	cref = Aref.transpose().lazyProduct(bref);
				490	}
				491	#else
				492
				493	DCHECK_GT(num_row_a, 0);
				494	DCHECK_GT(num_col_a, 0);
				495	DCHECK((kRowA == Eigen::Dynamic) \|\| (kRowA == num_row_a));
				496	DCHECK((kColA == Eigen::Dynamic) \|\| (kColA == num_col_a));
				497
				498	const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
				499	const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
				500	const int span = 4;
				501
				502	// Calculate the remainder part first.
				503
				504	// Process the last odd column if present.
				505	if (NUM_COL_A & 1) {
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	506	int row = NUM_COL_A - 1;
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	507	const double* pa = &A[row];
				508	const double* pb = &b[0];
				509	double tmp = 0.0;
				510	for (int col = 0; col < NUM_ROW_A; ++col) {
				511	tmp += pa (*pb++);
				512	pa += NUM_COL_A;
				513	}
				514	CERES_GEMM_STORE_SINGLE(c, row, tmp);
				515
				516	// Return directly for efficiency of extremely small matrix multiply.
				517	if (NUM_COL_A == 1) {
				518	return;
				519	}
				520	}
				521
				522	// Process the couple columns in remainder if present.
				523	if (NUM_COL_A & 2) {
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	524	int row = NUM_COL_A & (~(span - 1));
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	525	const double* pa = &A[row];
				526	const double* pb = &b[0];
				527	double tmp1 = 0.0, tmp2 = 0.0;
				528	for (int col = 0; col < NUM_ROW_A; ++col) {
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	529	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	530	double bv = *pb++;
				531	tmp1 += (pa ) bv;
				532	tmp2 += (pa + 1) bv;
				533	pa += NUM_COL_A;
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	534	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	535	}
				536	CERES_GEMM_STORE_PAIR(c, row, tmp1, tmp2);
				537
				538	// Return directly for efficiency of extremely small matrix multiply.
				539	if (NUM_COL_A < span) {
				540	return;
				541	}
				542	}
				543
				544	// Calculate the main part with multiples of 4.
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	545	int row_m = NUM_COL_A & (~(span - 1));
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	546	for (int row = 0; row < row_m; row += span) {
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	547	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	548	MTV_mat4x1(NUM_ROW_A, &A[row], NUM_COL_A,
				549	&b[0], &c[row], kOperation);
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	550	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	551	}
				552
				553	#endif // CERES_NO_CUSTOM_BLAS
				554	}
				555
				556	#undef CERES_GEMM_BEGIN
				557	#undef CERES_GEMM_EIGEN_HEADER
				558	#undef CERES_GEMM_NAIVE_HEADER
				559	#undef CERES_CALL_GEMM
				560	#undef CERES_GEMM_STORE_SINGLE
				561	#undef CERES_GEMM_STORE_PAIR
				562
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	563	} // namespace ceres::internal
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	564
				565	#endif // CERES_INTERNAL_SMALL_BLAS_H_