Blame - internal/ceres/small_blas.h - RealtimeRoboticsGroup/test

blob: 4ee9229f35fc290beaa4693f15637794cb7f13af [file] [log] [blame]

Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	1	// Ceres Solver - A fast non-linear least squares minimizer
				2	// Copyright 2015 Google Inc. All rights reserved.
				3	// http://ceres-solver.org/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are met:
				7	//
				8	// * Redistributions of source code must retain the above copyright notice,
				9	// this list of conditions and the following disclaimer.
				10	// * Redistributions in binary form must reproduce the above copyright notice,
				11	// this list of conditions and the following disclaimer in the documentation
				12	// and/or other materials provided with the distribution.
				13	// * Neither the name of Google Inc. nor the names of its contributors may be
				14	// used to endorse or promote products derived from this software without
				15	// specific prior written permission.
				16	//
				17	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				18	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				19	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				20	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				21	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				22	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				23	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				24	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				25	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				26	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				27	// POSSIBILITY OF SUCH DAMAGE.
				28	//
				29	// Author: sameeragarwal@google.com (Sameer Agarwal)
				30	//
				31	// Simple blas functions for use in the Schur Eliminator. These are
				32	// fairly basic implementations which already yield a significant
				33	// speedup in the eliminator performance.
				34
				35	#ifndef CERES_INTERNAL_SMALL_BLAS_H_
				36	#define CERES_INTERNAL_SMALL_BLAS_H_
				37
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	38	#include "ceres/internal/eigen.h"
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	39	#include "ceres/internal/port.h"
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	40	#include "glog/logging.h"
				41	#include "small_blas_generic.h"
				42
				43	namespace ceres {
				44	namespace internal {
				45
				46	// The following three macros are used to share code and reduce
				47	// template junk across the various GEMM variants.
				48	#define CERES_GEMM_BEGIN(name) \
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	49	template <int kRowA, int kColA, int kRowB, int kColB, int kOperation> \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	50	inline void name(const double* A, \
				51	const int num_row_a, \
				52	const int num_col_a, \
				53	const double* B, \
				54	const int num_row_b, \
				55	const int num_col_b, \
				56	double* C, \
				57	const int start_row_c, \
				58	const int start_col_c, \
				59	const int row_stride_c, \
				60	const int col_stride_c)
				61
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	62	#define CERES_GEMM_NAIVE_HEADER \
				63	DCHECK_GT(num_row_a, 0); \
				64	DCHECK_GT(num_col_a, 0); \
				65	DCHECK_GT(num_row_b, 0); \
				66	DCHECK_GT(num_col_b, 0); \
				67	DCHECK_GE(start_row_c, 0); \
				68	DCHECK_GE(start_col_c, 0); \
				69	DCHECK_GT(row_stride_c, 0); \
				70	DCHECK_GT(col_stride_c, 0); \
				71	DCHECK((kRowA == Eigen::Dynamic) \|\| (kRowA == num_row_a)); \
				72	DCHECK((kColA == Eigen::Dynamic) \|\| (kColA == num_col_a)); \
				73	DCHECK((kRowB == Eigen::Dynamic) \|\| (kRowB == num_row_b)); \
				74	DCHECK((kColB == Eigen::Dynamic) \|\| (kColB == num_col_b)); \
				75	const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a); \
				76	const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a); \
				77	const int NUM_ROW_B = (kRowB != Eigen::Dynamic ? kRowB : num_row_b); \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	78	const int NUM_COL_B = (kColB != Eigen::Dynamic ? kColB : num_col_b);
				79
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	80	#define CERES_GEMM_EIGEN_HEADER \
				81	const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref( \
				82	A, num_row_a, num_col_a); \
				83	const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref( \
				84	B, num_row_b, num_col_b); \
				85	MatrixRef Cref(C, row_stride_c, col_stride_c);
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	86
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	87	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	88	#define CERES_CALL_GEMM(name) \
				89	name<kRowA, kColA, kRowB, kColB, kOperation>( \
				90	A, num_row_a, num_col_a, \
				91	B, num_row_b, num_col_b, \
				92	C, start_row_c, start_col_c, row_stride_c, col_stride_c);
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	93	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	94
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	95	#define CERES_GEMM_STORE_SINGLE(p, index, value) \
				96	if (kOperation > 0) { \
				97	p[index] += value; \
				98	} else if (kOperation < 0) { \
				99	p[index] -= value; \
				100	} else { \
				101	p[index] = value; \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	102	}
				103
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	104	#define CERES_GEMM_STORE_PAIR(p, index, v1, v2) \
				105	if (kOperation > 0) { \
				106	p[index] += v1; \
				107	p[index + 1] += v2; \
				108	} else if (kOperation < 0) { \
				109	p[index] -= v1; \
				110	p[index + 1] -= v2; \
				111	} else { \
				112	p[index] = v1; \
				113	p[index + 1] = v2; \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	114	}
				115
				116	// For the matrix-matrix functions below, there are three variants for
				117	// each functionality. Foo, FooNaive and FooEigen. Foo is the one to
				118	// be called by the user. FooNaive is a basic loop based
				119	// implementation and FooEigen uses Eigen's implementation. Foo
				120	// chooses between FooNaive and FooEigen depending on how many of the
				121	// template arguments are fixed at compile time. Currently, FooEigen
				122	// is called if all matrix dimensions are compile time
				123	// constants. FooNaive is called otherwise. This leads to the best
				124	// performance currently.
				125	//
				126	// The MatrixMatrixMultiply variants compute:
				127	//
				128	// C op A * B;
				129	//
				130	// The MatrixTransposeMatrixMultiply variants compute:
				131	//
				132	// C op A' * B
				133	//
				134	// where op can be +=, -=, or =.
				135	//
				136	// The template parameters (kRowA, kColA, kRowB, kColB) allow
				137	// specialization of the loop at compile time. If this information is
				138	// not available, then Eigen::Dynamic should be used as the template
				139	// argument.
				140	//
				141	// kOperation = 1 -> C += A * B
				142	// kOperation = -1 -> C -= A * B
				143	// kOperation = 0 -> C = A * B
				144	//
				145	// The functions can write into matrices C which are larger than the
				146	// matrix A * B. This is done by specifying the true size of C via
				147	// row_stride_c and col_stride_c, and then indicating where A * B
				148	// should be written into by start_row_c and start_col_c.
				149	//
				150	// Graphically if row_stride_c = 10, col_stride_c = 12, start_row_c =
				151	// 4 and start_col_c = 5, then if A = 3x2 and B = 2x4, we get
				152	//
				153	// ------------
				154	// ------------
				155	// ------------
				156	// ------------
				157	// -----xxxx---
				158	// -----xxxx---
				159	// -----xxxx---
				160	// ------------
				161	// ------------
				162	// ------------
				163	//
				164	CERES_GEMM_BEGIN(MatrixMatrixMultiplyEigen) {
				165	CERES_GEMM_EIGEN_HEADER
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	166	Eigen::Block<MatrixRef, kRowA, kColB> block(
				167	Cref, start_row_c, start_col_c, num_row_a, num_col_b);
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	168
				169	if (kOperation > 0) {
				170	block.noalias() += Aref * Bref;
				171	} else if (kOperation < 0) {
				172	block.noalias() -= Aref * Bref;
				173	} else {
				174	block.noalias() = Aref * Bref;
				175	}
				176	}
				177
				178	CERES_GEMM_BEGIN(MatrixMatrixMultiplyNaive) {
				179	CERES_GEMM_NAIVE_HEADER
				180	DCHECK_EQ(NUM_COL_A, NUM_ROW_B);
				181
				182	const int NUM_ROW_C = NUM_ROW_A;
				183	const int NUM_COL_C = NUM_COL_B;
				184	DCHECK_LE(start_row_c + NUM_ROW_C, row_stride_c);
				185	DCHECK_LE(start_col_c + NUM_COL_C, col_stride_c);
				186	const int span = 4;
				187
				188	// Calculate the remainder part first.
				189
				190	// Process the last odd column if present.
				191	if (NUM_COL_C & 1) {
				192	int col = NUM_COL_C - 1;
				193	const double* pa = &A[0];
				194	for (int row = 0; row < NUM_ROW_C; ++row, pa += NUM_COL_A) {
				195	const double* pb = &B[col];
				196	double tmp = 0.0;
				197	for (int k = 0; k < NUM_COL_A; ++k, pb += NUM_COL_B) {
				198	tmp += pa[k] * pb[0];
				199	}
				200
				201	const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
				202	CERES_GEMM_STORE_SINGLE(C, index, tmp);
				203	}
				204
				205	// Return directly for efficiency of extremely small matrix multiply.
				206	if (NUM_COL_C == 1) {
				207	return;
				208	}
				209	}
				210
				211	// Process the couple columns in remainder if present.
				212	if (NUM_COL_C & 2) {
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	213	int col = NUM_COL_C & (int)(~(span - 1));
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	214	const double* pa = &A[0];
				215	for (int row = 0; row < NUM_ROW_C; ++row, pa += NUM_COL_A) {
				216	const double* pb = &B[col];
				217	double tmp1 = 0.0, tmp2 = 0.0;
				218	for (int k = 0; k < NUM_COL_A; ++k, pb += NUM_COL_B) {
				219	double av = pa[k];
				220	tmp1 += av * pb[0];
				221	tmp2 += av * pb[1];
				222	}
				223
				224	const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
				225	CERES_GEMM_STORE_PAIR(C, index, tmp1, tmp2);
				226	}
				227
				228	// Return directly for efficiency of extremely small matrix multiply.
				229	if (NUM_COL_C < span) {
				230	return;
				231	}
				232	}
				233
				234	// Calculate the main part with multiples of 4.
				235	int col_m = NUM_COL_C & (int)(~(span - 1));
				236	for (int col = 0; col < col_m; col += span) {
				237	for (int row = 0; row < NUM_ROW_C; ++row) {
				238	const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	239	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	240	MMM_mat1x4(NUM_COL_A, &A[row * NUM_COL_A],
				241	&B[col], NUM_COL_B, &C[index], kOperation);
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	242	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	243	}
				244	}
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	245	}
				246
				247	CERES_GEMM_BEGIN(MatrixMatrixMultiply) {
				248	#ifdef CERES_NO_CUSTOM_BLAS
				249
				250	CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
				251	return;
				252
				253	#else
				254
				255	if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
				256	kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
				257	CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
				258	} else {
				259	CERES_CALL_GEMM(MatrixMatrixMultiplyNaive)
				260	}
				261
				262	#endif
				263	}
				264
				265	CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyEigen) {
				266	CERES_GEMM_EIGEN_HEADER
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	267	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	268	Eigen::Block<MatrixRef, kColA, kColB> block(Cref,
				269	start_row_c, start_col_c,
				270	num_col_a, num_col_b);
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	271	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	272	if (kOperation > 0) {
				273	block.noalias() += Aref.transpose() * Bref;
				274	} else if (kOperation < 0) {
				275	block.noalias() -= Aref.transpose() * Bref;
				276	} else {
				277	block.noalias() = Aref.transpose() * Bref;
				278	}
				279	}
				280
				281	CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyNaive) {
				282	CERES_GEMM_NAIVE_HEADER
				283	DCHECK_EQ(NUM_ROW_A, NUM_ROW_B);
				284
				285	const int NUM_ROW_C = NUM_COL_A;
				286	const int NUM_COL_C = NUM_COL_B;
				287	DCHECK_LE(start_row_c + NUM_ROW_C, row_stride_c);
				288	DCHECK_LE(start_col_c + NUM_COL_C, col_stride_c);
				289	const int span = 4;
				290
				291	// Process the remainder part first.
				292
				293	// Process the last odd column if present.
				294	if (NUM_COL_C & 1) {
				295	int col = NUM_COL_C - 1;
				296	for (int row = 0; row < NUM_ROW_C; ++row) {
				297	const double* pa = &A[row];
				298	const double* pb = &B[col];
				299	double tmp = 0.0;
				300	for (int k = 0; k < NUM_ROW_A; ++k) {
				301	tmp += pa[0] * pb[0];
				302	pa += NUM_COL_A;
				303	pb += NUM_COL_B;
				304	}
				305
				306	const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
				307	CERES_GEMM_STORE_SINGLE(C, index, tmp);
				308	}
				309
				310	// Return directly for efficiency of extremely small matrix multiply.
				311	if (NUM_COL_C == 1) {
				312	return;
				313	}
				314	}
				315
				316	// Process the couple columns in remainder if present.
				317	if (NUM_COL_C & 2) {
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	318	int col = NUM_COL_C & (int)(~(span - 1));
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	319	for (int row = 0; row < NUM_ROW_C; ++row) {
				320	const double* pa = &A[row];
				321	const double* pb = &B[col];
				322	double tmp1 = 0.0, tmp2 = 0.0;
				323	for (int k = 0; k < NUM_ROW_A; ++k) {
				324	double av = *pa;
				325	tmp1 += av * pb[0];
				326	tmp2 += av * pb[1];
				327	pa += NUM_COL_A;
				328	pb += NUM_COL_B;
				329	}
				330
				331	const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
				332	CERES_GEMM_STORE_PAIR(C, index, tmp1, tmp2);
				333	}
				334
				335	// Return directly for efficiency of extremely small matrix multiply.
				336	if (NUM_COL_C < span) {
				337	return;
				338	}
				339	}
				340
				341	// Process the main part with multiples of 4.
				342	int col_m = NUM_COL_C & (int)(~(span - 1));
				343	for (int col = 0; col < col_m; col += span) {
				344	for (int row = 0; row < NUM_ROW_C; ++row) {
				345	const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	346	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	347	MTM_mat1x4(NUM_ROW_A, &A[row], NUM_COL_A,
				348	&B[col], NUM_COL_B, &C[index], kOperation);
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	349	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	350	}
				351	}
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	352	}
				353
				354	CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiply) {
				355	#ifdef CERES_NO_CUSTOM_BLAS
				356
				357	CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
				358	return;
				359
				360	#else
				361
				362	if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
				363	kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
				364	CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
				365	} else {
				366	CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyNaive)
				367	}
				368
				369	#endif
				370	}
				371
				372	// Matrix-Vector multiplication
				373	//
				374	// c op A * b;
				375	//
				376	// where op can be +=, -=, or =.
				377	//
				378	// The template parameters (kRowA, kColA) allow specialization of the
				379	// loop at compile time. If this information is not available, then
				380	// Eigen::Dynamic should be used as the template argument.
				381	//
				382	// kOperation = 1 -> c += A' * b
				383	// kOperation = -1 -> c -= A' * b
				384	// kOperation = 0 -> c = A' * b
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	385	template <int kRowA, int kColA, int kOperation>
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	386	inline void MatrixVectorMultiply(const double* A,
				387	const int num_row_a,
				388	const int num_col_a,
				389	const double* b,
				390	double* c) {
				391	#ifdef CERES_NO_CUSTOM_BLAS
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	392	const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(
				393	A, num_row_a, num_col_a);
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	394	const typename EigenTypes<kColA>::ConstVectorRef bref(b, num_col_a);
				395	typename EigenTypes<kRowA>::VectorRef cref(c, num_row_a);
				396
				397	// lazyProduct works better than .noalias() for matrix-vector
				398	// products.
				399	if (kOperation > 0) {
				400	cref += Aref.lazyProduct(bref);
				401	} else if (kOperation < 0) {
				402	cref -= Aref.lazyProduct(bref);
				403	} else {
				404	cref = Aref.lazyProduct(bref);
				405	}
				406	#else
				407
				408	DCHECK_GT(num_row_a, 0);
				409	DCHECK_GT(num_col_a, 0);
				410	DCHECK((kRowA == Eigen::Dynamic) \|\| (kRowA == num_row_a));
				411	DCHECK((kColA == Eigen::Dynamic) \|\| (kColA == num_col_a));
				412
				413	const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
				414	const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
				415	const int span = 4;
				416
				417	// Calculate the remainder part first.
				418
				419	// Process the last odd row if present.
				420	if (NUM_ROW_A & 1) {
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	421	int row = NUM_ROW_A - 1;
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	422	const double* pa = &A[row * NUM_COL_A];
				423	const double* pb = &b[0];
				424	double tmp = 0.0;
				425	for (int col = 0; col < NUM_COL_A; ++col) {
				426	tmp += (pa++) (*pb++);
				427	}
				428	CERES_GEMM_STORE_SINGLE(c, row, tmp);
				429
				430	// Return directly for efficiency of extremely small matrix multiply.
				431	if (NUM_ROW_A == 1) {
				432	return;
				433	}
				434	}
				435
				436	// Process the couple rows in remainder if present.
				437	if (NUM_ROW_A & 2) {
				438	int row = NUM_ROW_A & (int)(~(span - 1));
				439	const double* pa1 = &A[row * NUM_COL_A];
				440	const double* pa2 = pa1 + NUM_COL_A;
				441	const double* pb = &b[0];
				442	double tmp1 = 0.0, tmp2 = 0.0;
				443	for (int col = 0; col < NUM_COL_A; ++col) {
				444	double bv = *pb++;
				445	tmp1 += (pa1++) bv;
				446	tmp2 += (pa2++) bv;
				447	}
				448	CERES_GEMM_STORE_PAIR(c, row, tmp1, tmp2);
				449
				450	// Return directly for efficiency of extremely small matrix multiply.
				451	if (NUM_ROW_A < span) {
				452	return;
				453	}
				454	}
				455
				456	// Calculate the main part with multiples of 4.
				457	int row_m = NUM_ROW_A & (int)(~(span - 1));
				458	for (int row = 0; row < row_m; row += span) {
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	459	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	460	MVM_mat4x1(NUM_COL_A, &A[row * NUM_COL_A], NUM_COL_A,
				461	&b[0], &c[row], kOperation);
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	462	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	463	}
				464
				465	#endif // CERES_NO_CUSTOM_BLAS
				466	}
				467
				468	// Similar to MatrixVectorMultiply, except that A is transposed, i.e.,
				469	//
				470	// c op A' * b;
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	471	template <int kRowA, int kColA, int kOperation>
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	472	inline void MatrixTransposeVectorMultiply(const double* A,
				473	const int num_row_a,
				474	const int num_col_a,
				475	const double* b,
				476	double* c) {
				477	#ifdef CERES_NO_CUSTOM_BLAS
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	478	const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(
				479	A, num_row_a, num_col_a);
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	480	const typename EigenTypes<kRowA>::ConstVectorRef bref(b, num_row_a);
				481	typename EigenTypes<kColA>::VectorRef cref(c, num_col_a);
				482
				483	// lazyProduct works better than .noalias() for matrix-vector
				484	// products.
				485	if (kOperation > 0) {
				486	cref += Aref.transpose().lazyProduct(bref);
				487	} else if (kOperation < 0) {
				488	cref -= Aref.transpose().lazyProduct(bref);
				489	} else {
				490	cref = Aref.transpose().lazyProduct(bref);
				491	}
				492	#else
				493
				494	DCHECK_GT(num_row_a, 0);
				495	DCHECK_GT(num_col_a, 0);
				496	DCHECK((kRowA == Eigen::Dynamic) \|\| (kRowA == num_row_a));
				497	DCHECK((kColA == Eigen::Dynamic) \|\| (kColA == num_col_a));
				498
				499	const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
				500	const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
				501	const int span = 4;
				502
				503	// Calculate the remainder part first.
				504
				505	// Process the last odd column if present.
				506	if (NUM_COL_A & 1) {
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	507	int row = NUM_COL_A - 1;
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	508	const double* pa = &A[row];
				509	const double* pb = &b[0];
				510	double tmp = 0.0;
				511	for (int col = 0; col < NUM_ROW_A; ++col) {
				512	tmp += pa (*pb++);
				513	pa += NUM_COL_A;
				514	}
				515	CERES_GEMM_STORE_SINGLE(c, row, tmp);
				516
				517	// Return directly for efficiency of extremely small matrix multiply.
				518	if (NUM_COL_A == 1) {
				519	return;
				520	}
				521	}
				522
				523	// Process the couple columns in remainder if present.
				524	if (NUM_COL_A & 2) {
				525	int row = NUM_COL_A & (int)(~(span - 1));
				526	const double* pa = &A[row];
				527	const double* pb = &b[0];
				528	double tmp1 = 0.0, tmp2 = 0.0;
				529	for (int col = 0; col < NUM_ROW_A; ++col) {
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	530	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	531	double bv = *pb++;
				532	tmp1 += (pa ) bv;
				533	tmp2 += (pa + 1) bv;
				534	pa += NUM_COL_A;
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	535	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	536	}
				537	CERES_GEMM_STORE_PAIR(c, row, tmp1, tmp2);
				538
				539	// Return directly for efficiency of extremely small matrix multiply.
				540	if (NUM_COL_A < span) {
				541	return;
				542	}
				543	}
				544
				545	// Calculate the main part with multiples of 4.
				546	int row_m = NUM_COL_A & (int)(~(span - 1));
				547	for (int row = 0; row < row_m; row += span) {
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	548	// clang-format off
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	549	MTV_mat4x1(NUM_ROW_A, &A[row], NUM_COL_A,
				550	&b[0], &c[row], kOperation);
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	551	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	552	}
				553
				554	#endif // CERES_NO_CUSTOM_BLAS
				555	}
				556
				557	#undef CERES_GEMM_BEGIN
				558	#undef CERES_GEMM_EIGEN_HEADER
				559	#undef CERES_GEMM_NAIVE_HEADER
				560	#undef CERES_CALL_GEMM
				561	#undef CERES_GEMM_STORE_SINGLE
				562	#undef CERES_GEMM_STORE_PAIR
				563
				564	} // namespace internal
				565	} // namespace ceres
				566
				567	#endif // CERES_INTERNAL_SMALL_BLAS_H_