Blame - internal/ceres/small_blas_generic.h - RealtimeRoboticsGroup/test

blob: 93ee338813a2085b43de6999b7bde057d6780c49 [file] [log] [blame]

Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	1	// Ceres Solver - A fast non-linear least squares minimizer
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	2	// Copyright 2023 Google Inc. All rights reserved.
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	3	// http://ceres-solver.org/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are met:
				7	//
				8	// * Redistributions of source code must retain the above copyright notice,
				9	// this list of conditions and the following disclaimer.
				10	// * Redistributions in binary form must reproduce the above copyright notice,
				11	// this list of conditions and the following disclaimer in the documentation
				12	// and/or other materials provided with the distribution.
				13	// * Neither the name of Google Inc. nor the names of its contributors may be
				14	// used to endorse or promote products derived from this software without
				15	// specific prior written permission.
				16	//
				17	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				18	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				19	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				20	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				21	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				22	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				23	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				24	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				25	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				26	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				27	// POSSIBILITY OF SUCH DAMAGE.
				28	//
				29	// Author: yangfan34@lenovo.com (Lenovo Research Device+ Lab - Shanghai)
				30	//
				31	// Optimization for simple blas functions used in the Schur Eliminator.
				32	// These are fairly basic implementations which already yield a significant
				33	// speedup in the eliminator performance.
				34
				35	#ifndef CERES_INTERNAL_SMALL_BLAS_GENERIC_H_
				36	#define CERES_INTERNAL_SMALL_BLAS_GENERIC_H_
				37
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	38	namespace ceres::internal {
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	39
				40	// The following macros are used to share code
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	41	#define CERES_GEMM_OPT_NAIVE_HEADER \
				42	double cvec4[4] = {0.0, 0.0, 0.0, 0.0}; \
				43	const double* pa = a; \
				44	const double* pb = b; \
				45	const int span = 4; \
				46	int col_r = col_a & (span - 1); \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	47	int col_m = col_a - col_r;
				48
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	49	#define CERES_GEMM_OPT_STORE_MAT1X4 \
				50	if (kOperation > 0) { \
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	51	c[0] += cvec4[0]; \
				52	c[1] += cvec4[1]; \
				53	c[2] += cvec4[2]; \
				54	c[3] += cvec4[3]; \
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	55	} else if (kOperation < 0) { \
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	56	c[0] -= cvec4[0]; \
				57	c[1] -= cvec4[1]; \
				58	c[2] -= cvec4[2]; \
				59	c[3] -= cvec4[3]; \
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	60	} else { \
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	61	c[0] = cvec4[0]; \
				62	c[1] = cvec4[1]; \
				63	c[2] = cvec4[2]; \
				64	c[3] = cvec4[3]; \
				65	} \
				66	c += 4;
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	67
				68	// Matrix-Matrix Multiplication
				69	// Figure out 1x4 of Matrix C in one batch
				70	//
				71	// c op a * B;
				72	// where op can be +=, -=, or =, indicated by kOperation.
				73	//
				74	// Matrix C Matrix A Matrix B
				75	//
				76	// C0, C1, C2, C3 op A0, A1, A2, A3, ... * B0, B1, B2, B3
				77	// B4, B5, B6, B7
				78	// B8, B9, Ba, Bb
				79	// Bc, Bd, Be, Bf
				80	// . , . , . , .
				81	// . , . , . , .
				82	// . , . , . , .
				83	//
				84	// unroll for loops
				85	// utilize the data resided in cache
				86	// NOTE: col_a means the columns of A
				87	static inline void MMM_mat1x4(const int col_a,
				88	const double* a,
				89	const double* b,
				90	const int col_stride_b,
				91	double* c,
				92	const int kOperation) {
				93	CERES_GEMM_OPT_NAIVE_HEADER
				94	double av = 0.0;
				95	int bi = 0;
				96
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	97	#define CERES_GEMM_OPT_MMM_MAT1X4_MUL \
				98	av = pa[k]; \
				99	pb = b + bi; \
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	100	cvec4[0] += av * pb[0]; \
				101	cvec4[1] += av * pb[1]; \
				102	cvec4[2] += av * pb[2]; \
				103	cvec4[3] += av * pb[3]; \
				104	pb += 4; \
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	105	bi += col_stride_b; \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	106	k++;
				107
				108	for (int k = 0; k < col_m;) {
				109	CERES_GEMM_OPT_MMM_MAT1X4_MUL
				110	CERES_GEMM_OPT_MMM_MAT1X4_MUL
				111	CERES_GEMM_OPT_MMM_MAT1X4_MUL
				112	CERES_GEMM_OPT_MMM_MAT1X4_MUL
				113	}
				114
				115	for (int k = col_m; k < col_a;) {
				116	CERES_GEMM_OPT_MMM_MAT1X4_MUL
				117	}
				118
				119	CERES_GEMM_OPT_STORE_MAT1X4
				120
				121	#undef CERES_GEMM_OPT_MMM_MAT1X4_MUL
				122	}
				123
				124	// Matrix Transpose-Matrix multiplication
				125	// Figure out 1x4 of Matrix C in one batch
				126	//
				127	// c op a' * B;
				128	// where op can be +=, -=, or = indicated by kOperation.
				129	//
				130	// Matrix A
				131	//
				132	// A0
				133	// A1
				134	// A2
				135	// A3
				136	// .
				137	// .
				138	// .
				139	//
				140	// Matrix C Matrix A' Matrix B
				141	//
				142	// C0, C1, C2, C3 op A0, A1, A2, A3, ... * B0, B1, B2, B3
				143	// B4, B5, B6, B7
				144	// B8, B9, Ba, Bb
				145	// Bc, Bd, Be, Bf
				146	// . , . , . , .
				147	// . , . , . , .
				148	// . , . , . , .
				149	//
				150	// unroll for loops
				151	// utilize the data resided in cache
				152	// NOTE: col_a means the columns of A'
				153	static inline void MTM_mat1x4(const int col_a,
				154	const double* a,
				155	const int col_stride_a,
				156	const double* b,
				157	const int col_stride_b,
				158	double* c,
				159	const int kOperation) {
				160	CERES_GEMM_OPT_NAIVE_HEADER
				161	double av = 0.0;
				162	int ai = 0;
				163	int bi = 0;
				164
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	165	#define CERES_GEMM_OPT_MTM_MAT1X4_MUL \
				166	av = pa[ai]; \
				167	pb = b + bi; \
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	168	cvec4[0] += av * pb[0]; \
				169	cvec4[1] += av * pb[1]; \
				170	cvec4[2] += av * pb[2]; \
				171	cvec4[3] += av * pb[3]; \
				172	pb += 4; \
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	173	ai += col_stride_a; \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	174	bi += col_stride_b;
				175
				176	for (int k = 0; k < col_m; k += span) {
				177	CERES_GEMM_OPT_MTM_MAT1X4_MUL
				178	CERES_GEMM_OPT_MTM_MAT1X4_MUL
				179	CERES_GEMM_OPT_MTM_MAT1X4_MUL
				180	CERES_GEMM_OPT_MTM_MAT1X4_MUL
				181	}
				182
				183	for (int k = col_m; k < col_a; k++) {
				184	CERES_GEMM_OPT_MTM_MAT1X4_MUL
				185	}
				186
				187	CERES_GEMM_OPT_STORE_MAT1X4
				188
				189	#undef CERES_GEMM_OPT_MTM_MAT1X4_MUL
				190	}
				191
				192	// Matrix-Vector Multiplication
				193	// Figure out 4x1 of vector c in one batch
				194	//
				195	// c op A * b;
				196	// where op can be +=, -=, or =, indicated by kOperation.
				197	//
				198	// Vector c Matrix A Vector b
				199	//
				200	// C0 op A0, A1, A2, A3, ... * B0
				201	// C1 A4, A5, A6, A7, ... B1
				202	// C2 A8, A9, Aa, Ab, ... B2
				203	// C3 Ac, Ad, Ae, Af, ... B3
				204	// .
				205	// .
				206	// .
				207	//
				208	// unroll for loops
				209	// utilize the data resided in cache
				210	// NOTE: col_a means the columns of A
				211	static inline void MVM_mat4x1(const int col_a,
				212	const double* a,
				213	const int col_stride_a,
				214	const double* b,
				215	double* c,
				216	const int kOperation) {
				217	CERES_GEMM_OPT_NAIVE_HEADER
				218	double bv = 0.0;
				219
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	220	// clang-format off
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	221	#define CERES_GEMM_OPT_MVM_MAT4X1_MUL \
				222	bv = *pb; \
				223	cvec4[0] += (pa ) bv; \
				224	cvec4[1] += (pa + col_stride_a ) bv; \
				225	cvec4[2] += (pa + col_stride_a 2) * bv; \
				226	cvec4[3] += (pa + col_stride_a 3) * bv; \
				227	pa++; \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	228	pb++;
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	229	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	230
				231	for (int k = 0; k < col_m; k += span) {
				232	CERES_GEMM_OPT_MVM_MAT4X1_MUL
				233	CERES_GEMM_OPT_MVM_MAT4X1_MUL
				234	CERES_GEMM_OPT_MVM_MAT4X1_MUL
				235	CERES_GEMM_OPT_MVM_MAT4X1_MUL
				236	}
				237
				238	for (int k = col_m; k < col_a; k++) {
				239	CERES_GEMM_OPT_MVM_MAT4X1_MUL
				240	}
				241
				242	CERES_GEMM_OPT_STORE_MAT1X4
				243
				244	#undef CERES_GEMM_OPT_MVM_MAT4X1_MUL
				245	}
				246
				247	// Matrix Transpose-Vector multiplication
				248	// Figure out 4x1 of vector c in one batch
				249	//
				250	// c op A' * b;
				251	// where op can be +=, -=, or =, indicated by kOperation.
				252	//
				253	// Matrix A
				254	//
				255	// A0, A4, A8, Ac
				256	// A1, A5, A9, Ad
				257	// A2, A6, Aa, Ae
				258	// A3, A7, Ab, Af
				259	// . , . , . , .
				260	// . , . , . , .
				261	// . , . , . , .
				262	//
				263	// Vector c Matrix A' Vector b
				264	//
				265	// C0 op A0, A1, A2, A3, ... * B0
				266	// C1 A4, A5, A6, A7, ... B1
				267	// C2 A8, A9, Aa, Ab, ... B2
				268	// C3 Ac, Ad, Ae, Af, ... B3
				269	// .
				270	// .
				271	// .
				272	//
				273	// unroll for loops
				274	// utilize the data resided in cache
				275	// NOTE: col_a means the columns of A'
				276	static inline void MTV_mat4x1(const int col_a,
				277	const double* a,
				278	const int col_stride_a,
				279	const double* b,
				280	double* c,
				281	const int kOperation) {
				282	CERES_GEMM_OPT_NAIVE_HEADER
				283	double bv = 0.0;
				284
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	285	#define CERES_GEMM_OPT_MTV_MAT4X1_MUL \
				286	bv = *pb; \
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	287	cvec4[0] += pa[0] * bv; \
				288	cvec4[1] += pa[1] * bv; \
				289	cvec4[2] += pa[2] * bv; \
				290	cvec4[3] += pa[3] * bv; \
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame]	291	pa += col_stride_a; \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	292	pb++;
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	293
				294	for (int k = 0; k < col_m; k += span) {
				295	CERES_GEMM_OPT_MTV_MAT4X1_MUL
				296	CERES_GEMM_OPT_MTV_MAT4X1_MUL
				297	CERES_GEMM_OPT_MTV_MAT4X1_MUL
				298	CERES_GEMM_OPT_MTV_MAT4X1_MUL
				299	}
				300
				301	for (int k = col_m; k < col_a; k++) {
				302	CERES_GEMM_OPT_MTV_MAT4X1_MUL
				303	}
				304
				305	CERES_GEMM_OPT_STORE_MAT1X4
				306
				307	#undef CERES_GEMM_OPT_MTV_MAT4X1_MUL
				308	}
				309
				310	#undef CERES_GEMM_OPT_NAIVE_HEADER
				311	#undef CERES_GEMM_OPT_STORE_MAT1X4
				312
Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	313	} // namespace ceres::internal
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	314
				315	#endif // CERES_INTERNAL_SMALL_BLAS_GENERIC_H_