Blame - internal/ceres/small_blas_generic.h - RealtimeRoboticsGroup/test

blob: 3f3ea424c809dd6300dc8f4287dd5f190cd7c02e [file] [log] [blame]

Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	1	// Ceres Solver - A fast non-linear least squares minimizer
				2	// Copyright 2018 Google Inc. All rights reserved.
				3	// http://ceres-solver.org/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are met:
				7	//
				8	// * Redistributions of source code must retain the above copyright notice,
				9	// this list of conditions and the following disclaimer.
				10	// * Redistributions in binary form must reproduce the above copyright notice,
				11	// this list of conditions and the following disclaimer in the documentation
				12	// and/or other materials provided with the distribution.
				13	// * Neither the name of Google Inc. nor the names of its contributors may be
				14	// used to endorse or promote products derived from this software without
				15	// specific prior written permission.
				16	//
				17	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				18	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				19	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				20	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				21	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				22	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				23	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				24	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				25	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				26	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				27	// POSSIBILITY OF SUCH DAMAGE.
				28	//
				29	// Author: yangfan34@lenovo.com (Lenovo Research Device+ Lab - Shanghai)
				30	//
				31	// Optimization for simple blas functions used in the Schur Eliminator.
				32	// These are fairly basic implementations which already yield a significant
				33	// speedup in the eliminator performance.
				34
				35	#ifndef CERES_INTERNAL_SMALL_BLAS_GENERIC_H_
				36	#define CERES_INTERNAL_SMALL_BLAS_GENERIC_H_
				37
				38	namespace ceres {
				39	namespace internal {
				40
				41	// The following macros are used to share code
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	42	#define CERES_GEMM_OPT_NAIVE_HEADER \
				43	double c0 = 0.0; \
				44	double c1 = 0.0; \
				45	double c2 = 0.0; \
				46	double c3 = 0.0; \
				47	const double* pa = a; \
				48	const double* pb = b; \
				49	const int span = 4; \
				50	int col_r = col_a & (span - 1); \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	51	int col_m = col_a - col_r;
				52
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	53	#define CERES_GEMM_OPT_STORE_MAT1X4 \
				54	if (kOperation > 0) { \
				55	*c++ += c0; \
				56	*c++ += c1; \
				57	*c++ += c2; \
				58	*c++ += c3; \
				59	} else if (kOperation < 0) { \
				60	*c++ -= c0; \
				61	*c++ -= c1; \
				62	*c++ -= c2; \
				63	*c++ -= c3; \
				64	} else { \
				65	*c++ = c0; \
				66	*c++ = c1; \
				67	*c++ = c2; \
				68	*c++ = c3; \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	69	}
				70
				71	// Matrix-Matrix Multiplication
				72	// Figure out 1x4 of Matrix C in one batch
				73	//
				74	// c op a * B;
				75	// where op can be +=, -=, or =, indicated by kOperation.
				76	//
				77	// Matrix C Matrix A Matrix B
				78	//
				79	// C0, C1, C2, C3 op A0, A1, A2, A3, ... * B0, B1, B2, B3
				80	// B4, B5, B6, B7
				81	// B8, B9, Ba, Bb
				82	// Bc, Bd, Be, Bf
				83	// . , . , . , .
				84	// . , . , . , .
				85	// . , . , . , .
				86	//
				87	// unroll for loops
				88	// utilize the data resided in cache
				89	// NOTE: col_a means the columns of A
				90	static inline void MMM_mat1x4(const int col_a,
				91	const double* a,
				92	const double* b,
				93	const int col_stride_b,
				94	double* c,
				95	const int kOperation) {
				96	CERES_GEMM_OPT_NAIVE_HEADER
				97	double av = 0.0;
				98	int bi = 0;
				99
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	100	#define CERES_GEMM_OPT_MMM_MAT1X4_MUL \
				101	av = pa[k]; \
				102	pb = b + bi; \
				103	c0 += av * *pb++; \
				104	c1 += av * *pb++; \
				105	c2 += av * *pb++; \
				106	c3 += av * *pb++; \
				107	bi += col_stride_b; \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	108	k++;
				109
				110	for (int k = 0; k < col_m;) {
				111	CERES_GEMM_OPT_MMM_MAT1X4_MUL
				112	CERES_GEMM_OPT_MMM_MAT1X4_MUL
				113	CERES_GEMM_OPT_MMM_MAT1X4_MUL
				114	CERES_GEMM_OPT_MMM_MAT1X4_MUL
				115	}
				116
				117	for (int k = col_m; k < col_a;) {
				118	CERES_GEMM_OPT_MMM_MAT1X4_MUL
				119	}
				120
				121	CERES_GEMM_OPT_STORE_MAT1X4
				122
				123	#undef CERES_GEMM_OPT_MMM_MAT1X4_MUL
				124	}
				125
				126	// Matrix Transpose-Matrix multiplication
				127	// Figure out 1x4 of Matrix C in one batch
				128	//
				129	// c op a' * B;
				130	// where op can be +=, -=, or = indicated by kOperation.
				131	//
				132	// Matrix A
				133	//
				134	// A0
				135	// A1
				136	// A2
				137	// A3
				138	// .
				139	// .
				140	// .
				141	//
				142	// Matrix C Matrix A' Matrix B
				143	//
				144	// C0, C1, C2, C3 op A0, A1, A2, A3, ... * B0, B1, B2, B3
				145	// B4, B5, B6, B7
				146	// B8, B9, Ba, Bb
				147	// Bc, Bd, Be, Bf
				148	// . , . , . , .
				149	// . , . , . , .
				150	// . , . , . , .
				151	//
				152	// unroll for loops
				153	// utilize the data resided in cache
				154	// NOTE: col_a means the columns of A'
				155	static inline void MTM_mat1x4(const int col_a,
				156	const double* a,
				157	const int col_stride_a,
				158	const double* b,
				159	const int col_stride_b,
				160	double* c,
				161	const int kOperation) {
				162	CERES_GEMM_OPT_NAIVE_HEADER
				163	double av = 0.0;
				164	int ai = 0;
				165	int bi = 0;
				166
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	167	#define CERES_GEMM_OPT_MTM_MAT1X4_MUL \
				168	av = pa[ai]; \
				169	pb = b + bi; \
				170	c0 += av * *pb++; \
				171	c1 += av * *pb++; \
				172	c2 += av * *pb++; \
				173	c3 += av * *pb++; \
				174	ai += col_stride_a; \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	175	bi += col_stride_b;
				176
				177	for (int k = 0; k < col_m; k += span) {
				178	CERES_GEMM_OPT_MTM_MAT1X4_MUL
				179	CERES_GEMM_OPT_MTM_MAT1X4_MUL
				180	CERES_GEMM_OPT_MTM_MAT1X4_MUL
				181	CERES_GEMM_OPT_MTM_MAT1X4_MUL
				182	}
				183
				184	for (int k = col_m; k < col_a; k++) {
				185	CERES_GEMM_OPT_MTM_MAT1X4_MUL
				186	}
				187
				188	CERES_GEMM_OPT_STORE_MAT1X4
				189
				190	#undef CERES_GEMM_OPT_MTM_MAT1X4_MUL
				191	}
				192
				193	// Matrix-Vector Multiplication
				194	// Figure out 4x1 of vector c in one batch
				195	//
				196	// c op A * b;
				197	// where op can be +=, -=, or =, indicated by kOperation.
				198	//
				199	// Vector c Matrix A Vector b
				200	//
				201	// C0 op A0, A1, A2, A3, ... * B0
				202	// C1 A4, A5, A6, A7, ... B1
				203	// C2 A8, A9, Aa, Ab, ... B2
				204	// C3 Ac, Ad, Ae, Af, ... B3
				205	// .
				206	// .
				207	// .
				208	//
				209	// unroll for loops
				210	// utilize the data resided in cache
				211	// NOTE: col_a means the columns of A
				212	static inline void MVM_mat4x1(const int col_a,
				213	const double* a,
				214	const int col_stride_a,
				215	const double* b,
				216	double* c,
				217	const int kOperation) {
				218	CERES_GEMM_OPT_NAIVE_HEADER
				219	double bv = 0.0;
				220
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	221	// clang-format off
				222	#define CERES_GEMM_OPT_MVM_MAT4X1_MUL \
				223	bv = *pb; \
				224	c0 += (pa ) bv; \
				225	c1 += (pa + col_stride_a ) bv; \
				226	c2 += (pa + col_stride_a 2) * bv; \
				227	c3 += (pa + col_stride_a 3) * bv; \
				228	pa++; \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	229	pb++;
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	230	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	231
				232	for (int k = 0; k < col_m; k += span) {
				233	CERES_GEMM_OPT_MVM_MAT4X1_MUL
				234	CERES_GEMM_OPT_MVM_MAT4X1_MUL
				235	CERES_GEMM_OPT_MVM_MAT4X1_MUL
				236	CERES_GEMM_OPT_MVM_MAT4X1_MUL
				237	}
				238
				239	for (int k = col_m; k < col_a; k++) {
				240	CERES_GEMM_OPT_MVM_MAT4X1_MUL
				241	}
				242
				243	CERES_GEMM_OPT_STORE_MAT1X4
				244
				245	#undef CERES_GEMM_OPT_MVM_MAT4X1_MUL
				246	}
				247
				248	// Matrix Transpose-Vector multiplication
				249	// Figure out 4x1 of vector c in one batch
				250	//
				251	// c op A' * b;
				252	// where op can be +=, -=, or =, indicated by kOperation.
				253	//
				254	// Matrix A
				255	//
				256	// A0, A4, A8, Ac
				257	// A1, A5, A9, Ad
				258	// A2, A6, Aa, Ae
				259	// A3, A7, Ab, Af
				260	// . , . , . , .
				261	// . , . , . , .
				262	// . , . , . , .
				263	//
				264	// Vector c Matrix A' Vector b
				265	//
				266	// C0 op A0, A1, A2, A3, ... * B0
				267	// C1 A4, A5, A6, A7, ... B1
				268	// C2 A8, A9, Aa, Ab, ... B2
				269	// C3 Ac, Ad, Ae, Af, ... B3
				270	// .
				271	// .
				272	// .
				273	//
				274	// unroll for loops
				275	// utilize the data resided in cache
				276	// NOTE: col_a means the columns of A'
				277	static inline void MTV_mat4x1(const int col_a,
				278	const double* a,
				279	const int col_stride_a,
				280	const double* b,
				281	double* c,
				282	const int kOperation) {
				283	CERES_GEMM_OPT_NAIVE_HEADER
				284	double bv = 0.0;
				285
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	286	// clang-format off
				287	#define CERES_GEMM_OPT_MTV_MAT4X1_MUL \
				288	bv = *pb; \
				289	c0 += (pa ) bv; \
				290	c1 += (pa + 1) bv; \
				291	c2 += (pa + 2) bv; \
				292	c3 += (pa + 3) bv; \
				293	pa += col_stride_a; \
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	294	pb++;
Austin Schuh	1d1e6ea	2020-12-23 21:56:30 -0800	[diff] [blame^]	295	// clang-format on
Austin Schuh	70cc955	2019-01-21 19:46:48 -0800	[diff] [blame]	296
				297	for (int k = 0; k < col_m; k += span) {
				298	CERES_GEMM_OPT_MTV_MAT4X1_MUL
				299	CERES_GEMM_OPT_MTV_MAT4X1_MUL
				300	CERES_GEMM_OPT_MTV_MAT4X1_MUL
				301	CERES_GEMM_OPT_MTV_MAT4X1_MUL
				302	}
				303
				304	for (int k = col_m; k < col_a; k++) {
				305	CERES_GEMM_OPT_MTV_MAT4X1_MUL
				306	}
				307
				308	CERES_GEMM_OPT_STORE_MAT1X4
				309
				310	#undef CERES_GEMM_OPT_MTV_MAT4X1_MUL
				311	}
				312
				313	#undef CERES_GEMM_OPT_NAIVE_HEADER
				314	#undef CERES_GEMM_OPT_STORE_MAT1X4
				315
				316	} // namespace internal
				317	} // namespace ceres
				318
				319	#endif // CERES_INTERNAL_SMALL_BLAS_GENERIC_H_