Blame - internal/ceres/evaluation_benchmark.cc - RealtimeRoboticsGroup/test

blob: c679885f0115a4d2595ca3be3985eb0cf0ff074c [file] [log] [blame]

Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	1	// Ceres Solver - A fast non-linear least squares minimizer
				2	// Copyright 2023 Google Inc. All rights reserved.
				3	// http://ceres-solver.org/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are met:
				7	//
				8	// * Redistributions of source code must retain the above copyright notice,
				9	// this list of conditions and the following disclaimer.
				10	// * Redistributions in binary form must reproduce the above copyright notice,
				11	// this list of conditions and the following disclaimer in the documentation
				12	// and/or other materials provided with the distribution.
				13	// * Neither the name of Google Inc. nor the names of its contributors may be
				14	// used to endorse or promote products derived from this software without
				15	// specific prior written permission.
				16	//
				17	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				18	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				19	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				20	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				21	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				22	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				23	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				24	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				25	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				26	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				27	// POSSIBILITY OF SUCH DAMAGE.
				28	//
				29	// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
				30
				31	#include <memory>
				32	#include <random>
				33	#include <string>
				34	#include <vector>
				35
				36	#include "benchmark/benchmark.h"
				37	#include "ceres/block_sparse_matrix.h"
				38	#include "ceres/bundle_adjustment_test_util.h"
				39	#include "ceres/cuda_block_sparse_crs_view.h"
				40	#include "ceres/cuda_partitioned_block_sparse_crs_view.h"
				41	#include "ceres/cuda_sparse_matrix.h"
				42	#include "ceres/cuda_vector.h"
				43	#include "ceres/evaluator.h"
				44	#include "ceres/implicit_schur_complement.h"
				45	#include "ceres/partitioned_matrix_view.h"
				46	#include "ceres/power_series_expansion_preconditioner.h"
				47	#include "ceres/preprocessor.h"
				48	#include "ceres/problem.h"
				49	#include "ceres/problem_impl.h"
				50	#include "ceres/program.h"
				51	#include "ceres/sparse_matrix.h"
				52
				53	namespace ceres::internal {
				54
				55	template <typename Derived, typename Base>
				56	std::unique_ptr<Derived> downcast_unique_ptr(std::unique_ptr<Base>& base) {
				57	return std::unique_ptr<Derived>(dynamic_cast<Derived*>(base.release()));
				58	}
				59
				60	// Benchmark library might invoke benchmark function multiple times.
				61	// In order to save time required to parse BAL data, we ensure that
				62	// each dataset is being loaded at most once.
				63	// Each type of jacobians is also cached after first creation
				64	struct BALData {
				65	using PartitionedView = PartitionedMatrixView<2, 3, 9>;
				66	explicit BALData(const std::string& path) {
				67	bal_problem = std::make_unique<BundleAdjustmentProblem>(path);
				68	CHECK(bal_problem != nullptr);
				69
				70	auto problem_impl = bal_problem->mutable_problem()->mutable_impl();
				71	auto preprocessor = Preprocessor::Create(MinimizerType::TRUST_REGION);
				72
				73	preprocessed_problem = std::make_unique<PreprocessedProblem>();
				74	Solver::Options options = bal_problem->options();
				75	options.linear_solver_type = ITERATIVE_SCHUR;
				76	CHECK(preprocessor->Preprocess(
				77	options, problem_impl, preprocessed_problem.get()));
				78
				79	auto program = preprocessed_problem->reduced_program.get();
				80
				81	parameters.resize(program->NumParameters());
				82	program->ParameterBlocksToStateVector(parameters.data());
				83
				84	const int num_residuals = program->NumResiduals();
				85	b.resize(num_residuals);
				86
				87	std::mt19937 rng;
				88	std::normal_distribution<double> rnorm;
				89	for (int i = 0; i < num_residuals; ++i) {
				90	b[i] = rnorm(rng);
				91	}
				92
				93	const int num_parameters = program->NumParameters();
				94	D.resize(num_parameters);
				95	for (int i = 0; i < num_parameters; ++i) {
				96	D[i] = rnorm(rng);
				97	}
				98	}
				99
				100	std::unique_ptr<BlockSparseMatrix> CreateBlockSparseJacobian(
				101	ContextImpl* context, bool sequential) {
				102	auto problem = bal_problem->mutable_problem();
				103	auto problem_impl = problem->mutable_impl();
				104	CHECK(problem_impl != nullptr);
				105
				106	Evaluator::Options options;
				107	options.linear_solver_type = ITERATIVE_SCHUR;
				108	options.num_threads = 1;
				109	options.context = context;
				110	options.num_eliminate_blocks = bal_problem->num_points();
				111
				112	std::string error;
				113	auto program = preprocessed_problem->reduced_program.get();
				114	auto evaluator = Evaluator::Create(options, program, &error);
				115	CHECK(evaluator != nullptr);
				116
				117	auto jacobian = evaluator->CreateJacobian();
				118	auto block_sparse = downcast_unique_ptr<BlockSparseMatrix>(jacobian);
				119	CHECK(block_sparse != nullptr);
				120
				121	if (sequential) {
				122	auto block_structure_sequential =
				123	std::make_unique<CompressedRowBlockStructure>(
				124	*block_sparse->block_structure());
				125	int num_nonzeros = 0;
				126	for (auto& row_block : block_structure_sequential->rows) {
				127	const int row_block_size = row_block.block.size;
				128	for (auto& cell : row_block.cells) {
				129	const int col_block_size =
				130	block_structure_sequential->cols[cell.block_id].size;
				131	cell.position = num_nonzeros;
				132	num_nonzeros += col_block_size * row_block_size;
				133	}
				134	}
				135	block_sparse = std::make_unique<BlockSparseMatrix>(
				136	block_structure_sequential.release(),
				137	#ifndef CERES_NO_CUDA
				138	true
				139	#else
				140	false
				141	#endif
				142	);
				143	}
				144
				145	std::mt19937 rng;
				146	std::normal_distribution<double> rnorm;
				147	const int nnz = block_sparse->num_nonzeros();
				148	auto values = block_sparse->mutable_values();
				149	for (int i = 0; i < nnz; ++i) {
				150	values[i] = rnorm(rng);
				151	}
				152
				153	return block_sparse;
				154	}
				155
				156	std::unique_ptr<CompressedRowSparseMatrix> CreateCompressedRowSparseJacobian(
				157	ContextImpl* context) {
				158	auto block_sparse = BlockSparseJacobian(context);
				159	return block_sparse->ToCompressedRowSparseMatrix();
				160	}
				161
				162	const BlockSparseMatrix* BlockSparseJacobian(ContextImpl* context) {
				163	if (!block_sparse_jacobian) {
				164	block_sparse_jacobian = CreateBlockSparseJacobian(context, true);
				165	}
				166	return block_sparse_jacobian.get();
				167	}
				168
				169	const BlockSparseMatrix* BlockSparseJacobianPartitioned(
				170	ContextImpl* context) {
				171	if (!block_sparse_jacobian_partitioned) {
				172	block_sparse_jacobian_partitioned =
				173	CreateBlockSparseJacobian(context, false);
				174	}
				175	return block_sparse_jacobian_partitioned.get();
				176	}
				177
				178	const CompressedRowSparseMatrix* CompressedRowSparseJacobian(
				179	ContextImpl* context) {
				180	if (!crs_jacobian) {
				181	crs_jacobian = CreateCompressedRowSparseJacobian(context);
				182	}
				183	return crs_jacobian.get();
				184	}
				185
				186	std::unique_ptr<PartitionedView> PartitionedMatrixViewJacobian(
				187	const LinearSolver::Options& options) {
				188	auto block_sparse = BlockSparseJacobianPartitioned(options.context);
				189	return std::make_unique<PartitionedView>(options, *block_sparse);
				190	}
				191
				192	BlockSparseMatrix* BlockDiagonalEtE(const LinearSolver::Options& options) {
				193	if (!block_diagonal_ete) {
				194	auto partitioned_view = PartitionedMatrixViewJacobian(options);
				195	block_diagonal_ete = partitioned_view->CreateBlockDiagonalEtE();
				196	}
				197	return block_diagonal_ete.get();
				198	}
				199
				200	BlockSparseMatrix* BlockDiagonalFtF(const LinearSolver::Options& options) {
				201	if (!block_diagonal_ftf) {
				202	auto partitioned_view = PartitionedMatrixViewJacobian(options);
				203	block_diagonal_ftf = partitioned_view->CreateBlockDiagonalFtF();
				204	}
				205	return block_diagonal_ftf.get();
				206	}
				207
				208	const ImplicitSchurComplement* ImplicitSchurComplementWithoutDiagonal(
				209	const LinearSolver::Options& options) {
				210	auto block_sparse = BlockSparseJacobianPartitioned(options.context);
				211	implicit_schur_complement =
				212	std::make_unique<ImplicitSchurComplement>(options);
				213	implicit_schur_complement->Init(*block_sparse, nullptr, b.data());
				214	return implicit_schur_complement.get();
				215	}
				216
				217	const ImplicitSchurComplement* ImplicitSchurComplementWithDiagonal(
				218	const LinearSolver::Options& options) {
				219	auto block_sparse = BlockSparseJacobianPartitioned(options.context);
				220	implicit_schur_complement_diag =
				221	std::make_unique<ImplicitSchurComplement>(options);
				222	implicit_schur_complement_diag->Init(*block_sparse, D.data(), b.data());
				223	return implicit_schur_complement_diag.get();
				224	}
				225
				226	Vector parameters;
				227	Vector D;
				228	Vector b;
				229	std::unique_ptr<BundleAdjustmentProblem> bal_problem;
				230	std::unique_ptr<PreprocessedProblem> preprocessed_problem;
				231	std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_partitioned;
				232	std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian;
				233	std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
				234	std::unique_ptr<BlockSparseMatrix> block_diagonal_ete;
				235	std::unique_ptr<BlockSparseMatrix> block_diagonal_ftf;
				236	std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement;
				237	std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement_diag;
				238	};
				239
				240	static void Residuals(benchmark::State& state,
				241	BALData* data,
				242	ContextImpl* context) {
				243	const int num_threads = static_cast<int>(state.range(0));
				244
				245	Evaluator::Options options;
				246	options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
				247	options.num_threads = num_threads;
				248	options.context = context;
				249	options.num_eliminate_blocks = 0;
				250
				251	std::string error;
				252	CHECK(data->preprocessed_problem != nullptr);
				253	auto program = data->preprocessed_problem->reduced_program.get();
				254	CHECK(program != nullptr);
				255	auto evaluator = Evaluator::Create(options, program, &error);
				256	CHECK(evaluator != nullptr);
				257
				258	double cost = 0.;
				259	Vector residuals = Vector::Zero(program->NumResiduals());
				260
				261	Evaluator::EvaluateOptions eval_options;
				262	for (auto _ : state) {
				263	CHECK(evaluator->Evaluate(eval_options,
				264	data->parameters.data(),
				265	&cost,
				266	residuals.data(),
				267	nullptr,
				268	nullptr));
				269	}
				270	}
				271
				272	static void ResidualsAndJacobian(benchmark::State& state,
				273	BALData* data,
				274	ContextImpl* context) {
				275	const int num_threads = static_cast<int>(state.range(0));
				276
				277	Evaluator::Options options;
				278	options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
				279	options.num_threads = num_threads;
				280	options.context = context;
				281	options.num_eliminate_blocks = 0;
				282
				283	std::string error;
				284	CHECK(data->preprocessed_problem != nullptr);
				285	auto program = data->preprocessed_problem->reduced_program.get();
				286	CHECK(program != nullptr);
				287	auto evaluator = Evaluator::Create(options, program, &error);
				288	CHECK(evaluator != nullptr);
				289
				290	double cost = 0.;
				291	Vector residuals = Vector::Zero(program->NumResiduals());
				292	auto jacobian = evaluator->CreateJacobian();
				293
				294	Evaluator::EvaluateOptions eval_options;
				295	for (auto _ : state) {
				296	CHECK(evaluator->Evaluate(eval_options,
				297	data->parameters.data(),
				298	&cost,
				299	residuals.data(),
				300	nullptr,
				301	jacobian.get()));
				302	}
				303	}
				304
				305	static void Plus(benchmark::State& state, BALData* data, ContextImpl* context) {
				306	const int num_threads = static_cast<int>(state.range(0));
				307
				308	Evaluator::Options options;
				309	options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
				310	options.num_threads = num_threads;
				311	options.context = context;
				312	options.num_eliminate_blocks = 0;
				313
				314	std::string error;
				315	CHECK(data->preprocessed_problem != nullptr);
				316	auto program = data->preprocessed_problem->reduced_program.get();
				317	CHECK(program != nullptr);
				318	auto evaluator = Evaluator::Create(options, program, &error);
				319	CHECK(evaluator != nullptr);
				320
				321	Vector state_plus_delta = Vector::Zero(program->NumParameters());
				322	Vector delta = Vector::Random(program->NumEffectiveParameters());
				323
				324	for (auto _ : state) {
				325	CHECK(evaluator->Plus(
				326	data->parameters.data(), delta.data(), state_plus_delta.data()));
				327	}
				328	CHECK_GT(state_plus_delta.squaredNorm(), 0.);
				329	}
				330
				331	static void PSEPreconditioner(benchmark::State& state,
				332	BALData* data,
				333	ContextImpl* context) {
				334	LinearSolver::Options options;
				335	options.num_threads = static_cast<int>(state.range(0));
				336	options.elimination_groups.push_back(data->bal_problem->num_points());
				337	options.context = context;
				338
				339	auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);
				340	Preconditioner::Options preconditioner_options(options);
				341
				342	PowerSeriesExpansionPreconditioner preconditioner(
				343	jacobian, 10, 0, preconditioner_options);
				344
				345	Vector y = Vector::Zero(jacobian->num_cols());
				346	Vector x = Vector::Random(jacobian->num_cols());
				347
				348	for (auto _ : state) {
				349	preconditioner.RightMultiplyAndAccumulate(x.data(), y.data());
				350	}
				351	CHECK_GT(y.squaredNorm(), 0.);
				352	}
				353
				354	static void PMVRightMultiplyAndAccumulateF(benchmark::State& state,
				355	BALData* data,
				356	ContextImpl* context) {
				357	LinearSolver::Options options;
				358	options.num_threads = static_cast<int>(state.range(0));
				359	options.elimination_groups.push_back(data->bal_problem->num_points());
				360	options.context = context;
				361	auto jacobian = data->PartitionedMatrixViewJacobian(options);
				362
				363	Vector y = Vector::Zero(jacobian->num_rows());
				364	Vector x = Vector::Random(jacobian->num_cols_f());
				365
				366	for (auto _ : state) {
				367	jacobian->RightMultiplyAndAccumulateF(x.data(), y.data());
				368	}
				369	CHECK_GT(y.squaredNorm(), 0.);
				370	}
				371
				372	static void PMVLeftMultiplyAndAccumulateF(benchmark::State& state,
				373	BALData* data,
				374	ContextImpl* context) {
				375	LinearSolver::Options options;
				376	options.num_threads = static_cast<int>(state.range(0));
				377	options.elimination_groups.push_back(data->bal_problem->num_points());
				378	options.context = context;
				379	auto jacobian = data->PartitionedMatrixViewJacobian(options);
				380
				381	Vector y = Vector::Zero(jacobian->num_cols_f());
				382	Vector x = Vector::Random(jacobian->num_rows());
				383
				384	for (auto _ : state) {
				385	jacobian->LeftMultiplyAndAccumulateF(x.data(), y.data());
				386	}
				387	CHECK_GT(y.squaredNorm(), 0.);
				388	}
				389
				390	static void PMVRightMultiplyAndAccumulateE(benchmark::State& state,
				391	BALData* data,
				392	ContextImpl* context) {
				393	LinearSolver::Options options;
				394	options.num_threads = static_cast<int>(state.range(0));
				395	options.elimination_groups.push_back(data->bal_problem->num_points());
				396	options.context = context;
				397	auto jacobian = data->PartitionedMatrixViewJacobian(options);
				398
				399	Vector y = Vector::Zero(jacobian->num_rows());
				400	Vector x = Vector::Random(jacobian->num_cols_e());
				401
				402	for (auto _ : state) {
				403	jacobian->RightMultiplyAndAccumulateE(x.data(), y.data());
				404	}
				405	CHECK_GT(y.squaredNorm(), 0.);
				406	}
				407
				408	static void PMVLeftMultiplyAndAccumulateE(benchmark::State& state,
				409	BALData* data,
				410	ContextImpl* context) {
				411	LinearSolver::Options options;
				412	options.num_threads = static_cast<int>(state.range(0));
				413	options.elimination_groups.push_back(data->bal_problem->num_points());
				414	options.context = context;
				415	auto jacobian = data->PartitionedMatrixViewJacobian(options);
				416
				417	Vector y = Vector::Zero(jacobian->num_cols_e());
				418	Vector x = Vector::Random(jacobian->num_rows());
				419
				420	for (auto _ : state) {
				421	jacobian->LeftMultiplyAndAccumulateE(x.data(), y.data());
				422	}
				423	CHECK_GT(y.squaredNorm(), 0.);
				424	}
				425
				426	static void PMVUpdateBlockDiagonalEtE(benchmark::State& state,
				427	BALData* data,
				428	ContextImpl* context) {
				429	LinearSolver::Options options;
				430	options.num_threads = static_cast<int>(state.range(0));
				431	options.elimination_groups.push_back(data->bal_problem->num_points());
				432	options.context = context;
				433	auto jacobian = data->PartitionedMatrixViewJacobian(options);
				434	auto block_diagonal_ete = data->BlockDiagonalEtE(options);
				435
				436	for (auto _ : state) {
				437	jacobian->UpdateBlockDiagonalEtE(block_diagonal_ete);
				438	}
				439	}
				440
				441	static void PMVUpdateBlockDiagonalFtF(benchmark::State& state,
				442	BALData* data,
				443	ContextImpl* context) {
				444	LinearSolver::Options options;
				445	options.num_threads = static_cast<int>(state.range(0));
				446	options.elimination_groups.push_back(data->bal_problem->num_points());
				447	options.context = context;
				448	auto jacobian = data->PartitionedMatrixViewJacobian(options);
				449	auto block_diagonal_ftf = data->BlockDiagonalFtF(options);
				450
				451	for (auto _ : state) {
				452	jacobian->UpdateBlockDiagonalFtF(block_diagonal_ftf);
				453	}
				454	}
				455
				456	static void ISCRightMultiplyNoDiag(benchmark::State& state,
				457	BALData* data,
				458	ContextImpl* context) {
				459	LinearSolver::Options options;
				460	options.num_threads = static_cast<int>(state.range(0));
				461	options.elimination_groups.push_back(data->bal_problem->num_points());
				462	options.context = context;
				463	auto jacobian = data->ImplicitSchurComplementWithoutDiagonal(options);
				464
				465	Vector y = Vector::Zero(jacobian->num_rows());
				466	Vector x = Vector::Random(jacobian->num_cols());
				467	for (auto _ : state) {
				468	jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
				469	}
				470	CHECK_GT(y.squaredNorm(), 0.);
				471	}
				472
				473	static void ISCRightMultiplyDiag(benchmark::State& state,
				474	BALData* data,
				475	ContextImpl* context) {
				476	LinearSolver::Options options;
				477	options.num_threads = static_cast<int>(state.range(0));
				478	options.elimination_groups.push_back(data->bal_problem->num_points());
				479	options.context = context;
				480
				481	auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);
				482
				483	Vector y = Vector::Zero(jacobian->num_rows());
				484	Vector x = Vector::Random(jacobian->num_cols());
				485	for (auto _ : state) {
				486	jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
				487	}
				488	CHECK_GT(y.squaredNorm(), 0.);
				489	}
				490
				491	static void JacobianToCRS(benchmark::State& state,
				492	BALData* data,
				493	ContextImpl* context) {
				494	auto jacobian = data->BlockSparseJacobian(context);
				495
				496	std::unique_ptr<CompressedRowSparseMatrix> matrix;
				497	for (auto _ : state) {
				498	matrix = jacobian->ToCompressedRowSparseMatrix();
				499	}
				500	CHECK(matrix != nullptr);
				501	}
				502
				503	#ifndef CERES_NO_CUDA
				504	static void PMVRightMultiplyAndAccumulateFCuda(benchmark::State& state,
				505	BALData* data,
				506	ContextImpl* context) {
				507	LinearSolver::Options options;
				508	options.elimination_groups.push_back(data->bal_problem->num_points());
				509	options.context = context;
				510	options.num_threads = 1;
				511	auto jacobian = data->PartitionedMatrixViewJacobian(options);
				512	auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
				513	CudaPartitionedBlockSparseCRSView view(
				514	*underlying_matrix, jacobian->num_col_blocks_e(), context);
				515
				516	Vector x = Vector::Random(jacobian->num_cols_f());
				517	CudaVector cuda_x(context, x.size());
				518	CudaVector cuda_y(context, jacobian->num_rows());
				519
				520	cuda_x.CopyFromCpu(x);
				521	cuda_y.SetZero();
				522
				523	auto matrix = view.matrix_f();
				524	for (auto _ : state) {
				525	matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
				526	}
				527	CHECK_GT(cuda_y.Norm(), 0.);
				528	}
				529
				530	static void PMVLeftMultiplyAndAccumulateFCuda(benchmark::State& state,
				531	BALData* data,
				532	ContextImpl* context) {
				533	LinearSolver::Options options;
				534	options.elimination_groups.push_back(data->bal_problem->num_points());
				535	options.context = context;
				536	options.num_threads = 1;
				537	auto jacobian = data->PartitionedMatrixViewJacobian(options);
				538	auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
				539	CudaPartitionedBlockSparseCRSView view(
				540	*underlying_matrix, jacobian->num_col_blocks_e(), context);
				541
				542	Vector x = Vector::Random(jacobian->num_rows());
				543	CudaVector cuda_x(context, x.size());
				544	CudaVector cuda_y(context, jacobian->num_cols_f());
				545
				546	cuda_x.CopyFromCpu(x);
				547	cuda_y.SetZero();
				548
				549	auto matrix = view.matrix_f();
				550	for (auto _ : state) {
				551	matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
				552	}
				553	CHECK_GT(cuda_y.Norm(), 0.);
				554	}
				555
				556	static void PMVRightMultiplyAndAccumulateECuda(benchmark::State& state,
				557	BALData* data,
				558	ContextImpl* context) {
				559	LinearSolver::Options options;
				560	options.elimination_groups.push_back(data->bal_problem->num_points());
				561	options.context = context;
				562	options.num_threads = 1;
				563	auto jacobian = data->PartitionedMatrixViewJacobian(options);
				564	auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
				565	CudaPartitionedBlockSparseCRSView view(
				566	*underlying_matrix, jacobian->num_col_blocks_e(), context);
				567
				568	Vector x = Vector::Random(jacobian->num_cols_e());
				569	CudaVector cuda_x(context, x.size());
				570	CudaVector cuda_y(context, jacobian->num_rows());
				571
				572	cuda_x.CopyFromCpu(x);
				573	cuda_y.SetZero();
				574
				575	auto matrix = view.matrix_e();
				576	for (auto _ : state) {
				577	matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
				578	}
				579	CHECK_GT(cuda_y.Norm(), 0.);
				580	}
				581
				582	static void PMVLeftMultiplyAndAccumulateECuda(benchmark::State& state,
				583	BALData* data,
				584	ContextImpl* context) {
				585	LinearSolver::Options options;
				586	options.elimination_groups.push_back(data->bal_problem->num_points());
				587	options.context = context;
				588	options.num_threads = 1;
				589	auto jacobian = data->PartitionedMatrixViewJacobian(options);
				590	auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
				591	CudaPartitionedBlockSparseCRSView view(
				592	*underlying_matrix, jacobian->num_col_blocks_e(), context);
				593
				594	Vector x = Vector::Random(jacobian->num_rows());
				595	CudaVector cuda_x(context, x.size());
				596	CudaVector cuda_y(context, jacobian->num_cols_e());
				597
				598	cuda_x.CopyFromCpu(x);
				599	cuda_y.SetZero();
				600
				601	auto matrix = view.matrix_e();
				602	for (auto _ : state) {
				603	matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
				604	}
				605	CHECK_GT(cuda_y.Norm(), 0.);
				606	}
				607
				608	// We want CudaBlockSparseCRSView to be not slower than explicit conversion to
				609	// CRS on CPU
				610	static void JacobianToCRSView(benchmark::State& state,
				611	BALData* data,
				612	ContextImpl* context) {
				613	auto jacobian = data->BlockSparseJacobian(context);
				614
				615	std::unique_ptr<CudaBlockSparseCRSView> matrix;
				616	for (auto _ : state) {
				617	matrix = std::make_unique<CudaBlockSparseCRSView>(*jacobian, context);
				618	}
				619	CHECK(matrix != nullptr);
				620	}
				621	static void JacobianToCRSMatrix(benchmark::State& state,
				622	BALData* data,
				623	ContextImpl* context) {
				624	auto jacobian = data->BlockSparseJacobian(context);
				625
				626	std::unique_ptr<CudaSparseMatrix> matrix;
				627	std::unique_ptr<CompressedRowSparseMatrix> matrix_cpu;
				628	for (auto _ : state) {
				629	matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
				630	matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
				631	}
				632	CHECK(matrix != nullptr);
				633	}
				634	// Updating values in CudaBlockSparseCRSView should be +- as fast as just
				635	// copying values (time spent in value permutation has to be hidden by PCIe
				636	// transfer)
				637	static void JacobianToCRSViewUpdate(benchmark::State& state,
				638	BALData* data,
				639	ContextImpl* context) {
				640	auto jacobian = data->BlockSparseJacobian(context);
				641
				642	auto matrix = CudaBlockSparseCRSView(*jacobian, context);
				643	for (auto _ : state) {
				644	matrix.UpdateValues(*jacobian);
				645	}
				646	}
				647	static void JacobianToCRSMatrixUpdate(benchmark::State& state,
				648	BALData* data,
				649	ContextImpl* context) {
				650	auto jacobian = data->BlockSparseJacobian(context);
				651
				652	auto matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
				653	auto matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
				654	for (auto _ : state) {
				655	CHECK_EQ(cudaSuccess,
				656	cudaMemcpy(matrix->mutable_values(),
				657	matrix_cpu->values(),
				658	matrix->num_nonzeros() * sizeof(double),
				659	cudaMemcpyHostToDevice));
				660	}
				661	}
				662	#endif
				663
				664	static void JacobianSquaredColumnNorm(benchmark::State& state,
				665	BALData* data,
				666	ContextImpl* context) {
				667	const int num_threads = static_cast<int>(state.range(0));
				668
				669	auto jacobian = data->BlockSparseJacobian(context);
				670
				671	Vector x = Vector::Zero(jacobian->num_cols());
				672
				673	for (auto _ : state) {
				674	jacobian->SquaredColumnNorm(x.data(), context, num_threads);
				675	}
				676	CHECK_GT(x.squaredNorm(), 0.);
				677	}
				678
				679	static void JacobianScaleColumns(benchmark::State& state,
				680	BALData* data,
				681	ContextImpl* context) {
				682	const int num_threads = static_cast<int>(state.range(0));
				683
				684	auto jacobian_const = data->BlockSparseJacobian(context);
				685	auto jacobian = const_cast<BlockSparseMatrix*>(jacobian_const);
				686
				687	Vector x = Vector::Ones(jacobian->num_cols());
				688
				689	for (auto _ : state) {
				690	jacobian->ScaleColumns(x.data(), context, num_threads);
				691	}
				692	}
				693
				694	static void JacobianRightMultiplyAndAccumulate(benchmark::State& state,
				695	BALData* data,
				696	ContextImpl* context) {
				697	const int num_threads = static_cast<int>(state.range(0));
				698
				699	auto jacobian = data->BlockSparseJacobian(context);
				700
				701	Vector y = Vector::Zero(jacobian->num_rows());
				702	Vector x = Vector::Random(jacobian->num_cols());
				703
				704	for (auto _ : state) {
				705	jacobian->RightMultiplyAndAccumulate(
				706	x.data(), y.data(), context, num_threads);
				707	}
				708	CHECK_GT(y.squaredNorm(), 0.);
				709	}
				710
				711	static void JacobianLeftMultiplyAndAccumulate(benchmark::State& state,
				712	BALData* data,
				713	ContextImpl* context) {
				714	const int num_threads = static_cast<int>(state.range(0));
				715
				716	auto jacobian = data->BlockSparseJacobian(context);
				717
				718	Vector y = Vector::Zero(jacobian->num_cols());
				719	Vector x = Vector::Random(jacobian->num_rows());
				720
				721	for (auto _ : state) {
				722	jacobian->LeftMultiplyAndAccumulate(
				723	x.data(), y.data(), context, num_threads);
				724	}
				725	CHECK_GT(y.squaredNorm(), 0.);
				726	}
				727
				728	#ifndef CERES_NO_CUDA
				729	static void JacobianRightMultiplyAndAccumulateCuda(benchmark::State& state,
				730	BALData* data,
				731	ContextImpl* context) {
				732	auto crs_jacobian = data->CompressedRowSparseJacobian(context);
				733	CudaSparseMatrix cuda_jacobian(context, *crs_jacobian);
				734	CudaVector cuda_x(context, 0);
				735	CudaVector cuda_y(context, 0);
				736
				737	Vector x(crs_jacobian->num_cols());
				738	Vector y(crs_jacobian->num_rows());
				739	x.setRandom();
				740	y.setRandom();
				741
				742	cuda_x.CopyFromCpu(x);
				743	cuda_y.CopyFromCpu(y);
				744	double sum = 0;
				745	for (auto _ : state) {
				746	cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
				747	sum += cuda_y.Norm();
				748	CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
				749	}
				750	CHECK_NE(sum, 0.0);
				751	}
				752
				753	static void JacobianLeftMultiplyAndAccumulateCuda(benchmark::State& state,
				754	BALData* data,
				755	ContextImpl* context) {
				756	auto crs_jacobian = data->CompressedRowSparseJacobian(context);
				757	CudaSparseMatrix cuda_jacobian(context, *crs_jacobian);
				758	CudaVector cuda_x(context, 0);
				759	CudaVector cuda_y(context, 0);
				760
				761	Vector x(crs_jacobian->num_rows());
				762	Vector y(crs_jacobian->num_cols());
				763	x.setRandom();
				764	y.setRandom();
				765
				766	cuda_x.CopyFromCpu(x);
				767	cuda_y.CopyFromCpu(y);
				768	double sum = 0;
				769	for (auto _ : state) {
				770	cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
				771	sum += cuda_y.Norm();
				772	CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
				773	}
				774	CHECK_NE(sum, 0.0);
				775	}
				776	#endif
				777
				778	} // namespace ceres::internal
				779
				780	// Older versions of benchmark library might come without ::benchmark::Shutdown
				781	// function. We provide an empty fallback variant of Shutdown function in
				782	// order to support both older and newer versions
				783	namespace benchmark_shutdown_fallback {
				784	template <typename... Args>
				785	void Shutdown(Args... args) {}
				786	}; // namespace benchmark_shutdown_fallback
				787
				788	int main(int argc, char** argv) {
				789	::benchmark::Initialize(&argc, argv);
				790
				791	std::vector<std::unique_ptr<ceres::internal::BALData>> benchmark_data;
				792	if (argc == 1) {
				793	LOG(FATAL) << "No input datasets specified. Usage: " << argv[0]
				794	<< " [benchmark flags] path_to_BAL_data_1.txt ... "
				795	"path_to_BAL_data_N.txt";
				796	return -1;
				797	}
				798
				799	ceres::internal::ContextImpl context;
				800	context.EnsureMinimumThreads(16);
				801	#ifndef CERES_NO_CUDA
				802	std::string message;
				803	context.InitCuda(&message);
				804	#endif
				805
				806	for (int i = 1; i < argc; ++i) {
				807	const std::string path(argv[i]);
				808	const std::string name_residuals = "Residuals<" + path + ">";
				809	benchmark_data.emplace_back(
				810	std::make_unique<ceres::internal::BALData>(path));
				811	auto data = benchmark_data.back().get();
				812	::benchmark::RegisterBenchmark(
				813	name_residuals.c_str(), ceres::internal::Residuals, data, &context)
				814	->Arg(1)
				815	->Arg(2)
				816	->Arg(4)
				817	->Arg(8)
				818	->Arg(16);
				819
				820	const std::string name_jacobians = "ResidualsAndJacobian<" + path + ">";
				821	::benchmark::RegisterBenchmark(name_jacobians.c_str(),
				822	ceres::internal::ResidualsAndJacobian,
				823	data,
				824	&context)
				825	->Arg(1)
				826	->Arg(2)
				827	->Arg(4)
				828	->Arg(8)
				829	->Arg(16);
				830
				831	const std::string name_plus = "Plus<" + path + ">";
				832	::benchmark::RegisterBenchmark(
				833	name_plus.c_str(), ceres::internal::Plus, data, &context)
				834	->Arg(1)
				835	->Arg(2)
				836	->Arg(4)
				837	->Arg(8)
				838	->Arg(16);
				839
				840	const std::string name_right_product =
				841	"JacobianRightMultiplyAndAccumulate<" + path + ">";
				842	::benchmark::RegisterBenchmark(
				843	name_right_product.c_str(),
				844	ceres::internal::JacobianRightMultiplyAndAccumulate,
				845	data,
				846	&context)
				847	->Arg(1)
				848	->Arg(2)
				849	->Arg(4)
				850	->Arg(8)
				851	->Arg(16);
				852
				853	const std::string name_right_product_partitioned_f =
				854	"PMVRightMultiplyAndAccumulateF<" + path + ">";
				855	::benchmark::RegisterBenchmark(
				856	name_right_product_partitioned_f.c_str(),
				857	ceres::internal::PMVRightMultiplyAndAccumulateF,
				858	data,
				859	&context)
				860	->Arg(1)
				861	->Arg(2)
				862	->Arg(4)
				863	->Arg(8)
				864	->Arg(16);
				865
				866	#ifndef CERES_NO_CUDA
				867	const std::string name_right_product_partitioned_f_cuda =
				868	"PMVRightMultiplyAndAccumulateFCuda<" + path + ">";
				869	::benchmark::RegisterBenchmark(
				870	name_right_product_partitioned_f_cuda.c_str(),
				871	ceres::internal::PMVRightMultiplyAndAccumulateFCuda,
				872	data,
				873	&context);
				874	#endif
				875
				876	const std::string name_right_product_partitioned_e =
				877	"PMVRightMultiplyAndAccumulateE<" + path + ">";
				878	::benchmark::RegisterBenchmark(
				879	name_right_product_partitioned_e.c_str(),
				880	ceres::internal::PMVRightMultiplyAndAccumulateE,
				881	data,
				882	&context)
				883	->Arg(1)
				884	->Arg(2)
				885	->Arg(4)
				886	->Arg(8)
				887	->Arg(16);
				888
				889	#ifndef CERES_NO_CUDA
				890	const std::string name_right_product_partitioned_e_cuda =
				891	"PMVRightMultiplyAndAccumulateECuda<" + path + ">";
				892	::benchmark::RegisterBenchmark(
				893	name_right_product_partitioned_e_cuda.c_str(),
				894	ceres::internal::PMVRightMultiplyAndAccumulateECuda,
				895	data,
				896	&context);
				897	#endif
				898
				899	const std::string name_update_block_diagonal_ftf =
				900	"PMVUpdateBlockDiagonalFtF<" + path + ">";
				901	::benchmark::RegisterBenchmark(name_update_block_diagonal_ftf.c_str(),
				902	ceres::internal::PMVUpdateBlockDiagonalFtF,
				903	data,
				904	&context)
				905	->Arg(1)
				906	->Arg(2)
				907	->Arg(4)
				908	->Arg(8)
				909	->Arg(16);
				910
				911	const std::string name_pse =
				912	"PSEPreconditionerRightMultiplyAndAccumulate<" + path + ">";
				913	::benchmark::RegisterBenchmark(
				914	name_pse.c_str(), ceres::internal::PSEPreconditioner, data, &context)
				915	->Arg(1)
				916	->Arg(2)
				917	->Arg(4)
				918	->Arg(8)
				919	->Arg(16);
				920
				921	const std::string name_isc_no_diag =
				922	"ISCRightMultiplyAndAccumulate<" + path + ">";
				923	::benchmark::RegisterBenchmark(name_isc_no_diag.c_str(),
				924	ceres::internal::ISCRightMultiplyNoDiag,
				925	data,
				926	&context)
				927	->Arg(1)
				928	->Arg(2)
				929	->Arg(4)
				930	->Arg(8)
				931	->Arg(16);
				932
				933	const std::string name_update_block_diagonal_ete =
				934	"PMVUpdateBlockDiagonalEtE<" + path + ">";
				935	::benchmark::RegisterBenchmark(name_update_block_diagonal_ete.c_str(),
				936	ceres::internal::PMVUpdateBlockDiagonalEtE,
				937	data,
				938	&context)
				939	->Arg(1)
				940	->Arg(2)
				941	->Arg(4)
				942	->Arg(8)
				943	->Arg(16);
				944	const std::string name_isc_diag =
				945	"ISCRightMultiplyAndAccumulateDiag<" + path + ">";
				946	::benchmark::RegisterBenchmark(name_isc_diag.c_str(),
				947	ceres::internal::ISCRightMultiplyDiag,
				948	data,
				949	&context)
				950	->Arg(1)
				951	->Arg(2)
				952	->Arg(4)
				953	->Arg(8)
				954	->Arg(16);
				955
				956	#ifndef CERES_NO_CUDA
				957	const std::string name_right_product_cuda =
				958	"JacobianRightMultiplyAndAccumulateCuda<" + path + ">";
				959	::benchmark::RegisterBenchmark(
				960	name_right_product_cuda.c_str(),
				961	ceres::internal::JacobianRightMultiplyAndAccumulateCuda,
				962	data,
				963	&context)
				964	->Arg(1);
				965	#endif
				966
				967	const std::string name_left_product =
				968	"JacobianLeftMultiplyAndAccumulate<" + path + ">";
				969	::benchmark::RegisterBenchmark(
				970	name_left_product.c_str(),
				971	ceres::internal::JacobianLeftMultiplyAndAccumulate,
				972	data,
				973	&context)
				974	->Arg(1)
				975	->Arg(2)
				976	->Arg(4)
				977	->Arg(8)
				978	->Arg(16);
				979
				980	const std::string name_left_product_partitioned_f =
				981	"PMVLeftMultiplyAndAccumulateF<" + path + ">";
				982	::benchmark::RegisterBenchmark(
				983	name_left_product_partitioned_f.c_str(),
				984	ceres::internal::PMVLeftMultiplyAndAccumulateF,
				985	data,
				986	&context)
				987	->Arg(1)
				988	->Arg(2)
				989	->Arg(4)
				990	->Arg(8)
				991	->Arg(16);
				992
				993	#ifndef CERES_NO_CUDA
				994	const std::string name_left_product_partitioned_f_cuda =
				995	"PMVLeftMultiplyAndAccumulateFCuda<" + path + ">";
				996	::benchmark::RegisterBenchmark(
				997	name_left_product_partitioned_f_cuda.c_str(),
				998	ceres::internal::PMVLeftMultiplyAndAccumulateFCuda,
				999	data,
				1000	&context);
				1001	#endif
				1002
				1003	const std::string name_left_product_partitioned_e =
				1004	"PMVLeftMultiplyAndAccumulateE<" + path + ">";
				1005	::benchmark::RegisterBenchmark(
				1006	name_left_product_partitioned_e.c_str(),
				1007	ceres::internal::PMVLeftMultiplyAndAccumulateE,
				1008	data,
				1009	&context)
				1010	->Arg(1)
				1011	->Arg(2)
				1012	->Arg(4)
				1013	->Arg(8)
				1014	->Arg(16);
				1015
				1016	#ifndef CERES_NO_CUDA
				1017	const std::string name_left_product_partitioned_e_cuda =
				1018	"PMVLeftMultiplyAndAccumulateECuda<" + path + ">";
				1019	::benchmark::RegisterBenchmark(
				1020	name_left_product_partitioned_e_cuda.c_str(),
				1021	ceres::internal::PMVLeftMultiplyAndAccumulateECuda,
				1022	data,
				1023	&context);
				1024	#endif
				1025
				1026	#ifndef CERES_NO_CUDA
				1027	const std::string name_left_product_cuda =
				1028	"JacobianLeftMultiplyAndAccumulateCuda<" + path + ">";
				1029	::benchmark::RegisterBenchmark(
				1030	name_left_product_cuda.c_str(),
				1031	ceres::internal::JacobianLeftMultiplyAndAccumulateCuda,
				1032	data,
				1033	&context)
				1034	->Arg(1);
				1035	#endif
				1036
				1037	const std::string name_squared_column_norm =
				1038	"JacobianSquaredColumnNorm<" + path + ">";
				1039	::benchmark::RegisterBenchmark(name_squared_column_norm.c_str(),
				1040	ceres::internal::JacobianSquaredColumnNorm,
				1041	data,
				1042	&context)
				1043	->Arg(1)
				1044	->Arg(2)
				1045	->Arg(4)
				1046	->Arg(8)
				1047	->Arg(16);
				1048
				1049	const std::string name_scale_columns = "JacobianScaleColumns<" + path + ">";
				1050	::benchmark::RegisterBenchmark(name_scale_columns.c_str(),
				1051	ceres::internal::JacobianScaleColumns,
				1052	data,
				1053	&context)
				1054	->Arg(1)
				1055	->Arg(2)
				1056	->Arg(4)
				1057	->Arg(8)
				1058	->Arg(16);
				1059
				1060	const std::string name_to_crs = "JacobianToCRS<" + path + ">";
				1061	::benchmark::RegisterBenchmark(
				1062	name_to_crs.c_str(), ceres::internal::JacobianToCRS, data, &context);
				1063	#ifndef CERES_NO_CUDA
				1064	const std::string name_to_crs_view = "JacobianToCRSView<" + path + ">";
				1065	::benchmark::RegisterBenchmark(name_to_crs_view.c_str(),
				1066	ceres::internal::JacobianToCRSView,
				1067	data,
				1068	&context);
				1069	const std::string name_to_crs_matrix = "JacobianToCRSMatrix<" + path + ">";
				1070	::benchmark::RegisterBenchmark(name_to_crs_matrix.c_str(),
				1071	ceres::internal::JacobianToCRSMatrix,
				1072	data,
				1073	&context);
				1074	const std::string name_to_crs_view_update =
				1075	"JacobianToCRSViewUpdate<" + path + ">";
				1076	::benchmark::RegisterBenchmark(name_to_crs_view_update.c_str(),
				1077	ceres::internal::JacobianToCRSViewUpdate,
				1078	data,
				1079	&context);
				1080	const std::string name_to_crs_matrix_update =
				1081	"JacobianToCRSMatrixUpdate<" + path + ">";
				1082	::benchmark::RegisterBenchmark(name_to_crs_matrix_update.c_str(),
				1083	ceres::internal::JacobianToCRSMatrixUpdate,
				1084	data,
				1085	&context);
				1086	#endif
				1087	}
				1088	::benchmark::RunSpecifiedBenchmarks();
				1089
				1090	using namespace ::benchmark;
				1091	using namespace benchmark_shutdown_fallback;
				1092	Shutdown();
				1093	return 0;
				1094	}