blob: c679885f0115a4d2595ca3be3985eb0cf0ff074c [file] [log] [blame]
Austin Schuh3de38b02024-06-25 18:25:10 -07001// Ceres Solver - A fast non-linear least squares minimizer
2// Copyright 2023 Google Inc. All rights reserved.
3// http://ceres-solver.org/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//
8// * Redistributions of source code must retain the above copyright notice,
9// this list of conditions and the following disclaimer.
10// * Redistributions in binary form must reproduce the above copyright notice,
11// this list of conditions and the following disclaimer in the documentation
12// and/or other materials provided with the distribution.
13// * Neither the name of Google Inc. nor the names of its contributors may be
14// used to endorse or promote products derived from this software without
15// specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28//
29// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
30
31#include <memory>
32#include <random>
33#include <string>
34#include <vector>
35
36#include "benchmark/benchmark.h"
37#include "ceres/block_sparse_matrix.h"
38#include "ceres/bundle_adjustment_test_util.h"
39#include "ceres/cuda_block_sparse_crs_view.h"
40#include "ceres/cuda_partitioned_block_sparse_crs_view.h"
41#include "ceres/cuda_sparse_matrix.h"
42#include "ceres/cuda_vector.h"
43#include "ceres/evaluator.h"
44#include "ceres/implicit_schur_complement.h"
45#include "ceres/partitioned_matrix_view.h"
46#include "ceres/power_series_expansion_preconditioner.h"
47#include "ceres/preprocessor.h"
48#include "ceres/problem.h"
49#include "ceres/problem_impl.h"
50#include "ceres/program.h"
51#include "ceres/sparse_matrix.h"
52
53namespace ceres::internal {
54
55template <typename Derived, typename Base>
56std::unique_ptr<Derived> downcast_unique_ptr(std::unique_ptr<Base>& base) {
57 return std::unique_ptr<Derived>(dynamic_cast<Derived*>(base.release()));
58}
59
60// Benchmark library might invoke benchmark function multiple times.
61// In order to save time required to parse BAL data, we ensure that
62// each dataset is being loaded at most once.
63// Each type of jacobians is also cached after first creation
64struct BALData {
65 using PartitionedView = PartitionedMatrixView<2, 3, 9>;
66 explicit BALData(const std::string& path) {
67 bal_problem = std::make_unique<BundleAdjustmentProblem>(path);
68 CHECK(bal_problem != nullptr);
69
70 auto problem_impl = bal_problem->mutable_problem()->mutable_impl();
71 auto preprocessor = Preprocessor::Create(MinimizerType::TRUST_REGION);
72
73 preprocessed_problem = std::make_unique<PreprocessedProblem>();
74 Solver::Options options = bal_problem->options();
75 options.linear_solver_type = ITERATIVE_SCHUR;
76 CHECK(preprocessor->Preprocess(
77 options, problem_impl, preprocessed_problem.get()));
78
79 auto program = preprocessed_problem->reduced_program.get();
80
81 parameters.resize(program->NumParameters());
82 program->ParameterBlocksToStateVector(parameters.data());
83
84 const int num_residuals = program->NumResiduals();
85 b.resize(num_residuals);
86
87 std::mt19937 rng;
88 std::normal_distribution<double> rnorm;
89 for (int i = 0; i < num_residuals; ++i) {
90 b[i] = rnorm(rng);
91 }
92
93 const int num_parameters = program->NumParameters();
94 D.resize(num_parameters);
95 for (int i = 0; i < num_parameters; ++i) {
96 D[i] = rnorm(rng);
97 }
98 }
99
100 std::unique_ptr<BlockSparseMatrix> CreateBlockSparseJacobian(
101 ContextImpl* context, bool sequential) {
102 auto problem = bal_problem->mutable_problem();
103 auto problem_impl = problem->mutable_impl();
104 CHECK(problem_impl != nullptr);
105
106 Evaluator::Options options;
107 options.linear_solver_type = ITERATIVE_SCHUR;
108 options.num_threads = 1;
109 options.context = context;
110 options.num_eliminate_blocks = bal_problem->num_points();
111
112 std::string error;
113 auto program = preprocessed_problem->reduced_program.get();
114 auto evaluator = Evaluator::Create(options, program, &error);
115 CHECK(evaluator != nullptr);
116
117 auto jacobian = evaluator->CreateJacobian();
118 auto block_sparse = downcast_unique_ptr<BlockSparseMatrix>(jacobian);
119 CHECK(block_sparse != nullptr);
120
121 if (sequential) {
122 auto block_structure_sequential =
123 std::make_unique<CompressedRowBlockStructure>(
124 *block_sparse->block_structure());
125 int num_nonzeros = 0;
126 for (auto& row_block : block_structure_sequential->rows) {
127 const int row_block_size = row_block.block.size;
128 for (auto& cell : row_block.cells) {
129 const int col_block_size =
130 block_structure_sequential->cols[cell.block_id].size;
131 cell.position = num_nonzeros;
132 num_nonzeros += col_block_size * row_block_size;
133 }
134 }
135 block_sparse = std::make_unique<BlockSparseMatrix>(
136 block_structure_sequential.release(),
137#ifndef CERES_NO_CUDA
138 true
139#else
140 false
141#endif
142 );
143 }
144
145 std::mt19937 rng;
146 std::normal_distribution<double> rnorm;
147 const int nnz = block_sparse->num_nonzeros();
148 auto values = block_sparse->mutable_values();
149 for (int i = 0; i < nnz; ++i) {
150 values[i] = rnorm(rng);
151 }
152
153 return block_sparse;
154 }
155
156 std::unique_ptr<CompressedRowSparseMatrix> CreateCompressedRowSparseJacobian(
157 ContextImpl* context) {
158 auto block_sparse = BlockSparseJacobian(context);
159 return block_sparse->ToCompressedRowSparseMatrix();
160 }
161
162 const BlockSparseMatrix* BlockSparseJacobian(ContextImpl* context) {
163 if (!block_sparse_jacobian) {
164 block_sparse_jacobian = CreateBlockSparseJacobian(context, true);
165 }
166 return block_sparse_jacobian.get();
167 }
168
169 const BlockSparseMatrix* BlockSparseJacobianPartitioned(
170 ContextImpl* context) {
171 if (!block_sparse_jacobian_partitioned) {
172 block_sparse_jacobian_partitioned =
173 CreateBlockSparseJacobian(context, false);
174 }
175 return block_sparse_jacobian_partitioned.get();
176 }
177
178 const CompressedRowSparseMatrix* CompressedRowSparseJacobian(
179 ContextImpl* context) {
180 if (!crs_jacobian) {
181 crs_jacobian = CreateCompressedRowSparseJacobian(context);
182 }
183 return crs_jacobian.get();
184 }
185
186 std::unique_ptr<PartitionedView> PartitionedMatrixViewJacobian(
187 const LinearSolver::Options& options) {
188 auto block_sparse = BlockSparseJacobianPartitioned(options.context);
189 return std::make_unique<PartitionedView>(options, *block_sparse);
190 }
191
192 BlockSparseMatrix* BlockDiagonalEtE(const LinearSolver::Options& options) {
193 if (!block_diagonal_ete) {
194 auto partitioned_view = PartitionedMatrixViewJacobian(options);
195 block_diagonal_ete = partitioned_view->CreateBlockDiagonalEtE();
196 }
197 return block_diagonal_ete.get();
198 }
199
200 BlockSparseMatrix* BlockDiagonalFtF(const LinearSolver::Options& options) {
201 if (!block_diagonal_ftf) {
202 auto partitioned_view = PartitionedMatrixViewJacobian(options);
203 block_diagonal_ftf = partitioned_view->CreateBlockDiagonalFtF();
204 }
205 return block_diagonal_ftf.get();
206 }
207
208 const ImplicitSchurComplement* ImplicitSchurComplementWithoutDiagonal(
209 const LinearSolver::Options& options) {
210 auto block_sparse = BlockSparseJacobianPartitioned(options.context);
211 implicit_schur_complement =
212 std::make_unique<ImplicitSchurComplement>(options);
213 implicit_schur_complement->Init(*block_sparse, nullptr, b.data());
214 return implicit_schur_complement.get();
215 }
216
217 const ImplicitSchurComplement* ImplicitSchurComplementWithDiagonal(
218 const LinearSolver::Options& options) {
219 auto block_sparse = BlockSparseJacobianPartitioned(options.context);
220 implicit_schur_complement_diag =
221 std::make_unique<ImplicitSchurComplement>(options);
222 implicit_schur_complement_diag->Init(*block_sparse, D.data(), b.data());
223 return implicit_schur_complement_diag.get();
224 }
225
226 Vector parameters;
227 Vector D;
228 Vector b;
229 std::unique_ptr<BundleAdjustmentProblem> bal_problem;
230 std::unique_ptr<PreprocessedProblem> preprocessed_problem;
231 std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_partitioned;
232 std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian;
233 std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
234 std::unique_ptr<BlockSparseMatrix> block_diagonal_ete;
235 std::unique_ptr<BlockSparseMatrix> block_diagonal_ftf;
236 std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement;
237 std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement_diag;
238};
239
240static void Residuals(benchmark::State& state,
241 BALData* data,
242 ContextImpl* context) {
243 const int num_threads = static_cast<int>(state.range(0));
244
245 Evaluator::Options options;
246 options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
247 options.num_threads = num_threads;
248 options.context = context;
249 options.num_eliminate_blocks = 0;
250
251 std::string error;
252 CHECK(data->preprocessed_problem != nullptr);
253 auto program = data->preprocessed_problem->reduced_program.get();
254 CHECK(program != nullptr);
255 auto evaluator = Evaluator::Create(options, program, &error);
256 CHECK(evaluator != nullptr);
257
258 double cost = 0.;
259 Vector residuals = Vector::Zero(program->NumResiduals());
260
261 Evaluator::EvaluateOptions eval_options;
262 for (auto _ : state) {
263 CHECK(evaluator->Evaluate(eval_options,
264 data->parameters.data(),
265 &cost,
266 residuals.data(),
267 nullptr,
268 nullptr));
269 }
270}
271
272static void ResidualsAndJacobian(benchmark::State& state,
273 BALData* data,
274 ContextImpl* context) {
275 const int num_threads = static_cast<int>(state.range(0));
276
277 Evaluator::Options options;
278 options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
279 options.num_threads = num_threads;
280 options.context = context;
281 options.num_eliminate_blocks = 0;
282
283 std::string error;
284 CHECK(data->preprocessed_problem != nullptr);
285 auto program = data->preprocessed_problem->reduced_program.get();
286 CHECK(program != nullptr);
287 auto evaluator = Evaluator::Create(options, program, &error);
288 CHECK(evaluator != nullptr);
289
290 double cost = 0.;
291 Vector residuals = Vector::Zero(program->NumResiduals());
292 auto jacobian = evaluator->CreateJacobian();
293
294 Evaluator::EvaluateOptions eval_options;
295 for (auto _ : state) {
296 CHECK(evaluator->Evaluate(eval_options,
297 data->parameters.data(),
298 &cost,
299 residuals.data(),
300 nullptr,
301 jacobian.get()));
302 }
303}
304
305static void Plus(benchmark::State& state, BALData* data, ContextImpl* context) {
306 const int num_threads = static_cast<int>(state.range(0));
307
308 Evaluator::Options options;
309 options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
310 options.num_threads = num_threads;
311 options.context = context;
312 options.num_eliminate_blocks = 0;
313
314 std::string error;
315 CHECK(data->preprocessed_problem != nullptr);
316 auto program = data->preprocessed_problem->reduced_program.get();
317 CHECK(program != nullptr);
318 auto evaluator = Evaluator::Create(options, program, &error);
319 CHECK(evaluator != nullptr);
320
321 Vector state_plus_delta = Vector::Zero(program->NumParameters());
322 Vector delta = Vector::Random(program->NumEffectiveParameters());
323
324 for (auto _ : state) {
325 CHECK(evaluator->Plus(
326 data->parameters.data(), delta.data(), state_plus_delta.data()));
327 }
328 CHECK_GT(state_plus_delta.squaredNorm(), 0.);
329}
330
331static void PSEPreconditioner(benchmark::State& state,
332 BALData* data,
333 ContextImpl* context) {
334 LinearSolver::Options options;
335 options.num_threads = static_cast<int>(state.range(0));
336 options.elimination_groups.push_back(data->bal_problem->num_points());
337 options.context = context;
338
339 auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);
340 Preconditioner::Options preconditioner_options(options);
341
342 PowerSeriesExpansionPreconditioner preconditioner(
343 jacobian, 10, 0, preconditioner_options);
344
345 Vector y = Vector::Zero(jacobian->num_cols());
346 Vector x = Vector::Random(jacobian->num_cols());
347
348 for (auto _ : state) {
349 preconditioner.RightMultiplyAndAccumulate(x.data(), y.data());
350 }
351 CHECK_GT(y.squaredNorm(), 0.);
352}
353
354static void PMVRightMultiplyAndAccumulateF(benchmark::State& state,
355 BALData* data,
356 ContextImpl* context) {
357 LinearSolver::Options options;
358 options.num_threads = static_cast<int>(state.range(0));
359 options.elimination_groups.push_back(data->bal_problem->num_points());
360 options.context = context;
361 auto jacobian = data->PartitionedMatrixViewJacobian(options);
362
363 Vector y = Vector::Zero(jacobian->num_rows());
364 Vector x = Vector::Random(jacobian->num_cols_f());
365
366 for (auto _ : state) {
367 jacobian->RightMultiplyAndAccumulateF(x.data(), y.data());
368 }
369 CHECK_GT(y.squaredNorm(), 0.);
370}
371
372static void PMVLeftMultiplyAndAccumulateF(benchmark::State& state,
373 BALData* data,
374 ContextImpl* context) {
375 LinearSolver::Options options;
376 options.num_threads = static_cast<int>(state.range(0));
377 options.elimination_groups.push_back(data->bal_problem->num_points());
378 options.context = context;
379 auto jacobian = data->PartitionedMatrixViewJacobian(options);
380
381 Vector y = Vector::Zero(jacobian->num_cols_f());
382 Vector x = Vector::Random(jacobian->num_rows());
383
384 for (auto _ : state) {
385 jacobian->LeftMultiplyAndAccumulateF(x.data(), y.data());
386 }
387 CHECK_GT(y.squaredNorm(), 0.);
388}
389
390static void PMVRightMultiplyAndAccumulateE(benchmark::State& state,
391 BALData* data,
392 ContextImpl* context) {
393 LinearSolver::Options options;
394 options.num_threads = static_cast<int>(state.range(0));
395 options.elimination_groups.push_back(data->bal_problem->num_points());
396 options.context = context;
397 auto jacobian = data->PartitionedMatrixViewJacobian(options);
398
399 Vector y = Vector::Zero(jacobian->num_rows());
400 Vector x = Vector::Random(jacobian->num_cols_e());
401
402 for (auto _ : state) {
403 jacobian->RightMultiplyAndAccumulateE(x.data(), y.data());
404 }
405 CHECK_GT(y.squaredNorm(), 0.);
406}
407
408static void PMVLeftMultiplyAndAccumulateE(benchmark::State& state,
409 BALData* data,
410 ContextImpl* context) {
411 LinearSolver::Options options;
412 options.num_threads = static_cast<int>(state.range(0));
413 options.elimination_groups.push_back(data->bal_problem->num_points());
414 options.context = context;
415 auto jacobian = data->PartitionedMatrixViewJacobian(options);
416
417 Vector y = Vector::Zero(jacobian->num_cols_e());
418 Vector x = Vector::Random(jacobian->num_rows());
419
420 for (auto _ : state) {
421 jacobian->LeftMultiplyAndAccumulateE(x.data(), y.data());
422 }
423 CHECK_GT(y.squaredNorm(), 0.);
424}
425
426static void PMVUpdateBlockDiagonalEtE(benchmark::State& state,
427 BALData* data,
428 ContextImpl* context) {
429 LinearSolver::Options options;
430 options.num_threads = static_cast<int>(state.range(0));
431 options.elimination_groups.push_back(data->bal_problem->num_points());
432 options.context = context;
433 auto jacobian = data->PartitionedMatrixViewJacobian(options);
434 auto block_diagonal_ete = data->BlockDiagonalEtE(options);
435
436 for (auto _ : state) {
437 jacobian->UpdateBlockDiagonalEtE(block_diagonal_ete);
438 }
439}
440
441static void PMVUpdateBlockDiagonalFtF(benchmark::State& state,
442 BALData* data,
443 ContextImpl* context) {
444 LinearSolver::Options options;
445 options.num_threads = static_cast<int>(state.range(0));
446 options.elimination_groups.push_back(data->bal_problem->num_points());
447 options.context = context;
448 auto jacobian = data->PartitionedMatrixViewJacobian(options);
449 auto block_diagonal_ftf = data->BlockDiagonalFtF(options);
450
451 for (auto _ : state) {
452 jacobian->UpdateBlockDiagonalFtF(block_diagonal_ftf);
453 }
454}
455
456static void ISCRightMultiplyNoDiag(benchmark::State& state,
457 BALData* data,
458 ContextImpl* context) {
459 LinearSolver::Options options;
460 options.num_threads = static_cast<int>(state.range(0));
461 options.elimination_groups.push_back(data->bal_problem->num_points());
462 options.context = context;
463 auto jacobian = data->ImplicitSchurComplementWithoutDiagonal(options);
464
465 Vector y = Vector::Zero(jacobian->num_rows());
466 Vector x = Vector::Random(jacobian->num_cols());
467 for (auto _ : state) {
468 jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
469 }
470 CHECK_GT(y.squaredNorm(), 0.);
471}
472
473static void ISCRightMultiplyDiag(benchmark::State& state,
474 BALData* data,
475 ContextImpl* context) {
476 LinearSolver::Options options;
477 options.num_threads = static_cast<int>(state.range(0));
478 options.elimination_groups.push_back(data->bal_problem->num_points());
479 options.context = context;
480
481 auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);
482
483 Vector y = Vector::Zero(jacobian->num_rows());
484 Vector x = Vector::Random(jacobian->num_cols());
485 for (auto _ : state) {
486 jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
487 }
488 CHECK_GT(y.squaredNorm(), 0.);
489}
490
491static void JacobianToCRS(benchmark::State& state,
492 BALData* data,
493 ContextImpl* context) {
494 auto jacobian = data->BlockSparseJacobian(context);
495
496 std::unique_ptr<CompressedRowSparseMatrix> matrix;
497 for (auto _ : state) {
498 matrix = jacobian->ToCompressedRowSparseMatrix();
499 }
500 CHECK(matrix != nullptr);
501}
502
503#ifndef CERES_NO_CUDA
504static void PMVRightMultiplyAndAccumulateFCuda(benchmark::State& state,
505 BALData* data,
506 ContextImpl* context) {
507 LinearSolver::Options options;
508 options.elimination_groups.push_back(data->bal_problem->num_points());
509 options.context = context;
510 options.num_threads = 1;
511 auto jacobian = data->PartitionedMatrixViewJacobian(options);
512 auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
513 CudaPartitionedBlockSparseCRSView view(
514 *underlying_matrix, jacobian->num_col_blocks_e(), context);
515
516 Vector x = Vector::Random(jacobian->num_cols_f());
517 CudaVector cuda_x(context, x.size());
518 CudaVector cuda_y(context, jacobian->num_rows());
519
520 cuda_x.CopyFromCpu(x);
521 cuda_y.SetZero();
522
523 auto matrix = view.matrix_f();
524 for (auto _ : state) {
525 matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
526 }
527 CHECK_GT(cuda_y.Norm(), 0.);
528}
529
530static void PMVLeftMultiplyAndAccumulateFCuda(benchmark::State& state,
531 BALData* data,
532 ContextImpl* context) {
533 LinearSolver::Options options;
534 options.elimination_groups.push_back(data->bal_problem->num_points());
535 options.context = context;
536 options.num_threads = 1;
537 auto jacobian = data->PartitionedMatrixViewJacobian(options);
538 auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
539 CudaPartitionedBlockSparseCRSView view(
540 *underlying_matrix, jacobian->num_col_blocks_e(), context);
541
542 Vector x = Vector::Random(jacobian->num_rows());
543 CudaVector cuda_x(context, x.size());
544 CudaVector cuda_y(context, jacobian->num_cols_f());
545
546 cuda_x.CopyFromCpu(x);
547 cuda_y.SetZero();
548
549 auto matrix = view.matrix_f();
550 for (auto _ : state) {
551 matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
552 }
553 CHECK_GT(cuda_y.Norm(), 0.);
554}
555
556static void PMVRightMultiplyAndAccumulateECuda(benchmark::State& state,
557 BALData* data,
558 ContextImpl* context) {
559 LinearSolver::Options options;
560 options.elimination_groups.push_back(data->bal_problem->num_points());
561 options.context = context;
562 options.num_threads = 1;
563 auto jacobian = data->PartitionedMatrixViewJacobian(options);
564 auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
565 CudaPartitionedBlockSparseCRSView view(
566 *underlying_matrix, jacobian->num_col_blocks_e(), context);
567
568 Vector x = Vector::Random(jacobian->num_cols_e());
569 CudaVector cuda_x(context, x.size());
570 CudaVector cuda_y(context, jacobian->num_rows());
571
572 cuda_x.CopyFromCpu(x);
573 cuda_y.SetZero();
574
575 auto matrix = view.matrix_e();
576 for (auto _ : state) {
577 matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
578 }
579 CHECK_GT(cuda_y.Norm(), 0.);
580}
581
582static void PMVLeftMultiplyAndAccumulateECuda(benchmark::State& state,
583 BALData* data,
584 ContextImpl* context) {
585 LinearSolver::Options options;
586 options.elimination_groups.push_back(data->bal_problem->num_points());
587 options.context = context;
588 options.num_threads = 1;
589 auto jacobian = data->PartitionedMatrixViewJacobian(options);
590 auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
591 CudaPartitionedBlockSparseCRSView view(
592 *underlying_matrix, jacobian->num_col_blocks_e(), context);
593
594 Vector x = Vector::Random(jacobian->num_rows());
595 CudaVector cuda_x(context, x.size());
596 CudaVector cuda_y(context, jacobian->num_cols_e());
597
598 cuda_x.CopyFromCpu(x);
599 cuda_y.SetZero();
600
601 auto matrix = view.matrix_e();
602 for (auto _ : state) {
603 matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
604 }
605 CHECK_GT(cuda_y.Norm(), 0.);
606}
607
608// We want CudaBlockSparseCRSView to be not slower than explicit conversion to
609// CRS on CPU
610static void JacobianToCRSView(benchmark::State& state,
611 BALData* data,
612 ContextImpl* context) {
613 auto jacobian = data->BlockSparseJacobian(context);
614
615 std::unique_ptr<CudaBlockSparseCRSView> matrix;
616 for (auto _ : state) {
617 matrix = std::make_unique<CudaBlockSparseCRSView>(*jacobian, context);
618 }
619 CHECK(matrix != nullptr);
620}
621static void JacobianToCRSMatrix(benchmark::State& state,
622 BALData* data,
623 ContextImpl* context) {
624 auto jacobian = data->BlockSparseJacobian(context);
625
626 std::unique_ptr<CudaSparseMatrix> matrix;
627 std::unique_ptr<CompressedRowSparseMatrix> matrix_cpu;
628 for (auto _ : state) {
629 matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
630 matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
631 }
632 CHECK(matrix != nullptr);
633}
634// Updating values in CudaBlockSparseCRSView should be +- as fast as just
635// copying values (time spent in value permutation has to be hidden by PCIe
636// transfer)
637static void JacobianToCRSViewUpdate(benchmark::State& state,
638 BALData* data,
639 ContextImpl* context) {
640 auto jacobian = data->BlockSparseJacobian(context);
641
642 auto matrix = CudaBlockSparseCRSView(*jacobian, context);
643 for (auto _ : state) {
644 matrix.UpdateValues(*jacobian);
645 }
646}
647static void JacobianToCRSMatrixUpdate(benchmark::State& state,
648 BALData* data,
649 ContextImpl* context) {
650 auto jacobian = data->BlockSparseJacobian(context);
651
652 auto matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
653 auto matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
654 for (auto _ : state) {
655 CHECK_EQ(cudaSuccess,
656 cudaMemcpy(matrix->mutable_values(),
657 matrix_cpu->values(),
658 matrix->num_nonzeros() * sizeof(double),
659 cudaMemcpyHostToDevice));
660 }
661}
662#endif
663
664static void JacobianSquaredColumnNorm(benchmark::State& state,
665 BALData* data,
666 ContextImpl* context) {
667 const int num_threads = static_cast<int>(state.range(0));
668
669 auto jacobian = data->BlockSparseJacobian(context);
670
671 Vector x = Vector::Zero(jacobian->num_cols());
672
673 for (auto _ : state) {
674 jacobian->SquaredColumnNorm(x.data(), context, num_threads);
675 }
676 CHECK_GT(x.squaredNorm(), 0.);
677}
678
679static void JacobianScaleColumns(benchmark::State& state,
680 BALData* data,
681 ContextImpl* context) {
682 const int num_threads = static_cast<int>(state.range(0));
683
684 auto jacobian_const = data->BlockSparseJacobian(context);
685 auto jacobian = const_cast<BlockSparseMatrix*>(jacobian_const);
686
687 Vector x = Vector::Ones(jacobian->num_cols());
688
689 for (auto _ : state) {
690 jacobian->ScaleColumns(x.data(), context, num_threads);
691 }
692}
693
694static void JacobianRightMultiplyAndAccumulate(benchmark::State& state,
695 BALData* data,
696 ContextImpl* context) {
697 const int num_threads = static_cast<int>(state.range(0));
698
699 auto jacobian = data->BlockSparseJacobian(context);
700
701 Vector y = Vector::Zero(jacobian->num_rows());
702 Vector x = Vector::Random(jacobian->num_cols());
703
704 for (auto _ : state) {
705 jacobian->RightMultiplyAndAccumulate(
706 x.data(), y.data(), context, num_threads);
707 }
708 CHECK_GT(y.squaredNorm(), 0.);
709}
710
711static void JacobianLeftMultiplyAndAccumulate(benchmark::State& state,
712 BALData* data,
713 ContextImpl* context) {
714 const int num_threads = static_cast<int>(state.range(0));
715
716 auto jacobian = data->BlockSparseJacobian(context);
717
718 Vector y = Vector::Zero(jacobian->num_cols());
719 Vector x = Vector::Random(jacobian->num_rows());
720
721 for (auto _ : state) {
722 jacobian->LeftMultiplyAndAccumulate(
723 x.data(), y.data(), context, num_threads);
724 }
725 CHECK_GT(y.squaredNorm(), 0.);
726}
727
728#ifndef CERES_NO_CUDA
729static void JacobianRightMultiplyAndAccumulateCuda(benchmark::State& state,
730 BALData* data,
731 ContextImpl* context) {
732 auto crs_jacobian = data->CompressedRowSparseJacobian(context);
733 CudaSparseMatrix cuda_jacobian(context, *crs_jacobian);
734 CudaVector cuda_x(context, 0);
735 CudaVector cuda_y(context, 0);
736
737 Vector x(crs_jacobian->num_cols());
738 Vector y(crs_jacobian->num_rows());
739 x.setRandom();
740 y.setRandom();
741
742 cuda_x.CopyFromCpu(x);
743 cuda_y.CopyFromCpu(y);
744 double sum = 0;
745 for (auto _ : state) {
746 cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
747 sum += cuda_y.Norm();
748 CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
749 }
750 CHECK_NE(sum, 0.0);
751}
752
753static void JacobianLeftMultiplyAndAccumulateCuda(benchmark::State& state,
754 BALData* data,
755 ContextImpl* context) {
756 auto crs_jacobian = data->CompressedRowSparseJacobian(context);
757 CudaSparseMatrix cuda_jacobian(context, *crs_jacobian);
758 CudaVector cuda_x(context, 0);
759 CudaVector cuda_y(context, 0);
760
761 Vector x(crs_jacobian->num_rows());
762 Vector y(crs_jacobian->num_cols());
763 x.setRandom();
764 y.setRandom();
765
766 cuda_x.CopyFromCpu(x);
767 cuda_y.CopyFromCpu(y);
768 double sum = 0;
769 for (auto _ : state) {
770 cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
771 sum += cuda_y.Norm();
772 CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
773 }
774 CHECK_NE(sum, 0.0);
775}
776#endif
777
778} // namespace ceres::internal
779
780// Older versions of benchmark library might come without ::benchmark::Shutdown
781// function. We provide an empty fallback variant of Shutdown function in
782// order to support both older and newer versions
783namespace benchmark_shutdown_fallback {
784template <typename... Args>
785void Shutdown(Args... args) {}
786}; // namespace benchmark_shutdown_fallback
787
788int main(int argc, char** argv) {
789 ::benchmark::Initialize(&argc, argv);
790
791 std::vector<std::unique_ptr<ceres::internal::BALData>> benchmark_data;
792 if (argc == 1) {
793 LOG(FATAL) << "No input datasets specified. Usage: " << argv[0]
794 << " [benchmark flags] path_to_BAL_data_1.txt ... "
795 "path_to_BAL_data_N.txt";
796 return -1;
797 }
798
799 ceres::internal::ContextImpl context;
800 context.EnsureMinimumThreads(16);
801#ifndef CERES_NO_CUDA
802 std::string message;
803 context.InitCuda(&message);
804#endif
805
806 for (int i = 1; i < argc; ++i) {
807 const std::string path(argv[i]);
808 const std::string name_residuals = "Residuals<" + path + ">";
809 benchmark_data.emplace_back(
810 std::make_unique<ceres::internal::BALData>(path));
811 auto data = benchmark_data.back().get();
812 ::benchmark::RegisterBenchmark(
813 name_residuals.c_str(), ceres::internal::Residuals, data, &context)
814 ->Arg(1)
815 ->Arg(2)
816 ->Arg(4)
817 ->Arg(8)
818 ->Arg(16);
819
820 const std::string name_jacobians = "ResidualsAndJacobian<" + path + ">";
821 ::benchmark::RegisterBenchmark(name_jacobians.c_str(),
822 ceres::internal::ResidualsAndJacobian,
823 data,
824 &context)
825 ->Arg(1)
826 ->Arg(2)
827 ->Arg(4)
828 ->Arg(8)
829 ->Arg(16);
830
831 const std::string name_plus = "Plus<" + path + ">";
832 ::benchmark::RegisterBenchmark(
833 name_plus.c_str(), ceres::internal::Plus, data, &context)
834 ->Arg(1)
835 ->Arg(2)
836 ->Arg(4)
837 ->Arg(8)
838 ->Arg(16);
839
840 const std::string name_right_product =
841 "JacobianRightMultiplyAndAccumulate<" + path + ">";
842 ::benchmark::RegisterBenchmark(
843 name_right_product.c_str(),
844 ceres::internal::JacobianRightMultiplyAndAccumulate,
845 data,
846 &context)
847 ->Arg(1)
848 ->Arg(2)
849 ->Arg(4)
850 ->Arg(8)
851 ->Arg(16);
852
853 const std::string name_right_product_partitioned_f =
854 "PMVRightMultiplyAndAccumulateF<" + path + ">";
855 ::benchmark::RegisterBenchmark(
856 name_right_product_partitioned_f.c_str(),
857 ceres::internal::PMVRightMultiplyAndAccumulateF,
858 data,
859 &context)
860 ->Arg(1)
861 ->Arg(2)
862 ->Arg(4)
863 ->Arg(8)
864 ->Arg(16);
865
866#ifndef CERES_NO_CUDA
867 const std::string name_right_product_partitioned_f_cuda =
868 "PMVRightMultiplyAndAccumulateFCuda<" + path + ">";
869 ::benchmark::RegisterBenchmark(
870 name_right_product_partitioned_f_cuda.c_str(),
871 ceres::internal::PMVRightMultiplyAndAccumulateFCuda,
872 data,
873 &context);
874#endif
875
876 const std::string name_right_product_partitioned_e =
877 "PMVRightMultiplyAndAccumulateE<" + path + ">";
878 ::benchmark::RegisterBenchmark(
879 name_right_product_partitioned_e.c_str(),
880 ceres::internal::PMVRightMultiplyAndAccumulateE,
881 data,
882 &context)
883 ->Arg(1)
884 ->Arg(2)
885 ->Arg(4)
886 ->Arg(8)
887 ->Arg(16);
888
889#ifndef CERES_NO_CUDA
890 const std::string name_right_product_partitioned_e_cuda =
891 "PMVRightMultiplyAndAccumulateECuda<" + path + ">";
892 ::benchmark::RegisterBenchmark(
893 name_right_product_partitioned_e_cuda.c_str(),
894 ceres::internal::PMVRightMultiplyAndAccumulateECuda,
895 data,
896 &context);
897#endif
898
899 const std::string name_update_block_diagonal_ftf =
900 "PMVUpdateBlockDiagonalFtF<" + path + ">";
901 ::benchmark::RegisterBenchmark(name_update_block_diagonal_ftf.c_str(),
902 ceres::internal::PMVUpdateBlockDiagonalFtF,
903 data,
904 &context)
905 ->Arg(1)
906 ->Arg(2)
907 ->Arg(4)
908 ->Arg(8)
909 ->Arg(16);
910
911 const std::string name_pse =
912 "PSEPreconditionerRightMultiplyAndAccumulate<" + path + ">";
913 ::benchmark::RegisterBenchmark(
914 name_pse.c_str(), ceres::internal::PSEPreconditioner, data, &context)
915 ->Arg(1)
916 ->Arg(2)
917 ->Arg(4)
918 ->Arg(8)
919 ->Arg(16);
920
921 const std::string name_isc_no_diag =
922 "ISCRightMultiplyAndAccumulate<" + path + ">";
923 ::benchmark::RegisterBenchmark(name_isc_no_diag.c_str(),
924 ceres::internal::ISCRightMultiplyNoDiag,
925 data,
926 &context)
927 ->Arg(1)
928 ->Arg(2)
929 ->Arg(4)
930 ->Arg(8)
931 ->Arg(16);
932
933 const std::string name_update_block_diagonal_ete =
934 "PMVUpdateBlockDiagonalEtE<" + path + ">";
935 ::benchmark::RegisterBenchmark(name_update_block_diagonal_ete.c_str(),
936 ceres::internal::PMVUpdateBlockDiagonalEtE,
937 data,
938 &context)
939 ->Arg(1)
940 ->Arg(2)
941 ->Arg(4)
942 ->Arg(8)
943 ->Arg(16);
944 const std::string name_isc_diag =
945 "ISCRightMultiplyAndAccumulateDiag<" + path + ">";
946 ::benchmark::RegisterBenchmark(name_isc_diag.c_str(),
947 ceres::internal::ISCRightMultiplyDiag,
948 data,
949 &context)
950 ->Arg(1)
951 ->Arg(2)
952 ->Arg(4)
953 ->Arg(8)
954 ->Arg(16);
955
956#ifndef CERES_NO_CUDA
957 const std::string name_right_product_cuda =
958 "JacobianRightMultiplyAndAccumulateCuda<" + path + ">";
959 ::benchmark::RegisterBenchmark(
960 name_right_product_cuda.c_str(),
961 ceres::internal::JacobianRightMultiplyAndAccumulateCuda,
962 data,
963 &context)
964 ->Arg(1);
965#endif
966
967 const std::string name_left_product =
968 "JacobianLeftMultiplyAndAccumulate<" + path + ">";
969 ::benchmark::RegisterBenchmark(
970 name_left_product.c_str(),
971 ceres::internal::JacobianLeftMultiplyAndAccumulate,
972 data,
973 &context)
974 ->Arg(1)
975 ->Arg(2)
976 ->Arg(4)
977 ->Arg(8)
978 ->Arg(16);
979
980 const std::string name_left_product_partitioned_f =
981 "PMVLeftMultiplyAndAccumulateF<" + path + ">";
982 ::benchmark::RegisterBenchmark(
983 name_left_product_partitioned_f.c_str(),
984 ceres::internal::PMVLeftMultiplyAndAccumulateF,
985 data,
986 &context)
987 ->Arg(1)
988 ->Arg(2)
989 ->Arg(4)
990 ->Arg(8)
991 ->Arg(16);
992
993#ifndef CERES_NO_CUDA
994 const std::string name_left_product_partitioned_f_cuda =
995 "PMVLeftMultiplyAndAccumulateFCuda<" + path + ">";
996 ::benchmark::RegisterBenchmark(
997 name_left_product_partitioned_f_cuda.c_str(),
998 ceres::internal::PMVLeftMultiplyAndAccumulateFCuda,
999 data,
1000 &context);
1001#endif
1002
1003 const std::string name_left_product_partitioned_e =
1004 "PMVLeftMultiplyAndAccumulateE<" + path + ">";
1005 ::benchmark::RegisterBenchmark(
1006 name_left_product_partitioned_e.c_str(),
1007 ceres::internal::PMVLeftMultiplyAndAccumulateE,
1008 data,
1009 &context)
1010 ->Arg(1)
1011 ->Arg(2)
1012 ->Arg(4)
1013 ->Arg(8)
1014 ->Arg(16);
1015
1016#ifndef CERES_NO_CUDA
1017 const std::string name_left_product_partitioned_e_cuda =
1018 "PMVLeftMultiplyAndAccumulateECuda<" + path + ">";
1019 ::benchmark::RegisterBenchmark(
1020 name_left_product_partitioned_e_cuda.c_str(),
1021 ceres::internal::PMVLeftMultiplyAndAccumulateECuda,
1022 data,
1023 &context);
1024#endif
1025
1026#ifndef CERES_NO_CUDA
1027 const std::string name_left_product_cuda =
1028 "JacobianLeftMultiplyAndAccumulateCuda<" + path + ">";
1029 ::benchmark::RegisterBenchmark(
1030 name_left_product_cuda.c_str(),
1031 ceres::internal::JacobianLeftMultiplyAndAccumulateCuda,
1032 data,
1033 &context)
1034 ->Arg(1);
1035#endif
1036
1037 const std::string name_squared_column_norm =
1038 "JacobianSquaredColumnNorm<" + path + ">";
1039 ::benchmark::RegisterBenchmark(name_squared_column_norm.c_str(),
1040 ceres::internal::JacobianSquaredColumnNorm,
1041 data,
1042 &context)
1043 ->Arg(1)
1044 ->Arg(2)
1045 ->Arg(4)
1046 ->Arg(8)
1047 ->Arg(16);
1048
1049 const std::string name_scale_columns = "JacobianScaleColumns<" + path + ">";
1050 ::benchmark::RegisterBenchmark(name_scale_columns.c_str(),
1051 ceres::internal::JacobianScaleColumns,
1052 data,
1053 &context)
1054 ->Arg(1)
1055 ->Arg(2)
1056 ->Arg(4)
1057 ->Arg(8)
1058 ->Arg(16);
1059
1060 const std::string name_to_crs = "JacobianToCRS<" + path + ">";
1061 ::benchmark::RegisterBenchmark(
1062 name_to_crs.c_str(), ceres::internal::JacobianToCRS, data, &context);
1063#ifndef CERES_NO_CUDA
1064 const std::string name_to_crs_view = "JacobianToCRSView<" + path + ">";
1065 ::benchmark::RegisterBenchmark(name_to_crs_view.c_str(),
1066 ceres::internal::JacobianToCRSView,
1067 data,
1068 &context);
1069 const std::string name_to_crs_matrix = "JacobianToCRSMatrix<" + path + ">";
1070 ::benchmark::RegisterBenchmark(name_to_crs_matrix.c_str(),
1071 ceres::internal::JacobianToCRSMatrix,
1072 data,
1073 &context);
1074 const std::string name_to_crs_view_update =
1075 "JacobianToCRSViewUpdate<" + path + ">";
1076 ::benchmark::RegisterBenchmark(name_to_crs_view_update.c_str(),
1077 ceres::internal::JacobianToCRSViewUpdate,
1078 data,
1079 &context);
1080 const std::string name_to_crs_matrix_update =
1081 "JacobianToCRSMatrixUpdate<" + path + ">";
1082 ::benchmark::RegisterBenchmark(name_to_crs_matrix_update.c_str(),
1083 ceres::internal::JacobianToCRSMatrixUpdate,
1084 data,
1085 &context);
1086#endif
1087 }
1088 ::benchmark::RunSpecifiedBenchmarks();
1089
1090 using namespace ::benchmark;
1091 using namespace benchmark_shutdown_fallback;
1092 Shutdown();
1093 return 0;
1094}