Blame - internal/ceres/cuda_block_structure.cc - RealtimeRoboticsGroup/test

blob: 3685775b60f381ef4df114deb3e8cc69e1ef50e9 [file] [log] [blame]

Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	1	// Ceres Solver - A fast non-linear least squares minimizer
				2	// Copyright 2023 Google Inc. All rights reserved.
				3	// http://ceres-solver.org/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are met:
				7	//
				8	// * Redistributions of source code must retain the above copyright notice,
				9	// this list of conditions and the following disclaimer.
				10	// * Redistributions in binary form must reproduce the above copyright notice,
				11	// this list of conditions and the following disclaimer in the documentation
				12	// and/or other materials provided with the distribution.
				13	// * Neither the name of Google Inc. nor the names of its contributors may be
				14	// used to endorse or promote products derived from this software without
				15	// specific prior written permission.
				16	//
				17	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				18	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				19	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				20	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				21	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				22	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				23	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				24	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				25	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				26	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				27	// POSSIBILITY OF SUCH DAMAGE.
				28	//
				29	// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
				30
				31	#include "ceres/cuda_block_structure.h"
				32
				33	#ifndef CERES_NO_CUDA
				34
				35	namespace ceres::internal {
				36	namespace {
				37	// Dimension of a sorted array of blocks
				38	inline int Dimension(const std::vector<Block>& blocks) {
				39	if (blocks.empty()) {
				40	return 0;
				41	}
				42	const auto& last = blocks.back();
				43	return last.size + last.position;
				44	}
				45	} // namespace
				46	CudaBlockSparseStructure::CudaBlockSparseStructure(
				47	const CompressedRowBlockStructure& block_structure, ContextImpl* context)
				48	: CudaBlockSparseStructure(block_structure, 0, context) {}
				49
				50	CudaBlockSparseStructure::CudaBlockSparseStructure(
				51	const CompressedRowBlockStructure& block_structure,
				52	const int num_col_blocks_e,
				53	ContextImpl* context)
				54	: first_cell_in_row_block_(context),
				55	value_offset_row_block_f_(context),
				56	cells_(context),
				57	row_blocks_(context),
				58	col_blocks_(context) {
				59	// Row blocks extracted from CompressedRowBlockStructure::rows
				60	std::vector<Block> row_blocks;
				61	// Column blocks can be reused as-is
				62	const auto& col_blocks = block_structure.cols;
				63
				64	// Row block offset is an index of the first cell corresponding to row block
				65	std::vector<int> first_cell_in_row_block;
				66	// Offset of the first value in the first non-empty row-block of F sub-matrix
				67	std::vector<int> value_offset_row_block_f;
				68	// Flat array of all cells from all row-blocks
				69	std::vector<Cell> cells;
				70
				71	int f_values_offset = -1;
				72	num_nonzeros_e_ = 0;
				73	is_crs_compatible_ = true;
				74	num_row_blocks_ = block_structure.rows.size();
				75	num_col_blocks_ = col_blocks.size();
				76
				77	row_blocks.reserve(num_row_blocks_);
				78	first_cell_in_row_block.reserve(num_row_blocks_ + 1);
				79	value_offset_row_block_f.reserve(num_row_blocks_ + 1);
				80	num_nonzeros_ = 0;
				81	// Block-sparse matrices arising from block-jacobian writer are expected to
				82	// have sequential layout (for partitioned matrices - it is expected that both
				83	// E and F sub-matrices have sequential layout).
				84	bool sequential_layout = true;
				85	int row_block_id = 0;
				86	num_row_blocks_e_ = 0;
				87	for (; row_block_id < num_row_blocks_; ++row_block_id) {
				88	const auto& r = block_structure.rows[row_block_id];
				89	const int row_block_size = r.block.size;
				90	const int num_cells = r.cells.size();
				91
				92	if (num_col_blocks_e == 0 \|\| r.cells.size() == 0 \|\|
				93	r.cells[0].block_id >= num_col_blocks_e) {
				94	break;
				95	}
				96	num_row_blocks_e_ = row_block_id + 1;
				97	// In E sub-matrix there is exactly a single E cell in the row
				98	// since E cells are stored separately from F cells, crs-compatiblity of
				99	// F sub-matrix only breaks if there are more than 2 cells in row (that
				100	// is, more than 1 cell in F sub-matrix)
				101	if (num_cells > 2 && row_block_size > 1) {
				102	is_crs_compatible_ = false;
				103	}
				104	row_blocks.emplace_back(r.block);
				105	first_cell_in_row_block.push_back(cells.size());
				106
				107	for (int cell_id = 0; cell_id < num_cells; ++cell_id) {
				108	const auto& c = r.cells[cell_id];
				109	const int col_block_size = col_blocks[c.block_id].size;
				110	const int cell_size = col_block_size * row_block_size;
				111	cells.push_back(c);
				112	if (cell_id == 0) {
				113	DCHECK(c.position == num_nonzeros_e_);
				114	num_nonzeros_e_ += cell_size;
				115	} else {
				116	if (f_values_offset == -1) {
				117	num_nonzeros_ = c.position;
				118	f_values_offset = c.position;
				119	}
				120	sequential_layout &= c.position == num_nonzeros_;
				121	num_nonzeros_ += cell_size;
				122	if (cell_id == 1) {
				123	// Correct value_offset_row_block_f for empty row-blocks of F
				124	// preceding this one
				125	for (auto it = value_offset_row_block_f.rbegin();
				126	it != value_offset_row_block_f.rend();
				127	++it) {
				128	if (*it != -1) break;
				129	*it = c.position;
				130	}
				131	value_offset_row_block_f.push_back(c.position);
				132	}
				133	}
				134	}
				135	if (num_cells == 1) {
				136	value_offset_row_block_f.push_back(-1);
				137	}
				138	}
				139	for (; row_block_id < num_row_blocks_; ++row_block_id) {
				140	const auto& r = block_structure.rows[row_block_id];
				141	const int row_block_size = r.block.size;
				142	const int num_cells = r.cells.size();
				143	// After num_row_blocks_e_ row-blocks, there should be no cells in E
				144	// sub-matrix. Thus crs-compatibility of F sub-matrix breaks if there are
				145	// more than one cells in the row-block
				146	if (num_cells > 1 && row_block_size > 1) {
				147	is_crs_compatible_ = false;
				148	}
				149	row_blocks.emplace_back(r.block);
				150	first_cell_in_row_block.push_back(cells.size());
				151
				152	if (r.cells.empty()) {
				153	value_offset_row_block_f.push_back(-1);
				154	} else {
				155	for (auto it = value_offset_row_block_f.rbegin();
				156	it != value_offset_row_block_f.rend();
				157	--it) {
				158	if (*it != -1) break;
				159	*it = cells[0].position;
				160	}
				161	value_offset_row_block_f.push_back(r.cells[0].position);
				162	}
				163	for (const auto& c : r.cells) {
				164	const int col_block_size = col_blocks[c.block_id].size;
				165	const int cell_size = col_block_size * row_block_size;
				166	cells.push_back(c);
				167	DCHECK(c.block_id >= num_col_blocks_e);
				168	if (f_values_offset == -1) {
				169	num_nonzeros_ = c.position;
				170	f_values_offset = c.position;
				171	}
				172	sequential_layout &= c.position == num_nonzeros_;
				173	num_nonzeros_ += cell_size;
				174	}
				175	}
				176
				177	if (f_values_offset == -1) {
				178	f_values_offset = num_nonzeros_e_;
				179	num_nonzeros_ = num_nonzeros_e_;
				180	}
				181	// Fill non-zero offsets for the last rows of F submatrix
				182	for (auto it = value_offset_row_block_f.rbegin();
				183	it != value_offset_row_block_f.rend();
				184	++it) {
				185	if (*it != -1) break;
				186	*it = num_nonzeros_;
				187	}
				188	value_offset_row_block_f.push_back(num_nonzeros_);
				189	CHECK_EQ(num_nonzeros_e_, f_values_offset);
				190	first_cell_in_row_block.push_back(cells.size());
				191	num_cells_ = cells.size();
				192
				193	num_rows_ = Dimension(row_blocks);
				194	num_cols_ = Dimension(col_blocks);
				195
				196	CHECK(sequential_layout);
				197
				198	if (VLOG_IS_ON(3)) {
				199	const size_t first_cell_in_row_block_size =
				200	first_cell_in_row_block.size() * sizeof(int);
				201	const size_t cells_size = cells.size() * sizeof(Cell);
				202	const size_t row_blocks_size = row_blocks.size() * sizeof(Block);
				203	const size_t col_blocks_size = col_blocks.size() * sizeof(Block);
				204	const size_t total_size = first_cell_in_row_block_size + cells_size +
				205	col_blocks_size + row_blocks_size;
				206	const double ratio =
				207	(100. * total_size) / (num_nonzeros_ * (sizeof(int) + sizeof(double)) +
				208	num_rows_ * sizeof(int));
				209	VLOG(3) << "\nCudaBlockSparseStructure:\n"
				210	"\tRow block offsets: "
				211	<< first_cell_in_row_block_size
				212	<< " bytes\n"
				213	"\tColumn blocks: "
				214	<< col_blocks_size
				215	<< " bytes\n"
				216	"\tRow blocks: "
				217	<< row_blocks_size
				218	<< " bytes\n"
				219	"\tCells: "
				220	<< cells_size << " bytes\n\tTotal: " << total_size
				221	<< " bytes of GPU memory (" << ratio << "% of CRS matrix size)";
				222	}
				223
				224	first_cell_in_row_block_.CopyFromCpuVector(first_cell_in_row_block);
				225	cells_.CopyFromCpuVector(cells);
				226	row_blocks_.CopyFromCpuVector(row_blocks);
				227	col_blocks_.CopyFromCpuVector(col_blocks);
				228	if (num_col_blocks_e \|\| num_row_blocks_e_) {
				229	value_offset_row_block_f_.CopyFromCpuVector(value_offset_row_block_f);
				230	}
				231	}
				232	} // namespace ceres::internal
				233
				234	#endif // CERES_NO_CUDA