Blame - internal/ceres/cuda_sparse_matrix.cc - RealtimeRoboticsGroup/test

blob: 33685a400eb852505f424b219fa76729618f7bbf [file] [log] [blame]

Austin Schuh	3de38b0	2024-06-25 18:25:10 -0700	[diff] [blame^]	1	// Ceres Solver - A fast non-linear least squares minimizer
				2	// Copyright 2023 Google Inc. All rights reserved.
				3	// http://ceres-solver.org/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are met:
				7	//
				8	// * Redistributions of source code must retain the above copyright notice,
				9	// this list of conditions and the following disclaimer.
				10	// * Redistributions in binary form must reproduce the above copyright notice,
				11	// this list of conditions and the following disclaimer in the documentation
				12	// and/or other materials provided with the distribution.
				13	// * Neither the name of Google Inc. nor the names of its contributors may be
				14	// used to endorse or promote products derived from this software without
				15	// specific prior written permission.
				16	//
				17	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				18	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				19	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				20	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				21	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				22	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				23	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				24	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				25	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				26	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				27	// POSSIBILITY OF SUCH DAMAGE.
				28	//
				29	// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
				30	//
				31	// A CUDA sparse matrix linear operator.
				32
				33	// This include must come before any #ifndef check on Ceres compile options.
				34	// clang-format off
				35	#include "ceres/internal/config.h"
				36	// clang-format on
				37
				38	#include "ceres/cuda_sparse_matrix.h"
				39
				40	#include <math.h>
				41
				42	#include <memory>
				43
				44	#include "ceres/block_sparse_matrix.h"
				45	#include "ceres/compressed_row_sparse_matrix.h"
				46	#include "ceres/context_impl.h"
				47	#include "ceres/crs_matrix.h"
				48	#include "ceres/internal/export.h"
				49	#include "ceres/types.h"
				50	#include "ceres/wall_time.h"
				51
				52	#ifndef CERES_NO_CUDA
				53
				54	#include "ceres/cuda_buffer.h"
				55	#include "ceres/cuda_kernels_vector_ops.h"
				56	#include "ceres/cuda_vector.h"
				57	#include "cuda_runtime_api.h"
				58	#include "cusparse.h"
				59
				60	namespace ceres::internal {
				61	namespace {
				62	// Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of
				63	// CUSPARSE_SPMV_ALG_DEFAULT.
				64	#if CUDART_VERSION >= 11021
				65	const auto kSpMVAlgorithm = CUSPARSE_SPMV_ALG_DEFAULT;
				66	#else // CUDART_VERSION >= 11021
				67	const auto kSpMVAlgorithm = CUSPARSE_MV_ALG_DEFAULT;
				68	#endif // CUDART_VERSION >= 11021
				69	size_t GetTempBufferSizeForOp(const cusparseHandle_t& handle,
				70	const cusparseOperation_t op,
				71	const cusparseDnVecDescr_t& x,
				72	const cusparseDnVecDescr_t& y,
				73	const cusparseSpMatDescr_t& A) {
				74	size_t buffer_size;
				75	const double alpha = 1.0;
				76	const double beta = 1.0;
				77	CHECK_NE(A, nullptr);
				78	CHECK_EQ(cusparseSpMV_bufferSize(handle,
				79	op,
				80	&alpha,
				81	A,
				82	x,
				83	&beta,
				84	y,
				85	CUDA_R_64F,
				86	kSpMVAlgorithm,
				87	&buffer_size),
				88	CUSPARSE_STATUS_SUCCESS);
				89	return buffer_size;
				90	}
				91
				92	size_t GetTempBufferSize(const cusparseHandle_t& handle,
				93	const cusparseDnVecDescr_t& left,
				94	const cusparseDnVecDescr_t& right,
				95	const cusparseSpMatDescr_t& A) {
				96	CHECK_NE(A, nullptr);
				97	return std::max(GetTempBufferSizeForOp(
				98	handle, CUSPARSE_OPERATION_NON_TRANSPOSE, right, left, A),
				99	GetTempBufferSizeForOp(
				100	handle, CUSPARSE_OPERATION_TRANSPOSE, left, right, A));
				101	}
				102	} // namespace
				103
				104	CudaSparseMatrix::CudaSparseMatrix(int num_cols,
				105	CudaBuffer<int32_t>&& rows,
				106	CudaBuffer<int32_t>&& cols,
				107	ContextImpl* context)
				108	: num_rows_(rows.size() - 1),
				109	num_cols_(num_cols),
				110	num_nonzeros_(cols.size()),
				111	context_(context),
				112	rows_(std::move(rows)),
				113	cols_(std::move(cols)),
				114	values_(context, num_nonzeros_),
				115	spmv_buffer_(context) {
				116	Initialize();
				117	}
				118
				119	CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context,
				120	const CompressedRowSparseMatrix& crs_matrix)
				121	: num_rows_(crs_matrix.num_rows()),
				122	num_cols_(crs_matrix.num_cols()),
				123	num_nonzeros_(crs_matrix.num_nonzeros()),
				124	context_(context),
				125	rows_(context, num_rows_ + 1),
				126	cols_(context, num_nonzeros_),
				127	values_(context, num_nonzeros_),
				128	spmv_buffer_(context) {
				129	rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1);
				130	cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_);
				131	values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
				132	Initialize();
				133	}
				134
				135	CudaSparseMatrix::~CudaSparseMatrix() {
				136	CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS);
				137	descr_ = nullptr;
				138	CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_left_));
				139	CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_right_));
				140	}
				141
				142	void CudaSparseMatrix::CopyValuesFromCpu(
				143	const CompressedRowSparseMatrix& crs_matrix) {
				144	// There is no quick and easy way to verify that the structure is unchanged,
				145	// but at least we can check that the size of the matrix and the number of
				146	// nonzeros is unchanged.
				147	CHECK_EQ(num_rows_, crs_matrix.num_rows());
				148	CHECK_EQ(num_cols_, crs_matrix.num_cols());
				149	CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros());
				150	values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
				151	}
				152
				153	void CudaSparseMatrix::Initialize() {
				154	CHECK(context_->IsCudaInitialized());
				155	CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
				156	cusparseCreateCsr(&descr_,
				157	num_rows_,
				158	num_cols_,
				159	num_nonzeros_,
				160	rows_.data(),
				161	cols_.data(),
				162	values_.data(),
				163	CUSPARSE_INDEX_32I,
				164	CUSPARSE_INDEX_32I,
				165	CUSPARSE_INDEX_BASE_ZERO,
				166	CUDA_R_64F));
				167
				168	// Note: values_.data() is used as non-zero pointer to device memory
				169	// When there is no non-zero values, data-pointer of values_ array will be a
				170	// nullptr; but in this case left/right products are trivial and temporary
				171	// buffer (and vector descriptors) is not required
				172	if (!num_nonzeros_) return;
				173
				174	CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
				175	cusparseCreateDnVec(
				176	&descr_vec_left_, num_rows_, values_.data(), CUDA_R_64F));
				177	CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
				178	cusparseCreateDnVec(
				179	&descr_vec_right_, num_cols_, values_.data(), CUDA_R_64F));
				180	size_t buffer_size = GetTempBufferSize(
				181	context_->cusparse_handle_, descr_vec_left_, descr_vec_right_, descr_);
				182	spmv_buffer_.Reserve(buffer_size);
				183	}
				184
				185	void CudaSparseMatrix::SpMv(cusparseOperation_t op,
				186	const cusparseDnVecDescr_t& x,
				187	const cusparseDnVecDescr_t& y) const {
				188	const double alpha = 1.0;
				189	const double beta = 1.0;
				190
				191	CHECK_EQ(cusparseSpMV(context_->cusparse_handle_,
				192	op,
				193	&alpha,
				194	descr_,
				195	x,
				196	&beta,
				197	y,
				198	CUDA_R_64F,
				199	kSpMVAlgorithm,
				200	spmv_buffer_.data()),
				201	CUSPARSE_STATUS_SUCCESS);
				202	}
				203
				204	void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x,
				205	CudaVector* y) const {
				206	DCHECK(GetTempBufferSize(
				207	context_->cusparse_handle_, y->descr(), x.descr(), descr_) <=
				208	spmv_buffer_.size());
				209	SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x.descr(), y->descr());
				210	}
				211
				212	void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x,
				213	CudaVector* y) const {
				214	// TODO(Joydeep Biswas): We should consider storing a transposed copy of the
				215	// matrix by converting CSR to CSC. From the cuSPARSE documentation:
				216	// "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA
				217	// != CUSPARSE_OPERATION_NON_TRANSPOSE"
				218	DCHECK(GetTempBufferSize(
				219	context_->cusparse_handle_, x.descr(), y->descr(), descr_) <=
				220	spmv_buffer_.size());
				221	SpMv(CUSPARSE_OPERATION_TRANSPOSE, x.descr(), y->descr());
				222	}
				223
				224	} // namespace ceres::internal
				225
				226	#endif // CERES_NO_CUDA