blob: 33685a400eb852505f424b219fa76729618f7bbf [file] [log] [blame]
Austin Schuh3de38b02024-06-25 18:25:10 -07001// Ceres Solver - A fast non-linear least squares minimizer
2// Copyright 2023 Google Inc. All rights reserved.
3// http://ceres-solver.org/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//
8// * Redistributions of source code must retain the above copyright notice,
9// this list of conditions and the following disclaimer.
10// * Redistributions in binary form must reproduce the above copyright notice,
11// this list of conditions and the following disclaimer in the documentation
12// and/or other materials provided with the distribution.
13// * Neither the name of Google Inc. nor the names of its contributors may be
14// used to endorse or promote products derived from this software without
15// specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28//
29// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
30//
31// A CUDA sparse matrix linear operator.
32
33// This include must come before any #ifndef check on Ceres compile options.
34// clang-format off
35#include "ceres/internal/config.h"
36// clang-format on
37
38#include "ceres/cuda_sparse_matrix.h"
39
40#include <math.h>
41
42#include <memory>
43
44#include "ceres/block_sparse_matrix.h"
45#include "ceres/compressed_row_sparse_matrix.h"
46#include "ceres/context_impl.h"
47#include "ceres/crs_matrix.h"
48#include "ceres/internal/export.h"
49#include "ceres/types.h"
50#include "ceres/wall_time.h"
51
52#ifndef CERES_NO_CUDA
53
54#include "ceres/cuda_buffer.h"
55#include "ceres/cuda_kernels_vector_ops.h"
56#include "ceres/cuda_vector.h"
57#include "cuda_runtime_api.h"
58#include "cusparse.h"
59
60namespace ceres::internal {
61namespace {
62// Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of
63// CUSPARSE_SPMV_ALG_DEFAULT.
64#if CUDART_VERSION >= 11021
65const auto kSpMVAlgorithm = CUSPARSE_SPMV_ALG_DEFAULT;
66#else // CUDART_VERSION >= 11021
67const auto kSpMVAlgorithm = CUSPARSE_MV_ALG_DEFAULT;
68#endif // CUDART_VERSION >= 11021
69size_t GetTempBufferSizeForOp(const cusparseHandle_t& handle,
70 const cusparseOperation_t op,
71 const cusparseDnVecDescr_t& x,
72 const cusparseDnVecDescr_t& y,
73 const cusparseSpMatDescr_t& A) {
74 size_t buffer_size;
75 const double alpha = 1.0;
76 const double beta = 1.0;
77 CHECK_NE(A, nullptr);
78 CHECK_EQ(cusparseSpMV_bufferSize(handle,
79 op,
80 &alpha,
81 A,
82 x,
83 &beta,
84 y,
85 CUDA_R_64F,
86 kSpMVAlgorithm,
87 &buffer_size),
88 CUSPARSE_STATUS_SUCCESS);
89 return buffer_size;
90}
91
92size_t GetTempBufferSize(const cusparseHandle_t& handle,
93 const cusparseDnVecDescr_t& left,
94 const cusparseDnVecDescr_t& right,
95 const cusparseSpMatDescr_t& A) {
96 CHECK_NE(A, nullptr);
97 return std::max(GetTempBufferSizeForOp(
98 handle, CUSPARSE_OPERATION_NON_TRANSPOSE, right, left, A),
99 GetTempBufferSizeForOp(
100 handle, CUSPARSE_OPERATION_TRANSPOSE, left, right, A));
101}
102} // namespace
103
104CudaSparseMatrix::CudaSparseMatrix(int num_cols,
105 CudaBuffer<int32_t>&& rows,
106 CudaBuffer<int32_t>&& cols,
107 ContextImpl* context)
108 : num_rows_(rows.size() - 1),
109 num_cols_(num_cols),
110 num_nonzeros_(cols.size()),
111 context_(context),
112 rows_(std::move(rows)),
113 cols_(std::move(cols)),
114 values_(context, num_nonzeros_),
115 spmv_buffer_(context) {
116 Initialize();
117}
118
119CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context,
120 const CompressedRowSparseMatrix& crs_matrix)
121 : num_rows_(crs_matrix.num_rows()),
122 num_cols_(crs_matrix.num_cols()),
123 num_nonzeros_(crs_matrix.num_nonzeros()),
124 context_(context),
125 rows_(context, num_rows_ + 1),
126 cols_(context, num_nonzeros_),
127 values_(context, num_nonzeros_),
128 spmv_buffer_(context) {
129 rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1);
130 cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_);
131 values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
132 Initialize();
133}
134
135CudaSparseMatrix::~CudaSparseMatrix() {
136 CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS);
137 descr_ = nullptr;
138 CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_left_));
139 CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_right_));
140}
141
142void CudaSparseMatrix::CopyValuesFromCpu(
143 const CompressedRowSparseMatrix& crs_matrix) {
144 // There is no quick and easy way to verify that the structure is unchanged,
145 // but at least we can check that the size of the matrix and the number of
146 // nonzeros is unchanged.
147 CHECK_EQ(num_rows_, crs_matrix.num_rows());
148 CHECK_EQ(num_cols_, crs_matrix.num_cols());
149 CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros());
150 values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
151}
152
153void CudaSparseMatrix::Initialize() {
154 CHECK(context_->IsCudaInitialized());
155 CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
156 cusparseCreateCsr(&descr_,
157 num_rows_,
158 num_cols_,
159 num_nonzeros_,
160 rows_.data(),
161 cols_.data(),
162 values_.data(),
163 CUSPARSE_INDEX_32I,
164 CUSPARSE_INDEX_32I,
165 CUSPARSE_INDEX_BASE_ZERO,
166 CUDA_R_64F));
167
168 // Note: values_.data() is used as non-zero pointer to device memory
169 // When there is no non-zero values, data-pointer of values_ array will be a
170 // nullptr; but in this case left/right products are trivial and temporary
171 // buffer (and vector descriptors) is not required
172 if (!num_nonzeros_) return;
173
174 CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
175 cusparseCreateDnVec(
176 &descr_vec_left_, num_rows_, values_.data(), CUDA_R_64F));
177 CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
178 cusparseCreateDnVec(
179 &descr_vec_right_, num_cols_, values_.data(), CUDA_R_64F));
180 size_t buffer_size = GetTempBufferSize(
181 context_->cusparse_handle_, descr_vec_left_, descr_vec_right_, descr_);
182 spmv_buffer_.Reserve(buffer_size);
183}
184
185void CudaSparseMatrix::SpMv(cusparseOperation_t op,
186 const cusparseDnVecDescr_t& x,
187 const cusparseDnVecDescr_t& y) const {
188 const double alpha = 1.0;
189 const double beta = 1.0;
190
191 CHECK_EQ(cusparseSpMV(context_->cusparse_handle_,
192 op,
193 &alpha,
194 descr_,
195 x,
196 &beta,
197 y,
198 CUDA_R_64F,
199 kSpMVAlgorithm,
200 spmv_buffer_.data()),
201 CUSPARSE_STATUS_SUCCESS);
202}
203
204void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x,
205 CudaVector* y) const {
206 DCHECK(GetTempBufferSize(
207 context_->cusparse_handle_, y->descr(), x.descr(), descr_) <=
208 spmv_buffer_.size());
209 SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x.descr(), y->descr());
210}
211
212void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x,
213 CudaVector* y) const {
214 // TODO(Joydeep Biswas): We should consider storing a transposed copy of the
215 // matrix by converting CSR to CSC. From the cuSPARSE documentation:
216 // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA
217 // != CUSPARSE_OPERATION_NON_TRANSPOSE"
218 DCHECK(GetTempBufferSize(
219 context_->cusparse_handle_, x.descr(), y->descr(), descr_) <=
220 spmv_buffer_.size());
221 SpMv(CUSPARSE_OPERATION_TRANSPOSE, x.descr(), y->descr());
222}
223
224} // namespace ceres::internal
225
226#endif // CERES_NO_CUDA