Austin Schuh | 3de38b0 | 2024-06-25 18:25:10 -0700 | [diff] [blame^] | 1 | // Ceres Solver - A fast non-linear least squares minimizer |
| 2 | // Copyright 2023 Google Inc. All rights reserved. |
| 3 | // http://ceres-solver.org/ |
| 4 | // |
| 5 | // Redistribution and use in source and binary forms, with or without |
| 6 | // modification, are permitted provided that the following conditions are met: |
| 7 | // |
| 8 | // * Redistributions of source code must retain the above copyright notice, |
| 9 | // this list of conditions and the following disclaimer. |
| 10 | // * Redistributions in binary form must reproduce the above copyright notice, |
| 11 | // this list of conditions and the following disclaimer in the documentation |
| 12 | // and/or other materials provided with the distribution. |
| 13 | // * Neither the name of Google Inc. nor the names of its contributors may be |
| 14 | // used to endorse or promote products derived from this software without |
| 15 | // specific prior written permission. |
| 16 | // |
| 17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 18 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 19 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 20 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 21 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 22 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 23 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 24 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 25 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 26 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 27 | // POSSIBILITY OF SUCH DAMAGE. |
| 28 | // |
| 29 | // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) |
| 30 | |
| 31 | #ifndef CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_ |
| 32 | #define CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_ |
| 33 | |
| 34 | #include "ceres/internal/config.h" |
| 35 | |
| 36 | #ifndef CERES_NO_CUDA |
| 37 | |
| 38 | #include <algorithm> |
| 39 | |
| 40 | #include "ceres/cuda_buffer.h" |
| 41 | |
| 42 | namespace ceres::internal { |
| 43 | |
| 44 | // Most contemporary CUDA devices are capable of simultaneous code execution and |
| 45 | // host-to-device transfer. This class copies batches of data to GPU memory and |
| 46 | // executes processing of copied data in parallel (asynchronously). |
| 47 | // Data is copied to a fixed-size buffer on GPU (containing at most |
| 48 | // max_buffer_size values), and this memory is re-used when the previous |
| 49 | // batch of values is processed by user-provided callback |
| 50 | // Host-to-device copy uses a temporary buffer if required. Each batch of values |
| 51 | // has size of kValuesPerBatch, except the last one. |
| 52 | template <typename T> |
| 53 | class CERES_NO_EXPORT CudaStreamedBuffer { |
| 54 | public: |
| 55 | // If hardware supports only one host-to-device copy or one host-to-device |
| 56 | // copy is able to reach peak bandwidth, two streams are sufficient to reach |
| 57 | // maximum efficiency: |
| 58 | // - If transferring batch of values takes more time, than processing it on |
| 59 | // gpu, then at every moment of time one of the streams will be transferring |
| 60 | // data and other stream will be either processing data or idle; the whole |
| 61 | // process will be bounded by host-to-device copy. |
| 62 | // - If transferring batch of values takes less time, than processing it on |
| 63 | // gpu, then at every moment of time one of the streams will be processing |
| 64 | // data and other stream will be either performing computations or |
| 65 | // transferring data, and the whole process will be bounded by computations. |
| 66 | static constexpr int kNumBatches = 2; |
| 67 | // max_buffer_size is the maximal size (in elements of type T) of array |
| 68 | // to be pre-allocated in gpu memory. The size of array determines size of |
| 69 | // batch of values for simultaneous copying and processing. It should be large |
| 70 | // enough to allow highly-parallel execution of user kernels; making it too |
| 71 | // large increases latency. |
| 72 | CudaStreamedBuffer(ContextImpl* context, const int max_buffer_size) |
| 73 | : kValuesPerBatch(max_buffer_size / kNumBatches), |
| 74 | context_(context), |
| 75 | values_gpu_(context, kValuesPerBatch * kNumBatches) { |
| 76 | static_assert(ContextImpl::kNumCudaStreams >= kNumBatches); |
| 77 | CHECK_GE(max_buffer_size, kNumBatches); |
| 78 | // Pre-allocate a buffer of page-locked memory for transfers from a regular |
| 79 | // cpu memory. Because we will be only writing into that buffer from cpu, |
| 80 | // memory is allocated with cudaHostAllocWriteCombined flag. |
| 81 | CHECK_EQ(cudaSuccess, |
| 82 | cudaHostAlloc(&values_cpu_pinned_, |
| 83 | sizeof(T) * kValuesPerBatch * kNumBatches, |
| 84 | cudaHostAllocWriteCombined)); |
| 85 | for (auto& e : copy_finished_) { |
| 86 | CHECK_EQ(cudaSuccess, |
| 87 | cudaEventCreateWithFlags(&e, cudaEventDisableTiming)); |
| 88 | } |
| 89 | } |
| 90 | |
| 91 | CudaStreamedBuffer(const CudaStreamedBuffer&) = delete; |
| 92 | |
| 93 | ~CudaStreamedBuffer() { |
| 94 | CHECK_EQ(cudaSuccess, cudaFreeHost(values_cpu_pinned_)); |
| 95 | for (auto& e : copy_finished_) { |
| 96 | CHECK_EQ(cudaSuccess, cudaEventDestroy(e)); |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | // Transfer num_values at host-memory pointer from, calling |
| 101 | // callback(device_pointer, size_of_batch, offset_of_batch, stream_to_use) |
| 102 | // after scheduling transfer of each batch of data. User-provided callback |
| 103 | // should perform processing of data at device_pointer only in |
| 104 | // stream_to_use stream (device_pointer will be re-used in the next |
| 105 | // callback invocation with the same stream). |
| 106 | // |
| 107 | // Two diagrams below describe operation in two possible scenarios, depending |
| 108 | // on input data being stored in page-locked memory. In this example we will |
| 109 | // have max_buffer_size = 2 * K, num_values = N * K and callback |
| 110 | // scheduling a single asynchronous launch of |
| 111 | // Kernel<<..., stream_to_use>>(device_pointer, |
| 112 | // size_of_batch, |
| 113 | // offset_of_batch) |
| 114 | // |
| 115 | // a. Copying from page-locked memory |
| 116 | // In this case no copy on the host-side is necessary, and this method just |
| 117 | // schedules a bunch of interleaved memory copies and callback invocations: |
| 118 | // |
| 119 | // cudaStreamSynchronize(context->DefaultStream()); |
| 120 | // - Iteration #0: |
| 121 | // - cudaMemcpyAsync(values_gpu_, from, K * sizeof(T), H->D, stream_0) |
| 122 | // - callback(values_gpu_, K, 0, stream_0) |
| 123 | // - Iteration #1: |
| 124 | // - cudaMemcpyAsync(values_gpu_ + K, from + K, K * sizeof(T), H->D, |
| 125 | // stream_1) |
| 126 | // - callback(values_gpu_ + K, K, K, stream_1) |
| 127 | // - Iteration #2: |
| 128 | // - cudaMemcpyAsync(values_gpu_, from + 2 * K, K * sizeof(T), H->D, |
| 129 | // stream_0) |
| 130 | // - callback(values_gpu_, K, 2 * K, stream_0) |
| 131 | // - Iteration #3: |
| 132 | // - cudaMemcpyAsync(values_gpu_ + K, from + 3 * K, K * sizeof(T), H->D, |
| 133 | // stream_1) |
| 134 | // - callback(values_gpu_ + K, K, 3 * K, stream_1) |
| 135 | // ... |
| 136 | // - Iteration #i: |
| 137 | // - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, from + i * K, K * |
| 138 | // sizeof(T), H->D, stream_(i % 2)) |
| 139 | // - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2) |
| 140 | // ... |
| 141 | // cudaStreamSynchronize(stream_0) |
| 142 | // cudaStreamSynchronize(stream_1) |
| 143 | // |
| 144 | // This sequence of calls results in following activity on gpu (assuming that |
| 145 | // kernel invoked by callback takes less time than host-to-device copy): |
| 146 | // +-------------------+-------------------+ |
| 147 | // | Stream #0 | Stream #1 | |
| 148 | // +-------------------+-------------------+ |
| 149 | // | Copy host->device | | |
| 150 | // | | | |
| 151 | // | | | |
| 152 | // +-------------------+-------------------+ |
| 153 | // | Kernel | Copy host->device | |
| 154 | // +-------------------+ | |
| 155 | // | | | |
| 156 | // +-------------------+-------------------+ |
| 157 | // | Copy host->device | Kernel | |
| 158 | // | +-------------------+ |
| 159 | // | | | |
| 160 | // +-------------------+-------------------+ |
| 161 | // | Kernel | Copy host->device | |
| 162 | // | ... | |
| 163 | // +---------------------------------------+ |
| 164 | // |
| 165 | // b. Copying from regular memory |
| 166 | // In this case a copy from regular memory to page-locked memory is required |
| 167 | // in order to get asynchrnonous operation. Because pinned memory on host-side |
| 168 | // is reused, additional synchronization is required. On each iteration method |
| 169 | // the following actions are performed: |
| 170 | // - Wait till previous copy operation in stream is completed |
| 171 | // - Copy batch of values from input array into pinned memory |
| 172 | // - Asynchronously launch host-to-device copy |
| 173 | // - Setup event for synchronization on copy completion |
| 174 | // - Invoke callback (that launches kernel asynchronously) |
| 175 | // |
| 176 | // Invocations are performed with the following arguments |
| 177 | // cudaStreamSynchronize(context->DefaultStream()); |
| 178 | // - Iteration #0: |
| 179 | // - cudaEventSynchronize(copy_finished_0) |
| 180 | // - std::copy_n(from, K, values_cpu_pinned_) |
| 181 | // - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D, |
| 182 | // stream_0) |
| 183 | // - cudaEventRecord(copy_finished_0, stream_0) |
| 184 | // - callback(values_gpu_, K, 0, stream_0) |
| 185 | // - Iteration #1: |
| 186 | // - cudaEventSynchronize(copy_finished_1) |
| 187 | // - std::copy_n(from + K, K, values_cpu_pinned_ + K) |
| 188 | // - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K * |
| 189 | // sizeof(T), H->D, stream_1) |
| 190 | // - cudaEventRecord(copy_finished_1, stream_1) |
| 191 | // - callback(values_gpu_ + K, K, K, stream_1) |
| 192 | // - Iteration #2: |
| 193 | // - cudaEventSynchronize(copy_finished_0) |
| 194 | // - std::copy_n(from + 2 * K, K, values_cpu_pinned_) |
| 195 | // - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D, |
| 196 | // stream_0) |
| 197 | // - cudaEventRecord(copy_finished_0, stream_0) |
| 198 | // - callback(values_gpu_, K, 2 * K, stream_0) |
| 199 | // - Iteration #3: |
| 200 | // - cudaEventSynchronize(copy_finished_1) |
| 201 | // - std::copy_n(from + 3 * K, K, values_cpu_pinned_ + K) |
| 202 | // - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K * |
| 203 | // sizeof(T), H->D, stream_1) |
| 204 | // - cudaEventRecord(copy_finished_1, stream_1) |
| 205 | // - callback(values_gpu_ + K, K, 3 * K, stream_1) |
| 206 | // ... |
| 207 | // - Iteration #i: |
| 208 | // - cudaEventSynchronize(copy_finished_(i % 2)) |
| 209 | // - std::copy_n(from + i * K, K, values_cpu_pinned_ + (i % 2) * K) |
| 210 | // - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, values_cpu_pinned_ + (i % |
| 211 | // 2) * K, K * sizeof(T), H->D, stream_(i % 2)) |
| 212 | // - cudaEventRecord(copy_finished_(i % 2), stream_(i % 2)) |
| 213 | // - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2)) |
| 214 | // ... |
| 215 | // cudaStreamSynchronize(stream_0) |
| 216 | // cudaStreamSynchronize(stream_1) |
| 217 | // |
| 218 | // This sequence of calls results in following activity on cpu and gpu |
| 219 | // (assuming that kernel invoked by callback takes less time than |
| 220 | // host-to-device copy and copy in cpu memory, and copy in cpu memory is |
| 221 | // faster than host-to-device copy): |
| 222 | // +----------------------------+-------------------+-------------------+ |
| 223 | // | Stream #0 | Stream #0 | Stream #1 | |
| 224 | // +----------------------------+-------------------+-------------------+ |
| 225 | // | Copy to pinned memory | | | |
| 226 | // | | | | |
| 227 | // +----------------------------+-------------------| | |
| 228 | // | Copy to pinned memory | Copy host->device | | |
| 229 | // | | | | |
| 230 | // +----------------------------+ | | |
| 231 | // | Waiting previous h->d copy | | | |
| 232 | // +----------------------------+-------------------+-------------------+ |
| 233 | // | Copy to pinned memory | Kernel | Copy host->device | |
| 234 | // | +-------------------+ | |
| 235 | // +----------------------------+ | | |
| 236 | // | Waiting previous h->d copy | | | |
| 237 | // +----------------------------+-------------------+-------------------+ |
| 238 | // | Copy to pinned memory | Copy host->device | Kernel | |
| 239 | // | | +-------------------+ |
| 240 | // | ... ... | |
| 241 | // +----------------------------+---------------------------------------+ |
| 242 | // |
| 243 | template <typename Fun> |
| 244 | void CopyToGpu(const T* from, const int num_values, Fun&& callback) { |
| 245 | // This synchronization is not required in some cases, but we perform it in |
| 246 | // order to avoid situation when user callback depends on data that is |
| 247 | // still to be computed in default stream |
| 248 | CHECK_EQ(cudaSuccess, cudaStreamSynchronize(context_->DefaultStream())); |
| 249 | |
| 250 | // If pointer to input data does not correspond to page-locked memory, |
| 251 | // host-to-device memory copy might be executed synchrnonously (with a copy |
| 252 | // to pinned memory happening inside the driver). In that case we perform |
| 253 | // copy to a pre-allocated array of page-locked memory. |
| 254 | const bool copy_to_pinned_memory = MemoryTypeResultsInSynchronousCopy(from); |
| 255 | T* batch_values_gpu[kNumBatches]; |
| 256 | T* batch_values_cpu[kNumBatches]; |
| 257 | auto streams = context_->streams_; |
| 258 | for (int i = 0; i < kNumBatches; ++i) { |
| 259 | batch_values_gpu[i] = values_gpu_.data() + kValuesPerBatch * i; |
| 260 | batch_values_cpu[i] = values_cpu_pinned_ + kValuesPerBatch * i; |
| 261 | } |
| 262 | int batch_id = 0; |
| 263 | for (int offset = 0; offset < num_values; offset += kValuesPerBatch) { |
| 264 | const int num_values_batch = |
| 265 | std::min(num_values - offset, kValuesPerBatch); |
| 266 | const T* batch_from = from + offset; |
| 267 | T* batch_to = batch_values_gpu[batch_id]; |
| 268 | auto stream = streams[batch_id]; |
| 269 | auto copy_finished = copy_finished_[batch_id]; |
| 270 | |
| 271 | if (copy_to_pinned_memory) { |
| 272 | // Copying values to a temporary buffer should be started only after the |
| 273 | // previous copy from temporary buffer to device is completed. |
| 274 | CHECK_EQ(cudaSuccess, cudaEventSynchronize(copy_finished)); |
| 275 | std::copy_n(batch_from, num_values_batch, batch_values_cpu[batch_id]); |
| 276 | batch_from = batch_values_cpu[batch_id]; |
| 277 | } |
| 278 | CHECK_EQ(cudaSuccess, |
| 279 | cudaMemcpyAsync(batch_to, |
| 280 | batch_from, |
| 281 | sizeof(T) * num_values_batch, |
| 282 | cudaMemcpyHostToDevice, |
| 283 | stream)); |
| 284 | if (copy_to_pinned_memory) { |
| 285 | // Next copy to a temporary buffer can start straight after asynchronous |
| 286 | // copy is completed (and might be started before kernels asynchronously |
| 287 | // executed in stream by user-supplied callback are completed). |
| 288 | // No explicit synchronization is required when copying data from |
| 289 | // page-locked memory, because memory copy and user kernel execution |
| 290 | // with corresponding part of values_gpu_ array is serialized using |
| 291 | // stream |
| 292 | CHECK_EQ(cudaSuccess, cudaEventRecord(copy_finished, stream)); |
| 293 | } |
| 294 | callback(batch_to, num_values_batch, offset, stream); |
| 295 | batch_id = (batch_id + 1) % kNumBatches; |
| 296 | } |
| 297 | // Explicitly synchronize on all CUDA streams that were utilized. |
| 298 | for (int i = 0; i < kNumBatches; ++i) { |
| 299 | CHECK_EQ(cudaSuccess, cudaStreamSynchronize(streams[i])); |
| 300 | } |
| 301 | } |
| 302 | |
| 303 | private: |
| 304 | // It is necessary to have all host-to-device copies to be completely |
| 305 | // asynchronous. This requires source memory to be allocated in page-locked |
| 306 | // memory. |
| 307 | static bool MemoryTypeResultsInSynchronousCopy(const void* ptr) { |
| 308 | cudaPointerAttributes attributes; |
| 309 | auto status = cudaPointerGetAttributes(&attributes, ptr); |
| 310 | #if CUDART_VERSION < 11000 |
| 311 | // In CUDA versions prior 11 call to cudaPointerGetAttributes with host |
| 312 | // pointer will return cudaErrorInvalidValue |
| 313 | if (status == cudaErrorInvalidValue) { |
| 314 | return true; |
| 315 | } |
| 316 | #endif |
| 317 | CHECK_EQ(status, cudaSuccess); |
| 318 | // This class only supports cpu memory as a source |
| 319 | CHECK_NE(attributes.type, cudaMemoryTypeDevice); |
| 320 | // If host memory was allocated (or registered) with CUDA API, or is a |
| 321 | // managed memory, then call to cudaMemcpyAsync will be asynchrnous. In case |
| 322 | // of managed memory it might be slightly better to perform a single call of |
| 323 | // user-provided call-back (and hope that page migration will provide a |
| 324 | // similar throughput with zero efforts from our side). |
| 325 | return attributes.type == cudaMemoryTypeUnregistered; |
| 326 | } |
| 327 | |
| 328 | const int kValuesPerBatch; |
| 329 | ContextImpl* context_ = nullptr; |
| 330 | CudaBuffer<T> values_gpu_; |
| 331 | T* values_cpu_pinned_ = nullptr; |
| 332 | cudaEvent_t copy_finished_[kNumBatches] = {nullptr}; |
| 333 | }; |
| 334 | |
| 335 | } // namespace ceres::internal |
| 336 | |
| 337 | #endif // CERES_NO_CUDA |
| 338 | #endif // CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_ |