blob: 37bcf4ac5e166dc9242506369b3b8a302101f80d [file] [log] [blame]
Austin Schuh3de38b02024-06-25 18:25:10 -07001// Ceres Solver - A fast non-linear least squares minimizer
2// Copyright 2023 Google Inc. All rights reserved.
3// http://ceres-solver.org/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//
8// * Redistributions of source code must retain the above copyright notice,
9// this list of conditions and the following disclaimer.
10// * Redistributions in binary form must reproduce the above copyright notice,
11// this list of conditions and the following disclaimer in the documentation
12// and/or other materials provided with the distribution.
13// * Neither the name of Google Inc. nor the names of its contributors may be
14// used to endorse or promote products derived from this software without
15// specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28//
29// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
30
31#ifndef CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
32#define CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
33
34#include "ceres/internal/config.h"
35
36#ifndef CERES_NO_CUDA
37
38#include <algorithm>
39
40#include "ceres/cuda_buffer.h"
41
42namespace ceres::internal {
43
44// Most contemporary CUDA devices are capable of simultaneous code execution and
45// host-to-device transfer. This class copies batches of data to GPU memory and
46// executes processing of copied data in parallel (asynchronously).
47// Data is copied to a fixed-size buffer on GPU (containing at most
48// max_buffer_size values), and this memory is re-used when the previous
49// batch of values is processed by user-provided callback
50// Host-to-device copy uses a temporary buffer if required. Each batch of values
51// has size of kValuesPerBatch, except the last one.
52template <typename T>
53class CERES_NO_EXPORT CudaStreamedBuffer {
54 public:
55 // If hardware supports only one host-to-device copy or one host-to-device
56 // copy is able to reach peak bandwidth, two streams are sufficient to reach
57 // maximum efficiency:
58 // - If transferring batch of values takes more time, than processing it on
59 // gpu, then at every moment of time one of the streams will be transferring
60 // data and other stream will be either processing data or idle; the whole
61 // process will be bounded by host-to-device copy.
62 // - If transferring batch of values takes less time, than processing it on
63 // gpu, then at every moment of time one of the streams will be processing
64 // data and other stream will be either performing computations or
65 // transferring data, and the whole process will be bounded by computations.
66 static constexpr int kNumBatches = 2;
67 // max_buffer_size is the maximal size (in elements of type T) of array
68 // to be pre-allocated in gpu memory. The size of array determines size of
69 // batch of values for simultaneous copying and processing. It should be large
70 // enough to allow highly-parallel execution of user kernels; making it too
71 // large increases latency.
72 CudaStreamedBuffer(ContextImpl* context, const int max_buffer_size)
73 : kValuesPerBatch(max_buffer_size / kNumBatches),
74 context_(context),
75 values_gpu_(context, kValuesPerBatch * kNumBatches) {
76 static_assert(ContextImpl::kNumCudaStreams >= kNumBatches);
77 CHECK_GE(max_buffer_size, kNumBatches);
78 // Pre-allocate a buffer of page-locked memory for transfers from a regular
79 // cpu memory. Because we will be only writing into that buffer from cpu,
80 // memory is allocated with cudaHostAllocWriteCombined flag.
81 CHECK_EQ(cudaSuccess,
82 cudaHostAlloc(&values_cpu_pinned_,
83 sizeof(T) * kValuesPerBatch * kNumBatches,
84 cudaHostAllocWriteCombined));
85 for (auto& e : copy_finished_) {
86 CHECK_EQ(cudaSuccess,
87 cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
88 }
89 }
90
91 CudaStreamedBuffer(const CudaStreamedBuffer&) = delete;
92
93 ~CudaStreamedBuffer() {
94 CHECK_EQ(cudaSuccess, cudaFreeHost(values_cpu_pinned_));
95 for (auto& e : copy_finished_) {
96 CHECK_EQ(cudaSuccess, cudaEventDestroy(e));
97 }
98 }
99
100 // Transfer num_values at host-memory pointer from, calling
101 // callback(device_pointer, size_of_batch, offset_of_batch, stream_to_use)
102 // after scheduling transfer of each batch of data. User-provided callback
103 // should perform processing of data at device_pointer only in
104 // stream_to_use stream (device_pointer will be re-used in the next
105 // callback invocation with the same stream).
106 //
107 // Two diagrams below describe operation in two possible scenarios, depending
108 // on input data being stored in page-locked memory. In this example we will
109 // have max_buffer_size = 2 * K, num_values = N * K and callback
110 // scheduling a single asynchronous launch of
111 // Kernel<<..., stream_to_use>>(device_pointer,
112 // size_of_batch,
113 // offset_of_batch)
114 //
115 // a. Copying from page-locked memory
116 // In this case no copy on the host-side is necessary, and this method just
117 // schedules a bunch of interleaved memory copies and callback invocations:
118 //
119 // cudaStreamSynchronize(context->DefaultStream());
120 // - Iteration #0:
121 // - cudaMemcpyAsync(values_gpu_, from, K * sizeof(T), H->D, stream_0)
122 // - callback(values_gpu_, K, 0, stream_0)
123 // - Iteration #1:
124 // - cudaMemcpyAsync(values_gpu_ + K, from + K, K * sizeof(T), H->D,
125 // stream_1)
126 // - callback(values_gpu_ + K, K, K, stream_1)
127 // - Iteration #2:
128 // - cudaMemcpyAsync(values_gpu_, from + 2 * K, K * sizeof(T), H->D,
129 // stream_0)
130 // - callback(values_gpu_, K, 2 * K, stream_0)
131 // - Iteration #3:
132 // - cudaMemcpyAsync(values_gpu_ + K, from + 3 * K, K * sizeof(T), H->D,
133 // stream_1)
134 // - callback(values_gpu_ + K, K, 3 * K, stream_1)
135 // ...
136 // - Iteration #i:
137 // - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, from + i * K, K *
138 // sizeof(T), H->D, stream_(i % 2))
139 // - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2)
140 // ...
141 // cudaStreamSynchronize(stream_0)
142 // cudaStreamSynchronize(stream_1)
143 //
144 // This sequence of calls results in following activity on gpu (assuming that
145 // kernel invoked by callback takes less time than host-to-device copy):
146 // +-------------------+-------------------+
147 // | Stream #0 | Stream #1 |
148 // +-------------------+-------------------+
149 // | Copy host->device | |
150 // | | |
151 // | | |
152 // +-------------------+-------------------+
153 // | Kernel | Copy host->device |
154 // +-------------------+ |
155 // | | |
156 // +-------------------+-------------------+
157 // | Copy host->device | Kernel |
158 // | +-------------------+
159 // | | |
160 // +-------------------+-------------------+
161 // | Kernel | Copy host->device |
162 // | ... |
163 // +---------------------------------------+
164 //
165 // b. Copying from regular memory
166 // In this case a copy from regular memory to page-locked memory is required
167 // in order to get asynchrnonous operation. Because pinned memory on host-side
168 // is reused, additional synchronization is required. On each iteration method
169 // the following actions are performed:
170 // - Wait till previous copy operation in stream is completed
171 // - Copy batch of values from input array into pinned memory
172 // - Asynchronously launch host-to-device copy
173 // - Setup event for synchronization on copy completion
174 // - Invoke callback (that launches kernel asynchronously)
175 //
176 // Invocations are performed with the following arguments
177 // cudaStreamSynchronize(context->DefaultStream());
178 // - Iteration #0:
179 // - cudaEventSynchronize(copy_finished_0)
180 // - std::copy_n(from, K, values_cpu_pinned_)
181 // - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
182 // stream_0)
183 // - cudaEventRecord(copy_finished_0, stream_0)
184 // - callback(values_gpu_, K, 0, stream_0)
185 // - Iteration #1:
186 // - cudaEventSynchronize(copy_finished_1)
187 // - std::copy_n(from + K, K, values_cpu_pinned_ + K)
188 // - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
189 // sizeof(T), H->D, stream_1)
190 // - cudaEventRecord(copy_finished_1, stream_1)
191 // - callback(values_gpu_ + K, K, K, stream_1)
192 // - Iteration #2:
193 // - cudaEventSynchronize(copy_finished_0)
194 // - std::copy_n(from + 2 * K, K, values_cpu_pinned_)
195 // - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
196 // stream_0)
197 // - cudaEventRecord(copy_finished_0, stream_0)
198 // - callback(values_gpu_, K, 2 * K, stream_0)
199 // - Iteration #3:
200 // - cudaEventSynchronize(copy_finished_1)
201 // - std::copy_n(from + 3 * K, K, values_cpu_pinned_ + K)
202 // - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
203 // sizeof(T), H->D, stream_1)
204 // - cudaEventRecord(copy_finished_1, stream_1)
205 // - callback(values_gpu_ + K, K, 3 * K, stream_1)
206 // ...
207 // - Iteration #i:
208 // - cudaEventSynchronize(copy_finished_(i % 2))
209 // - std::copy_n(from + i * K, K, values_cpu_pinned_ + (i % 2) * K)
210 // - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, values_cpu_pinned_ + (i %
211 // 2) * K, K * sizeof(T), H->D, stream_(i % 2))
212 // - cudaEventRecord(copy_finished_(i % 2), stream_(i % 2))
213 // - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2))
214 // ...
215 // cudaStreamSynchronize(stream_0)
216 // cudaStreamSynchronize(stream_1)
217 //
218 // This sequence of calls results in following activity on cpu and gpu
219 // (assuming that kernel invoked by callback takes less time than
220 // host-to-device copy and copy in cpu memory, and copy in cpu memory is
221 // faster than host-to-device copy):
222 // +----------------------------+-------------------+-------------------+
223 // | Stream #0 | Stream #0 | Stream #1 |
224 // +----------------------------+-------------------+-------------------+
225 // | Copy to pinned memory | | |
226 // | | | |
227 // +----------------------------+-------------------| |
228 // | Copy to pinned memory | Copy host->device | |
229 // | | | |
230 // +----------------------------+ | |
231 // | Waiting previous h->d copy | | |
232 // +----------------------------+-------------------+-------------------+
233 // | Copy to pinned memory | Kernel | Copy host->device |
234 // | +-------------------+ |
235 // +----------------------------+ | |
236 // | Waiting previous h->d copy | | |
237 // +----------------------------+-------------------+-------------------+
238 // | Copy to pinned memory | Copy host->device | Kernel |
239 // | | +-------------------+
240 // | ... ... |
241 // +----------------------------+---------------------------------------+
242 //
243 template <typename Fun>
244 void CopyToGpu(const T* from, const int num_values, Fun&& callback) {
245 // This synchronization is not required in some cases, but we perform it in
246 // order to avoid situation when user callback depends on data that is
247 // still to be computed in default stream
248 CHECK_EQ(cudaSuccess, cudaStreamSynchronize(context_->DefaultStream()));
249
250 // If pointer to input data does not correspond to page-locked memory,
251 // host-to-device memory copy might be executed synchrnonously (with a copy
252 // to pinned memory happening inside the driver). In that case we perform
253 // copy to a pre-allocated array of page-locked memory.
254 const bool copy_to_pinned_memory = MemoryTypeResultsInSynchronousCopy(from);
255 T* batch_values_gpu[kNumBatches];
256 T* batch_values_cpu[kNumBatches];
257 auto streams = context_->streams_;
258 for (int i = 0; i < kNumBatches; ++i) {
259 batch_values_gpu[i] = values_gpu_.data() + kValuesPerBatch * i;
260 batch_values_cpu[i] = values_cpu_pinned_ + kValuesPerBatch * i;
261 }
262 int batch_id = 0;
263 for (int offset = 0; offset < num_values; offset += kValuesPerBatch) {
264 const int num_values_batch =
265 std::min(num_values - offset, kValuesPerBatch);
266 const T* batch_from = from + offset;
267 T* batch_to = batch_values_gpu[batch_id];
268 auto stream = streams[batch_id];
269 auto copy_finished = copy_finished_[batch_id];
270
271 if (copy_to_pinned_memory) {
272 // Copying values to a temporary buffer should be started only after the
273 // previous copy from temporary buffer to device is completed.
274 CHECK_EQ(cudaSuccess, cudaEventSynchronize(copy_finished));
275 std::copy_n(batch_from, num_values_batch, batch_values_cpu[batch_id]);
276 batch_from = batch_values_cpu[batch_id];
277 }
278 CHECK_EQ(cudaSuccess,
279 cudaMemcpyAsync(batch_to,
280 batch_from,
281 sizeof(T) * num_values_batch,
282 cudaMemcpyHostToDevice,
283 stream));
284 if (copy_to_pinned_memory) {
285 // Next copy to a temporary buffer can start straight after asynchronous
286 // copy is completed (and might be started before kernels asynchronously
287 // executed in stream by user-supplied callback are completed).
288 // No explicit synchronization is required when copying data from
289 // page-locked memory, because memory copy and user kernel execution
290 // with corresponding part of values_gpu_ array is serialized using
291 // stream
292 CHECK_EQ(cudaSuccess, cudaEventRecord(copy_finished, stream));
293 }
294 callback(batch_to, num_values_batch, offset, stream);
295 batch_id = (batch_id + 1) % kNumBatches;
296 }
297 // Explicitly synchronize on all CUDA streams that were utilized.
298 for (int i = 0; i < kNumBatches; ++i) {
299 CHECK_EQ(cudaSuccess, cudaStreamSynchronize(streams[i]));
300 }
301 }
302
303 private:
304 // It is necessary to have all host-to-device copies to be completely
305 // asynchronous. This requires source memory to be allocated in page-locked
306 // memory.
307 static bool MemoryTypeResultsInSynchronousCopy(const void* ptr) {
308 cudaPointerAttributes attributes;
309 auto status = cudaPointerGetAttributes(&attributes, ptr);
310#if CUDART_VERSION < 11000
311 // In CUDA versions prior 11 call to cudaPointerGetAttributes with host
312 // pointer will return cudaErrorInvalidValue
313 if (status == cudaErrorInvalidValue) {
314 return true;
315 }
316#endif
317 CHECK_EQ(status, cudaSuccess);
318 // This class only supports cpu memory as a source
319 CHECK_NE(attributes.type, cudaMemoryTypeDevice);
320 // If host memory was allocated (or registered) with CUDA API, or is a
321 // managed memory, then call to cudaMemcpyAsync will be asynchrnous. In case
322 // of managed memory it might be slightly better to perform a single call of
323 // user-provided call-back (and hope that page migration will provide a
324 // similar throughput with zero efforts from our side).
325 return attributes.type == cudaMemoryTypeUnregistered;
326 }
327
328 const int kValuesPerBatch;
329 ContextImpl* context_ = nullptr;
330 CudaBuffer<T> values_gpu_;
331 T* values_cpu_pinned_ = nullptr;
332 cudaEvent_t copy_finished_[kNumBatches] = {nullptr};
333};
334
335} // namespace ceres::internal
336
337#endif // CERES_NO_CUDA
338#endif // CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_