Blame - internal/ceres/cuda_streamed_buffer.h - RealtimeRoboticsGroup/test

Austin Schuh

3de38b0

2024-06-25 18:25:10 -0700

[diff] [blame^]

1

// Ceres Solver - A fast non-linear least squares minimizer

2

3

// http://ceres-solver.org/

4

//

5

// Redistribution and use in source and binary forms, with or without

6

// modification, are permitted provided that the following conditions are met:

7

//

8

// * Redistributions of source code must retain the above copyright notice,

9

// this list of conditions and the following disclaimer.

10

// * Redistributions in binary form must reproduce the above copyright notice,

11

// this list of conditions and the following disclaimer in the documentation

12

// and/or other materials provided with the distribution.

13

// * Neither the name of Google Inc. nor the names of its contributors may be

14

// used to endorse or promote products derived from this software without

15

// specific prior written permission.

16

//

17

// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

18

// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

19

// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

20

// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

21

// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

22

// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

23

// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

24

// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

25

// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

26

// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

27

// POSSIBILITY OF SUCH DAMAGE.

28

//

29

// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

30

31

#ifndef CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_

32

#define CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_

33

34

#include "ceres/internal/config.h"

35

36

#ifndef CERES_NO_CUDA

#include <algorithm>

#include "ceres/cuda_buffer.h"

41

42

namespace ceres::internal {

43

44

// Most contemporary CUDA devices are capable of simultaneous code execution and

45

// host-to-device transfer. This class copies batches of data to GPU memory and

46

// executes processing of copied data in parallel (asynchronously).

47

// Data is copied to a fixed-size buffer on GPU (containing at most

48

// max_buffer_size values), and this memory is re-used when the previous

49

// batch of values is processed by user-provided callback

50

// Host-to-device copy uses a temporary buffer if required. Each batch of values

51

// has size of kValuesPerBatch, except the last one.

52

template <typename T>

53

class CERES_NO_EXPORT CudaStreamedBuffer {

54

public:

55

// If hardware supports only one host-to-device copy or one host-to-device

56

// copy is able to reach peak bandwidth, two streams are sufficient to reach

57

// maximum efficiency:

58

// - If transferring batch of values takes more time, than processing it on

59

// gpu, then at every moment of time one of the streams will be transferring

60

// data and other stream will be either processing data or idle; the whole

61

// process will be bounded by host-to-device copy.

62

// - If transferring batch of values takes less time, than processing it on

63

// gpu, then at every moment of time one of the streams will be processing

64

// data and other stream will be either performing computations or

65

// transferring data, and the whole process will be bounded by computations.

66

static constexpr int kNumBatches = 2;

67

// max_buffer_size is the maximal size (in elements of type T) of array

68

// to be pre-allocated in gpu memory. The size of array determines size of

69

// batch of values for simultaneous copying and processing. It should be large

70

// enough to allow highly-parallel execution of user kernels; making it too

71

// large increases latency.

72

CudaStreamedBuffer(ContextImpl* context, const int max_buffer_size)

73

: kValuesPerBatch(max_buffer_size / kNumBatches),

74

context_(context),

75

values_gpu_(context, kValuesPerBatch * kNumBatches) {

76

static_assert(ContextImpl::kNumCudaStreams >= kNumBatches);

77

CHECK_GE(max_buffer_size, kNumBatches);

78

// Pre-allocate a buffer of page-locked memory for transfers from a regular

79

// cpu memory. Because we will be only writing into that buffer from cpu,

80

// memory is allocated with cudaHostAllocWriteCombined flag.

81

CHECK_EQ(cudaSuccess,

82

cudaHostAlloc(&values_cpu_pinned_,

83

sizeof(T) * kValuesPerBatch * kNumBatches,

84

cudaHostAllocWriteCombined));

85

for (auto& e : copy_finished_) {

86

CHECK_EQ(cudaSuccess,

87

cudaEventCreateWithFlags(&e, cudaEventDisableTiming));

}

}

CudaStreamedBuffer(const CudaStreamedBuffer&) = delete;

92

93

~CudaStreamedBuffer() {

94

CHECK_EQ(cudaSuccess, cudaFreeHost(values_cpu_pinned_));

95

for (auto& e : copy_finished_) {

96

CHECK_EQ(cudaSuccess, cudaEventDestroy(e));

}

}

// Transfer num_values at host-memory pointer from, calling

101

// callback(device_pointer, size_of_batch, offset_of_batch, stream_to_use)

102

// after scheduling transfer of each batch of data. User-provided callback

103

// should perform processing of data at device_pointer only in

104

// stream_to_use stream (device_pointer will be re-used in the next

105

// callback invocation with the same stream).

106

//

107

// Two diagrams below describe operation in two possible scenarios, depending

108

// on input data being stored in page-locked memory. In this example we will

109

// have max_buffer_size = 2 * K, num_values = N * K and callback

110

// scheduling a single asynchronous launch of

111

// Kernel<<..., stream_to_use>>(device_pointer,

// size_of_batch,

// offset_of_batch)

//

// a. Copying from page-locked memory

116

// In this case no copy on the host-side is necessary, and this method just

117

// schedules a bunch of interleaved memory copies and callback invocations:

118

//

119

// cudaStreamSynchronize(context->DefaultStream());

120

// - Iteration #0:

121

// - cudaMemcpyAsync(values_gpu_, from, K * sizeof(T), H->D, stream_0)

122

// - callback(values_gpu_, K, 0, stream_0)

123

// - Iteration #1:

124

// - cudaMemcpyAsync(values_gpu_ + K, from + K, K * sizeof(T), H->D,

125

// stream_1)

126

// - callback(values_gpu_ + K, K, K, stream_1)

127

// - Iteration #2:

128

// - cudaMemcpyAsync(values_gpu_, from + 2 * K, K * sizeof(T), H->D,

129

// stream_0)

130

// - callback(values_gpu_, K, 2 * K, stream_0)

131

// - Iteration #3:

132

// - cudaMemcpyAsync(values_gpu_ + K, from + 3 * K, K * sizeof(T), H->D,

133

// stream_1)

134

// - callback(values_gpu_ + K, K, 3 * K, stream_1)

135

// ...

136

// - Iteration #i:

137

// - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, from + i * K, K *

138

// sizeof(T), H->D, stream_(i % 2))

139

// - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2)

140

// ...

141

// cudaStreamSynchronize(stream_0)

142

// cudaStreamSynchronize(stream_1)

143

//

144

// This sequence of calls results in following activity on gpu (assuming that

145

// kernel invoked by callback takes less time than host-to-device copy):

146

// +-------------------+-------------------+

147

// | Stream #0 | Stream #1 |

148

// +-------------------+-------------------+

149

// | Copy host->device | |

150

// | | |

151

// | | |

152

// +-------------------+-------------------+

153

// | Kernel | Copy host->device |

154

// +-------------------+ |

155

// | | |

156

// +-------------------+-------------------+

157

// | Copy host->device | Kernel |

158

// | +-------------------+

159

// | | |

160

// +-------------------+-------------------+

161

// | Kernel | Copy host->device |

162

// | ... |

163

// +---------------------------------------+

164

//

165

// b. Copying from regular memory

166

// In this case a copy from regular memory to page-locked memory is required

167

// in order to get asynchrnonous operation. Because pinned memory on host-side

168

// is reused, additional synchronization is required. On each iteration method

169

// the following actions are performed:

170

// - Wait till previous copy operation in stream is completed

171

// - Copy batch of values from input array into pinned memory

172

// - Asynchronously launch host-to-device copy

173

// - Setup event for synchronization on copy completion

174

// - Invoke callback (that launches kernel asynchronously)

175

//

176

// Invocations are performed with the following arguments

177

// cudaStreamSynchronize(context->DefaultStream());

178

// - Iteration #0:

179

// - cudaEventSynchronize(copy_finished_0)

180

// - std::copy_n(from, K, values_cpu_pinned_)

181

// - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,

182

// stream_0)

183

// - cudaEventRecord(copy_finished_0, stream_0)

184

// - callback(values_gpu_, K, 0, stream_0)

185

// - Iteration #1:

186

// - cudaEventSynchronize(copy_finished_1)

187

// - std::copy_n(from + K, K, values_cpu_pinned_ + K)

188

// - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *

189

// sizeof(T), H->D, stream_1)

190

// - cudaEventRecord(copy_finished_1, stream_1)

191

// - callback(values_gpu_ + K, K, K, stream_1)

192

// - Iteration #2:

193

// - cudaEventSynchronize(copy_finished_0)

194

// - std::copy_n(from + 2 * K, K, values_cpu_pinned_)

195

// - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,

196

// stream_0)

197

// - cudaEventRecord(copy_finished_0, stream_0)

198

// - callback(values_gpu_, K, 2 * K, stream_0)

199

// - Iteration #3:

200

// - cudaEventSynchronize(copy_finished_1)

201

// - std::copy_n(from + 3 * K, K, values_cpu_pinned_ + K)

202

// - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *

203

// sizeof(T), H->D, stream_1)

204

// - cudaEventRecord(copy_finished_1, stream_1)

205

// - callback(values_gpu_ + K, K, 3 * K, stream_1)

206

// ...

207

// - Iteration #i:

208

// - cudaEventSynchronize(copy_finished_(i % 2))

209

// - std::copy_n(from + i * K, K, values_cpu_pinned_ + (i % 2) * K)

210

// - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, values_cpu_pinned_ + (i %

211

// 2) * K, K * sizeof(T), H->D, stream_(i % 2))

212

// - cudaEventRecord(copy_finished_(i % 2), stream_(i % 2))

213

// - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2))

214

// ...

215

// cudaStreamSynchronize(stream_0)

216

// cudaStreamSynchronize(stream_1)

217

//

218

// This sequence of calls results in following activity on cpu and gpu

219

// (assuming that kernel invoked by callback takes less time than

220

// host-to-device copy and copy in cpu memory, and copy in cpu memory is

221

// faster than host-to-device copy):

222

// +----------------------------+-------------------+-------------------+

223

// | Stream #0 | Stream #0 | Stream #1 |

224

// +----------------------------+-------------------+-------------------+

225

// | Copy to pinned memory | | |

226

// | | | |

227

// +----------------------------+-------------------| |

228

// | Copy to pinned memory | Copy host->device | |

229

// | | | |

230

// +----------------------------+ | |

231

// | Waiting previous h->d copy | | |

232

// +----------------------------+-------------------+-------------------+

233

// | Copy to pinned memory | Kernel | Copy host->device |

234

// | +-------------------+ |

235

// +----------------------------+ | |

236

// | Waiting previous h->d copy | | |

237

// +----------------------------+-------------------+-------------------+

238

// | Copy to pinned memory | Copy host->device | Kernel |

239

// | | +-------------------+

240

// | ... ... |

241

// +----------------------------+---------------------------------------+

242

//

243

template <typename Fun>

244

void CopyToGpu(const T* from, const int num_values, Fun&& callback) {

245

// This synchronization is not required in some cases, but we perform it in

246

// order to avoid situation when user callback depends on data that is

247

// still to be computed in default stream

248

CHECK_EQ(cudaSuccess, cudaStreamSynchronize(context_->DefaultStream()));

249

250

// If pointer to input data does not correspond to page-locked memory,

251

// host-to-device memory copy might be executed synchrnonously (with a copy

252

// to pinned memory happening inside the driver). In that case we perform

253

// copy to a pre-allocated array of page-locked memory.

254

const bool copy_to_pinned_memory = MemoryTypeResultsInSynchronousCopy(from);

255

T* batch_values_gpu[kNumBatches];

256

T* batch_values_cpu[kNumBatches];

257

auto streams = context_->streams_;

258

for (int i = 0; i < kNumBatches; ++i) {

259

batch_values_gpu[i] = values_gpu_.data() + kValuesPerBatch * i;

260

batch_values_cpu[i] = values_cpu_pinned_ + kValuesPerBatch * i;

261

}

262

int batch_id = 0;

263

for (int offset = 0; offset < num_values; offset += kValuesPerBatch) {

264

const int num_values_batch =

265

std::min(num_values - offset, kValuesPerBatch);

266

const T* batch_from = from + offset;

267

T* batch_to = batch_values_gpu[batch_id];

268

auto stream = streams[batch_id];

269

auto copy_finished = copy_finished_[batch_id];

270

271

if (copy_to_pinned_memory) {

272

// Copying values to a temporary buffer should be started only after the

273

// previous copy from temporary buffer to device is completed.

274

CHECK_EQ(cudaSuccess, cudaEventSynchronize(copy_finished));

275

std::copy_n(batch_from, num_values_batch, batch_values_cpu[batch_id]);

276

batch_from = batch_values_cpu[batch_id];

277

}

278

CHECK_EQ(cudaSuccess,

279

cudaMemcpyAsync(batch_to,

280

batch_from,

281

sizeof(T) * num_values_batch,

282

cudaMemcpyHostToDevice,

283

stream));

284

if (copy_to_pinned_memory) {

285

// Next copy to a temporary buffer can start straight after asynchronous

286

// copy is completed (and might be started before kernels asynchronously

287

// executed in stream by user-supplied callback are completed).

288

// No explicit synchronization is required when copying data from

289

// page-locked memory, because memory copy and user kernel execution

290

// with corresponding part of values_gpu_ array is serialized using

291

// stream

292

CHECK_EQ(cudaSuccess, cudaEventRecord(copy_finished, stream));

293

}

294

callback(batch_to, num_values_batch, offset, stream);

295

batch_id = (batch_id + 1) % kNumBatches;

296

}

297

// Explicitly synchronize on all CUDA streams that were utilized.

298

for (int i = 0; i < kNumBatches; ++i) {

299

CHECK_EQ(cudaSuccess, cudaStreamSynchronize(streams[i]));

}

}

private:

// It is necessary to have all host-to-device copies to be completely

305

// asynchronous. This requires source memory to be allocated in page-locked

306

// memory.

307

static bool MemoryTypeResultsInSynchronousCopy(const void* ptr) {

308

cudaPointerAttributes attributes;

309

auto status = cudaPointerGetAttributes(&attributes, ptr);

310

#if CUDART_VERSION < 11000

311

// In CUDA versions prior 11 call to cudaPointerGetAttributes with host

312

// pointer will return cudaErrorInvalidValue

313

if (status == cudaErrorInvalidValue) {

return true;

}

#endif

CHECK_EQ(status, cudaSuccess);

318

// This class only supports cpu memory as a source

319

CHECK_NE(attributes.type, cudaMemoryTypeDevice);

320

// If host memory was allocated (or registered) with CUDA API, or is a

321

// managed memory, then call to cudaMemcpyAsync will be asynchrnous. In case

322

// of managed memory it might be slightly better to perform a single call of

323

// user-provided call-back (and hope that page migration will provide a

324

// similar throughput with zero efforts from our side).

325

return attributes.type == cudaMemoryTypeUnregistered;

326

}

327

328

const int kValuesPerBatch;

329

ContextImpl* context_ = nullptr;

330

CudaBuffer<T> values_gpu_;

331

T* values_cpu_pinned_ = nullptr;

332

cudaEvent_t copy_finished_[kNumBatches] = {nullptr};

333

};

334

335

} // namespace ceres::internal

336

337

#endif // CERES_NO_CUDA

338

#endif // CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_