blob: 951efdcd35c8617eb4eb57313950121bbe2da0cb [file] [log] [blame]
Austin Schuha36c8902019-12-30 18:07:15 -08001#ifndef AOS_EVENTS_LOGGING_LOGFILE_UTILS_H_
2#define AOS_EVENTS_LOGGING_LOGFILE_UTILS_H_
3
4#include <sys/uio.h>
5
Austin Schuh97789fc2020-08-01 14:42:45 -07006#include <chrono>
Austin Schuh05b70472020-01-01 17:11:17 -08007#include <deque>
Austin Schuh97789fc2020-08-01 14:42:45 -07008#include <limits>
9#include <memory>
Austin Schuh05b70472020-01-01 17:11:17 -080010#include <optional>
Austin Schuhfa895892020-01-07 20:07:41 -080011#include <string>
Austin Schuha36c8902019-12-30 18:07:15 -080012#include <string_view>
Brian Silverman98360e22020-04-28 16:51:20 -070013#include <tuple>
Austin Schuh97789fc2020-08-01 14:42:45 -070014#include <utility>
Austin Schuha36c8902019-12-30 18:07:15 -080015#include <vector>
16
Austin Schuh4b5c22a2020-11-30 22:58:43 -080017#include "absl/container/btree_set.h"
Austin Schuh05b70472020-01-01 17:11:17 -080018#include "absl/types/span.h"
Brian Silvermanf51499a2020-09-21 12:49:08 -070019#include "aos/containers/resizeable_buffer.h"
Austin Schuha36c8902019-12-30 18:07:15 -080020#include "aos/events/event_loop.h"
Austin Schuh2dc8c7d2021-07-01 17:41:28 -070021#include "aos/events/logging/boot_timestamp.h"
Brian Silvermanf51499a2020-09-21 12:49:08 -070022#include "aos/events/logging/buffer_encoder.h"
Austin Schuhc41603c2020-10-11 16:17:37 -070023#include "aos/events/logging/logfile_sorting.h"
Austin Schuha36c8902019-12-30 18:07:15 -080024#include "aos/events/logging/logger_generated.h"
Brian Silvermanf51499a2020-09-21 12:49:08 -070025#include "aos/flatbuffers.h"
Austin Schuha36c8902019-12-30 18:07:15 -080026#include "flatbuffers/flatbuffers.h"
27
Brian Silvermanf51499a2020-09-21 12:49:08 -070028namespace aos::logger {
Austin Schuha36c8902019-12-30 18:07:15 -080029
30enum class LogType : uint8_t {
31 // The message originated on this node and should be logged here.
32 kLogMessage,
33 // The message originated on another node, but only the delivery times are
34 // logged here.
35 kLogDeliveryTimeOnly,
36 // The message originated on another node. Log it and the delivery times
37 // together. The message_gateway is responsible for logging any messages
38 // which didn't get delivered.
Austin Schuh6f3babe2020-01-26 20:34:50 -080039 kLogMessageAndDeliveryTime,
40 // The message originated on the other node and should be logged on this node.
41 kLogRemoteMessage
Austin Schuha36c8902019-12-30 18:07:15 -080042};
43
Austin Schuha36c8902019-12-30 18:07:15 -080044// This class manages efficiently writing a sequence of detached buffers to a
Brian Silvermanf51499a2020-09-21 12:49:08 -070045// file. It encodes them, queues them up, and batches the write operation.
Austin Schuha36c8902019-12-30 18:07:15 -080046class DetachedBufferWriter {
47 public:
Brian Silvermana9f2ec92020-10-06 18:00:53 -070048 // Marker struct for one of our constructor overloads.
49 struct already_out_of_space_t {};
50
Brian Silvermanf51499a2020-09-21 12:49:08 -070051 DetachedBufferWriter(std::string_view filename,
52 std::unique_ptr<DetachedBufferEncoder> encoder);
Brian Silvermana9f2ec92020-10-06 18:00:53 -070053 // Creates a dummy instance which won't even open a file. It will act as if
54 // opening the file ran out of space immediately.
55 DetachedBufferWriter(already_out_of_space_t) : ran_out_of_space_(true) {}
Austin Schuh2f8fd752020-09-01 22:38:28 -070056 DetachedBufferWriter(DetachedBufferWriter &&other);
57 DetachedBufferWriter(const DetachedBufferWriter &) = delete;
58
Austin Schuha36c8902019-12-30 18:07:15 -080059 ~DetachedBufferWriter();
60
Austin Schuh2f8fd752020-09-01 22:38:28 -070061 DetachedBufferWriter &operator=(DetachedBufferWriter &&other);
Brian Silverman98360e22020-04-28 16:51:20 -070062 DetachedBufferWriter &operator=(const DetachedBufferWriter &) = delete;
63
Austin Schuh6f3babe2020-01-26 20:34:50 -080064 std::string_view filename() const { return filename_; }
65
Brian Silvermana9f2ec92020-10-06 18:00:53 -070066 // This will be true until Close() is called, unless the file couldn't be
67 // created due to running out of space.
68 bool is_open() const { return fd_ != -1; }
69
Brian Silvermanf51499a2020-09-21 12:49:08 -070070 // Queues up a finished FlatBufferBuilder to be encoded and written.
71 //
72 // Triggers a flush if there's enough data queued up.
73 //
74 // Steals the detached buffer from it.
Austin Schuhbd06ae42021-03-31 22:48:21 -070075 void QueueSizedFlatbuffer(flatbuffers::FlatBufferBuilder *fbb,
76 aos::monotonic_clock::time_point now) {
77 QueueSizedFlatbuffer(fbb->Release(), now);
Brian Silvermanf51499a2020-09-21 12:49:08 -070078 }
79 // May steal the backing storage of buffer, or may leave it alone.
Austin Schuhbd06ae42021-03-31 22:48:21 -070080 void QueueSizedFlatbuffer(flatbuffers::DetachedBuffer &&buffer,
81 aos::monotonic_clock::time_point now) {
82 QueueSizedFlatbuffer(std::move(buffer));
83 FlushAtThreshold(now);
84 }
85 // Unconditionally queues the buffer.
Brian Silvermanf51499a2020-09-21 12:49:08 -070086 void QueueSizedFlatbuffer(flatbuffers::DetachedBuffer &&buffer) {
Brian Silvermana9f2ec92020-10-06 18:00:53 -070087 if (ran_out_of_space_) {
88 return;
89 }
Brian Silvermanf51499a2020-09-21 12:49:08 -070090 encoder_->Encode(std::move(buffer));
Brian Silvermanf51499a2020-09-21 12:49:08 -070091 }
Austin Schuha36c8902019-12-30 18:07:15 -080092
Brian Silvermanf51499a2020-09-21 12:49:08 -070093 // Queues up data in span. May copy or may write it to disk immediately.
94 void QueueSpan(absl::Span<const uint8_t> span);
Austin Schuha36c8902019-12-30 18:07:15 -080095
Brian Silverman0465fcf2020-09-24 00:29:18 -070096 // Indicates we got ENOSPC when trying to write. After this returns true, no
97 // further data is written.
98 bool ran_out_of_space() const { return ran_out_of_space_; }
99
100 // To avoid silently failing to write logfiles, you must call this before
101 // destruction if ran_out_of_space() is true and the situation has been
102 // handled.
103 void acknowledge_out_of_space() {
104 CHECK(ran_out_of_space_);
105 acknowledge_ran_out_of_space_ = true;
106 }
107
108 // Fully flushes and closes the underlying file now. No additional data may be
109 // enqueued after calling this.
110 //
111 // This will be performed in the destructor automatically.
112 //
113 // Note that this may set ran_out_of_space().
114 void Close();
115
Brian Silvermanf51499a2020-09-21 12:49:08 -0700116 // Returns the total number of bytes written and currently queued.
Austin Schuha426f1f2021-03-31 22:27:41 -0700117 size_t total_bytes() const {
118 if (!encoder_) {
119 return 0;
120 }
121 return encoder_->total_bytes();
122 }
Austin Schuha36c8902019-12-30 18:07:15 -0800123
Brian Silvermanf51499a2020-09-21 12:49:08 -0700124 // The maximum time for a single write call, or 0 if none have been performed.
125 std::chrono::nanoseconds max_write_time() const { return max_write_time_; }
126 // The number of bytes in the longest write call, or -1 if none have been
127 // performed.
128 int max_write_time_bytes() const { return max_write_time_bytes_; }
129 // The number of buffers in the longest write call, or -1 if none have been
130 // performed.
131 int max_write_time_messages() const { return max_write_time_messages_; }
132 // The total time spent in write calls.
133 std::chrono::nanoseconds total_write_time() const {
134 return total_write_time_;
135 }
136 // The total number of writes which have been performed.
137 int total_write_count() const { return total_write_count_; }
138 // The total number of messages which have been written.
139 int total_write_messages() const { return total_write_messages_; }
140 // The total number of bytes which have been written.
141 int total_write_bytes() const { return total_write_bytes_; }
142 void ResetStatistics() {
143 max_write_time_ = std::chrono::nanoseconds::zero();
144 max_write_time_bytes_ = -1;
145 max_write_time_messages_ = -1;
146 total_write_time_ = std::chrono::nanoseconds::zero();
147 total_write_count_ = 0;
148 total_write_messages_ = 0;
149 total_write_bytes_ = 0;
150 }
Brian Silverman98360e22020-04-28 16:51:20 -0700151
Austin Schuha36c8902019-12-30 18:07:15 -0800152 private:
Brian Silvermanf51499a2020-09-21 12:49:08 -0700153 // Performs a single writev call with as much of the data we have queued up as
154 // possible.
155 //
156 // This will normally take all of the data we have queued up, unless an
157 // encoder has spit out a big enough chunk all at once that we can't manage
158 // all of it.
159 void Flush();
160
Brian Silverman0465fcf2020-09-24 00:29:18 -0700161 // write_return is what write(2) or writev(2) returned. write_size is the
162 // number of bytes we expected it to write.
163 void HandleWriteReturn(ssize_t write_return, size_t write_size);
164
Brian Silvermanf51499a2020-09-21 12:49:08 -0700165 void UpdateStatsForWrite(aos::monotonic_clock::duration duration,
166 ssize_t written, int iovec_size);
167
168 // Flushes data if we've reached the threshold to do that as part of normal
Austin Schuhbd06ae42021-03-31 22:48:21 -0700169 // operation either due to the outstanding queued data, or because we have
170 // passed our flush period. now is the current time to save some CPU grabbing
171 // the current time. It just needs to be close.
172 void FlushAtThreshold(aos::monotonic_clock::time_point now);
Brian Silvermanf51499a2020-09-21 12:49:08 -0700173
Austin Schuh2f8fd752020-09-01 22:38:28 -0700174 std::string filename_;
Brian Silvermanf51499a2020-09-21 12:49:08 -0700175 std::unique_ptr<DetachedBufferEncoder> encoder_;
Austin Schuh6f3babe2020-01-26 20:34:50 -0800176
Austin Schuha36c8902019-12-30 18:07:15 -0800177 int fd_ = -1;
Brian Silverman0465fcf2020-09-24 00:29:18 -0700178 bool ran_out_of_space_ = false;
179 bool acknowledge_ran_out_of_space_ = false;
Austin Schuha36c8902019-12-30 18:07:15 -0800180
Austin Schuha36c8902019-12-30 18:07:15 -0800181 // List of iovecs to use with writev. This is a member variable to avoid
182 // churn.
183 std::vector<struct iovec> iovec_;
Brian Silvermanf51499a2020-09-21 12:49:08 -0700184
185 std::chrono::nanoseconds max_write_time_ = std::chrono::nanoseconds::zero();
186 int max_write_time_bytes_ = -1;
187 int max_write_time_messages_ = -1;
188 std::chrono::nanoseconds total_write_time_ = std::chrono::nanoseconds::zero();
189 int total_write_count_ = 0;
190 int total_write_messages_ = 0;
191 int total_write_bytes_ = 0;
Austin Schuhbd06ae42021-03-31 22:48:21 -0700192
193 aos::monotonic_clock::time_point last_flush_time_ =
194 aos::monotonic_clock::min_time;
Austin Schuha36c8902019-12-30 18:07:15 -0800195};
196
197// Packes a message pointed to by the context into a MessageHeader.
198flatbuffers::Offset<MessageHeader> PackMessage(
199 flatbuffers::FlatBufferBuilder *fbb, const Context &context,
200 int channel_index, LogType log_type);
201
Austin Schuhfa30c352022-10-16 11:12:02 -0700202// Returns the size that the packed message from PackMessage or
203// PackMessageInline will be.
204flatbuffers::uoffset_t PackMessageSize(LogType log_type,
205 const Context &context);
206
207// Packs the provided message pointed to by context into the provided buffer.
208// This is equivalent to PackMessage, but doesn't require allocating a
209// FlatBufferBuilder underneath.
210size_t PackMessageInline(uint8_t *data, const Context &contex,
211 int channel_index, LogType log_type);
212
Austin Schuh05b70472020-01-01 17:11:17 -0800213// Class to read chunks out of a log file.
214class SpanReader {
215 public:
Austin Schuhcd368422021-11-22 21:23:29 -0800216 SpanReader(std::string_view filename, bool quiet = false);
Austin Schuha36c8902019-12-30 18:07:15 -0800217
Austin Schuh6f3babe2020-01-26 20:34:50 -0800218 std::string_view filename() const { return filename_; }
219
Brian Smarttea913d42021-12-10 15:02:38 -0800220 size_t TotalRead() const { return total_read_; }
221 size_t TotalConsumed() const { return total_consumed_; }
Austin Schuh60e77942022-05-16 17:48:24 -0700222 bool IsIncomplete() const {
223 return is_finished_ && total_consumed_ < total_read_;
224 }
Brian Smarttea913d42021-12-10 15:02:38 -0800225
Austin Schuhcf5f6442021-07-06 10:43:28 -0700226 // Returns a span with the data for the next message from the log file,
227 // including the size. The result is only guarenteed to be valid until
228 // ReadMessage() or PeekMessage() is called again.
Austin Schuh05b70472020-01-01 17:11:17 -0800229 absl::Span<const uint8_t> ReadMessage();
230
Austin Schuhcf5f6442021-07-06 10:43:28 -0700231 // Returns a span with the data for the next message without consuming it.
232 // Multiple calls to PeekMessage return the same data. ReadMessage or
233 // ConsumeMessage must be called to get the next message.
234 absl::Span<const uint8_t> PeekMessage();
235 // Consumes the message so the next call to ReadMessage or PeekMessage returns
236 // new data. This does not invalidate the data.
237 void ConsumeMessage();
238
Austin Schuh05b70472020-01-01 17:11:17 -0800239 private:
240 // TODO(austin): Optimization:
241 // Allocate the 256k blocks like we do today. But, refcount them with
242 // shared_ptr pointed to by the messageheader that is returned. This avoids
243 // the copy. Need to do more benchmarking.
Brian Silvermanf51499a2020-09-21 12:49:08 -0700244 // And (Brian): Consider just mmapping the file and handing out refcounted
245 // pointers into that too.
Austin Schuh05b70472020-01-01 17:11:17 -0800246
247 // Reads a chunk of data into data_. Returns false if no data was read.
248 bool ReadBlock();
249
Austin Schuhc41603c2020-10-11 16:17:37 -0700250 std::string filename_;
Austin Schuh6f3babe2020-01-26 20:34:50 -0800251
Brian Silvermanf51499a2020-09-21 12:49:08 -0700252 // File reader and data decoder.
253 std::unique_ptr<DataDecoder> decoder_;
Austin Schuh05b70472020-01-01 17:11:17 -0800254
Brian Silvermanf51499a2020-09-21 12:49:08 -0700255 // Vector to read into.
256 ResizeableBuffer data_;
Austin Schuh05b70472020-01-01 17:11:17 -0800257
258 // Amount of data consumed already in data_.
259 size_t consumed_data_ = 0;
Brian Smarttea913d42021-12-10 15:02:38 -0800260
261 // Accumulates the total volume of bytes read from filename_
262 size_t total_read_ = 0;
263
264 // Accumulates the total volume of read bytes that were 'consumed' into
265 // messages. May be less than total_read_, if the last message (span) is
266 // either truncated or somehow corrupt.
267 size_t total_consumed_ = 0;
268
269 // Reached the end, no more readable messages.
270 bool is_finished_ = false;
Austin Schuh05b70472020-01-01 17:11:17 -0800271};
272
Brian Silvermanfee16972021-09-14 12:06:38 -0700273// Reads the last header from a log file. This handles any duplicate headers
274// that were written.
275std::optional<SizePrefixedFlatbufferVector<LogFileHeader>> ReadHeader(
276 SpanReader *span_reader);
277std::optional<SizePrefixedFlatbufferVector<LogFileHeader>> ReadHeader(
278 std::string_view filename);
279// Reads the Nth message from a log file, excluding the header. Note: this
280// doesn't handle duplicate headers.
281std::optional<SizePrefixedFlatbufferVector<MessageHeader>> ReadNthMessage(
282 std::string_view filename, size_t n);
283
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700284class UnpackedMessageHeader;
285
Austin Schuh05b70472020-01-01 17:11:17 -0800286// Class which handles reading the header and messages from the log file. This
287// handles any per-file state left before merging below.
288class MessageReader {
289 public:
290 MessageReader(std::string_view filename);
291
Austin Schuh6f3babe2020-01-26 20:34:50 -0800292 std::string_view filename() const { return span_reader_.filename(); }
293
Austin Schuh05b70472020-01-01 17:11:17 -0800294 // Returns the header from the log file.
295 const LogFileHeader *log_file_header() const {
Austin Schuh97789fc2020-08-01 14:42:45 -0700296 return &raw_log_file_header_.message();
297 }
298
299 // Returns the raw data of the header from the log file.
Austin Schuhadd6eb32020-11-09 21:24:26 -0800300 const SizePrefixedFlatbufferVector<LogFileHeader> &raw_log_file_header()
301 const {
Austin Schuh97789fc2020-08-01 14:42:45 -0700302 return raw_log_file_header_;
Austin Schuh05b70472020-01-01 17:11:17 -0800303 }
304
305 // Returns the minimum maount of data needed to queue up for sorting before
306 // ware guarenteed to not see data out of order.
307 std::chrono::nanoseconds max_out_of_order_duration() const {
308 return max_out_of_order_duration_;
309 }
310
Austin Schuhcde938c2020-02-02 17:30:07 -0800311 // Returns the newest timestamp read out of the log file.
Austin Schuh05b70472020-01-01 17:11:17 -0800312 monotonic_clock::time_point newest_timestamp() const {
313 return newest_timestamp_;
314 }
315
316 // Returns the next message if there is one.
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700317 std::shared_ptr<UnpackedMessageHeader> ReadMessage();
Austin Schuh05b70472020-01-01 17:11:17 -0800318
319 // The time at which we need to read another chunk from the logfile.
320 monotonic_clock::time_point queue_data_time() const {
321 return newest_timestamp() - max_out_of_order_duration();
322 }
323
Brian Smarttea913d42021-12-10 15:02:38 -0800324 // Flag value setters for testing
325 void set_crash_on_corrupt_message_flag(bool b) {
326 crash_on_corrupt_message_flag_ = b;
327 }
328 void set_ignore_corrupt_messages_flag(bool b) {
329 ignore_corrupt_messages_flag_ = b;
330 }
331
Austin Schuh05b70472020-01-01 17:11:17 -0800332 private:
333 // Log chunk reader.
334 SpanReader span_reader_;
335
Austin Schuh97789fc2020-08-01 14:42:45 -0700336 // Vector holding the raw data for the log file header.
Austin Schuhadd6eb32020-11-09 21:24:26 -0800337 SizePrefixedFlatbufferVector<LogFileHeader> raw_log_file_header_;
Austin Schuh05b70472020-01-01 17:11:17 -0800338
339 // Minimum amount of data to queue up for sorting before we are guarenteed
340 // to not see data out of order.
341 std::chrono::nanoseconds max_out_of_order_duration_;
342
343 // Timestamp of the newest message in a channel queue.
344 monotonic_clock::time_point newest_timestamp_ = monotonic_clock::min_time;
Brian Smarttea913d42021-12-10 15:02:38 -0800345
346 // Total volume of verifiable messages from the beginning of the file.
347 // TODO - are message counts also useful?
348 size_t total_verified_before_ = 0;
349
350 // Total volume of messages with corrupted flatbuffer formatting, if any.
351 // Excludes corrupted message content.
352 // TODO - if the layout included something as simple as a CRC (relatively
353 // fast and robust enough) for each span, then corrupted content could be
354 // included in this check.
355 size_t total_corrupted_ = 0;
356
357 // Total volume of verifiable messages intermixed with corrupted messages,
358 // if any. Will be == 0 if total_corrupted_ == 0.
359 size_t total_verified_during_ = 0;
360
361 // Total volume of verifiable messages found after the last corrupted one,
362 // if any. Will be == 0 if total_corrupted_ == 0.
363 size_t total_verified_after_ = 0;
364
365 bool is_corrupted() const { return total_corrupted_ > 0; }
366
367 bool crash_on_corrupt_message_flag_ = true;
368 bool ignore_corrupt_messages_flag_ = false;
Austin Schuh05b70472020-01-01 17:11:17 -0800369};
370
Austin Schuhc41603c2020-10-11 16:17:37 -0700371// A class to seamlessly read messages from a list of part files.
372class PartsMessageReader {
373 public:
374 PartsMessageReader(LogParts log_parts);
375
376 std::string_view filename() const { return message_reader_.filename(); }
377
Austin Schuhd2f96102020-12-01 20:27:29 -0800378 // Returns the LogParts that holds the filenames we are reading.
379 const LogParts &parts() const { return parts_; }
380
Austin Schuh4b5c22a2020-11-30 22:58:43 -0800381 const LogFileHeader *log_file_header() const {
382 return message_reader_.log_file_header();
383 }
384
Austin Schuhc41603c2020-10-11 16:17:37 -0700385 // Returns the minimum amount of data needed to queue up for sorting before
386 // we are guarenteed to not see data out of order.
387 std::chrono::nanoseconds max_out_of_order_duration() const {
388 return message_reader_.max_out_of_order_duration();
389 }
390
391 // Returns the newest timestamp read out of the log file.
392 monotonic_clock::time_point newest_timestamp() const {
393 return newest_timestamp_;
394 }
395
396 // Returns the next message if there is one, or nullopt if we have reached the
397 // end of all the files.
398 // Note: reading the next message may change the max_out_of_order_duration().
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700399 std::shared_ptr<UnpackedMessageHeader> ReadMessage();
Austin Schuhc41603c2020-10-11 16:17:37 -0700400
Austin Schuh48507722021-07-17 17:29:24 -0700401 // Returns the boot count for the requested node, or std::nullopt if we don't
402 // know.
403 std::optional<size_t> boot_count(size_t node_index) const {
404 CHECK_GE(node_index, 0u);
405 CHECK_LT(node_index, boot_counts_.size());
406 return boot_counts_[node_index];
407 }
408
Austin Schuhc41603c2020-10-11 16:17:37 -0700409 private:
410 // Opens the next log and updates message_reader_. Sets done_ if there is
411 // nothing more to do.
412 void NextLog();
Austin Schuh48507722021-07-17 17:29:24 -0700413 void ComputeBootCounts();
Austin Schuhc41603c2020-10-11 16:17:37 -0700414
415 const LogParts parts_;
416 size_t next_part_index_ = 1u;
417 bool done_ = false;
418 MessageReader message_reader_;
Brian Silvermanfee16972021-09-14 12:06:38 -0700419 // We instantiate the next one early, to allow implementations to prefetch.
420 // TODO(Brian): To get optimal performance when downloading, this needs more
421 // communication with the implementation to prioritize the next part and add
422 // more parallelism when it helps. Maybe some kind of a queue of parts in
423 // order, and the implementation gets to pull however many make sense off the
424 // front?
425 std::optional<MessageReader> next_message_reader_;
Austin Schuhc41603c2020-10-11 16:17:37 -0700426
Austin Schuh315b96b2020-12-11 21:21:12 -0800427 // True after we have seen a message after the start of the log. The
428 // guarentees on logging essentially are that all data from before the
429 // starting time of the log may be arbitrarily out of order, but once we get
430 // max_out_of_order_duration past the start, everything will remain within
431 // max_out_of_order_duration. We shouldn't see anything before the start
432 // after we've seen a message that is at least max_out_of_order_duration after
433 // the start.
434 bool after_start_ = false;
435
Austin Schuhc41603c2020-10-11 16:17:37 -0700436 monotonic_clock::time_point newest_timestamp_ = monotonic_clock::min_time;
Austin Schuh48507722021-07-17 17:29:24 -0700437
438 // Per node boot counts.
439 std::vector<std::optional<size_t>> boot_counts_;
Austin Schuhc41603c2020-10-11 16:17:37 -0700440};
441
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700442// Stores MessageHeader as a flat header and inline, aligned block of data.
443class UnpackedMessageHeader {
444 public:
445 UnpackedMessageHeader(const UnpackedMessageHeader &) = delete;
446 UnpackedMessageHeader &operator=(const UnpackedMessageHeader &) = delete;
447
448 // The channel.
449 uint32_t channel_index = 0xffffffff;
450
451 monotonic_clock::time_point monotonic_sent_time;
452 realtime_clock::time_point realtime_sent_time;
453
454 // The local queue index.
455 uint32_t queue_index = 0xffffffff;
456
Austin Schuh826e6ce2021-11-18 20:33:10 -0800457 std::optional<aos::monotonic_clock::time_point> monotonic_remote_time;
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700458
459 std::optional<realtime_clock::time_point> realtime_remote_time;
460 std::optional<uint32_t> remote_queue_index;
461
462 // This field is defaulted in the flatbuffer, so we need to store both the
463 // possibly defaulted value and whether it is defaulted.
464 monotonic_clock::time_point monotonic_timestamp_time;
465 bool has_monotonic_timestamp_time;
466
467 static std::shared_ptr<UnpackedMessageHeader> MakeMessage(
468 const MessageHeader &message);
469
470 // Note: we are storing a span here because we need something to put in the
471 // SharedSpan pointer that RawSender takes. We are using the aliasing
472 // constructor of shared_ptr to avoid the allocation, and it needs a nice
473 // pointer to track.
474 absl::Span<const uint8_t> span;
475
476 char actual_data[];
477
478 private:
479 ~UnpackedMessageHeader() {}
480
481 static void DestroyAndFree(UnpackedMessageHeader *p) {
482 p->~UnpackedMessageHeader();
483 free(p);
484 }
485};
486
487std::ostream &operator<<(std::ostream &os,
488 const UnpackedMessageHeader &message);
489
Austin Schuh1be0ce42020-11-29 22:43:26 -0800490// Struct to hold a message as it gets sorted on a single node.
491struct Message {
492 // The channel.
493 uint32_t channel_index = 0xffffffff;
494 // The local queue index.
Austin Schuh58646e22021-08-23 23:51:46 -0700495 // TODO(austin): Technically the boot inside queue_index is redundant with
496 // timestamp. In practice, it is less error-prone to duplicate it. Maybe a
497 // function to return the combined struct?
498 BootQueueIndex queue_index;
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700499 // The local timestamp.
500 BootTimestamp timestamp;
Austin Schuhf16ef6a2021-06-30 21:48:17 -0700501
Austin Schuh48507722021-07-17 17:29:24 -0700502 // Remote boot when this is a timestamp.
503 size_t monotonic_remote_boot = 0xffffff;
504
505 size_t monotonic_timestamp_boot = 0xffffff;
506
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700507 std::shared_ptr<UnpackedMessageHeader> data;
Austin Schuh1be0ce42020-11-29 22:43:26 -0800508
509 bool operator<(const Message &m2) const;
510 bool operator>=(const Message &m2) const;
Austin Schuh8f52ed52020-11-30 23:12:39 -0800511 bool operator==(const Message &m2) const;
Austin Schuh1be0ce42020-11-29 22:43:26 -0800512};
513
514std::ostream &operator<<(std::ostream &os, const Message &m);
515
Austin Schuhd2f96102020-12-01 20:27:29 -0800516// Structure to hold a full message and all the timestamps, which may or may not
517// have been sent from a remote node. The remote_queue_index will be invalid if
518// this message is from the point of view of the node which sent it.
519struct TimestampedMessage {
520 uint32_t channel_index = 0xffffffff;
521
Austin Schuh58646e22021-08-23 23:51:46 -0700522 BootQueueIndex queue_index;
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700523 BootTimestamp monotonic_event_time;
Austin Schuhd2f96102020-12-01 20:27:29 -0800524 realtime_clock::time_point realtime_event_time = realtime_clock::min_time;
525
Austin Schuh58646e22021-08-23 23:51:46 -0700526 BootQueueIndex remote_queue_index;
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700527 BootTimestamp monotonic_remote_time;
Austin Schuhd2f96102020-12-01 20:27:29 -0800528 realtime_clock::time_point realtime_remote_time = realtime_clock::min_time;
529
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700530 BootTimestamp monotonic_timestamp_time;
Austin Schuh8bf1e632021-01-02 22:41:04 -0800531
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700532 std::shared_ptr<UnpackedMessageHeader> data;
Austin Schuhd2f96102020-12-01 20:27:29 -0800533};
534
535std::ostream &operator<<(std::ostream &os, const TimestampedMessage &m);
536
Austin Schuh4b5c22a2020-11-30 22:58:43 -0800537// Class to sort the resulting messages from a PartsMessageReader.
538class LogPartsSorter {
539 public:
540 LogPartsSorter(LogParts log_parts);
541
Austin Schuh0ca51f32020-12-25 21:51:45 -0800542 // Returns the parts that this is sorting messages from.
543 const LogParts &parts() const { return parts_message_reader_.parts(); }
Austin Schuh4b5c22a2020-11-30 22:58:43 -0800544
Austin Schuhd2f96102020-12-01 20:27:29 -0800545 monotonic_clock::time_point monotonic_start_time() const {
Austin Schuh0ca51f32020-12-25 21:51:45 -0800546 return parts().monotonic_start_time;
Austin Schuhd2f96102020-12-01 20:27:29 -0800547 }
548 realtime_clock::time_point realtime_start_time() const {
Austin Schuh0ca51f32020-12-25 21:51:45 -0800549 return parts().realtime_start_time;
Austin Schuhd2f96102020-12-01 20:27:29 -0800550 }
551
Austin Schuh4b5c22a2020-11-30 22:58:43 -0800552 // The time this data is sorted until.
553 monotonic_clock::time_point sorted_until() const { return sorted_until_; }
554
555 // Returns the next sorted message from the log file. It is safe to call
556 // std::move() on the result to move the data flatbuffer from it.
557 Message *Front();
558 // Pops the front message. This should only be called after a call to
559 // Front().
560 void PopFront();
561
562 // Returns a debug string representing the contents of this sorter.
563 std::string DebugString() const;
564
565 private:
566 // Log parts reader we are wrapping.
567 PartsMessageReader parts_message_reader_;
568 // Cache of the time we are sorted until.
569 aos::monotonic_clock::time_point sorted_until_ = monotonic_clock::min_time;
570
Austin Schuhb000de62020-12-03 22:00:40 -0800571 // Timestamp of the last message returned. Used to make sure nothing goes
572 // backwards.
573 monotonic_clock::time_point last_message_time_ = monotonic_clock::min_time;
574
Austin Schuh4b5c22a2020-11-30 22:58:43 -0800575 // Set used for efficient sorting of messages. We can benchmark and evaluate
576 // other data structures if this proves to be the bottleneck.
577 absl::btree_set<Message> messages_;
Austin Schuh48507722021-07-17 17:29:24 -0700578
579 // Mapping from channel to source node.
580 // TODO(austin): Should we put this in Boots so it can be cached for everyone?
581 std::vector<size_t> source_node_index_;
Austin Schuh4b5c22a2020-11-30 22:58:43 -0800582};
583
Austin Schuh8f52ed52020-11-30 23:12:39 -0800584// Class to run merge sort on the messages from multiple LogPartsSorter
585// instances.
586class NodeMerger {
587 public:
Austin Schuhd2f96102020-12-01 20:27:29 -0800588 NodeMerger(std::vector<LogParts> parts);
589
Austin Schuhf16ef6a2021-06-30 21:48:17 -0700590 // Copying and moving will mess up the internal raw pointers. Just don't do
591 // it.
592 NodeMerger(NodeMerger const &) = delete;
593 NodeMerger(NodeMerger &&) = delete;
594 void operator=(NodeMerger const &) = delete;
595 void operator=(NodeMerger &&) = delete;
596
Austin Schuhd2f96102020-12-01 20:27:29 -0800597 // Node index in the configuration of this node.
598 int node() const { return node_; }
Austin Schuh8f52ed52020-11-30 23:12:39 -0800599
Austin Schuh0ca51f32020-12-25 21:51:45 -0800600 // List of parts being sorted together.
601 std::vector<const LogParts *> Parts() const;
602
603 const Configuration *configuration() const {
604 return parts_sorters_[0].parts().config.get();
Austin Schuhd2f96102020-12-01 20:27:29 -0800605 }
606
607 monotonic_clock::time_point monotonic_start_time() const {
608 return monotonic_start_time_;
609 }
610 realtime_clock::time_point realtime_start_time() const {
611 return realtime_start_time_;
Austin Schuh8f52ed52020-11-30 23:12:39 -0800612 }
Austin Schuh5dd22842021-11-17 16:09:39 -0800613 monotonic_clock::time_point monotonic_oldest_time() const {
614 return monotonic_oldest_time_;
615 }
Austin Schuh8f52ed52020-11-30 23:12:39 -0800616
617 // The time this data is sorted until.
618 monotonic_clock::time_point sorted_until() const { return sorted_until_; }
619
620 // Returns the next sorted message from the set of log files. It is safe to
621 // call std::move() on the result to move the data flatbuffer from it.
622 Message *Front();
623 // Pops the front message. This should only be called after a call to
624 // Front().
625 void PopFront();
626
627 private:
628 // Unsorted list of all parts sorters.
Austin Schuhd2f96102020-12-01 20:27:29 -0800629 std::vector<LogPartsSorter> parts_sorters_;
Austin Schuh8f52ed52020-11-30 23:12:39 -0800630 // Pointer to the parts sorter holding the current Front message if one
631 // exists, or nullptr if a new one needs to be found.
632 LogPartsSorter *current_ = nullptr;
633 // Cached sorted_until value.
634 aos::monotonic_clock::time_point sorted_until_ = monotonic_clock::min_time;
Austin Schuhd2f96102020-12-01 20:27:29 -0800635
636 // Cached node.
637 int node_;
638
Austin Schuhb000de62020-12-03 22:00:40 -0800639 // Timestamp of the last message returned. Used to make sure nothing goes
640 // backwards.
641 monotonic_clock::time_point last_message_time_ = monotonic_clock::min_time;
642
Austin Schuhd2f96102020-12-01 20:27:29 -0800643 realtime_clock::time_point realtime_start_time_ = realtime_clock::max_time;
644 monotonic_clock::time_point monotonic_start_time_ = monotonic_clock::max_time;
Austin Schuh60e77942022-05-16 17:48:24 -0700645 monotonic_clock::time_point monotonic_oldest_time_ =
646 monotonic_clock::max_time;
Austin Schuhd2f96102020-12-01 20:27:29 -0800647};
648
Austin Schuhf16ef6a2021-06-30 21:48:17 -0700649// Class to concatenate multiple boots worth of logs into a single per-node
650// stream.
651class BootMerger {
652 public:
653 BootMerger(std::vector<LogParts> file);
654
655 // Copying and moving will mess up the internal raw pointers. Just don't do
656 // it.
657 BootMerger(BootMerger const &) = delete;
658 BootMerger(BootMerger &&) = delete;
659 void operator=(BootMerger const &) = delete;
660 void operator=(BootMerger &&) = delete;
661
662 // Node index in the configuration of this node.
663 int node() const { return node_mergers_[0]->node(); }
664
665 // List of parts being sorted together.
666 std::vector<const LogParts *> Parts() const;
667
668 const Configuration *configuration() const {
669 return node_mergers_[0]->configuration();
670 }
671
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700672 monotonic_clock::time_point monotonic_start_time(size_t boot) const {
673 CHECK_LT(boot, node_mergers_.size());
674 return node_mergers_[boot]->monotonic_start_time();
Austin Schuhf16ef6a2021-06-30 21:48:17 -0700675 }
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700676 realtime_clock::time_point realtime_start_time(size_t boot) const {
677 CHECK_LT(boot, node_mergers_.size());
678 return node_mergers_[boot]->realtime_start_time();
Austin Schuhf16ef6a2021-06-30 21:48:17 -0700679 }
Austin Schuh5dd22842021-11-17 16:09:39 -0800680 monotonic_clock::time_point monotonic_oldest_time(size_t boot) const {
681 CHECK_LT(boot, node_mergers_.size());
682 return node_mergers_[boot]->monotonic_oldest_time();
683 }
Austin Schuhf16ef6a2021-06-30 21:48:17 -0700684
685 bool started() const {
686 return node_mergers_[index_]->sorted_until() != monotonic_clock::min_time ||
687 index_ != 0;
688 }
689
690 // Returns the next sorted message from the set of log files. It is safe to
691 // call std::move() on the result to move the data flatbuffer from it.
692 Message *Front();
693 // Pops the front message. This should only be called after a call to
694 // Front().
695 void PopFront();
696
697 private:
698 int index_ = 0;
699
700 // TODO(austin): Sanjay points out this is pretty inefficient. Don't keep so
701 // many things open.
702 std::vector<std::unique_ptr<NodeMerger>> node_mergers_;
703};
704
Austin Schuhd2f96102020-12-01 20:27:29 -0800705// Class to match timestamps with the corresponding data from other nodes.
Austin Schuh79b30942021-01-24 22:32:21 -0800706//
707// This class also buffers data for the node it represents, and supports
708// notifying when new data is queued as well as queueing until a point in time.
Austin Schuhd2f96102020-12-01 20:27:29 -0800709class TimestampMapper {
710 public:
711 TimestampMapper(std::vector<LogParts> file);
712
713 // Copying and moving will mess up the internal raw pointers. Just don't do
714 // it.
715 TimestampMapper(TimestampMapper const &) = delete;
716 TimestampMapper(TimestampMapper &&) = delete;
717 void operator=(TimestampMapper const &) = delete;
718 void operator=(TimestampMapper &&) = delete;
719
720 // TODO(austin): It would be super helpful to provide a way to queue up to
721 // time X without matching timestamps, and to then be able to pull the
722 // timestamps out of this queue. This lets us bootstrap time estimation
723 // without exploding memory usage worst case.
724
Austin Schuh0ca51f32020-12-25 21:51:45 -0800725 const Configuration *configuration() const { return configuration_.get(); }
Austin Schuhd2f96102020-12-01 20:27:29 -0800726
727 // Returns which node this is sorting for.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700728 size_t node() const { return boot_merger_.node(); }
Austin Schuhd2f96102020-12-01 20:27:29 -0800729
730 // The start time of this log.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700731 monotonic_clock::time_point monotonic_start_time(size_t boot) const {
732 return boot_merger_.monotonic_start_time(boot);
Austin Schuhd2f96102020-12-01 20:27:29 -0800733 }
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700734 realtime_clock::time_point realtime_start_time(size_t boot) const {
735 return boot_merger_.realtime_start_time(boot);
Austin Schuhd2f96102020-12-01 20:27:29 -0800736 }
Austin Schuh5dd22842021-11-17 16:09:39 -0800737 // Returns the oldest timestamp on a message on this boot.
738 monotonic_clock::time_point monotonic_oldest_time(size_t boot) const {
739 return boot_merger_.monotonic_oldest_time(boot);
740 }
Austin Schuhd2f96102020-12-01 20:27:29 -0800741
742 // Uses timestamp_mapper as the peer for its node. Only one mapper may be set
743 // for each node. Peers are used to look up the data for timestamps on this
744 // node.
745 void AddPeer(TimestampMapper *timestamp_mapper);
746
Austin Schuh24bf4972021-06-29 22:09:08 -0700747 // Returns true if anything has been queued up.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700748 bool started() const { return boot_merger_.started(); }
Austin Schuhd2f96102020-12-01 20:27:29 -0800749
750 // Returns the next message for this node.
751 TimestampedMessage *Front();
752 // Pops the next message. Front must be called first.
753 void PopFront();
754
755 // Returns debug information about this node.
756 std::string DebugString() const;
757
Austin Schuh79b30942021-01-24 22:32:21 -0800758 // Queues data the provided time.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700759 void QueueUntil(BootTimestamp queue_time);
Austin Schuhe639ea12021-01-25 13:00:22 -0800760 // Queues until we have time_estimation_buffer of data in the queue.
761 void QueueFor(std::chrono::nanoseconds time_estimation_buffer);
Austin Schuh79b30942021-01-24 22:32:21 -0800762
Austin Schuh06601222021-01-26 17:02:50 -0800763 // Queues until the condition is met.
764 template <typename T>
765 void QueueUntilCondition(T fn) {
766 while (true) {
767 if (fn()) {
768 break;
769 }
770 if (!QueueMatched()) {
771 break;
772 }
773 }
774 }
775
Austin Schuh79b30942021-01-24 22:32:21 -0800776 // Sets a callback to be called whenever a full message is queued.
777 void set_timestamp_callback(std::function<void(TimestampedMessage *)> fn) {
778 timestamp_callback_ = fn;
779 }
780
Austin Schuhd2f96102020-12-01 20:27:29 -0800781 private:
782 // The state for a remote node. This holds the data that needs to be matched
783 // with the remote node's timestamps.
784 struct NodeData {
785 // True if we should save data here. This should be true if any of the
786 // bools in delivered below are true.
787 bool any_delivered = false;
788
Austin Schuh36c00932021-07-19 18:13:21 -0700789 // True if we have a peer and therefore should be saving data for it.
790 bool save_for_peer = false;
791
Austin Schuhd2f96102020-12-01 20:27:29 -0800792 // Peer pointer. This node is only to be considered if a peer is set.
793 TimestampMapper *peer = nullptr;
794
795 struct ChannelData {
796 // Deque per channel. This contains the data from the outside
797 // TimestampMapper node which is relevant for the node this NodeData
798 // points to.
799 std::deque<Message> messages;
800 // Bool tracking per channel if a message is delivered to the node this
801 // NodeData represents.
802 bool delivered = false;
Austin Schuh6a7358f2021-11-18 22:40:40 -0800803 // The TTL for delivery.
804 std::chrono::nanoseconds time_to_live = std::chrono::nanoseconds(0);
Austin Schuhd2f96102020-12-01 20:27:29 -0800805 };
806
807 // Vector with per channel data.
808 std::vector<ChannelData> channels;
809 };
810
811 // Returns (and forgets about) the data for the provided timestamp message
812 // showing when it was delivered to this node.
813 Message MatchingMessageFor(const Message &message);
814
815 // Queues up a single message into our message queue, and any nodes that this
816 // message is delivered to. Returns true if one was available, false
817 // otherwise.
818 bool Queue();
819
Austin Schuh79b30942021-01-24 22:32:21 -0800820 // Queues up a single matched message into our matched message queue. Returns
821 // true if one was queued, and false otherwise.
822 bool QueueMatched();
823
Austin Schuhd2f96102020-12-01 20:27:29 -0800824 // Queues up data until we have at least one message >= to time t.
825 // Useful for triggering a remote node to read enough data to have the
826 // timestamp you care about available.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700827 void QueueUnmatchedUntil(BootTimestamp t);
Austin Schuhd2f96102020-12-01 20:27:29 -0800828
Austin Schuh79b30942021-01-24 22:32:21 -0800829 // Queues m into matched_messages_.
830 void QueueMessage(Message *m);
Austin Schuhd2f96102020-12-01 20:27:29 -0800831
Austin Schuh58646e22021-08-23 23:51:46 -0700832 // Returns the name of the node this class is sorting for.
833 std::string_view node_name() const {
834 return configuration_->has_nodes() ? configuration_->nodes()
835 ->Get(boot_merger_.node())
836 ->name()
837 ->string_view()
838 : "(single node)";
839 }
840
Austin Schuhd2f96102020-12-01 20:27:29 -0800841 // The node merger to source messages from.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700842 BootMerger boot_merger_;
Austin Schuh0ca51f32020-12-25 21:51:45 -0800843
844 std::shared_ptr<const Configuration> configuration_;
845
Austin Schuhd2f96102020-12-01 20:27:29 -0800846 // The buffer of messages for this node. These are not matched with any
847 // remote data.
848 std::deque<Message> messages_;
849 // The node index for the source node for each channel.
850 std::vector<size_t> source_node_;
851
852 // Vector per node. Not all nodes will have anything.
853 std::vector<NodeData> nodes_data_;
854
855 // Latest message to return.
Austin Schuh79b30942021-01-24 22:32:21 -0800856 std::deque<TimestampedMessage> matched_messages_;
Austin Schuhd2f96102020-12-01 20:27:29 -0800857
Austin Schuh79b30942021-01-24 22:32:21 -0800858 // Tracks the state of the first message in matched_messages_. Do we need to
859 // update it, is it valid, or should we return nullptr?
Austin Schuhd2f96102020-12-01 20:27:29 -0800860 enum class FirstMessage {
861 kNeedsUpdate,
862 kInMessage,
863 kNullptr,
864 };
865 FirstMessage first_message_ = FirstMessage::kNeedsUpdate;
866
867 // Timestamp of the last message returned. Used to make sure nothing goes
868 // backwards.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700869 BootTimestamp last_message_time_ = BootTimestamp::min_time();
Austin Schuh6a7358f2021-11-18 22:40:40 -0800870 BootTimestamp last_popped_message_time_ = BootTimestamp::min_time();
Austin Schuhd2f96102020-12-01 20:27:29 -0800871 // Time this node is queued up until. Used for caching.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700872 BootTimestamp queued_until_ = BootTimestamp::min_time();
Austin Schuh79b30942021-01-24 22:32:21 -0800873
874 std::function<void(TimestampedMessage *)> timestamp_callback_;
Austin Schuh8f52ed52020-11-30 23:12:39 -0800875};
876
Austin Schuhee711052020-08-24 16:06:09 -0700877// Returns the node name with a trailing space, or an empty string if we are on
878// a single node.
879std::string MaybeNodeName(const Node *);
880
Brian Silvermanf51499a2020-09-21 12:49:08 -0700881} // namespace aos::logger
Austin Schuha36c8902019-12-30 18:07:15 -0800882
883#endif // AOS_EVENTS_LOGGING_LOGFILE_UTILS_H_