blob: 7105b0c4dd04b2650350beb81cb0bdecab50400b [file] [log] [blame]
Austin Schuha36c8902019-12-30 18:07:15 -08001#ifndef AOS_EVENTS_LOGGING_LOGFILE_UTILS_H_
2#define AOS_EVENTS_LOGGING_LOGFILE_UTILS_H_
3
4#include <sys/uio.h>
5
Austin Schuh97789fc2020-08-01 14:42:45 -07006#include <chrono>
Austin Schuh05b70472020-01-01 17:11:17 -08007#include <deque>
Austin Schuh97789fc2020-08-01 14:42:45 -07008#include <limits>
9#include <memory>
Austin Schuh05b70472020-01-01 17:11:17 -080010#include <optional>
Austin Schuhfa895892020-01-07 20:07:41 -080011#include <string>
Austin Schuha36c8902019-12-30 18:07:15 -080012#include <string_view>
Brian Silverman98360e22020-04-28 16:51:20 -070013#include <tuple>
Austin Schuh97789fc2020-08-01 14:42:45 -070014#include <utility>
Austin Schuha36c8902019-12-30 18:07:15 -080015#include <vector>
16
Austin Schuh4b5c22a2020-11-30 22:58:43 -080017#include "absl/container/btree_set.h"
Austin Schuh05b70472020-01-01 17:11:17 -080018#include "absl/types/span.h"
Brian Silvermanf51499a2020-09-21 12:49:08 -070019#include "aos/containers/resizeable_buffer.h"
Austin Schuha36c8902019-12-30 18:07:15 -080020#include "aos/events/event_loop.h"
Austin Schuh2dc8c7d2021-07-01 17:41:28 -070021#include "aos/events/logging/boot_timestamp.h"
Brian Silvermanf51499a2020-09-21 12:49:08 -070022#include "aos/events/logging/buffer_encoder.h"
Austin Schuhc41603c2020-10-11 16:17:37 -070023#include "aos/events/logging/logfile_sorting.h"
Austin Schuha36c8902019-12-30 18:07:15 -080024#include "aos/events/logging/logger_generated.h"
Brian Silvermanf51499a2020-09-21 12:49:08 -070025#include "aos/flatbuffers.h"
Austin Schuhf2d0e682022-10-16 14:20:58 -070026#include "aos/network/remote_message_generated.h"
Austin Schuha36c8902019-12-30 18:07:15 -080027#include "flatbuffers/flatbuffers.h"
28
Brian Silvermanf51499a2020-09-21 12:49:08 -070029namespace aos::logger {
Austin Schuha36c8902019-12-30 18:07:15 -080030
31enum class LogType : uint8_t {
32 // The message originated on this node and should be logged here.
33 kLogMessage,
34 // The message originated on another node, but only the delivery times are
35 // logged here.
36 kLogDeliveryTimeOnly,
37 // The message originated on another node. Log it and the delivery times
38 // together. The message_gateway is responsible for logging any messages
39 // which didn't get delivered.
Austin Schuh6f3babe2020-01-26 20:34:50 -080040 kLogMessageAndDeliveryTime,
41 // The message originated on the other node and should be logged on this node.
42 kLogRemoteMessage
Austin Schuha36c8902019-12-30 18:07:15 -080043};
44
Austin Schuha36c8902019-12-30 18:07:15 -080045// This class manages efficiently writing a sequence of detached buffers to a
Brian Silvermanf51499a2020-09-21 12:49:08 -070046// file. It encodes them, queues them up, and batches the write operation.
Austin Schuha36c8902019-12-30 18:07:15 -080047class DetachedBufferWriter {
48 public:
Brian Silvermana9f2ec92020-10-06 18:00:53 -070049 // Marker struct for one of our constructor overloads.
50 struct already_out_of_space_t {};
51
Brian Silvermanf51499a2020-09-21 12:49:08 -070052 DetachedBufferWriter(std::string_view filename,
Austin Schuh48d10d62022-10-16 22:19:23 -070053 std::unique_ptr<DataEncoder> encoder);
Brian Silvermana9f2ec92020-10-06 18:00:53 -070054 // Creates a dummy instance which won't even open a file. It will act as if
55 // opening the file ran out of space immediately.
56 DetachedBufferWriter(already_out_of_space_t) : ran_out_of_space_(true) {}
Austin Schuh2f8fd752020-09-01 22:38:28 -070057 DetachedBufferWriter(DetachedBufferWriter &&other);
58 DetachedBufferWriter(const DetachedBufferWriter &) = delete;
59
Austin Schuha36c8902019-12-30 18:07:15 -080060 ~DetachedBufferWriter();
61
Austin Schuh2f8fd752020-09-01 22:38:28 -070062 DetachedBufferWriter &operator=(DetachedBufferWriter &&other);
Brian Silverman98360e22020-04-28 16:51:20 -070063 DetachedBufferWriter &operator=(const DetachedBufferWriter &) = delete;
64
Austin Schuh6f3babe2020-01-26 20:34:50 -080065 std::string_view filename() const { return filename_; }
66
Brian Silvermana9f2ec92020-10-06 18:00:53 -070067 // This will be true until Close() is called, unless the file couldn't be
68 // created due to running out of space.
69 bool is_open() const { return fd_ != -1; }
70
Brian Silvermanf51499a2020-09-21 12:49:08 -070071 // Queues up a finished FlatBufferBuilder to be encoded and written.
72 //
73 // Triggers a flush if there's enough data queued up.
74 //
75 // Steals the detached buffer from it.
Austin Schuh48d10d62022-10-16 22:19:23 -070076 void CopyMessage(DataEncoder::Copier *coppier,
77 aos::monotonic_clock::time_point now);
Austin Schuha36c8902019-12-30 18:07:15 -080078
Brian Silvermanf51499a2020-09-21 12:49:08 -070079 // Queues up data in span. May copy or may write it to disk immediately.
80 void QueueSpan(absl::Span<const uint8_t> span);
Austin Schuha36c8902019-12-30 18:07:15 -080081
Brian Silverman0465fcf2020-09-24 00:29:18 -070082 // Indicates we got ENOSPC when trying to write. After this returns true, no
83 // further data is written.
84 bool ran_out_of_space() const { return ran_out_of_space_; }
85
86 // To avoid silently failing to write logfiles, you must call this before
87 // destruction if ran_out_of_space() is true and the situation has been
88 // handled.
89 void acknowledge_out_of_space() {
90 CHECK(ran_out_of_space_);
91 acknowledge_ran_out_of_space_ = true;
92 }
93
94 // Fully flushes and closes the underlying file now. No additional data may be
95 // enqueued after calling this.
96 //
97 // This will be performed in the destructor automatically.
98 //
99 // Note that this may set ran_out_of_space().
100 void Close();
101
Brian Silvermanf51499a2020-09-21 12:49:08 -0700102 // Returns the total number of bytes written and currently queued.
Austin Schuha426f1f2021-03-31 22:27:41 -0700103 size_t total_bytes() const {
104 if (!encoder_) {
105 return 0;
106 }
107 return encoder_->total_bytes();
108 }
Austin Schuha36c8902019-12-30 18:07:15 -0800109
Brian Silvermanf51499a2020-09-21 12:49:08 -0700110 // The maximum time for a single write call, or 0 if none have been performed.
111 std::chrono::nanoseconds max_write_time() const { return max_write_time_; }
112 // The number of bytes in the longest write call, or -1 if none have been
113 // performed.
114 int max_write_time_bytes() const { return max_write_time_bytes_; }
115 // The number of buffers in the longest write call, or -1 if none have been
116 // performed.
117 int max_write_time_messages() const { return max_write_time_messages_; }
118 // The total time spent in write calls.
119 std::chrono::nanoseconds total_write_time() const {
120 return total_write_time_;
121 }
122 // The total number of writes which have been performed.
123 int total_write_count() const { return total_write_count_; }
124 // The total number of messages which have been written.
125 int total_write_messages() const { return total_write_messages_; }
126 // The total number of bytes which have been written.
127 int total_write_bytes() const { return total_write_bytes_; }
128 void ResetStatistics() {
129 max_write_time_ = std::chrono::nanoseconds::zero();
130 max_write_time_bytes_ = -1;
131 max_write_time_messages_ = -1;
132 total_write_time_ = std::chrono::nanoseconds::zero();
133 total_write_count_ = 0;
134 total_write_messages_ = 0;
135 total_write_bytes_ = 0;
136 }
Brian Silverman98360e22020-04-28 16:51:20 -0700137
Austin Schuha36c8902019-12-30 18:07:15 -0800138 private:
Brian Silvermanf51499a2020-09-21 12:49:08 -0700139 // Performs a single writev call with as much of the data we have queued up as
140 // possible.
141 //
142 // This will normally take all of the data we have queued up, unless an
143 // encoder has spit out a big enough chunk all at once that we can't manage
144 // all of it.
145 void Flush();
146
Brian Silverman0465fcf2020-09-24 00:29:18 -0700147 // write_return is what write(2) or writev(2) returned. write_size is the
148 // number of bytes we expected it to write.
149 void HandleWriteReturn(ssize_t write_return, size_t write_size);
150
Brian Silvermanf51499a2020-09-21 12:49:08 -0700151 void UpdateStatsForWrite(aos::monotonic_clock::duration duration,
152 ssize_t written, int iovec_size);
153
154 // Flushes data if we've reached the threshold to do that as part of normal
Austin Schuhbd06ae42021-03-31 22:48:21 -0700155 // operation either due to the outstanding queued data, or because we have
156 // passed our flush period. now is the current time to save some CPU grabbing
157 // the current time. It just needs to be close.
158 void FlushAtThreshold(aos::monotonic_clock::time_point now);
Brian Silvermanf51499a2020-09-21 12:49:08 -0700159
Austin Schuh2f8fd752020-09-01 22:38:28 -0700160 std::string filename_;
Austin Schuh48d10d62022-10-16 22:19:23 -0700161 std::unique_ptr<DataEncoder> encoder_;
Austin Schuh6f3babe2020-01-26 20:34:50 -0800162
Austin Schuha36c8902019-12-30 18:07:15 -0800163 int fd_ = -1;
Brian Silverman0465fcf2020-09-24 00:29:18 -0700164 bool ran_out_of_space_ = false;
165 bool acknowledge_ran_out_of_space_ = false;
Austin Schuha36c8902019-12-30 18:07:15 -0800166
Austin Schuha36c8902019-12-30 18:07:15 -0800167 // List of iovecs to use with writev. This is a member variable to avoid
168 // churn.
169 std::vector<struct iovec> iovec_;
Brian Silvermanf51499a2020-09-21 12:49:08 -0700170
171 std::chrono::nanoseconds max_write_time_ = std::chrono::nanoseconds::zero();
172 int max_write_time_bytes_ = -1;
173 int max_write_time_messages_ = -1;
174 std::chrono::nanoseconds total_write_time_ = std::chrono::nanoseconds::zero();
175 int total_write_count_ = 0;
176 int total_write_messages_ = 0;
177 int total_write_bytes_ = 0;
Austin Schuhbd06ae42021-03-31 22:48:21 -0700178
179 aos::monotonic_clock::time_point last_flush_time_ =
180 aos::monotonic_clock::min_time;
Austin Schuha36c8902019-12-30 18:07:15 -0800181};
182
Austin Schuhf2d0e682022-10-16 14:20:58 -0700183// Repacks the provided RemoteMessage into fbb.
184flatbuffers::Offset<MessageHeader> PackRemoteMessage(
185 flatbuffers::FlatBufferBuilder *fbb,
186 const message_bridge::RemoteMessage *msg, int channel_index,
187 const aos::monotonic_clock::time_point monotonic_timestamp_time);
188
189constexpr flatbuffers::uoffset_t PackRemoteMessageSize() { return 96u; }
190size_t PackRemoteMessageInline(
191 uint8_t *data, const message_bridge::RemoteMessage *msg, int channel_index,
192 const aos::monotonic_clock::time_point monotonic_timestamp_time);
193
Austin Schuha36c8902019-12-30 18:07:15 -0800194// Packes a message pointed to by the context into a MessageHeader.
195flatbuffers::Offset<MessageHeader> PackMessage(
196 flatbuffers::FlatBufferBuilder *fbb, const Context &context,
197 int channel_index, LogType log_type);
198
Austin Schuhfa30c352022-10-16 11:12:02 -0700199// Returns the size that the packed message from PackMessage or
200// PackMessageInline will be.
Austin Schuh48d10d62022-10-16 22:19:23 -0700201flatbuffers::uoffset_t PackMessageSize(LogType log_type, size_t data_size);
Austin Schuhfa30c352022-10-16 11:12:02 -0700202
203// Packs the provided message pointed to by context into the provided buffer.
204// This is equivalent to PackMessage, but doesn't require allocating a
205// FlatBufferBuilder underneath.
206size_t PackMessageInline(uint8_t *data, const Context &contex,
207 int channel_index, LogType log_type);
208
Austin Schuh05b70472020-01-01 17:11:17 -0800209// Class to read chunks out of a log file.
210class SpanReader {
211 public:
Austin Schuhcd368422021-11-22 21:23:29 -0800212 SpanReader(std::string_view filename, bool quiet = false);
Austin Schuha36c8902019-12-30 18:07:15 -0800213
Austin Schuh6f3babe2020-01-26 20:34:50 -0800214 std::string_view filename() const { return filename_; }
215
Brian Smarttea913d42021-12-10 15:02:38 -0800216 size_t TotalRead() const { return total_read_; }
217 size_t TotalConsumed() const { return total_consumed_; }
Austin Schuh60e77942022-05-16 17:48:24 -0700218 bool IsIncomplete() const {
219 return is_finished_ && total_consumed_ < total_read_;
220 }
Brian Smarttea913d42021-12-10 15:02:38 -0800221
Austin Schuhcf5f6442021-07-06 10:43:28 -0700222 // Returns a span with the data for the next message from the log file,
223 // including the size. The result is only guarenteed to be valid until
224 // ReadMessage() or PeekMessage() is called again.
Austin Schuh05b70472020-01-01 17:11:17 -0800225 absl::Span<const uint8_t> ReadMessage();
226
Austin Schuhcf5f6442021-07-06 10:43:28 -0700227 // Returns a span with the data for the next message without consuming it.
228 // Multiple calls to PeekMessage return the same data. ReadMessage or
229 // ConsumeMessage must be called to get the next message.
230 absl::Span<const uint8_t> PeekMessage();
231 // Consumes the message so the next call to ReadMessage or PeekMessage returns
232 // new data. This does not invalidate the data.
233 void ConsumeMessage();
234
Austin Schuh05b70472020-01-01 17:11:17 -0800235 private:
236 // TODO(austin): Optimization:
237 // Allocate the 256k blocks like we do today. But, refcount them with
238 // shared_ptr pointed to by the messageheader that is returned. This avoids
239 // the copy. Need to do more benchmarking.
Brian Silvermanf51499a2020-09-21 12:49:08 -0700240 // And (Brian): Consider just mmapping the file and handing out refcounted
241 // pointers into that too.
Austin Schuh05b70472020-01-01 17:11:17 -0800242
243 // Reads a chunk of data into data_. Returns false if no data was read.
244 bool ReadBlock();
245
Austin Schuhc41603c2020-10-11 16:17:37 -0700246 std::string filename_;
Austin Schuh6f3babe2020-01-26 20:34:50 -0800247
Brian Silvermanf51499a2020-09-21 12:49:08 -0700248 // File reader and data decoder.
249 std::unique_ptr<DataDecoder> decoder_;
Austin Schuh05b70472020-01-01 17:11:17 -0800250
Brian Silvermanf51499a2020-09-21 12:49:08 -0700251 // Vector to read into.
252 ResizeableBuffer data_;
Austin Schuh05b70472020-01-01 17:11:17 -0800253
254 // Amount of data consumed already in data_.
255 size_t consumed_data_ = 0;
Brian Smarttea913d42021-12-10 15:02:38 -0800256
257 // Accumulates the total volume of bytes read from filename_
258 size_t total_read_ = 0;
259
260 // Accumulates the total volume of read bytes that were 'consumed' into
261 // messages. May be less than total_read_, if the last message (span) is
262 // either truncated or somehow corrupt.
263 size_t total_consumed_ = 0;
264
265 // Reached the end, no more readable messages.
266 bool is_finished_ = false;
Austin Schuh05b70472020-01-01 17:11:17 -0800267};
268
Brian Silvermanfee16972021-09-14 12:06:38 -0700269// Reads the last header from a log file. This handles any duplicate headers
270// that were written.
271std::optional<SizePrefixedFlatbufferVector<LogFileHeader>> ReadHeader(
272 SpanReader *span_reader);
273std::optional<SizePrefixedFlatbufferVector<LogFileHeader>> ReadHeader(
274 std::string_view filename);
275// Reads the Nth message from a log file, excluding the header. Note: this
276// doesn't handle duplicate headers.
277std::optional<SizePrefixedFlatbufferVector<MessageHeader>> ReadNthMessage(
278 std::string_view filename, size_t n);
279
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700280class UnpackedMessageHeader;
281
Austin Schuh05b70472020-01-01 17:11:17 -0800282// Class which handles reading the header and messages from the log file. This
283// handles any per-file state left before merging below.
284class MessageReader {
285 public:
286 MessageReader(std::string_view filename);
287
Austin Schuh6f3babe2020-01-26 20:34:50 -0800288 std::string_view filename() const { return span_reader_.filename(); }
289
Austin Schuh05b70472020-01-01 17:11:17 -0800290 // Returns the header from the log file.
291 const LogFileHeader *log_file_header() const {
Austin Schuh97789fc2020-08-01 14:42:45 -0700292 return &raw_log_file_header_.message();
293 }
294
295 // Returns the raw data of the header from the log file.
Austin Schuhadd6eb32020-11-09 21:24:26 -0800296 const SizePrefixedFlatbufferVector<LogFileHeader> &raw_log_file_header()
297 const {
Austin Schuh97789fc2020-08-01 14:42:45 -0700298 return raw_log_file_header_;
Austin Schuh05b70472020-01-01 17:11:17 -0800299 }
300
301 // Returns the minimum maount of data needed to queue up for sorting before
302 // ware guarenteed to not see data out of order.
303 std::chrono::nanoseconds max_out_of_order_duration() const {
304 return max_out_of_order_duration_;
305 }
306
Austin Schuhcde938c2020-02-02 17:30:07 -0800307 // Returns the newest timestamp read out of the log file.
Austin Schuh05b70472020-01-01 17:11:17 -0800308 monotonic_clock::time_point newest_timestamp() const {
309 return newest_timestamp_;
310 }
311
312 // Returns the next message if there is one.
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700313 std::shared_ptr<UnpackedMessageHeader> ReadMessage();
Austin Schuh05b70472020-01-01 17:11:17 -0800314
315 // The time at which we need to read another chunk from the logfile.
316 monotonic_clock::time_point queue_data_time() const {
317 return newest_timestamp() - max_out_of_order_duration();
318 }
319
Brian Smarttea913d42021-12-10 15:02:38 -0800320 // Flag value setters for testing
321 void set_crash_on_corrupt_message_flag(bool b) {
322 crash_on_corrupt_message_flag_ = b;
323 }
324 void set_ignore_corrupt_messages_flag(bool b) {
325 ignore_corrupt_messages_flag_ = b;
326 }
327
Austin Schuh05b70472020-01-01 17:11:17 -0800328 private:
329 // Log chunk reader.
330 SpanReader span_reader_;
331
Austin Schuh97789fc2020-08-01 14:42:45 -0700332 // Vector holding the raw data for the log file header.
Austin Schuhadd6eb32020-11-09 21:24:26 -0800333 SizePrefixedFlatbufferVector<LogFileHeader> raw_log_file_header_;
Austin Schuh05b70472020-01-01 17:11:17 -0800334
335 // Minimum amount of data to queue up for sorting before we are guarenteed
336 // to not see data out of order.
337 std::chrono::nanoseconds max_out_of_order_duration_;
338
339 // Timestamp of the newest message in a channel queue.
340 monotonic_clock::time_point newest_timestamp_ = monotonic_clock::min_time;
Brian Smarttea913d42021-12-10 15:02:38 -0800341
342 // Total volume of verifiable messages from the beginning of the file.
343 // TODO - are message counts also useful?
344 size_t total_verified_before_ = 0;
345
346 // Total volume of messages with corrupted flatbuffer formatting, if any.
347 // Excludes corrupted message content.
348 // TODO - if the layout included something as simple as a CRC (relatively
349 // fast and robust enough) for each span, then corrupted content could be
350 // included in this check.
351 size_t total_corrupted_ = 0;
352
353 // Total volume of verifiable messages intermixed with corrupted messages,
354 // if any. Will be == 0 if total_corrupted_ == 0.
355 size_t total_verified_during_ = 0;
356
357 // Total volume of verifiable messages found after the last corrupted one,
358 // if any. Will be == 0 if total_corrupted_ == 0.
359 size_t total_verified_after_ = 0;
360
361 bool is_corrupted() const { return total_corrupted_ > 0; }
362
363 bool crash_on_corrupt_message_flag_ = true;
364 bool ignore_corrupt_messages_flag_ = false;
Austin Schuh05b70472020-01-01 17:11:17 -0800365};
366
Austin Schuhc41603c2020-10-11 16:17:37 -0700367// A class to seamlessly read messages from a list of part files.
368class PartsMessageReader {
369 public:
370 PartsMessageReader(LogParts log_parts);
371
372 std::string_view filename() const { return message_reader_.filename(); }
373
Austin Schuhd2f96102020-12-01 20:27:29 -0800374 // Returns the LogParts that holds the filenames we are reading.
375 const LogParts &parts() const { return parts_; }
376
Austin Schuh4b5c22a2020-11-30 22:58:43 -0800377 const LogFileHeader *log_file_header() const {
378 return message_reader_.log_file_header();
379 }
380
Austin Schuhc41603c2020-10-11 16:17:37 -0700381 // Returns the minimum amount of data needed to queue up for sorting before
382 // we are guarenteed to not see data out of order.
383 std::chrono::nanoseconds max_out_of_order_duration() const {
384 return message_reader_.max_out_of_order_duration();
385 }
386
387 // Returns the newest timestamp read out of the log file.
388 monotonic_clock::time_point newest_timestamp() const {
389 return newest_timestamp_;
390 }
391
392 // Returns the next message if there is one, or nullopt if we have reached the
393 // end of all the files.
394 // Note: reading the next message may change the max_out_of_order_duration().
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700395 std::shared_ptr<UnpackedMessageHeader> ReadMessage();
Austin Schuhc41603c2020-10-11 16:17:37 -0700396
Austin Schuh48507722021-07-17 17:29:24 -0700397 // Returns the boot count for the requested node, or std::nullopt if we don't
398 // know.
399 std::optional<size_t> boot_count(size_t node_index) const {
400 CHECK_GE(node_index, 0u);
401 CHECK_LT(node_index, boot_counts_.size());
402 return boot_counts_[node_index];
403 }
404
Austin Schuhc41603c2020-10-11 16:17:37 -0700405 private:
406 // Opens the next log and updates message_reader_. Sets done_ if there is
407 // nothing more to do.
408 void NextLog();
Austin Schuh48507722021-07-17 17:29:24 -0700409 void ComputeBootCounts();
Austin Schuhc41603c2020-10-11 16:17:37 -0700410
411 const LogParts parts_;
412 size_t next_part_index_ = 1u;
413 bool done_ = false;
414 MessageReader message_reader_;
Brian Silvermanfee16972021-09-14 12:06:38 -0700415 // We instantiate the next one early, to allow implementations to prefetch.
416 // TODO(Brian): To get optimal performance when downloading, this needs more
417 // communication with the implementation to prioritize the next part and add
418 // more parallelism when it helps. Maybe some kind of a queue of parts in
419 // order, and the implementation gets to pull however many make sense off the
420 // front?
421 std::optional<MessageReader> next_message_reader_;
Austin Schuhc41603c2020-10-11 16:17:37 -0700422
Austin Schuh315b96b2020-12-11 21:21:12 -0800423 // True after we have seen a message after the start of the log. The
424 // guarentees on logging essentially are that all data from before the
425 // starting time of the log may be arbitrarily out of order, but once we get
426 // max_out_of_order_duration past the start, everything will remain within
427 // max_out_of_order_duration. We shouldn't see anything before the start
428 // after we've seen a message that is at least max_out_of_order_duration after
429 // the start.
430 bool after_start_ = false;
431
Austin Schuhc41603c2020-10-11 16:17:37 -0700432 monotonic_clock::time_point newest_timestamp_ = monotonic_clock::min_time;
Austin Schuh48507722021-07-17 17:29:24 -0700433
434 // Per node boot counts.
435 std::vector<std::optional<size_t>> boot_counts_;
Austin Schuhc41603c2020-10-11 16:17:37 -0700436};
437
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700438// Stores MessageHeader as a flat header and inline, aligned block of data.
439class UnpackedMessageHeader {
440 public:
James Kuszmaul9776b392023-01-14 14:08:08 -0800441 UnpackedMessageHeader(
442 uint32_t channel_index, monotonic_clock::time_point monotonic_sent_time,
443 realtime_clock::time_point realtime_sent_time, uint32_t queue_index,
444 std::optional<monotonic_clock::time_point> monotonic_remote_time,
445 std::optional<realtime_clock::time_point> realtime_remote_time,
446 std::optional<uint32_t> remote_queue_index,
447 monotonic_clock::time_point monotonic_timestamp_time,
448 bool has_monotonic_timestamp_time, absl::Span<const uint8_t> span)
449 : channel_index(channel_index),
450 monotonic_sent_time(monotonic_sent_time),
451 realtime_sent_time(realtime_sent_time),
452 queue_index(queue_index),
453 monotonic_remote_time(monotonic_remote_time),
454 realtime_remote_time(realtime_remote_time),
455 remote_queue_index(remote_queue_index),
456 monotonic_timestamp_time(monotonic_timestamp_time),
457 has_monotonic_timestamp_time(has_monotonic_timestamp_time),
458 span(span) {}
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700459 UnpackedMessageHeader(const UnpackedMessageHeader &) = delete;
460 UnpackedMessageHeader &operator=(const UnpackedMessageHeader &) = delete;
461
462 // The channel.
463 uint32_t channel_index = 0xffffffff;
464
465 monotonic_clock::time_point monotonic_sent_time;
466 realtime_clock::time_point realtime_sent_time;
467
468 // The local queue index.
469 uint32_t queue_index = 0xffffffff;
470
Austin Schuh826e6ce2021-11-18 20:33:10 -0800471 std::optional<aos::monotonic_clock::time_point> monotonic_remote_time;
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700472
473 std::optional<realtime_clock::time_point> realtime_remote_time;
474 std::optional<uint32_t> remote_queue_index;
475
476 // This field is defaulted in the flatbuffer, so we need to store both the
477 // possibly defaulted value and whether it is defaulted.
478 monotonic_clock::time_point monotonic_timestamp_time;
479 bool has_monotonic_timestamp_time;
480
481 static std::shared_ptr<UnpackedMessageHeader> MakeMessage(
482 const MessageHeader &message);
483
484 // Note: we are storing a span here because we need something to put in the
485 // SharedSpan pointer that RawSender takes. We are using the aliasing
486 // constructor of shared_ptr to avoid the allocation, and it needs a nice
487 // pointer to track.
488 absl::Span<const uint8_t> span;
489
490 char actual_data[];
491
492 private:
493 ~UnpackedMessageHeader() {}
494
495 static void DestroyAndFree(UnpackedMessageHeader *p) {
496 p->~UnpackedMessageHeader();
497 free(p);
498 }
499};
500
501std::ostream &operator<<(std::ostream &os,
502 const UnpackedMessageHeader &message);
503
Austin Schuh1be0ce42020-11-29 22:43:26 -0800504// Struct to hold a message as it gets sorted on a single node.
505struct Message {
506 // The channel.
507 uint32_t channel_index = 0xffffffff;
508 // The local queue index.
Austin Schuh58646e22021-08-23 23:51:46 -0700509 // TODO(austin): Technically the boot inside queue_index is redundant with
510 // timestamp. In practice, it is less error-prone to duplicate it. Maybe a
511 // function to return the combined struct?
512 BootQueueIndex queue_index;
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700513 // The local timestamp.
514 BootTimestamp timestamp;
Austin Schuhf16ef6a2021-06-30 21:48:17 -0700515
Austin Schuh48507722021-07-17 17:29:24 -0700516 // Remote boot when this is a timestamp.
517 size_t monotonic_remote_boot = 0xffffff;
518
519 size_t monotonic_timestamp_boot = 0xffffff;
520
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700521 std::shared_ptr<UnpackedMessageHeader> data;
Austin Schuh1be0ce42020-11-29 22:43:26 -0800522
523 bool operator<(const Message &m2) const;
524 bool operator>=(const Message &m2) const;
Austin Schuh8f52ed52020-11-30 23:12:39 -0800525 bool operator==(const Message &m2) const;
Austin Schuh1be0ce42020-11-29 22:43:26 -0800526};
527
528std::ostream &operator<<(std::ostream &os, const Message &m);
529
Austin Schuhd2f96102020-12-01 20:27:29 -0800530// Structure to hold a full message and all the timestamps, which may or may not
531// have been sent from a remote node. The remote_queue_index will be invalid if
532// this message is from the point of view of the node which sent it.
533struct TimestampedMessage {
534 uint32_t channel_index = 0xffffffff;
535
Austin Schuh58646e22021-08-23 23:51:46 -0700536 BootQueueIndex queue_index;
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700537 BootTimestamp monotonic_event_time;
Austin Schuhd2f96102020-12-01 20:27:29 -0800538 realtime_clock::time_point realtime_event_time = realtime_clock::min_time;
539
Austin Schuh58646e22021-08-23 23:51:46 -0700540 BootQueueIndex remote_queue_index;
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700541 BootTimestamp monotonic_remote_time;
Austin Schuhd2f96102020-12-01 20:27:29 -0800542 realtime_clock::time_point realtime_remote_time = realtime_clock::min_time;
543
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700544 BootTimestamp monotonic_timestamp_time;
Austin Schuh8bf1e632021-01-02 22:41:04 -0800545
Tyler Chatowb7c6eba2021-07-28 14:43:23 -0700546 std::shared_ptr<UnpackedMessageHeader> data;
Austin Schuhd2f96102020-12-01 20:27:29 -0800547};
548
549std::ostream &operator<<(std::ostream &os, const TimestampedMessage &m);
550
Austin Schuh4b5c22a2020-11-30 22:58:43 -0800551// Class to sort the resulting messages from a PartsMessageReader.
552class LogPartsSorter {
553 public:
554 LogPartsSorter(LogParts log_parts);
555
Austin Schuh0ca51f32020-12-25 21:51:45 -0800556 // Returns the parts that this is sorting messages from.
557 const LogParts &parts() const { return parts_message_reader_.parts(); }
Austin Schuh4b5c22a2020-11-30 22:58:43 -0800558
Austin Schuhd2f96102020-12-01 20:27:29 -0800559 monotonic_clock::time_point monotonic_start_time() const {
Austin Schuh0ca51f32020-12-25 21:51:45 -0800560 return parts().monotonic_start_time;
Austin Schuhd2f96102020-12-01 20:27:29 -0800561 }
562 realtime_clock::time_point realtime_start_time() const {
Austin Schuh0ca51f32020-12-25 21:51:45 -0800563 return parts().realtime_start_time;
Austin Schuhd2f96102020-12-01 20:27:29 -0800564 }
565
Austin Schuh4b5c22a2020-11-30 22:58:43 -0800566 // The time this data is sorted until.
567 monotonic_clock::time_point sorted_until() const { return sorted_until_; }
568
569 // Returns the next sorted message from the log file. It is safe to call
570 // std::move() on the result to move the data flatbuffer from it.
571 Message *Front();
572 // Pops the front message. This should only be called after a call to
573 // Front().
574 void PopFront();
575
576 // Returns a debug string representing the contents of this sorter.
577 std::string DebugString() const;
578
579 private:
580 // Log parts reader we are wrapping.
581 PartsMessageReader parts_message_reader_;
582 // Cache of the time we are sorted until.
583 aos::monotonic_clock::time_point sorted_until_ = monotonic_clock::min_time;
584
Austin Schuhb000de62020-12-03 22:00:40 -0800585 // Timestamp of the last message returned. Used to make sure nothing goes
586 // backwards.
587 monotonic_clock::time_point last_message_time_ = monotonic_clock::min_time;
588
Austin Schuh4b5c22a2020-11-30 22:58:43 -0800589 // Set used for efficient sorting of messages. We can benchmark and evaluate
590 // other data structures if this proves to be the bottleneck.
591 absl::btree_set<Message> messages_;
Austin Schuh48507722021-07-17 17:29:24 -0700592
593 // Mapping from channel to source node.
594 // TODO(austin): Should we put this in Boots so it can be cached for everyone?
595 std::vector<size_t> source_node_index_;
Austin Schuh4b5c22a2020-11-30 22:58:43 -0800596};
597
Austin Schuh8f52ed52020-11-30 23:12:39 -0800598// Class to run merge sort on the messages from multiple LogPartsSorter
599// instances.
600class NodeMerger {
601 public:
Austin Schuhd2f96102020-12-01 20:27:29 -0800602 NodeMerger(std::vector<LogParts> parts);
603
Austin Schuhf16ef6a2021-06-30 21:48:17 -0700604 // Copying and moving will mess up the internal raw pointers. Just don't do
605 // it.
606 NodeMerger(NodeMerger const &) = delete;
607 NodeMerger(NodeMerger &&) = delete;
608 void operator=(NodeMerger const &) = delete;
609 void operator=(NodeMerger &&) = delete;
610
Austin Schuhd2f96102020-12-01 20:27:29 -0800611 // Node index in the configuration of this node.
612 int node() const { return node_; }
Austin Schuh8f52ed52020-11-30 23:12:39 -0800613
Austin Schuh0ca51f32020-12-25 21:51:45 -0800614 // List of parts being sorted together.
615 std::vector<const LogParts *> Parts() const;
616
617 const Configuration *configuration() const {
618 return parts_sorters_[0].parts().config.get();
Austin Schuhd2f96102020-12-01 20:27:29 -0800619 }
620
621 monotonic_clock::time_point monotonic_start_time() const {
622 return monotonic_start_time_;
623 }
624 realtime_clock::time_point realtime_start_time() const {
625 return realtime_start_time_;
Austin Schuh8f52ed52020-11-30 23:12:39 -0800626 }
Austin Schuh5dd22842021-11-17 16:09:39 -0800627 monotonic_clock::time_point monotonic_oldest_time() const {
628 return monotonic_oldest_time_;
629 }
Austin Schuh8f52ed52020-11-30 23:12:39 -0800630
631 // The time this data is sorted until.
632 monotonic_clock::time_point sorted_until() const { return sorted_until_; }
633
634 // Returns the next sorted message from the set of log files. It is safe to
635 // call std::move() on the result to move the data flatbuffer from it.
636 Message *Front();
637 // Pops the front message. This should only be called after a call to
638 // Front().
639 void PopFront();
640
641 private:
642 // Unsorted list of all parts sorters.
Austin Schuhd2f96102020-12-01 20:27:29 -0800643 std::vector<LogPartsSorter> parts_sorters_;
Austin Schuh8f52ed52020-11-30 23:12:39 -0800644 // Pointer to the parts sorter holding the current Front message if one
645 // exists, or nullptr if a new one needs to be found.
646 LogPartsSorter *current_ = nullptr;
647 // Cached sorted_until value.
648 aos::monotonic_clock::time_point sorted_until_ = monotonic_clock::min_time;
Austin Schuhd2f96102020-12-01 20:27:29 -0800649
650 // Cached node.
651 int node_;
652
Austin Schuhb000de62020-12-03 22:00:40 -0800653 // Timestamp of the last message returned. Used to make sure nothing goes
654 // backwards.
655 monotonic_clock::time_point last_message_time_ = monotonic_clock::min_time;
656
Austin Schuhd2f96102020-12-01 20:27:29 -0800657 realtime_clock::time_point realtime_start_time_ = realtime_clock::max_time;
658 monotonic_clock::time_point monotonic_start_time_ = monotonic_clock::max_time;
Austin Schuh60e77942022-05-16 17:48:24 -0700659 monotonic_clock::time_point monotonic_oldest_time_ =
660 monotonic_clock::max_time;
Austin Schuhd2f96102020-12-01 20:27:29 -0800661};
662
Austin Schuhf16ef6a2021-06-30 21:48:17 -0700663// Class to concatenate multiple boots worth of logs into a single per-node
664// stream.
665class BootMerger {
666 public:
667 BootMerger(std::vector<LogParts> file);
668
669 // Copying and moving will mess up the internal raw pointers. Just don't do
670 // it.
671 BootMerger(BootMerger const &) = delete;
672 BootMerger(BootMerger &&) = delete;
673 void operator=(BootMerger const &) = delete;
674 void operator=(BootMerger &&) = delete;
675
676 // Node index in the configuration of this node.
677 int node() const { return node_mergers_[0]->node(); }
678
679 // List of parts being sorted together.
680 std::vector<const LogParts *> Parts() const;
681
682 const Configuration *configuration() const {
683 return node_mergers_[0]->configuration();
684 }
685
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700686 monotonic_clock::time_point monotonic_start_time(size_t boot) const {
687 CHECK_LT(boot, node_mergers_.size());
688 return node_mergers_[boot]->monotonic_start_time();
Austin Schuhf16ef6a2021-06-30 21:48:17 -0700689 }
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700690 realtime_clock::time_point realtime_start_time(size_t boot) const {
691 CHECK_LT(boot, node_mergers_.size());
692 return node_mergers_[boot]->realtime_start_time();
Austin Schuhf16ef6a2021-06-30 21:48:17 -0700693 }
Austin Schuh5dd22842021-11-17 16:09:39 -0800694 monotonic_clock::time_point monotonic_oldest_time(size_t boot) const {
695 CHECK_LT(boot, node_mergers_.size());
696 return node_mergers_[boot]->monotonic_oldest_time();
697 }
Austin Schuhf16ef6a2021-06-30 21:48:17 -0700698
699 bool started() const {
700 return node_mergers_[index_]->sorted_until() != monotonic_clock::min_time ||
701 index_ != 0;
702 }
703
704 // Returns the next sorted message from the set of log files. It is safe to
705 // call std::move() on the result to move the data flatbuffer from it.
706 Message *Front();
707 // Pops the front message. This should only be called after a call to
708 // Front().
709 void PopFront();
710
711 private:
712 int index_ = 0;
713
714 // TODO(austin): Sanjay points out this is pretty inefficient. Don't keep so
715 // many things open.
716 std::vector<std::unique_ptr<NodeMerger>> node_mergers_;
717};
718
Austin Schuhd2f96102020-12-01 20:27:29 -0800719// Class to match timestamps with the corresponding data from other nodes.
Austin Schuh79b30942021-01-24 22:32:21 -0800720//
721// This class also buffers data for the node it represents, and supports
722// notifying when new data is queued as well as queueing until a point in time.
Austin Schuhd2f96102020-12-01 20:27:29 -0800723class TimestampMapper {
724 public:
725 TimestampMapper(std::vector<LogParts> file);
726
727 // Copying and moving will mess up the internal raw pointers. Just don't do
728 // it.
729 TimestampMapper(TimestampMapper const &) = delete;
730 TimestampMapper(TimestampMapper &&) = delete;
731 void operator=(TimestampMapper const &) = delete;
732 void operator=(TimestampMapper &&) = delete;
733
734 // TODO(austin): It would be super helpful to provide a way to queue up to
735 // time X without matching timestamps, and to then be able to pull the
736 // timestamps out of this queue. This lets us bootstrap time estimation
737 // without exploding memory usage worst case.
738
Austin Schuh0ca51f32020-12-25 21:51:45 -0800739 const Configuration *configuration() const { return configuration_.get(); }
Austin Schuhd2f96102020-12-01 20:27:29 -0800740
741 // Returns which node this is sorting for.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700742 size_t node() const { return boot_merger_.node(); }
Austin Schuhd2f96102020-12-01 20:27:29 -0800743
744 // The start time of this log.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700745 monotonic_clock::time_point monotonic_start_time(size_t boot) const {
746 return boot_merger_.monotonic_start_time(boot);
Austin Schuhd2f96102020-12-01 20:27:29 -0800747 }
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700748 realtime_clock::time_point realtime_start_time(size_t boot) const {
749 return boot_merger_.realtime_start_time(boot);
Austin Schuhd2f96102020-12-01 20:27:29 -0800750 }
Austin Schuh5dd22842021-11-17 16:09:39 -0800751 // Returns the oldest timestamp on a message on this boot.
752 monotonic_clock::time_point monotonic_oldest_time(size_t boot) const {
753 return boot_merger_.monotonic_oldest_time(boot);
754 }
Austin Schuhd2f96102020-12-01 20:27:29 -0800755
756 // Uses timestamp_mapper as the peer for its node. Only one mapper may be set
757 // for each node. Peers are used to look up the data for timestamps on this
758 // node.
759 void AddPeer(TimestampMapper *timestamp_mapper);
760
Austin Schuh24bf4972021-06-29 22:09:08 -0700761 // Returns true if anything has been queued up.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700762 bool started() const { return boot_merger_.started(); }
Austin Schuhd2f96102020-12-01 20:27:29 -0800763
764 // Returns the next message for this node.
765 TimestampedMessage *Front();
766 // Pops the next message. Front must be called first.
767 void PopFront();
768
769 // Returns debug information about this node.
770 std::string DebugString() const;
771
Austin Schuh79b30942021-01-24 22:32:21 -0800772 // Queues data the provided time.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700773 void QueueUntil(BootTimestamp queue_time);
Austin Schuhe639ea12021-01-25 13:00:22 -0800774 // Queues until we have time_estimation_buffer of data in the queue.
775 void QueueFor(std::chrono::nanoseconds time_estimation_buffer);
Austin Schuh79b30942021-01-24 22:32:21 -0800776
Austin Schuh06601222021-01-26 17:02:50 -0800777 // Queues until the condition is met.
778 template <typename T>
779 void QueueUntilCondition(T fn) {
780 while (true) {
781 if (fn()) {
782 break;
783 }
784 if (!QueueMatched()) {
785 break;
786 }
787 }
788 }
789
Austin Schuh79b30942021-01-24 22:32:21 -0800790 // Sets a callback to be called whenever a full message is queued.
791 void set_timestamp_callback(std::function<void(TimestampedMessage *)> fn) {
792 timestamp_callback_ = fn;
793 }
794
Austin Schuhd2f96102020-12-01 20:27:29 -0800795 private:
796 // The state for a remote node. This holds the data that needs to be matched
797 // with the remote node's timestamps.
798 struct NodeData {
799 // True if we should save data here. This should be true if any of the
800 // bools in delivered below are true.
801 bool any_delivered = false;
802
Austin Schuh36c00932021-07-19 18:13:21 -0700803 // True if we have a peer and therefore should be saving data for it.
804 bool save_for_peer = false;
805
Austin Schuhd2f96102020-12-01 20:27:29 -0800806 // Peer pointer. This node is only to be considered if a peer is set.
807 TimestampMapper *peer = nullptr;
808
809 struct ChannelData {
810 // Deque per channel. This contains the data from the outside
811 // TimestampMapper node which is relevant for the node this NodeData
812 // points to.
813 std::deque<Message> messages;
814 // Bool tracking per channel if a message is delivered to the node this
815 // NodeData represents.
816 bool delivered = false;
Austin Schuh6a7358f2021-11-18 22:40:40 -0800817 // The TTL for delivery.
818 std::chrono::nanoseconds time_to_live = std::chrono::nanoseconds(0);
Austin Schuhd2f96102020-12-01 20:27:29 -0800819 };
820
821 // Vector with per channel data.
822 std::vector<ChannelData> channels;
823 };
824
825 // Returns (and forgets about) the data for the provided timestamp message
826 // showing when it was delivered to this node.
827 Message MatchingMessageFor(const Message &message);
828
829 // Queues up a single message into our message queue, and any nodes that this
830 // message is delivered to. Returns true if one was available, false
831 // otherwise.
832 bool Queue();
833
Austin Schuh79b30942021-01-24 22:32:21 -0800834 // Queues up a single matched message into our matched message queue. Returns
835 // true if one was queued, and false otherwise.
836 bool QueueMatched();
837
Austin Schuhd2f96102020-12-01 20:27:29 -0800838 // Queues up data until we have at least one message >= to time t.
839 // Useful for triggering a remote node to read enough data to have the
840 // timestamp you care about available.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700841 void QueueUnmatchedUntil(BootTimestamp t);
Austin Schuhd2f96102020-12-01 20:27:29 -0800842
Austin Schuh79b30942021-01-24 22:32:21 -0800843 // Queues m into matched_messages_.
844 void QueueMessage(Message *m);
Austin Schuhd2f96102020-12-01 20:27:29 -0800845
Austin Schuh58646e22021-08-23 23:51:46 -0700846 // Returns the name of the node this class is sorting for.
847 std::string_view node_name() const {
848 return configuration_->has_nodes() ? configuration_->nodes()
849 ->Get(boot_merger_.node())
850 ->name()
851 ->string_view()
852 : "(single node)";
853 }
854
Austin Schuhd2f96102020-12-01 20:27:29 -0800855 // The node merger to source messages from.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700856 BootMerger boot_merger_;
Austin Schuh0ca51f32020-12-25 21:51:45 -0800857
858 std::shared_ptr<const Configuration> configuration_;
859
Austin Schuhd2f96102020-12-01 20:27:29 -0800860 // The buffer of messages for this node. These are not matched with any
861 // remote data.
862 std::deque<Message> messages_;
863 // The node index for the source node for each channel.
864 std::vector<size_t> source_node_;
865
866 // Vector per node. Not all nodes will have anything.
867 std::vector<NodeData> nodes_data_;
868
869 // Latest message to return.
Austin Schuh79b30942021-01-24 22:32:21 -0800870 std::deque<TimestampedMessage> matched_messages_;
Austin Schuhd2f96102020-12-01 20:27:29 -0800871
Austin Schuh79b30942021-01-24 22:32:21 -0800872 // Tracks the state of the first message in matched_messages_. Do we need to
873 // update it, is it valid, or should we return nullptr?
Austin Schuhd2f96102020-12-01 20:27:29 -0800874 enum class FirstMessage {
875 kNeedsUpdate,
876 kInMessage,
877 kNullptr,
878 };
879 FirstMessage first_message_ = FirstMessage::kNeedsUpdate;
880
881 // Timestamp of the last message returned. Used to make sure nothing goes
882 // backwards.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700883 BootTimestamp last_message_time_ = BootTimestamp::min_time();
Austin Schuh6a7358f2021-11-18 22:40:40 -0800884 BootTimestamp last_popped_message_time_ = BootTimestamp::min_time();
Austin Schuhd2f96102020-12-01 20:27:29 -0800885 // Time this node is queued up until. Used for caching.
Austin Schuh2dc8c7d2021-07-01 17:41:28 -0700886 BootTimestamp queued_until_ = BootTimestamp::min_time();
Austin Schuh79b30942021-01-24 22:32:21 -0800887
888 std::function<void(TimestampedMessage *)> timestamp_callback_;
Austin Schuh8f52ed52020-11-30 23:12:39 -0800889};
890
Austin Schuhee711052020-08-24 16:06:09 -0700891// Returns the node name with a trailing space, or an empty string if we are on
892// a single node.
893std::string MaybeNodeName(const Node *);
894
Brian Silvermanf51499a2020-09-21 12:49:08 -0700895} // namespace aos::logger
Austin Schuha36c8902019-12-30 18:07:15 -0800896
897#endif // AOS_EVENTS_LOGGING_LOGFILE_UTILS_H_