blob: ca4630f2c5c342a1b40c4405284aca31699efaa2 [file] [log] [blame]
Austin Schuhe309d2a2019-11-29 13:25:21 -08001#ifndef AOS_EVENTS_LOGGER_H_
2#define AOS_EVENTS_LOGGER_H_
3
Austin Schuh8bd96322020-02-13 21:18:22 -08004#include <chrono>
Austin Schuhe309d2a2019-11-29 13:25:21 -08005#include <deque>
Austin Schuh05b70472020-01-01 17:11:17 -08006#include <string_view>
Austin Schuh2f8fd752020-09-01 22:38:28 -07007#include <tuple>
Austin Schuh6f3babe2020-01-26 20:34:50 -08008#include <vector>
Austin Schuhe309d2a2019-11-29 13:25:21 -08009
Austin Schuh8bd96322020-02-13 21:18:22 -080010#include "Eigen/Dense"
11#include "absl/strings/str_cat.h"
Austin Schuhe309d2a2019-11-29 13:25:21 -080012#include "absl/types/span.h"
13#include "aos/events/event_loop.h"
Austin Schuh2f8fd752020-09-01 22:38:28 -070014#include "aos/events/logging/eigen_mpq.h"
Austin Schuhcb5601b2020-09-10 15:29:59 -070015#include "aos/events/logging/log_namer.h"
Austin Schuha36c8902019-12-30 18:07:15 -080016#include "aos/events/logging/logfile_utils.h"
James Kuszmaul38735e82019-12-07 16:42:06 -080017#include "aos/events/logging/logger_generated.h"
Austin Schuh64fab802020-09-09 22:47:47 -070018#include "aos/events/logging/uuid.h"
Austin Schuh92547522019-12-28 14:33:43 -080019#include "aos/events/simulated_event_loop.h"
Austin Schuh2f8fd752020-09-01 22:38:28 -070020#include "aos/network/message_bridge_server_generated.h"
Austin Schuh8bd96322020-02-13 21:18:22 -080021#include "aos/network/timestamp_filter.h"
Austin Schuhe309d2a2019-11-29 13:25:21 -080022#include "aos/time/time.h"
23#include "flatbuffers/flatbuffers.h"
Austin Schuh2f8fd752020-09-01 22:38:28 -070024#include "third_party/gmp/gmpxx.h"
Austin Schuhe309d2a2019-11-29 13:25:21 -080025
26namespace aos {
27namespace logger {
28
Austin Schuhe309d2a2019-11-29 13:25:21 -080029// Logs all channels available in the event loop to disk every 100 ms.
30// Start by logging one message per channel to capture any state and
31// configuration that is sent rately on a channel and would affect execution.
32class Logger {
33 public:
Austin Schuh0c297012020-09-16 18:41:59 -070034 // Constructs a logger.
Austin Schuh0c297012020-09-16 18:41:59 -070035 // event_loop: The event loop used to read the messages.
Austin Schuh0c297012020-09-16 18:41:59 -070036 // configuration: When provided, this is the configuration to log, and the
37 // configuration to use for the channel list to log. If not provided,
38 // this becomes the configuration from the event loop.
Brian Silverman1f345222020-09-24 21:14:48 -070039 // should_log: When provided, a filter for channels to log. If not provided,
40 // all available channels are logged.
41 Logger(EventLoop *event_loop)
42 : Logger(event_loop, event_loop->configuration()) {}
43 Logger(EventLoop *event_loop, const Configuration *configuration)
44 : Logger(event_loop, configuration,
45 [](const Channel *) { return true; }) {}
46 Logger(EventLoop *event_loop, const Configuration *configuration,
47 std::function<bool(const Channel *)> should_log);
Austin Schuh0c297012020-09-16 18:41:59 -070048 ~Logger();
49
50 // Overrides the name in the log file header.
51 void set_name(std::string_view name) { name_ = name; }
Austin Schuhe309d2a2019-11-29 13:25:21 -080052
Brian Silverman1f345222020-09-24 21:14:48 -070053 // Sets the callback to run after each period of data is logged. Defaults to
54 // doing nothing.
55 //
56 // This callback may safely do things like call Rotate().
57 void set_on_logged_period(std::function<void()> on_logged_period) {
58 on_logged_period_ = std::move(on_logged_period);
59 }
60
61 // Sets the period between polling the data. Defaults to 100ms.
62 //
63 // Changing this while a set of files is being written may result in
64 // unreadable files.
65 void set_polling_period(std::chrono::nanoseconds polling_period) {
66 polling_period_ = polling_period;
67 }
68
Brian Silvermanae7c0332020-09-30 16:58:23 -070069 std::string_view log_start_uuid() const { return log_start_uuid_; }
Brian Silverman035e4182020-10-06 17:13:00 -070070 UUID logger_instance_uuid() const { return logger_instance_uuid_; }
Brian Silvermanae7c0332020-09-30 16:58:23 -070071
Brian Silvermancb805822020-10-06 17:43:35 -070072 // The maximum time for a single fetch which returned a message, or 0 if none
73 // of those have happened.
74 std::chrono::nanoseconds max_message_fetch_time() const {
75 return max_message_fetch_time_;
76 }
77 // The channel for that longest fetch which returned a message, or -1 if none
78 // of those have happened.
79 int max_message_fetch_time_channel() const {
80 return max_message_fetch_time_channel_;
81 }
82 // The size of the message returned by that longest fetch, or -1 if none of
83 // those have happened.
84 int max_message_fetch_time_size() const {
85 return max_message_fetch_time_size_;
86 }
87 // The total time spent fetching messages.
88 std::chrono::nanoseconds total_message_fetch_time() const {
89 return total_message_fetch_time_;
90 }
91 // The total number of fetch calls which returned messages.
92 int total_message_fetch_count() const { return total_message_fetch_count_; }
93 // The total number of bytes fetched.
94 int64_t total_message_fetch_bytes() const {
95 return total_message_fetch_bytes_;
96 }
97
98 // The total time spent in fetches which did not return a message.
99 std::chrono::nanoseconds total_nop_fetch_time() const {
100 return total_nop_fetch_time_;
101 }
102 // The total number of fetches which did not return a message.
103 int total_nop_fetch_count() const { return total_nop_fetch_count_; }
104
105 // The maximum time for a single copy, or 0 if none of those have happened.
106 std::chrono::nanoseconds max_copy_time() const { return max_copy_time_; }
107 // The channel for that longest copy, or -1 if none of those have happened.
108 int max_copy_time_channel() const { return max_copy_time_channel_; }
109 // The size of the message for that longest copy, or -1 if none of those have
110 // happened.
111 int max_copy_time_size() const { return max_copy_time_size_; }
112 // The total time spent copying messages.
113 std::chrono::nanoseconds total_copy_time() const { return total_copy_time_; }
114 // The total number of messages copied.
115 int total_copy_count() const { return total_copy_count_; }
116 // The total number of bytes copied.
117 int64_t total_copy_bytes() const { return total_copy_bytes_; }
118
119 void ResetStatisics();
120
Austin Schuh2f8fd752020-09-01 22:38:28 -0700121 // Rotates the log file(s), triggering new part files to be written for each
122 // log file.
123 void Rotate();
Austin Schuhfa895892020-01-07 20:07:41 -0800124
Brian Silverman1f345222020-09-24 21:14:48 -0700125 // Starts logging to files with the given naming scheme.
Brian Silvermanae7c0332020-09-30 16:58:23 -0700126 //
127 // log_start_uuid may be used to tie this log event to other log events across
128 // multiple nodes. The default (empty string) indicates there isn't one
129 // available.
130 void StartLogging(std::unique_ptr<LogNamer> log_namer,
131 std::string_view log_start_uuid = "");
Brian Silverman1f345222020-09-24 21:14:48 -0700132
133 // Stops logging. Ensures any messages through end_time make it into the log.
134 //
135 // If you want to stop ASAP, pass min_time to avoid reading any more messages.
136 //
137 // Returns the LogNamer in case the caller wants to do anything else with it
138 // before destroying it.
139 std::unique_ptr<LogNamer> StopLogging(
140 aos::monotonic_clock::time_point end_time);
141
142 // Returns whether a log is currently being written.
143 bool is_started() const { return static_cast<bool>(log_namer_); }
144
145 // Shortcut to call StartLogging with a LocalLogNamer when event processing
146 // starts.
147 void StartLoggingLocalNamerOnRun(std::string base_name) {
148 event_loop_->OnRun([this, base_name]() {
149 StartLogging(
150 std::make_unique<LocalLogNamer>(base_name, event_loop_->node()));
151 });
152 }
153
Austin Schuhe309d2a2019-11-29 13:25:21 -0800154 private:
Austin Schuhe309d2a2019-11-29 13:25:21 -0800155 // Structure to track both a fetcher, and if the data fetched has been
156 // written. We may want to delay writing data to disk so that we don't let
157 // data get too far out of order when written to disk so we can avoid making
158 // it too hard to sort when reading.
159 struct FetcherStruct {
160 std::unique_ptr<RawFetcher> fetcher;
161 bool written = false;
Austin Schuh15649d62019-12-28 16:36:38 -0800162
Austin Schuh8d7e0bb2020-10-02 17:57:00 -0700163 // Channel index to log to.
Austin Schuh6f3babe2020-01-26 20:34:50 -0800164 int channel_index = -1;
Brian Silverman1f345222020-09-24 21:14:48 -0700165 const Channel *channel = nullptr;
166 const Node *timestamp_node = nullptr;
Austin Schuh6f3babe2020-01-26 20:34:50 -0800167
168 LogType log_type = LogType::kLogMessage;
169
Brian Silverman1f345222020-09-24 21:14:48 -0700170 // We fill out the metadata at construction, but the actual writers have to
171 // be updated each time we start logging. To avoid duplicating the complex
172 // logic determining whether each writer should be initialized, we just
173 // stash the answer in separate member variables.
174 bool wants_writer = false;
Austin Schuh6f3babe2020-01-26 20:34:50 -0800175 DetachedBufferWriter *writer = nullptr;
Brian Silverman1f345222020-09-24 21:14:48 -0700176 bool wants_timestamp_writer = false;
Austin Schuh6f3babe2020-01-26 20:34:50 -0800177 DetachedBufferWriter *timestamp_writer = nullptr;
Brian Silverman1f345222020-09-24 21:14:48 -0700178 bool wants_contents_writer = false;
Austin Schuh2f8fd752020-09-01 22:38:28 -0700179 DetachedBufferWriter *contents_writer = nullptr;
Brian Silverman1f345222020-09-24 21:14:48 -0700180
Austin Schuh2f8fd752020-09-01 22:38:28 -0700181 int node_index = 0;
Austin Schuhe309d2a2019-11-29 13:25:21 -0800182 };
183
Austin Schuh8d7e0bb2020-10-02 17:57:00 -0700184 // Vector mapping from the channel index from the event loop to the logged
185 // channel index.
186 std::vector<int> event_loop_to_logged_channel_index_;
187
Austin Schuh2f8fd752020-09-01 22:38:28 -0700188 struct NodeState {
189 aos::monotonic_clock::time_point monotonic_start_time =
190 aos::monotonic_clock::min_time;
191 aos::realtime_clock::time_point realtime_start_time =
192 aos::realtime_clock::min_time;
193
194 aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> log_file_header =
195 aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader>::Empty();
196 };
Brian Silverman1f345222020-09-24 21:14:48 -0700197
198 void WriteHeader();
199 aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> MakeHeader(
200 const Node *node);
201
202 bool MaybeUpdateTimestamp(
203 const Node *node, int node_index,
204 aos::monotonic_clock::time_point monotonic_start_time,
205 aos::realtime_clock::time_point realtime_start_time);
206
207 void DoLogData(const monotonic_clock::time_point end_time);
208
209 void WriteMissingTimestamps();
210
211 // Fetches from each channel until all the data is logged.
212 void LogUntil(monotonic_clock::time_point t);
213
Brian Silvermancb805822020-10-06 17:43:35 -0700214 void RecordFetchResult(aos::monotonic_clock::time_point start,
215 aos::monotonic_clock::time_point end, bool got_new,
216 FetcherStruct *fetcher);
217
218 void RecordCreateMessageTime(aos::monotonic_clock::time_point start,
219 aos::monotonic_clock::time_point end,
220 FetcherStruct *fetcher);
221
Brian Silverman1f345222020-09-24 21:14:48 -0700222 // Sets the start time for a specific node.
223 void SetStartTime(size_t node_index,
224 aos::monotonic_clock::time_point monotonic_start_time,
225 aos::realtime_clock::time_point realtime_start_time);
226
Brian Silvermanae7c0332020-09-30 16:58:23 -0700227 EventLoop *const event_loop_;
Brian Silverman1f345222020-09-24 21:14:48 -0700228 // The configuration to place at the top of the log file.
229 const Configuration *const configuration_;
230
Brian Silvermanae7c0332020-09-30 16:58:23 -0700231 UUID log_event_uuid_ = UUID::Zero();
232 const UUID logger_instance_uuid_ = UUID::Random();
233 std::unique_ptr<LogNamer> log_namer_;
234 // Empty indicates there isn't one.
235 std::string log_start_uuid_;
236 const std::string boot_uuid_;
237
Brian Silverman1f345222020-09-24 21:14:48 -0700238 // Name to save in the log file. Defaults to hostname.
239 std::string name_;
240
241 std::function<void()> on_logged_period_ = []() {};
242
Brian Silvermancb805822020-10-06 17:43:35 -0700243 std::chrono::nanoseconds max_message_fetch_time_ =
244 std::chrono::nanoseconds::zero();
245 int max_message_fetch_time_channel_ = -1;
246 int max_message_fetch_time_size_ = -1;
247 std::chrono::nanoseconds total_message_fetch_time_ =
248 std::chrono::nanoseconds::zero();
249 int total_message_fetch_count_ = 0;
250 int64_t total_message_fetch_bytes_ = 0;
251
252 std::chrono::nanoseconds total_nop_fetch_time_ =
253 std::chrono::nanoseconds::zero();
254 int total_nop_fetch_count_ = 0;
255
256 std::chrono::nanoseconds max_copy_time_ = std::chrono::nanoseconds::zero();
257 int max_copy_time_channel_ = -1;
258 int max_copy_time_size_ = -1;
259 std::chrono::nanoseconds total_copy_time_ = std::chrono::nanoseconds::zero();
260 int total_copy_count_ = 0;
261 int64_t total_copy_bytes_ = 0;
262
Brian Silverman1f345222020-09-24 21:14:48 -0700263 std::vector<FetcherStruct> fetchers_;
264 TimerHandler *timer_handler_;
265
266 // Period to poll the channels.
267 std::chrono::nanoseconds polling_period_ = std::chrono::milliseconds(100);
268
269 // Last time that data was written for all channels to disk.
270 monotonic_clock::time_point last_synchronized_time_;
271
272 // Max size that the header has consumed. This much extra data will be
273 // reserved in the builder to avoid reallocating.
274 size_t max_header_size_ = 0;
275
276 // Fetcher for all the statistics from all the nodes.
277 aos::Fetcher<message_bridge::ServerStatistics> server_statistics_fetcher_;
278
Austin Schuh2f8fd752020-09-01 22:38:28 -0700279 std::vector<NodeState> node_state_;
Austin Schuhe309d2a2019-11-29 13:25:21 -0800280};
281
Austin Schuh11d43732020-09-21 17:28:30 -0700282// Datastructure to hold ordered parts.
283struct LogParts {
284 // Monotonic and realtime start times for this set of log files. For log
285 // files which started out unknown and then became known, this is the known
286 // start time.
287 aos::monotonic_clock::time_point monotonic_start_time;
288 aos::realtime_clock::time_point realtime_start_time;
289
290 // UUIDs if available.
Brian Silvermanae7c0332020-09-30 16:58:23 -0700291 std::string log_event_uuid;
Austin Schuh11d43732020-09-21 17:28:30 -0700292 std::string parts_uuid;
293
294 // The node this represents, or empty if we are in a single node world.
295 std::string node;
296
297 // Pre-sorted list of parts.
298 std::vector<std::string> parts;
299};
300
301// Datastructure to hold parts from the same run of the logger which have no
302// ordering constraints relative to each other.
303struct LogFile {
304 // The UUID tying them all together (if available)
Brian Silvermanae7c0332020-09-30 16:58:23 -0700305 std::string log_event_uuid;
Austin Schuh11d43732020-09-21 17:28:30 -0700306
307 // All the parts, unsorted.
308 std::vector<LogParts> parts;
309};
310
311std::ostream &operator<<(std::ostream &stream, const LogFile &file);
312std::ostream &operator<<(std::ostream &stream, const LogParts &parts);
313
Austin Schuh5212cad2020-09-09 23:12:09 -0700314// Takes a bunch of parts and sorts them based on part_uuid and part_index.
Austin Schuh11d43732020-09-21 17:28:30 -0700315std::vector<LogFile> SortParts(const std::vector<std::string> &parts);
316
317std::vector<std::vector<std::string>> ToLogReaderVector(
318 const std::vector<LogFile> &log_files);
Austin Schuh5212cad2020-09-09 23:12:09 -0700319
Austin Schuh6f3babe2020-01-26 20:34:50 -0800320// We end up with one of the following 3 log file types.
321//
322// Single node logged as the source node.
323// -> Replayed just on the source node.
324//
325// Forwarding timestamps only logged from the perspective of the destination
326// node.
327// -> Matched with data on source node and logged.
328//
329// Forwarding timestamps with data logged as the destination node.
330// -> Replayed just as the destination
331// -> Replayed as the source (Much harder, ordering is not defined)
332//
333// Duplicate data logged. -> CHECK that it matches and explode otherwise.
334//
335// This can be boiled down to a set of constraints and tools.
336//
337// 1) Forwarding timestamps and data need to be logged separately.
338// 2) Any forwarded data logged on the destination node needs to be logged
339// separately such that it can be sorted.
340//
341// 1) Log reader needs to be able to sort a list of log files.
342// 2) Log reader needs to be able to merge sorted lists of log files.
343// 3) Log reader needs to be able to match timestamps with messages.
344//
345// We also need to be able to generate multiple views of a log file depending on
346// the target.
347
Austin Schuhe309d2a2019-11-29 13:25:21 -0800348// Replays all the channels in the logfile to the event loop.
349class LogReader {
350 public:
James Kuszmaulc7bbb3e2020-01-03 20:01:00 -0800351 // If you want to supply a new configuration that will be used for replay
352 // (e.g., to change message rates, or to populate an updated schema), then
353 // pass it in here. It must provide all the channels that the original logged
354 // config did.
Austin Schuh6f3babe2020-01-26 20:34:50 -0800355 //
356 // Log filenames are in the following format:
357 //
358 // {
359 // {log1_part0, log1_part1, ...},
360 // {log2}
361 // }
362 // The inner vector is a list of log file chunks which form up a log file.
363 // The outer vector is a list of log files with subsets of the messages, or
364 // messages from different nodes.
365 //
366 // If the outer vector isn't provided, it is assumed to be of size 1.
James Kuszmaulc7bbb3e2020-01-03 20:01:00 -0800367 LogReader(std::string_view filename,
368 const Configuration *replay_configuration = nullptr);
Austin Schuh6f3babe2020-01-26 20:34:50 -0800369 LogReader(const std::vector<std::string> &filenames,
370 const Configuration *replay_configuration = nullptr);
371 LogReader(const std::vector<std::vector<std::string>> &filenames,
Austin Schuhfa895892020-01-07 20:07:41 -0800372 const Configuration *replay_configuration = nullptr);
Austin Schuh11d43732020-09-21 17:28:30 -0700373 LogReader(const std::vector<LogFile> &log_files,
374 const Configuration *replay_configuration = nullptr);
James Kuszmaul7daef362019-12-31 18:28:17 -0800375 ~LogReader();
Austin Schuhe309d2a2019-11-29 13:25:21 -0800376
Austin Schuh6331ef92020-01-07 18:28:09 -0800377 // Registers all the callbacks to send the log file data out on an event loop
378 // created in event_loop_factory. This also updates time to be at the start
379 // of the log file by running until the log file starts.
380 // Note: the configuration used in the factory should be configuration()
381 // below, but can be anything as long as the locations needed to send
382 // everything are available.
James Kuszmaul84ff3e52020-01-03 19:48:53 -0800383 void Register(SimulatedEventLoopFactory *event_loop_factory);
Austin Schuh6331ef92020-01-07 18:28:09 -0800384 // Creates an SimulatedEventLoopFactory accessible via event_loop_factory(),
385 // and then calls Register.
386 void Register();
387 // Registers callbacks for all the events after the log file starts. This is
388 // only useful when replaying live.
Austin Schuhe309d2a2019-11-29 13:25:21 -0800389 void Register(EventLoop *event_loop);
Austin Schuh6331ef92020-01-07 18:28:09 -0800390
James Kuszmaul84ff3e52020-01-03 19:48:53 -0800391 // Unregisters the senders. You only need to call this if you separately
392 // supplied an event loop or event loop factory and the lifetimes are such
393 // that they need to be explicitly destroyed before the LogReader destructor
394 // gets called.
Austin Schuhe309d2a2019-11-29 13:25:21 -0800395 void Deregister();
396
Austin Schuh0c297012020-09-16 18:41:59 -0700397 // Returns the configuration being used for replay from the log file.
398 // Note that this may be different from the configuration actually used for
399 // handling events. You should generally only use this to create a
400 // SimulatedEventLoopFactory, and then get the configuration from there for
401 // everything else.
James Kuszmaulc7bbb3e2020-01-03 20:01:00 -0800402 const Configuration *logged_configuration() const;
Austin Schuh11d43732020-09-21 17:28:30 -0700403 // Returns the configuration being used for replay from the log file.
404 // Note that this may be different from the configuration actually used for
405 // handling events. You should generally only use this to create a
406 // SimulatedEventLoopFactory, and then get the configuration from there for
407 // everything else.
Austin Schuh6f3babe2020-01-26 20:34:50 -0800408 // The pointer is invalidated whenever RemapLoggedChannel is called.
Austin Schuh15649d62019-12-28 16:36:38 -0800409 const Configuration *configuration() const;
410
Austin Schuh6f3babe2020-01-26 20:34:50 -0800411 // Returns the nodes that this log file was created on. This is a list of
412 // pointers to a node in the nodes() list inside configuration(). The
413 // pointers here are invalidated whenever RemapLoggedChannel is called.
414 std::vector<const Node *> Nodes() const;
Austin Schuhe309d2a2019-11-29 13:25:21 -0800415
416 // Returns the starting timestamp for the log file.
Austin Schuh11d43732020-09-21 17:28:30 -0700417 monotonic_clock::time_point monotonic_start_time(
418 const Node *node = nullptr) const;
419 realtime_clock::time_point realtime_start_time(
420 const Node *node = nullptr) const;
Austin Schuhe309d2a2019-11-29 13:25:21 -0800421
James Kuszmaulc7bbb3e2020-01-03 20:01:00 -0800422 // Causes the logger to publish the provided channel on a different name so
423 // that replayed applications can publish on the proper channel name without
424 // interference. This operates on raw channel names, without any node or
425 // application specific mappings.
426 void RemapLoggedChannel(std::string_view name, std::string_view type,
427 std::string_view add_prefix = "/original");
428 template <typename T>
429 void RemapLoggedChannel(std::string_view name,
430 std::string_view add_prefix = "/original") {
431 RemapLoggedChannel(name, T::GetFullyQualifiedName(), add_prefix);
432 }
433
Austin Schuh01b4c352020-09-21 23:09:39 -0700434 // Remaps the provided channel, though this respects node mappings, and
435 // preserves them too. This makes it so if /aos -> /pi1/aos on one node,
436 // /original/aos -> /original/pi1/aos on the same node after renaming, just
437 // like you would hope.
438 //
439 // TODO(austin): If you have 2 nodes remapping something to the same channel,
440 // this doesn't handle that. No use cases exist yet for that, so it isn't
441 // being done yet.
442 void RemapLoggedChannel(std::string_view name, std::string_view type,
443 const Node *node,
444 std::string_view add_prefix = "/original");
Brian Silvermande9f3ff2020-04-28 16:56:58 -0700445 template <typename T>
Austin Schuh01b4c352020-09-21 23:09:39 -0700446 void RemapLoggedChannel(std::string_view name, const Node *node,
447 std::string_view add_prefix = "/original") {
448 RemapLoggedChannel(name, T::GetFullyQualifiedName(), node, add_prefix);
449 }
450
451 template <typename T>
452 bool HasChannel(std::string_view name, const Node *node = nullptr) {
Brian Silvermande9f3ff2020-04-28 16:56:58 -0700453 return configuration::GetChannel(log_file_header()->configuration(), name,
454 T::GetFullyQualifiedName(), "",
Austin Schuh01b4c352020-09-21 23:09:39 -0700455 node) != nullptr;
Brian Silvermande9f3ff2020-04-28 16:56:58 -0700456 }
457
James Kuszmaul84ff3e52020-01-03 19:48:53 -0800458 SimulatedEventLoopFactory *event_loop_factory() {
459 return event_loop_factory_;
460 }
461
Brian Silvermande9f3ff2020-04-28 16:56:58 -0700462 const LogFileHeader *log_file_header() const {
463 return &log_file_header_.message();
464 }
465
Austin Schuh0c297012020-09-16 18:41:59 -0700466 std::string_view name() const {
467 return log_file_header()->name()->string_view();
468 }
469
Austin Schuhe309d2a2019-11-29 13:25:21 -0800470 private:
Austin Schuh6f3babe2020-01-26 20:34:50 -0800471 const Channel *RemapChannel(const EventLoop *event_loop,
472 const Channel *channel);
473
Austin Schuhe309d2a2019-11-29 13:25:21 -0800474 // Queues at least max_out_of_order_duration_ messages into channels_.
475 void QueueMessages();
James Kuszmaulc7bbb3e2020-01-03 20:01:00 -0800476 // Handle constructing a configuration with all the additional remapped
477 // channels from calls to RemapLoggedChannel.
478 void MakeRemappedConfig();
Austin Schuhe309d2a2019-11-29 13:25:21 -0800479
Austin Schuh2f8fd752020-09-01 22:38:28 -0700480 // Returns the number of nodes.
481 size_t nodes_count() const {
482 return !configuration::MultiNode(logged_configuration())
483 ? 1u
484 : logged_configuration()->nodes()->size();
485 }
486
Austin Schuh6f3babe2020-01-26 20:34:50 -0800487 const std::vector<std::vector<std::string>> filenames_;
488
489 // This is *a* log file header used to provide the logged config. The rest of
490 // the header is likely distracting.
491 FlatbufferVector<LogFileHeader> log_file_header_;
492
Austin Schuh2f8fd752020-09-01 22:38:28 -0700493 // Returns [ta; tb; ...] = tuple[0] * t + tuple[1]
494 std::tuple<Eigen::Matrix<double, Eigen::Dynamic, 1>,
495 Eigen::Matrix<double, Eigen::Dynamic, 1>>
496 SolveOffsets();
497
498 void LogFit(std::string_view prefix);
Austin Schuh8bd96322020-02-13 21:18:22 -0800499
Austin Schuh6f3babe2020-01-26 20:34:50 -0800500 // State per node.
Austin Schuh858c9f32020-08-31 16:56:12 -0700501 class State {
502 public:
503 State(std::unique_ptr<ChannelMerger> channel_merger);
Austin Schuh6f3babe2020-01-26 20:34:50 -0800504
Austin Schuh858c9f32020-08-31 16:56:12 -0700505 // Returns the timestamps, channel_index, and message from a channel.
506 // update_time (will be) set to true when popping this message causes the
507 // filter to change the time offset estimation function.
508 std::tuple<TimestampMerger::DeliveryTimestamp, int,
509 FlatbufferVector<MessageHeader>>
510 PopOldest(bool *update_time);
511
512 // Returns the monotonic time of the oldest message.
513 monotonic_clock::time_point OldestMessageTime() const;
514
515 // Primes the queues inside State. Should be called before calling
516 // OldestMessageTime.
517 void SeedSortedMessages();
Austin Schuh8bd96322020-02-13 21:18:22 -0800518
Austin Schuh858c9f32020-08-31 16:56:12 -0700519 // Returns the starting time for this node.
520 monotonic_clock::time_point monotonic_start_time() const {
521 return channel_merger_->monotonic_start_time();
522 }
523 realtime_clock::time_point realtime_start_time() const {
524 return channel_merger_->realtime_start_time();
525 }
526
527 // Sets the node event loop factory for replaying into a
528 // SimulatedEventLoopFactory. Returns the EventLoop to use.
529 EventLoop *SetNodeEventLoopFactory(
530 NodeEventLoopFactory *node_event_loop_factory);
531
532 // Sets and gets the event loop to use.
533 void set_event_loop(EventLoop *event_loop) { event_loop_ = event_loop; }
534 EventLoop *event_loop() { return event_loop_; }
535
Austin Schuh858c9f32020-08-31 16:56:12 -0700536 // Sets the current realtime offset from the monotonic clock for this node
537 // (if we are on a simulated event loop).
538 void SetRealtimeOffset(monotonic_clock::time_point monotonic_time,
539 realtime_clock::time_point realtime_time) {
540 if (node_event_loop_factory_ != nullptr) {
541 node_event_loop_factory_->SetRealtimeOffset(monotonic_time,
542 realtime_time);
543 }
544 }
545
Austin Schuh8d7e0bb2020-10-02 17:57:00 -0700546 // Returns the MessageHeader sender to log delivery timestamps to for the
547 // provided remote node.
548 aos::Sender<MessageHeader> *RemoteTimestampSender(
549 const Node *delivered_node);
550
Austin Schuh858c9f32020-08-31 16:56:12 -0700551 // Converts a timestamp from the monotonic clock on this node to the
552 // distributed clock.
553 distributed_clock::time_point ToDistributedClock(
554 monotonic_clock::time_point time) {
555 return node_event_loop_factory_->ToDistributedClock(time);
556 }
557
Austin Schuh2f8fd752020-09-01 22:38:28 -0700558 monotonic_clock::time_point FromDistributedClock(
559 distributed_clock::time_point time) {
560 return node_event_loop_factory_->FromDistributedClock(time);
561 }
562
Austin Schuh858c9f32020-08-31 16:56:12 -0700563 // Sets the offset (and slope) from the distributed clock.
564 void SetDistributedOffset(std::chrono::nanoseconds distributed_offset,
565 double distributed_slope) {
566 node_event_loop_factory_->SetDistributedOffset(distributed_offset,
567 distributed_slope);
568 }
569
570 // Returns the current time on the remote node which sends messages on
571 // channel_index.
572 monotonic_clock::time_point monotonic_remote_now(size_t channel_index) {
Austin Schuh8d7e0bb2020-10-02 17:57:00 -0700573 return channel_source_state_[channel_index]
574 ->node_event_loop_factory_->monotonic_now();
Austin Schuh858c9f32020-08-31 16:56:12 -0700575 }
576
Austin Schuh2f8fd752020-09-01 22:38:28 -0700577 distributed_clock::time_point RemoteToDistributedClock(
578 size_t channel_index, monotonic_clock::time_point time) {
Austin Schuh8d7e0bb2020-10-02 17:57:00 -0700579 return channel_source_state_[channel_index]
580 ->node_event_loop_factory_->ToDistributedClock(time);
Austin Schuh2f8fd752020-09-01 22:38:28 -0700581 }
582
583 const Node *remote_node(size_t channel_index) {
Austin Schuh8d7e0bb2020-10-02 17:57:00 -0700584 return channel_source_state_[channel_index]
585 ->node_event_loop_factory_->node();
Austin Schuh2f8fd752020-09-01 22:38:28 -0700586 }
587
588 monotonic_clock::time_point monotonic_now() {
589 return node_event_loop_factory_->monotonic_now();
590 }
591
Austin Schuh858c9f32020-08-31 16:56:12 -0700592 // Sets the node we will be merging as, and returns true if there is any
593 // data on it.
594 bool SetNode() { return channel_merger_->SetNode(event_loop_->node()); }
595
596 // Sets the number of channels.
597 void SetChannelCount(size_t count);
598
599 // Sets the sender, filter, and target factory for a channel.
Austin Schuh8d7e0bb2020-10-02 17:57:00 -0700600 void SetChannel(size_t logged_channel_index, size_t factory_channel_index,
601 std::unique_ptr<RawSender> sender,
Austin Schuh2f8fd752020-09-01 22:38:28 -0700602 message_bridge::NoncausalOffsetEstimator *filter,
Austin Schuh8d7e0bb2020-10-02 17:57:00 -0700603 aos::Sender<MessageHeader> *remote_timestamp_sender,
604 State *source_state);
Austin Schuh858c9f32020-08-31 16:56:12 -0700605
606 // Returns if we have read all the messages from all the logs.
607 bool at_end() const { return channel_merger_->at_end(); }
608
609 // Unregisters everything so we can destory the event loop.
610 void Deregister();
611
612 // Sets the current TimerHandle for the replay callback.
613 void set_timer_handler(TimerHandler *timer_handler) {
614 timer_handler_ = timer_handler;
615 }
616
617 // Sets the next wakeup time on the replay callback.
618 void Setup(monotonic_clock::time_point next_time) {
619 timer_handler_->Setup(next_time);
620 }
621
622 // Sends a buffer on the provided channel index.
623 bool Send(size_t channel_index, const void *data, size_t size,
Austin Schuh8d7e0bb2020-10-02 17:57:00 -0700624 const TimestampMerger::DeliveryTimestamp &delivery_timestamp);
Austin Schuh858c9f32020-08-31 16:56:12 -0700625
626 // Returns a debug string for the channel merger.
Austin Schuh2f8fd752020-09-01 22:38:28 -0700627 std::string DebugString() const {
628 std::stringstream messages;
629 size_t i = 0;
630 for (const auto &message : sorted_messages_) {
631 if (i < 7 || i + 7 > sorted_messages_.size()) {
632 messages << "sorted_messages[" << i
633 << "]: " << std::get<0>(message).monotonic_event_time << " "
634 << configuration::StrippedChannelToString(
635 event_loop_->configuration()->channels()->Get(
636 std::get<2>(message).message().channel_index()))
637 << "\n";
638 } else if (i == 7) {
639 messages << "...\n";
640 }
641 ++i;
642 }
643 return messages.str() + channel_merger_->DebugString();
644 }
Austin Schuh858c9f32020-08-31 16:56:12 -0700645
646 private:
647 // Log file.
648 std::unique_ptr<ChannelMerger> channel_merger_;
649
650 std::deque<std::tuple<TimestampMerger::DeliveryTimestamp, int,
Austin Schuh2f8fd752020-09-01 22:38:28 -0700651 FlatbufferVector<MessageHeader>,
652 message_bridge::NoncausalOffsetEstimator *>>
Austin Schuh858c9f32020-08-31 16:56:12 -0700653 sorted_messages_;
654
655 // Senders.
656 std::vector<std::unique_ptr<RawSender>> channels_;
Austin Schuh8d7e0bb2020-10-02 17:57:00 -0700657 std::vector<aos::Sender<MessageHeader> *> remote_timestamp_senders_;
658 // The mapping from logged channel index to sent channel index. Needed for
659 // sending out MessageHeaders.
660 std::vector<int> factory_channel_index_;
661
662 struct SentTimestamp {
663 monotonic_clock::time_point monotonic_event_time =
664 monotonic_clock::min_time;
665 realtime_clock::time_point realtime_event_time = realtime_clock::min_time;
666 uint32_t queue_index = 0xffffffff;
667
668 // The queue index that this message *actually* was sent with.
669 uint32_t actual_queue_index = 0xffffffff;
670 };
671
672 // Stores all the timestamps that have been sent on this channel. This is
673 // only done for channels which are forwarded and on the node which
674 // initially sends the message.
675 //
676 // TODO(austin): This whole concept is a hack. We should be able to
677 // associate state with the message as it gets sorted and recover it.
678 std::vector<std::unique_ptr<std::vector<SentTimestamp>>> queue_index_map_;
Austin Schuh858c9f32020-08-31 16:56:12 -0700679
680 // Factory (if we are in sim) that this loop was created on.
681 NodeEventLoopFactory *node_event_loop_factory_ = nullptr;
682 std::unique_ptr<EventLoop> event_loop_unique_ptr_;
683 // Event loop.
684 EventLoop *event_loop_ = nullptr;
685 // And timer used to send messages.
686 TimerHandler *timer_handler_;
687
Austin Schuh8bd96322020-02-13 21:18:22 -0800688 // Filters (or nullptr if it isn't a forwarded channel) for each channel.
689 // This corresponds to the object which is shared among all the channels
690 // going between 2 nodes. The second element in the tuple indicates if this
691 // is the primary direction or not.
Austin Schuh2f8fd752020-09-01 22:38:28 -0700692 std::vector<message_bridge::NoncausalOffsetEstimator *> filters_;
Austin Schuh8bd96322020-02-13 21:18:22 -0800693
694 // List of NodeEventLoopFactorys (or nullptr if it isn't a forwarded
695 // channel) which correspond to the originating node.
Austin Schuh8d7e0bb2020-10-02 17:57:00 -0700696 std::vector<State *> channel_source_state_;
697
698 std::map<const Node *, aos::Sender<MessageHeader>>
699 remote_timestamp_senders_map_;
Austin Schuh6f3babe2020-01-26 20:34:50 -0800700 };
701
Austin Schuh8bd96322020-02-13 21:18:22 -0800702 // Node index -> State.
703 std::vector<std::unique_ptr<State>> states_;
704
705 // Creates the requested filter if it doesn't exist, regardless of whether
706 // these nodes can actually communicate directly. The second return value
707 // reports if this is the primary direction or not.
Austin Schuh2f8fd752020-09-01 22:38:28 -0700708 message_bridge::NoncausalOffsetEstimator *GetFilter(const Node *node_a,
709 const Node *node_b);
Austin Schuh8bd96322020-02-13 21:18:22 -0800710
711 // FILE to write offsets to (if populated).
712 FILE *offset_fp_ = nullptr;
713 // Timestamp of the first piece of data used for the horizontal axis on the
714 // plot.
715 aos::realtime_clock::time_point first_time_;
716
717 // List of filters for a connection. The pointer to the first node will be
718 // less than the second node.
719 std::map<std::tuple<const Node *, const Node *>,
Austin Schuh2f8fd752020-09-01 22:38:28 -0700720 std::tuple<message_bridge::NoncausalOffsetEstimator>>
Austin Schuh8bd96322020-02-13 21:18:22 -0800721 filters_;
722
723 // Returns the offset from the monotonic clock for a node to the distributed
Austin Schuh2f8fd752020-09-01 22:38:28 -0700724 // clock. monotonic = distributed * slope() + offset();
725 double slope(int node_index) const {
726 CHECK_LT(node_index, time_slope_matrix_.rows())
James Kuszmaul46d82582020-05-09 19:50:09 -0700727 << ": Got too high of a node index.";
Austin Schuh2f8fd752020-09-01 22:38:28 -0700728 return time_slope_matrix_(node_index);
729 }
730 std::chrono::nanoseconds offset(int node_index) const {
731 CHECK_LT(node_index, time_offset_matrix_.rows())
732 << ": Got too high of a node index.";
733 return std::chrono::duration_cast<std::chrono::nanoseconds>(
734 std::chrono::duration<double>(time_offset_matrix_(node_index)));
Austin Schuh8bd96322020-02-13 21:18:22 -0800735 }
736
737 // Updates the offset matrix solution and sets the per-node distributed
738 // offsets in the factory.
739 void UpdateOffsets();
740
Austin Schuh2f8fd752020-09-01 22:38:28 -0700741 // We have 2 types of equations to do a least squares regression over to fully
742 // constrain our time function.
743 //
744 // One is simple. The distributed clock is the average of all the clocks.
Brian Silverman87ac0402020-09-17 14:47:01 -0700745 // (ta + tb + tc + td) / num_nodes = t_distributed
Austin Schuh2f8fd752020-09-01 22:38:28 -0700746 //
747 // The second is a bit more complicated. Our basic time conversion function
748 // is:
749 // tb = ta + (ta * slope + offset)
750 // We can rewrite this as follows
751 // tb - (1 + slope) * ta = offset
752 //
753 // From here, we have enough equations to solve for t{a,b,c,...} We want to
754 // take as an input the offsets and slope, and solve for the per-node times as
755 // a function of the distributed clock.
756 //
757 // We need to massage our equations to make this work. If we solve for the
758 // per-node times at two set distributed clock times, we will be able to
759 // recreate the linear function (we know it is linear). We can do a similar
760 // thing by breaking our equation up into:
Brian Silverman87ac0402020-09-17 14:47:01 -0700761 //
Austin Schuh2f8fd752020-09-01 22:38:28 -0700762 // [1/3 1/3 1/3 ] [ta] [t_distributed]
763 // [ 1 -1-m1 0 ] [tb] = [oab]
764 // [ 1 0 -1-m2 ] [tc] [oac]
765 //
766 // This solves to:
767 //
768 // [ta] [ a00 a01 a02] [t_distributed]
769 // [tb] = [ a10 a11 a12] * [oab]
770 // [tc] [ a20 a21 a22] [oac]
771 //
772 // and can be split into:
773 //
774 // [ta] [ a00 ] [a01 a02]
775 // [tb] = [ a10 ] * t_distributed + [a11 a12] * [oab]
776 // [tc] [ a20 ] [a21 a22] [oac]
777 //
778 // (map_matrix_ + slope_matrix_) * [ta; tb; tc] = [offset_matrix_];
779 // offset_matrix_ will be in nanoseconds.
780 Eigen::Matrix<mpq_class, Eigen::Dynamic, Eigen::Dynamic> map_matrix_;
781 Eigen::Matrix<mpq_class, Eigen::Dynamic, Eigen::Dynamic> slope_matrix_;
782 Eigen::Matrix<mpq_class, Eigen::Dynamic, 1> offset_matrix_;
783 // Matrix tracking which offsets are valid.
784 Eigen::Matrix<bool, Eigen::Dynamic, 1> valid_matrix_;
785 // Matrix tracking the last valid matrix we used to determine connected nodes.
786 Eigen::Matrix<bool, Eigen::Dynamic, 1> last_valid_matrix_;
787 size_t cached_valid_node_count_ = 0;
Austin Schuh8bd96322020-02-13 21:18:22 -0800788
Austin Schuh2f8fd752020-09-01 22:38:28 -0700789 // [ta; tb; tc] = time_slope_matrix_ * t + time_offset_matrix;
790 // t is in seconds.
791 Eigen::Matrix<double, Eigen::Dynamic, 1> time_slope_matrix_;
792 Eigen::Matrix<double, Eigen::Dynamic, 1> time_offset_matrix_;
Austin Schuhe309d2a2019-11-29 13:25:21 -0800793
James Kuszmaul84ff3e52020-01-03 19:48:53 -0800794 std::unique_ptr<FlatbufferDetachedBuffer<Configuration>>
795 remapped_configuration_buffer_;
796
James Kuszmaul84ff3e52020-01-03 19:48:53 -0800797 std::unique_ptr<SimulatedEventLoopFactory> event_loop_factory_unique_ptr_;
798 SimulatedEventLoopFactory *event_loop_factory_ = nullptr;
James Kuszmaulc7bbb3e2020-01-03 20:01:00 -0800799
800 // Map of channel indices to new name. The channel index will be an index into
801 // logged_configuration(), and the string key will be the name of the channel
802 // to send on instead of the logged channel name.
803 std::map<size_t, std::string> remapped_channels_;
Austin Schuh01b4c352020-09-21 23:09:39 -0700804 std::vector<MapT> maps_;
James Kuszmaulc7bbb3e2020-01-03 20:01:00 -0800805
Austin Schuh6f3babe2020-01-26 20:34:50 -0800806 // Number of nodes which still have data to send. This is used to figure out
807 // when to exit.
808 size_t live_nodes_ = 0;
809
James Kuszmaulc7bbb3e2020-01-03 20:01:00 -0800810 const Configuration *remapped_configuration_ = nullptr;
811 const Configuration *replay_configuration_ = nullptr;
Austin Schuhcde938c2020-02-02 17:30:07 -0800812
813 // If true, the replay timer will ignore any missing data. This is used
814 // during startup when we are bootstrapping everything and trying to get to
815 // the start of all the log files.
816 bool ignore_missing_data_ = false;
Austin Schuhe309d2a2019-11-29 13:25:21 -0800817};
818
819} // namespace logger
820} // namespace aos
821
822#endif // AOS_EVENTS_LOGGER_H_