blob: b8d0c7bd9105e83e5b0bf0cdb279c817100248af [file] [log] [blame]
Austin Schuhb06f03b2021-02-17 22:00:37 -08001#include "aos/events/logging/log_writer.h"
2
Austin Schuh6bb8a822021-03-31 23:04:39 -07003#include <dirent.h>
4
Austin Schuhb06f03b2021-02-17 22:00:37 -08005#include <functional>
6#include <map>
7#include <vector>
8
9#include "aos/configuration.h"
10#include "aos/events/event_loop.h"
11#include "aos/network/message_bridge_server_generated.h"
12#include "aos/network/team_number.h"
Austin Schuh61e973f2021-02-21 21:43:56 -080013#include "aos/network/timestamp_channel.h"
Austin Schuhb06f03b2021-02-17 22:00:37 -080014
15namespace aos {
16namespace logger {
17namespace {
18using message_bridge::RemoteMessage;
Austin Schuhbd06ae42021-03-31 22:48:21 -070019namespace chrono = std::chrono;
Austin Schuhb06f03b2021-02-17 22:00:37 -080020} // namespace
21
22Logger::Logger(EventLoop *event_loop, const Configuration *configuration,
23 std::function<bool(const Channel *)> should_log)
24 : event_loop_(event_loop),
25 configuration_(configuration),
Austin Schuh5b728b72021-06-16 14:57:15 -070026 node_(configuration::GetNode(configuration_, event_loop->node())),
27 node_index_(configuration::GetNodeIndex(configuration_, node_)),
Austin Schuhb06f03b2021-02-17 22:00:37 -080028 name_(network::GetHostname()),
29 timer_handler_(event_loop_->AddTimer(
Austin Schuh30586902021-03-30 22:54:08 -070030 [this]() { DoLogData(event_loop_->monotonic_now(), true); })),
Austin Schuhb06f03b2021-02-17 22:00:37 -080031 server_statistics_fetcher_(
32 configuration::MultiNode(event_loop_->configuration())
33 ? event_loop_->MakeFetcher<message_bridge::ServerStatistics>(
34 "/aos")
35 : aos::Fetcher<message_bridge::ServerStatistics>()) {
Austin Schuh58646e22021-08-23 23:51:46 -070036 timer_handler_->set_name("channel_poll");
Austin Schuh5b728b72021-06-16 14:57:15 -070037 VLOG(1) << "Creating logger for " << FlatbufferToJson(node_);
Austin Schuhb06f03b2021-02-17 22:00:37 -080038
Austin Schuh01f3b392022-01-25 20:03:09 -080039 // When we are logging remote timestamps, we need to be able to translate from
40 // the channel index that the event loop uses to the channel index in the
41 // config in the log file.
42 event_loop_to_logged_channel_index_.resize(
43 event_loop->configuration()->channels()->size(), -1);
44 for (size_t event_loop_channel_index = 0;
45 event_loop_channel_index <
46 event_loop->configuration()->channels()->size();
47 ++event_loop_channel_index) {
48 const Channel *event_loop_channel =
49 event_loop->configuration()->channels()->Get(event_loop_channel_index);
50
51 const Channel *logged_channel = aos::configuration::GetChannel(
52 configuration_, event_loop_channel->name()->string_view(),
53 event_loop_channel->type()->string_view(), "", node_);
54
55 if (logged_channel != nullptr) {
56 event_loop_to_logged_channel_index_[event_loop_channel_index] =
57 configuration::ChannelIndex(configuration_, logged_channel);
58 }
59 }
60
61 // Map to match source channels with the timestamp logger, if the contents
62 // should be reliable, and a list of all channels logged on it to be treated
63 // as reliable.
64 std::map<const Channel *, std::tuple<const Node *, bool, std::vector<bool>>>
65 timestamp_logger_channels;
Austin Schuhb06f03b2021-02-17 22:00:37 -080066
Austin Schuh61e973f2021-02-21 21:43:56 -080067 message_bridge::ChannelTimestampFinder finder(event_loop_);
68 for (const Channel *channel : *event_loop_->configuration()->channels()) {
69 if (!configuration::ChannelIsSendableOnNode(channel, event_loop_->node())) {
Austin Schuhb06f03b2021-02-17 22:00:37 -080070 continue;
71 }
Austin Schuh61e973f2021-02-21 21:43:56 -080072 if (!channel->has_destination_nodes()) {
73 continue;
74 }
Austin Schuh01f3b392022-01-25 20:03:09 -080075 const size_t channel_index =
76 configuration::ChannelIndex(event_loop_->configuration(), channel);
77
Austin Schuh61e973f2021-02-21 21:43:56 -080078 for (const Connection *connection : *channel->destination_nodes()) {
79 if (configuration::ConnectionDeliveryTimeIsLoggedOnNode(
80 connection, event_loop_->node())) {
81 const Node *other_node = configuration::GetNode(
Austin Schuh5b728b72021-06-16 14:57:15 -070082 configuration_, connection->name()->string_view());
Austin Schuh61e973f2021-02-21 21:43:56 -080083
84 VLOG(1) << "Timestamps are logged from "
85 << FlatbufferToJson(other_node);
Austin Schuh01f3b392022-01-25 20:03:09 -080086 // True if each channel's remote timestamps are split into a separate
87 // RemoteMessage channel.
88 const bool is_split =
89 finder.SplitChannelForChannel(channel, connection) != nullptr;
90
91 const Channel *const timestamp_logger_channel =
92 finder.ForChannel(channel, connection);
93
94 auto it = timestamp_logger_channels.find(timestamp_logger_channel);
95 if (it != timestamp_logger_channels.end()) {
96 CHECK(!is_split);
97 CHECK_LT(channel_index, std::get<2>(it->second).size());
98 std::get<2>(it->second)[channel_index] = (connection->time_to_live() == 0);
99 } else {
100 if (is_split) {
101 timestamp_logger_channels.insert(std::make_pair(
102 timestamp_logger_channel,
103 std::make_tuple(other_node, (connection->time_to_live() == 0),
104 std::vector<bool>())));
105 } else {
106 std::vector<bool> channel_reliable_contents(
107 event_loop->configuration()->channels()->size(), false);
108 channel_reliable_contents[channel_index] =
109 (connection->time_to_live() == 0);
110
111 timestamp_logger_channels.insert(std::make_pair(
112 timestamp_logger_channel,
113 std::make_tuple(other_node, false,
114 std::move(channel_reliable_contents))));
115 }
116 }
Austin Schuh61e973f2021-02-21 21:43:56 -0800117 }
118 }
Austin Schuhb06f03b2021-02-17 22:00:37 -0800119 }
120
Austin Schuhb06f03b2021-02-17 22:00:37 -0800121 for (size_t channel_index = 0;
122 channel_index < configuration_->channels()->size(); ++channel_index) {
123 const Channel *const config_channel =
124 configuration_->channels()->Get(channel_index);
125 // The MakeRawFetcher method needs a channel which is in the event loop
126 // configuration() object, not the configuration_ object. Go look that up
127 // from the config.
128 const Channel *channel = aos::configuration::GetChannel(
129 event_loop_->configuration(), config_channel->name()->string_view(),
130 config_channel->type()->string_view(), "", event_loop_->node());
131 CHECK(channel != nullptr)
132 << ": Failed to look up channel "
133 << aos::configuration::CleanedChannelToString(config_channel);
Austin Schuh5b728b72021-06-16 14:57:15 -0700134 if (!should_log(config_channel)) {
Austin Schuhb06f03b2021-02-17 22:00:37 -0800135 continue;
136 }
137
138 FetcherStruct fs;
139 fs.channel_index = channel_index;
140 fs.channel = channel;
141
142 const bool is_local =
Austin Schuh5b728b72021-06-16 14:57:15 -0700143 configuration::ChannelIsSendableOnNode(config_channel, node_);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800144
145 const bool is_readable =
Austin Schuh5b728b72021-06-16 14:57:15 -0700146 configuration::ChannelIsReadableOnNode(config_channel, node_);
Austin Schuh01f3b392022-01-25 20:03:09 -0800147 const bool is_logged =
148 configuration::ChannelMessageIsLoggedOnNode(config_channel, node_);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800149 const bool log_message = is_logged && is_readable;
150
151 bool log_delivery_times = false;
Austin Schuh5b728b72021-06-16 14:57:15 -0700152 if (configuration::MultiNode(configuration_)) {
Austin Schuh72211ae2021-08-05 14:02:30 -0700153 const aos::Connection *connection =
Austin Schuh5b728b72021-06-16 14:57:15 -0700154 configuration::ConnectionToNode(config_channel, node_);
Austin Schuh72211ae2021-08-05 14:02:30 -0700155
Austin Schuhb06f03b2021-02-17 22:00:37 -0800156 log_delivery_times = configuration::ConnectionDeliveryTimeIsLoggedOnNode(
Austin Schuh72211ae2021-08-05 14:02:30 -0700157 connection, event_loop_->node());
158
159 CHECK_EQ(log_delivery_times,
160 configuration::ConnectionDeliveryTimeIsLoggedOnNode(
Austin Schuh5b728b72021-06-16 14:57:15 -0700161 config_channel, node_, node_));
Austin Schuh72211ae2021-08-05 14:02:30 -0700162
163 if (connection) {
164 fs.reliable_forwarding = (connection->time_to_live() == 0);
165 }
Austin Schuhb06f03b2021-02-17 22:00:37 -0800166 }
167
Austin Schuh01f3b392022-01-25 20:03:09 -0800168 // Now, detect a RemoteMessage timestamp logger where we should just log
169 // the contents to a file directly.
Austin Schuhb06f03b2021-02-17 22:00:37 -0800170 const bool log_contents = timestamp_logger_channels.find(channel) !=
171 timestamp_logger_channels.end();
172
173 if (log_message || log_delivery_times || log_contents) {
174 fs.fetcher = event_loop->MakeRawFetcher(channel);
175 VLOG(1) << "Logging channel "
176 << configuration::CleanedChannelToString(channel);
177
178 if (log_delivery_times) {
179 VLOG(1) << " Delivery times";
180 fs.wants_timestamp_writer = true;
Austin Schuh5b728b72021-06-16 14:57:15 -0700181 fs.timestamp_node_index = static_cast<int>(node_index_);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800182 }
Austin Schuhe46492f2021-07-31 19:49:41 -0700183 // Both the timestamp and data writers want data_node_index so it knows
184 // what the source node is.
185 if (log_message || log_delivery_times) {
Austin Schuhb06f03b2021-02-17 22:00:37 -0800186 if (!is_local) {
187 const Node *source_node = configuration::GetNode(
188 configuration_, channel->source_node()->string_view());
189 fs.data_node_index =
190 configuration::GetNodeIndex(configuration_, source_node);
Austin Schuhe46492f2021-07-31 19:49:41 -0700191 }
192 }
193 if (log_message) {
194 VLOG(1) << " Data";
195 fs.wants_writer = true;
196 if (!is_local) {
Austin Schuhb06f03b2021-02-17 22:00:37 -0800197 fs.log_type = LogType::kLogRemoteMessage;
198 } else {
Austin Schuh5b728b72021-06-16 14:57:15 -0700199 fs.data_node_index = static_cast<int>(node_index_);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800200 }
201 }
202 if (log_contents) {
203 VLOG(1) << "Timestamp logger channel "
204 << configuration::CleanedChannelToString(channel);
Austin Schuh01f3b392022-01-25 20:03:09 -0800205 auto timestamp_logger_channel_info =
206 timestamp_logger_channels.find(channel);
207 CHECK(timestamp_logger_channel_info != timestamp_logger_channels.end());
208 fs.timestamp_node = std::get<0>(timestamp_logger_channel_info->second);
209 fs.reliable_contents =
210 std::get<1>(timestamp_logger_channel_info->second);
211 fs.channel_reliable_contents =
212 std::get<2>(timestamp_logger_channel_info->second);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800213 fs.wants_contents_writer = true;
214 fs.contents_node_index =
215 configuration::GetNodeIndex(configuration_, fs.timestamp_node);
216 }
217 fetchers_.emplace_back(std::move(fs));
218 }
219 }
Austin Schuhb06f03b2021-02-17 22:00:37 -0800220}
221
222Logger::~Logger() {
223 if (log_namer_) {
224 // If we are replaying a log file, or in simulation, we want to force the
225 // last bit of data to be logged. The easiest way to deal with this is to
Austin Schuh01f3b392022-01-25 20:03:09 -0800226 // poll everything as we go to destroy the class, ie, shut down the
227 // logger, and write it to disk.
Austin Schuhb06f03b2021-02-17 22:00:37 -0800228 StopLogging(event_loop_->monotonic_now());
229 }
230}
231
Austin Schuh6bb8a822021-03-31 23:04:39 -0700232bool Logger::RenameLogBase(std::string new_base_name) {
233 if (new_base_name == log_namer_->base_name()) {
234 return true;
235 }
236 std::string current_directory = std::string(log_namer_->base_name());
237 std::string new_directory = new_base_name;
238
239 auto current_path_split = current_directory.rfind("/");
240 auto new_path_split = new_directory.rfind("/");
241
242 CHECK(new_base_name.substr(new_path_split) ==
243 current_directory.substr(current_path_split))
244 << "Rename of file base from " << current_directory << " to "
245 << new_directory << " is not supported.";
246
247 current_directory.resize(current_path_split);
248 new_directory.resize(new_path_split);
249 DIR *dir = opendir(current_directory.c_str());
250 if (dir) {
251 closedir(dir);
252 const int result = rename(current_directory.c_str(), new_directory.c_str());
253 if (result != 0) {
254 PLOG(ERROR) << "Unable to rename " << current_directory << " to "
255 << new_directory;
256 return false;
257 }
258 } else {
259 // Handle if directory was already renamed.
260 dir = opendir(new_directory.c_str());
261 if (!dir) {
262 LOG(ERROR) << "Old directory " << current_directory
263 << " missing and new directory " << new_directory
264 << " not present.";
265 return false;
266 }
267 closedir(dir);
268 }
269
270 log_namer_->set_base_name(new_base_name);
271 Rotate();
272 return true;
273}
274
Brian Smartt03c00da2022-02-24 10:25:00 -0800275std::string Logger::WriteConfiguration(LogNamer* log_namer) {
Austin Schuhb06f03b2021-02-17 22:00:37 -0800276 std::string config_sha256;
Brian Smartt03c00da2022-02-24 10:25:00 -0800277
Austin Schuhb06f03b2021-02-17 22:00:37 -0800278 if (separate_config_) {
279 flatbuffers::FlatBufferBuilder fbb;
280 flatbuffers::Offset<aos::Configuration> configuration_offset =
281 CopyFlatBuffer(configuration_, &fbb);
282 LogFileHeader::Builder log_file_header_builder(fbb);
283 log_file_header_builder.add_configuration(configuration_offset);
284 fbb.FinishSizePrefixed(log_file_header_builder.Finish());
285 aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> config_header(
286 fbb.Release());
287 config_sha256 = Sha256(config_header.span());
288 LOG(INFO) << "Config sha256 of " << config_sha256;
Brian Smartt03c00da2022-02-24 10:25:00 -0800289 log_namer->WriteConfiguration(&config_header, config_sha256);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800290 }
291
Brian Smartt03c00da2022-02-24 10:25:00 -0800292 return config_sha256;
293}
294
295void Logger::StartLogging(std::unique_ptr<LogNamer> log_namer,
296 std::optional<UUID> log_start_uuid) {
297 CHECK(!log_namer_) << ": Already logging";
298
299 VLOG(1) << "Starting logger for " << FlatbufferToJson(node_);
300
301 auto config_sha256 = WriteConfiguration(log_namer.get());
302
303 log_namer_ = std::move(log_namer);
304
Austin Schuhb06f03b2021-02-17 22:00:37 -0800305 log_event_uuid_ = UUID::Random();
306 log_start_uuid_ = log_start_uuid;
Austin Schuhb06f03b2021-02-17 22:00:37 -0800307
308 // We want to do as much work as possible before the initial Fetch. Time
309 // between that and actually starting to log opens up the possibility of
310 // falling off the end of the queue during that time.
311
312 for (FetcherStruct &f : fetchers_) {
313 if (f.wants_writer) {
314 f.writer = log_namer_->MakeWriter(f.channel);
315 }
316 if (f.wants_timestamp_writer) {
317 f.timestamp_writer = log_namer_->MakeTimestampWriter(f.channel);
318 }
319 if (f.wants_contents_writer) {
320 f.contents_writer = log_namer_->MakeForwardedTimestampWriter(
321 f.channel, CHECK_NOTNULL(f.timestamp_node));
322 }
323 }
324
Austin Schuh73340842021-07-30 22:32:06 -0700325 log_namer_->SetHeaderTemplate(MakeHeader(config_sha256));
Austin Schuhb06f03b2021-02-17 22:00:37 -0800326
Austin Schuha42ee962021-03-31 22:49:30 -0700327 const aos::monotonic_clock::time_point beginning_time =
328 event_loop_->monotonic_now();
329
Austin Schuhb06f03b2021-02-17 22:00:37 -0800330 // Grab data from each channel right before we declare the log file started
331 // so we can capture the latest message on each channel. This lets us have
332 // non periodic messages with configuration that now get logged.
333 for (FetcherStruct &f : fetchers_) {
334 const auto start = event_loop_->monotonic_now();
335 const bool got_new = f.fetcher->Fetch();
336 const auto end = event_loop_->monotonic_now();
337 RecordFetchResult(start, end, got_new, &f);
338
339 // If there is a message, we want to write it.
340 f.written = f.fetcher->context().data == nullptr;
341 }
342
343 // Clear out any old timestamps in case we are re-starting logging.
Austin Schuh572924a2021-07-30 22:32:12 -0700344 for (size_t i = 0; i < configuration::NodesCount(configuration_); ++i) {
Austin Schuh58646e22021-08-23 23:51:46 -0700345 log_namer_->ClearStartTimes();
Austin Schuhb06f03b2021-02-17 22:00:37 -0800346 }
347
Austin Schuha42ee962021-03-31 22:49:30 -0700348 const aos::monotonic_clock::time_point fetch_time =
349 event_loop_->monotonic_now();
Austin Schuhb06f03b2021-02-17 22:00:37 -0800350 WriteHeader();
Austin Schuha42ee962021-03-31 22:49:30 -0700351 const aos::monotonic_clock::time_point header_time =
352 event_loop_->monotonic_now();
Austin Schuhb06f03b2021-02-17 22:00:37 -0800353
Brian Smartt03c00da2022-02-24 10:25:00 -0800354 VLOG(1) << "Logging node as " << FlatbufferToJson(node_)
355 << " start_time " << last_synchronized_time_ << ", took "
356 << chrono::duration<double>(fetch_time - beginning_time).count()
357 << " to fetch, "
358 << chrono::duration<double>(header_time - fetch_time).count()
359 << " to write headers, boot uuid " << event_loop_->boot_uuid();
Austin Schuhb06f03b2021-02-17 22:00:37 -0800360
361 // Force logging up until the start of the log file now, so the messages at
362 // the start are always ordered before the rest of the messages.
363 // Note: this ship may have already sailed, but we don't have to make it
364 // worse.
365 // TODO(austin): Test...
Austin Schuh855f8932021-03-19 22:01:32 -0700366 //
367 // This is safe to call here since we have set last_synchronized_time_ as the
368 // same time as in the header, and all the data before it should be logged
369 // without ordering concerns.
Austin Schuhb06f03b2021-02-17 22:00:37 -0800370 LogUntil(last_synchronized_time_);
371
372 timer_handler_->Setup(event_loop_->monotonic_now() + polling_period_,
373 polling_period_);
374}
375
Brian Smartt03c00da2022-02-24 10:25:00 -0800376std::unique_ptr<LogNamer> Logger::RestartLogging(std::unique_ptr<LogNamer> log_namer,
377 std::optional<UUID> log_start_uuid) {
378 CHECK(log_namer_) << ": Unexpected restart while not logging";
379
380 VLOG(1) << "Restarting logger for " << FlatbufferToJson(node_);
381
382 // Force out every currently pending message, pointing all fetchers at the
383 // last (currently available) records. Note that LogUntil() updates
384 // last_synchronized_time_ to the time value that it receives.
385 while(LogUntil(last_synchronized_time_ + polling_period_));
386
387 std::unique_ptr<LogNamer> old_log_namer = std::move(log_namer_);
388 log_namer_ = std::move(log_namer);
389
390 const aos::monotonic_clock::time_point beginning_time =
391 event_loop_->monotonic_now();
392
393 auto config_sha256 = WriteConfiguration(log_namer_.get());
394
395 log_event_uuid_ = UUID::Random();
396 log_start_uuid_ = log_start_uuid;
397
398 log_namer_->SetHeaderTemplate(MakeHeader(config_sha256));
399
400 // Note that WriteHeader updates last_synchronized_time_ to be the
401 // current time when it is called, which is then the "start time"
402 // of the new (restarted) log. This timestamp will be after
403 // the timestamp of the last message fetched on each channel.
404 WriteHeader();
405
406 const aos::monotonic_clock::time_point header_time =
407 event_loop_->monotonic_now();
408
409 // Write the transition record(s) for each channel ...
410 for (FetcherStruct &f : fetchers_) {
411
412 // Create writers from the new namer
413 NewDataWriter *next_writer = nullptr;
414 NewDataWriter *next_timestamp_writer = nullptr;
415 NewDataWriter *next_contents_writer = nullptr;
416
417 if (f.wants_writer) {
418 next_writer = log_namer_->MakeWriter(f.channel);
419 }
420 if (f.wants_timestamp_writer) {
421 next_timestamp_writer = log_namer_->MakeTimestampWriter(f.channel);
422 }
423 if (f.wants_contents_writer) {
424 next_contents_writer = log_namer_->MakeForwardedTimestampWriter(
425 f.channel, CHECK_NOTNULL(f.timestamp_node));
426 }
427
428 if (f.fetcher->context().data != nullptr) {
429
430 // Write the last message fetched as the first of the new log of this type.
431 // The timestamps on these will all be before the new start time.
432 WriteData(next_writer, f);
433 WriteTimestamps(next_timestamp_writer, f);
434 WriteContent(next_contents_writer, f);
435
436 // It is possible that a few more snuck in. Write them all out also, including
437 // any that should also be in the old log.
438 while (true) {
439 // Get the next message ...
440 const auto start = event_loop_->monotonic_now();
441 const bool got_new = f.fetcher->FetchNext();
442 const auto end = event_loop_->monotonic_now();
443 RecordFetchResult(start, end, got_new, &f);
444
445 if (got_new) {
446 if (f.fetcher->context().monotonic_event_time < last_synchronized_time_) {
447 WriteFetchedRecord(f);
448 }
449
450 WriteData(next_writer, f);
451 WriteTimestamps(next_timestamp_writer, f);
452 WriteContent(next_contents_writer, f);
453
454 if (f.fetcher->context().monotonic_event_time > last_synchronized_time_) {
455 break;
456 }
457 } else {
458 break;
459 }
460 }
461 }
462
463 // Switch fully over to the new writers.
464 f.writer = next_writer;
465 f.timestamp_writer = next_timestamp_writer;
466 f.contents_writer = next_contents_writer;
467 f.written = true;
468 }
469
470 const aos::monotonic_clock::time_point channel_time =
471 event_loop_->monotonic_now();
472
473 VLOG(1) << "Logging node as " << FlatbufferToJson(node_)
474 << " restart_time " << last_synchronized_time_ << ", took "
475 << chrono::duration<double>(header_time - beginning_time).count()
476 << " to prepare and write header, "
477 << chrono::duration<double>(channel_time - header_time).count()
478 << " to write initial channel messages, boot uuid " << event_loop_->boot_uuid();
479
480 return old_log_namer;
481}
482
Austin Schuhb06f03b2021-02-17 22:00:37 -0800483std::unique_ptr<LogNamer> Logger::StopLogging(
484 aos::monotonic_clock::time_point end_time) {
485 CHECK(log_namer_) << ": Not logging right now";
486
487 if (end_time != aos::monotonic_clock::min_time) {
Austin Schuh30586902021-03-30 22:54:08 -0700488 // Folks like to use the on_logged_period_ callback to trigger stop and
489 // start events. We can't have those then recurse and try to stop again.
490 // Rather than making everything reentrant, let's just instead block the
491 // callback here.
492 DoLogData(end_time, false);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800493 }
494 timer_handler_->Disable();
495
496 for (FetcherStruct &f : fetchers_) {
497 f.writer = nullptr;
498 f.timestamp_writer = nullptr;
499 f.contents_writer = nullptr;
500 }
Austin Schuhb06f03b2021-02-17 22:00:37 -0800501
502 log_event_uuid_ = UUID::Zero();
Austin Schuh34f9e482021-03-31 22:54:18 -0700503 log_start_uuid_ = std::nullopt;
Austin Schuhb06f03b2021-02-17 22:00:37 -0800504
505 return std::move(log_namer_);
506}
507
508void Logger::WriteHeader() {
509 if (configuration::MultiNode(configuration_)) {
510 server_statistics_fetcher_.Fetch();
511 }
512
Austin Schuh73340842021-07-30 22:32:06 -0700513 const aos::monotonic_clock::time_point monotonic_start_time =
Austin Schuhb06f03b2021-02-17 22:00:37 -0800514 event_loop_->monotonic_now();
Austin Schuh73340842021-07-30 22:32:06 -0700515 const aos::realtime_clock::time_point realtime_start_time =
Austin Schuhb06f03b2021-02-17 22:00:37 -0800516 event_loop_->realtime_now();
517
518 // We need to pick a point in time to declare the log file "started". This
519 // starts here. It needs to be after everything is fetched so that the
520 // fetchers are all pointed at the most recent message before the start
521 // time.
522 last_synchronized_time_ = monotonic_start_time;
523
524 for (const Node *node : log_namer_->nodes()) {
525 const int node_index = configuration::GetNodeIndex(configuration_, node);
526 MaybeUpdateTimestamp(node, node_index, monotonic_start_time,
527 realtime_start_time);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800528 }
529}
530
Austin Schuhb06f03b2021-02-17 22:00:37 -0800531void Logger::WriteMissingTimestamps() {
532 if (configuration::MultiNode(configuration_)) {
533 server_statistics_fetcher_.Fetch();
534 } else {
535 return;
536 }
537
538 if (server_statistics_fetcher_.get() == nullptr) {
539 return;
540 }
541
542 for (const Node *node : log_namer_->nodes()) {
543 const int node_index = configuration::GetNodeIndex(configuration_, node);
544 if (MaybeUpdateTimestamp(
545 node, node_index,
546 server_statistics_fetcher_.context().monotonic_event_time,
547 server_statistics_fetcher_.context().realtime_event_time)) {
Austin Schuh58646e22021-08-23 23:51:46 -0700548 VLOG(1) << "Timestamps changed on " << aos::FlatbufferToJson(node);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800549 }
550 }
551}
552
Austin Schuhb06f03b2021-02-17 22:00:37 -0800553bool Logger::MaybeUpdateTimestamp(
554 const Node *node, int node_index,
555 aos::monotonic_clock::time_point monotonic_start_time,
556 aos::realtime_clock::time_point realtime_start_time) {
557 // Bail early if the start times are already set.
Austin Schuh58646e22021-08-23 23:51:46 -0700558 if (node_ == node || !configuration::MultiNode(configuration_)) {
559 if (log_namer_->monotonic_start_time(node_index,
560 event_loop_->boot_uuid()) !=
561 monotonic_clock::min_time) {
562 return false;
563 }
Brian Smartt03c00da2022-02-24 10:25:00 -0800564
Austin Schuhb06f03b2021-02-17 22:00:37 -0800565 // There are no offsets to compute for ourself, so always succeed.
Austin Schuh58646e22021-08-23 23:51:46 -0700566 log_namer_->SetStartTimes(node_index, event_loop_->boot_uuid(),
567 monotonic_start_time, realtime_start_time,
568 monotonic_start_time, realtime_start_time);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800569 return true;
570 } else if (server_statistics_fetcher_.get() != nullptr) {
571 // We must be a remote node now. Look for the connection and see if it is
572 // connected.
James Kuszmaul17607fb2021-10-15 20:00:32 -0700573 CHECK(server_statistics_fetcher_->has_connections());
Austin Schuhb06f03b2021-02-17 22:00:37 -0800574
575 for (const message_bridge::ServerConnection *connection :
576 *server_statistics_fetcher_->connections()) {
577 if (connection->node()->name()->string_view() !=
578 node->name()->string_view()) {
579 continue;
580 }
581
582 if (connection->state() != message_bridge::State::CONNECTED) {
583 VLOG(1) << node->name()->string_view()
584 << " is not connected, can't start it yet.";
585 break;
586 }
587
Austin Schuhb06f03b2021-02-17 22:00:37 -0800588 if (!connection->has_monotonic_offset()) {
589 VLOG(1) << "Missing monotonic offset for setting start time for node "
590 << aos::FlatbufferToJson(node);
591 break;
592 }
593
James Kuszmaul17607fb2021-10-15 20:00:32 -0700594 CHECK(connection->has_boot_uuid());
Austin Schuh58646e22021-08-23 23:51:46 -0700595 const UUID boot_uuid =
596 UUID::FromString(connection->boot_uuid()->string_view());
597
598 if (log_namer_->monotonic_start_time(node_index, boot_uuid) !=
599 monotonic_clock::min_time) {
600 break;
601 }
602
603 VLOG(1) << "Updating start time for "
604 << aos::FlatbufferToJson(connection);
605
Austin Schuhb06f03b2021-02-17 22:00:37 -0800606 // Found it and it is connected. Compensate and go.
Austin Schuh73340842021-07-30 22:32:06 -0700607 log_namer_->SetStartTimes(
Austin Schuh58646e22021-08-23 23:51:46 -0700608 node_index, boot_uuid,
Austin Schuh73340842021-07-30 22:32:06 -0700609 monotonic_start_time +
610 std::chrono::nanoseconds(connection->monotonic_offset()),
611 realtime_start_time, monotonic_start_time, realtime_start_time);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800612 return true;
613 }
614 }
615 return false;
616}
617
618aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> Logger::MakeHeader(
Austin Schuh73340842021-07-30 22:32:06 -0700619 std::string_view config_sha256) {
Austin Schuhb06f03b2021-02-17 22:00:37 -0800620 flatbuffers::FlatBufferBuilder fbb;
621 fbb.ForceDefaults(true);
622
623 flatbuffers::Offset<aos::Configuration> configuration_offset;
624 if (!separate_config_) {
625 configuration_offset = CopyFlatBuffer(configuration_, &fbb);
626 } else {
627 CHECK(!config_sha256.empty());
628 }
629
630 const flatbuffers::Offset<flatbuffers::String> name_offset =
631 fbb.CreateString(name_);
632
633 CHECK(log_event_uuid_ != UUID::Zero());
634 const flatbuffers::Offset<flatbuffers::String> log_event_uuid_offset =
Austin Schuh5e2bfb82021-03-13 22:46:55 -0800635 log_event_uuid_.PackString(&fbb);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800636
637 const flatbuffers::Offset<flatbuffers::String> logger_instance_uuid_offset =
Austin Schuh5e2bfb82021-03-13 22:46:55 -0800638 logger_instance_uuid_.PackString(&fbb);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800639
640 flatbuffers::Offset<flatbuffers::String> log_start_uuid_offset;
Austin Schuh34f9e482021-03-31 22:54:18 -0700641 if (log_start_uuid_) {
642 log_start_uuid_offset = fbb.CreateString(log_start_uuid_->ToString());
Austin Schuhb06f03b2021-02-17 22:00:37 -0800643 }
644
645 flatbuffers::Offset<flatbuffers::String> config_sha256_offset;
646 if (!config_sha256.empty()) {
647 config_sha256_offset = fbb.CreateString(config_sha256);
648 }
649
650 const flatbuffers::Offset<flatbuffers::String> logger_node_boot_uuid_offset =
Austin Schuh5e2bfb82021-03-13 22:46:55 -0800651 event_loop_->boot_uuid().PackString(&fbb);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800652
Austin Schuhb06f03b2021-02-17 22:00:37 -0800653 flatbuffers::Offset<Node> logger_node_offset;
654
655 if (configuration::MultiNode(configuration_)) {
Austin Schuh5b728b72021-06-16 14:57:15 -0700656 logger_node_offset = RecursiveCopyFlatBuffer(node_, &fbb);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800657 }
658
659 aos::logger::LogFileHeader::Builder log_file_header_builder(fbb);
660
661 log_file_header_builder.add_name(name_offset);
662
663 // Only add the node if we are running in a multinode configuration.
Austin Schuh73340842021-07-30 22:32:06 -0700664 if (configuration::MultiNode(configuration_)) {
Austin Schuhb06f03b2021-02-17 22:00:37 -0800665 log_file_header_builder.add_logger_node(logger_node_offset);
666 }
667
668 if (!configuration_offset.IsNull()) {
669 log_file_header_builder.add_configuration(configuration_offset);
670 }
671 // The worst case theoretical out of order is the polling period times 2.
672 // One message could get logged right after the boundary, but be for right
673 // before the next boundary. And the reverse could happen for another
674 // message. Report back 3x to be extra safe, and because the cost isn't
675 // huge on the read side.
676 log_file_header_builder.add_max_out_of_order_duration(
677 std::chrono::nanoseconds(3 * polling_period_).count());
678
Austin Schuhb06f03b2021-02-17 22:00:37 -0800679 log_file_header_builder.add_log_event_uuid(log_event_uuid_offset);
680 log_file_header_builder.add_logger_instance_uuid(logger_instance_uuid_offset);
681 if (!log_start_uuid_offset.IsNull()) {
682 log_file_header_builder.add_log_start_uuid(log_start_uuid_offset);
683 }
684 log_file_header_builder.add_logger_node_boot_uuid(
685 logger_node_boot_uuid_offset);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800686
687 if (!config_sha256_offset.IsNull()) {
688 log_file_header_builder.add_configuration_sha256(config_sha256_offset);
689 }
690
691 fbb.FinishSizePrefixed(log_file_header_builder.Finish());
692 aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> result(
693 fbb.Release());
694
695 CHECK(result.Verify()) << ": Built a corrupted header.";
696
697 return result;
698}
699
700void Logger::ResetStatisics() {
701 max_message_fetch_time_ = std::chrono::nanoseconds::zero();
702 max_message_fetch_time_channel_ = -1;
703 max_message_fetch_time_size_ = -1;
704 total_message_fetch_time_ = std::chrono::nanoseconds::zero();
705 total_message_fetch_count_ = 0;
706 total_message_fetch_bytes_ = 0;
707 total_nop_fetch_time_ = std::chrono::nanoseconds::zero();
708 total_nop_fetch_count_ = 0;
709 max_copy_time_ = std::chrono::nanoseconds::zero();
710 max_copy_time_channel_ = -1;
711 max_copy_time_size_ = -1;
712 total_copy_time_ = std::chrono::nanoseconds::zero();
713 total_copy_count_ = 0;
714 total_copy_bytes_ = 0;
715}
716
717void Logger::Rotate() {
718 for (const Node *node : log_namer_->nodes()) {
Austin Schuh73340842021-07-30 22:32:06 -0700719 log_namer_->Rotate(node);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800720 }
721}
722
Brian Smartt03c00da2022-02-24 10:25:00 -0800723void Logger::WriteData(NewDataWriter *writer, const FetcherStruct &f) {
724 if (writer != nullptr) {
725 const UUID source_node_boot_uuid =
726 static_cast<int>(node_index_) != f.data_node_index
727 ? f.fetcher->context().source_boot_uuid
728 : event_loop_->boot_uuid();
729 // Write!
730 const auto start = event_loop_->monotonic_now();
731 flatbuffers::FlatBufferBuilder fbb(f.fetcher->context().size +
732 max_header_size_);
733 fbb.ForceDefaults(true);
734
735 fbb.FinishSizePrefixed(PackMessage(&fbb, f.fetcher->context(),
736 f.channel_index, f.log_type));
737 const auto end = event_loop_->monotonic_now();
738 RecordCreateMessageTime(start, end, f);
739
740 max_header_size_ = std::max(max_header_size_,
741 fbb.GetSize() - f.fetcher->context().size);
742 writer->QueueMessage(&fbb, source_node_boot_uuid, end);
743
744 VLOG(2) << "Wrote data as node "
745 << FlatbufferToJson(node_) << " for channel "
746 << configuration::CleanedChannelToString(f.fetcher->channel())
747 << " to " << writer->filename() << " data "
748 << FlatbufferToJson(
749 flatbuffers::GetSizePrefixedRoot<MessageHeader>(
750 fbb.GetBufferPointer()));
751 }
752}
753
754void Logger::WriteTimestamps(NewDataWriter *timestamp_writer, const FetcherStruct &f) {
755 if (timestamp_writer != nullptr) {
756 // And now handle timestamps.
757 const auto start = event_loop_->monotonic_now();
758 flatbuffers::FlatBufferBuilder fbb;
759 fbb.ForceDefaults(true);
760
761 fbb.FinishSizePrefixed(PackMessage(&fbb, f.fetcher->context(),
762 f.channel_index,
763 LogType::kLogDeliveryTimeOnly));
764 const auto end = event_loop_->monotonic_now();
765 RecordCreateMessageTime(start, end, f);
766
767 // Tell our writer that we know something about the remote boot.
768 timestamp_writer->UpdateRemote(
769 f.data_node_index, f.fetcher->context().source_boot_uuid,
770 f.fetcher->context().monotonic_remote_time,
771 f.fetcher->context().monotonic_event_time, f.reliable_forwarding);
772 timestamp_writer->QueueMessage(&fbb, event_loop_->boot_uuid(), end);
773
774 VLOG(2) << "Wrote timestamps as node "
775 << FlatbufferToJson(node_) << " for channel "
776 << configuration::CleanedChannelToString(f.fetcher->channel())
777 << " to " << timestamp_writer->filename() << " timestamp "
778 << FlatbufferToJson(
779 flatbuffers::GetSizePrefixedRoot<MessageHeader>(
780 fbb.GetBufferPointer()));
781 }
782}
783
784void Logger::WriteContent(NewDataWriter *contents_writer, const FetcherStruct &f) {
785 if (contents_writer != nullptr) {
786 const auto start = event_loop_->monotonic_now();
787 // And now handle the special message contents channel. Copy the
788 // message into a FlatBufferBuilder and save it to disk.
789 // TODO(austin): We can be more efficient here when we start to
790 // care...
791 flatbuffers::FlatBufferBuilder fbb;
792 fbb.ForceDefaults(true);
793
794 const RemoteMessage *msg =
795 flatbuffers::GetRoot<RemoteMessage>(f.fetcher->context().data);
796
797 CHECK(msg->has_boot_uuid()) << ": " << aos::FlatbufferToJson(msg);
798
799 logger::MessageHeader::Builder message_header_builder(fbb);
800
801 // TODO(austin): This needs to check the channel_index and confirm
802 // that it should be logged before squirreling away the timestamp to
803 // disk. We don't want to log irrelevant timestamps.
804
805 // Note: this must match the same order as MessageBridgeServer and
806 // PackMessage. We want identical headers to have identical
807 // on-the-wire formats to make comparing them easier.
808
809 // Translate from the channel index that the event loop uses to the
810 // channel index in the log file.
811 message_header_builder.add_channel_index(
812 event_loop_to_logged_channel_index_[msg->channel_index()]);
813
814 message_header_builder.add_queue_index(msg->queue_index());
815 message_header_builder.add_monotonic_sent_time(msg->monotonic_sent_time());
816 message_header_builder.add_realtime_sent_time(msg->realtime_sent_time());
817
818 message_header_builder.add_monotonic_remote_time(
819 msg->monotonic_remote_time());
820 message_header_builder.add_realtime_remote_time(
821 msg->realtime_remote_time());
822 message_header_builder.add_remote_queue_index(msg->remote_queue_index());
823
824 const aos::monotonic_clock::time_point monotonic_timestamp_time =
825 f.fetcher->context().monotonic_event_time;
826 message_header_builder.add_monotonic_timestamp_time(
827 monotonic_timestamp_time.time_since_epoch().count());
828
829 fbb.FinishSizePrefixed(message_header_builder.Finish());
830 const auto end = event_loop_->monotonic_now();
831 RecordCreateMessageTime(start, end, f);
832
833 // Timestamps tell us information about what happened too!
834 // Capture any reboots so UpdateRemote is properly recorded.
835 contents_writer->UpdateBoot(UUID::FromVector(msg->boot_uuid()));
836
837 // Start with recording info about the data flowing from our node to the
838 // remote.
839 const bool reliable =
840 f.channel_reliable_contents.size() != 0u
841 ? f.channel_reliable_contents[msg->channel_index()]
842 : f.reliable_contents;
843
844 contents_writer->UpdateRemote(node_index_, event_loop_->boot_uuid(),
845 monotonic_clock::time_point(
846 chrono::nanoseconds(msg->monotonic_remote_time())),
847 monotonic_clock::time_point(
848 chrono::nanoseconds(msg->monotonic_sent_time())),
849 reliable, monotonic_timestamp_time);
850
851 contents_writer->QueueMessage(
852 &fbb, UUID::FromVector(msg->boot_uuid()), end);
853 }
854}
855
856void Logger::WriteFetchedRecord(FetcherStruct &f) {
857 WriteData(f.writer, f);
858 WriteTimestamps(f.timestamp_writer, f);
859 WriteContent(f.contents_writer, f);
860}
861
862bool Logger::LogUntil(monotonic_clock::time_point t) {
863 bool has_pending_messages = false;
864
Austin Schuhb06f03b2021-02-17 22:00:37 -0800865 // Grab the latest ServerStatistics message. This will always have the
866 // oppertunity to be >= to the current time, so it will always represent any
867 // reboots which may have happened.
868 WriteMissingTimestamps();
869
870 // Write each channel to disk, one at a time.
871 for (FetcherStruct &f : fetchers_) {
872 while (true) {
873 if (f.written) {
874 const auto start = event_loop_->monotonic_now();
875 const bool got_new = f.fetcher->FetchNext();
876 const auto end = event_loop_->monotonic_now();
877 RecordFetchResult(start, end, got_new, &f);
878 if (!got_new) {
879 VLOG(2) << "No new data on "
880 << configuration::CleanedChannelToString(
881 f.fetcher->channel());
882 break;
883 }
884 f.written = false;
885 }
886
887 // TODO(james): Write tests to exercise this logic.
888 if (f.fetcher->context().monotonic_event_time >= t) {
Brian Smartt03c00da2022-02-24 10:25:00 -0800889 has_pending_messages = true;
Austin Schuhb06f03b2021-02-17 22:00:37 -0800890 break;
891 }
Austin Schuhb06f03b2021-02-17 22:00:37 -0800892
Brian Smartt03c00da2022-02-24 10:25:00 -0800893 WriteFetchedRecord(f);
Austin Schuhb06f03b2021-02-17 22:00:37 -0800894
895 f.written = true;
896 }
897 }
898 last_synchronized_time_ = t;
Brian Smartt03c00da2022-02-24 10:25:00 -0800899
900 return has_pending_messages;
Austin Schuhb06f03b2021-02-17 22:00:37 -0800901}
902
Austin Schuh30586902021-03-30 22:54:08 -0700903void Logger::DoLogData(const monotonic_clock::time_point end_time,
904 bool run_on_logged) {
Austin Schuhb06f03b2021-02-17 22:00:37 -0800905 // We want to guarantee that messages aren't out of order by more than
906 // max_out_of_order_duration. To do this, we need sync points. Every write
907 // cycle should be a sync point.
908
909 do {
910 // Move the sync point up by at most polling_period. This forces one sync
911 // per iteration, even if it is small.
912 LogUntil(std::min(last_synchronized_time_ + polling_period_, end_time));
913
Austin Schuh30586902021-03-30 22:54:08 -0700914 if (run_on_logged) {
915 on_logged_period_();
916 }
Austin Schuhb06f03b2021-02-17 22:00:37 -0800917
918 // If we missed cycles, we could be pretty far behind. Spin until we are
919 // caught up.
920 } while (last_synchronized_time_ + polling_period_ < end_time);
921}
922
923void Logger::RecordFetchResult(aos::monotonic_clock::time_point start,
924 aos::monotonic_clock::time_point end,
925 bool got_new, FetcherStruct *fetcher) {
926 const auto duration = end - start;
927 if (!got_new) {
928 ++total_nop_fetch_count_;
929 total_nop_fetch_time_ += duration;
930 return;
931 }
932 ++total_message_fetch_count_;
933 total_message_fetch_bytes_ += fetcher->fetcher->context().size;
934 total_message_fetch_time_ += duration;
935 if (duration > max_message_fetch_time_) {
936 max_message_fetch_time_ = duration;
937 max_message_fetch_time_channel_ = fetcher->channel_index;
938 max_message_fetch_time_size_ = fetcher->fetcher->context().size;
939 }
940}
941
942void Logger::RecordCreateMessageTime(aos::monotonic_clock::time_point start,
943 aos::monotonic_clock::time_point end,
Brian Smartt03c00da2022-02-24 10:25:00 -0800944 const FetcherStruct &fetcher) {
Austin Schuhb06f03b2021-02-17 22:00:37 -0800945 const auto duration = end - start;
946 total_copy_time_ += duration;
947 ++total_copy_count_;
Brian Smartt03c00da2022-02-24 10:25:00 -0800948 total_copy_bytes_ += fetcher.fetcher->context().size;
Austin Schuhb06f03b2021-02-17 22:00:37 -0800949 if (duration > max_copy_time_) {
950 max_copy_time_ = duration;
Brian Smartt03c00da2022-02-24 10:25:00 -0800951 max_copy_time_channel_ = fetcher.channel_index;
952 max_copy_time_size_ = fetcher.fetcher->context().size;
Austin Schuhb06f03b2021-02-17 22:00:37 -0800953 }
954}
955
956} // namespace logger
957} // namespace aos