blob: e7d0d3b9040993f5526ebbb74c0018642da8b781 [file] [log] [blame]
James Kuszmaul38735e82019-12-07 16:42:06 -08001#include "aos/events/logging/logger.h"
Austin Schuhe309d2a2019-11-29 13:25:21 -08002
3#include <fcntl.h>
Austin Schuh4c4e0092019-12-22 16:18:03 -08004#include <limits.h>
Austin Schuhe309d2a2019-11-29 13:25:21 -08005#include <sys/stat.h>
6#include <sys/types.h>
7#include <sys/uio.h>
8#include <vector>
9
10#include "absl/strings/string_view.h"
11#include "absl/types/span.h"
12#include "aos/events/event_loop.h"
James Kuszmaul38735e82019-12-07 16:42:06 -080013#include "aos/events/logging/logger_generated.h"
Austin Schuhe309d2a2019-11-29 13:25:21 -080014#include "aos/flatbuffer_merge.h"
Austin Schuh288479d2019-12-18 19:47:52 -080015#include "aos/network/team_number.h"
Austin Schuhe309d2a2019-11-29 13:25:21 -080016#include "aos/time/time.h"
17#include "flatbuffers/flatbuffers.h"
18
Austin Schuh15649d62019-12-28 16:36:38 -080019DEFINE_bool(skip_missing_forwarding_entries, false,
20 "If true, drop any forwarding entries with missing data. If "
21 "false, CHECK.");
Austin Schuhe309d2a2019-11-29 13:25:21 -080022
23namespace aos {
24namespace logger {
25
26namespace chrono = std::chrono;
27
Austin Schuhe309d2a2019-11-29 13:25:21 -080028Logger::Logger(DetachedBufferWriter *writer, EventLoop *event_loop,
29 std::chrono::milliseconds polling_period)
30 : event_loop_(event_loop),
31 writer_(writer),
32 timer_handler_(event_loop_->AddTimer([this]() { DoLogData(); })),
33 polling_period_(polling_period) {
34 for (const Channel *channel : *event_loop_->configuration()->channels()) {
35 FetcherStruct fs;
Austin Schuh15649d62019-12-28 16:36:38 -080036 const bool is_readable =
37 configuration::ChannelIsReadableOnNode(channel, event_loop_->node());
38 const bool log_message = configuration::ChannelMessageIsLoggedOnNode(
39 channel, event_loop_->node()) &&
40 is_readable;
41
42 const bool log_delivery_times =
43 (event_loop_->node() == nullptr)
44 ? false
45 : configuration::ConnectionDeliveryTimeIsLoggedOnNode(
46 channel, event_loop_->node(), event_loop_->node());
47
48 if (log_message || log_delivery_times) {
49 fs.fetcher = event_loop->MakeRawFetcher(channel);
50 VLOG(1) << "Logging channel "
51 << configuration::CleanedChannelToString(channel);
52
53 if (log_delivery_times) {
54 if (log_message) {
55 VLOG(1) << " Logging message and delivery times";
56 fs.log_type = LogType::kLogMessageAndDeliveryTime;
57 } else {
58 VLOG(1) << " Logging delivery times only";
59 fs.log_type = LogType::kLogDeliveryTimeOnly;
60 }
61 } else {
62 // We don't have a particularly great use case right now for logging a
63 // forwarded message, but either not logging the delivery times, or
64 // logging them on another node. Fail rather than produce bad results.
65 CHECK(configuration::ChannelIsSendableOnNode(channel,
66 event_loop_->node()))
67 << ": Logger only knows how to log remote messages with "
68 "forwarding timestamps.";
69 VLOG(1) << " Logging message only";
70 fs.log_type = LogType::kLogMessage;
71 }
72 }
73
Austin Schuhe309d2a2019-11-29 13:25:21 -080074 fs.written = false;
75 fetchers_.emplace_back(std::move(fs));
76 }
77
78 // When things start, we want to log the header, then the most recent messages
79 // available on each fetcher to capture the previous state, then start
80 // polling.
81 event_loop_->OnRun([this, polling_period]() {
82 // Grab data from each channel right before we declare the log file started
83 // so we can capture the latest message on each channel. This lets us have
84 // non periodic messages with configuration that now get logged.
85 for (FetcherStruct &f : fetchers_) {
Austin Schuh15649d62019-12-28 16:36:38 -080086 if (f.fetcher.get() != nullptr) {
87 f.written = !f.fetcher->Fetch();
88 }
Austin Schuhe309d2a2019-11-29 13:25:21 -080089 }
90
91 // We need to pick a point in time to declare the log file "started". This
92 // starts here. It needs to be after everything is fetched so that the
93 // fetchers are all pointed at the most recent message before the start
94 // time.
95 const monotonic_clock::time_point monotonic_now =
96 event_loop_->monotonic_now();
97 const realtime_clock::time_point realtime_now = event_loop_->realtime_now();
98 last_synchronized_time_ = monotonic_now;
99
100 {
101 // Now write the header with this timestamp in it.
102 flatbuffers::FlatBufferBuilder fbb;
103 fbb.ForceDefaults(1);
104
105 flatbuffers::Offset<aos::Configuration> configuration_offset =
106 CopyFlatBuffer(event_loop_->configuration(), &fbb);
107
Austin Schuh288479d2019-12-18 19:47:52 -0800108 flatbuffers::Offset<flatbuffers::String> string_offset =
109 fbb.CreateString(network::GetHostname());
110
Austin Schuhfd960622020-01-01 13:22:55 -0800111 flatbuffers::Offset<Node> node_offset;
112 if (event_loop_->node() != nullptr) {
113 node_offset = CopyFlatBuffer(event_loop_->node(), &fbb);
114 }
Austin Schuh15649d62019-12-28 16:36:38 -0800115 LOG(INFO) << "Logging node as " << FlatbufferToJson(event_loop_->node());
116
Austin Schuhe309d2a2019-11-29 13:25:21 -0800117 aos::logger::LogFileHeader::Builder log_file_header_builder(fbb);
118
Austin Schuh288479d2019-12-18 19:47:52 -0800119 log_file_header_builder.add_name(string_offset);
120
Austin Schuhfd960622020-01-01 13:22:55 -0800121 // Only add the node if we are running in a multinode configuration.
122 if (event_loop_->node() != nullptr) {
123 log_file_header_builder.add_node(node_offset);
124 }
Austin Schuh15649d62019-12-28 16:36:38 -0800125
Austin Schuhe309d2a2019-11-29 13:25:21 -0800126 log_file_header_builder.add_configuration(configuration_offset);
127 // The worst case theoretical out of order is the polling period times 2.
128 // One message could get logged right after the boundary, but be for right
129 // before the next boundary. And the reverse could happen for another
130 // message. Report back 3x to be extra safe, and because the cost isn't
131 // huge on the read side.
132 log_file_header_builder.add_max_out_of_order_duration(
133 std::chrono::duration_cast<std::chrono::nanoseconds>(3 *
134 polling_period)
135 .count());
136
Austin Schuh629c9172019-12-23 20:34:43 -0800137 log_file_header_builder.add_monotonic_start_time(
Austin Schuhe309d2a2019-11-29 13:25:21 -0800138 std::chrono::duration_cast<std::chrono::nanoseconds>(
139 monotonic_now.time_since_epoch())
140 .count());
Austin Schuh629c9172019-12-23 20:34:43 -0800141 log_file_header_builder.add_realtime_start_time(
Austin Schuhe309d2a2019-11-29 13:25:21 -0800142 std::chrono::duration_cast<std::chrono::nanoseconds>(
143 realtime_now.time_since_epoch())
144 .count());
145
146 fbb.FinishSizePrefixed(log_file_header_builder.Finish());
147 writer_->QueueSizedFlatbuffer(&fbb);
148 }
149
150 timer_handler_->Setup(event_loop_->monotonic_now() + polling_period,
151 polling_period);
152 });
153}
154
155void Logger::DoLogData() {
156 // We want to guarentee that messages aren't out of order by more than
157 // max_out_of_order_duration. To do this, we need sync points. Every write
158 // cycle should be a sync point.
159 const monotonic_clock::time_point monotonic_now = monotonic_clock::now();
160
161 do {
162 // Move the sync point up by at most polling_period. This forces one sync
163 // per iteration, even if it is small.
164 last_synchronized_time_ =
165 std::min(last_synchronized_time_ + polling_period_, monotonic_now);
166 size_t channel_index = 0;
167 // Write each channel to disk, one at a time.
168 for (FetcherStruct &f : fetchers_) {
Austin Schuh15649d62019-12-28 16:36:38 -0800169 // Skip any channels which we aren't supposed to log.
170 if (f.fetcher.get() != nullptr) {
171 while (true) {
172 if (f.written) {
173 if (!f.fetcher->FetchNext()) {
174 VLOG(2) << "No new data on "
175 << configuration::CleanedChannelToString(
176 f.fetcher->channel());
177 break;
178 } else {
179 f.written = false;
180 }
Austin Schuhe309d2a2019-11-29 13:25:21 -0800181 }
Austin Schuhe309d2a2019-11-29 13:25:21 -0800182
Austin Schuh15649d62019-12-28 16:36:38 -0800183 CHECK(!f.written);
184
185 // TODO(james): Write tests to exercise this logic.
186 if (f.fetcher->context().monotonic_event_time <
187 last_synchronized_time_) {
188 // Write!
189 flatbuffers::FlatBufferBuilder fbb(f.fetcher->context().size +
190 max_header_size_);
191 fbb.ForceDefaults(1);
192
193 fbb.FinishSizePrefixed(PackMessage(&fbb, f.fetcher->context(),
194 channel_index, f.log_type));
195
196 VLOG(2) << "Writing data for channel "
197 << configuration::CleanedChannelToString(
198 f.fetcher->channel());
199
200 max_header_size_ = std::max(
201 max_header_size_, fbb.GetSize() - f.fetcher->context().size);
202 writer_->QueueSizedFlatbuffer(&fbb);
203
204 f.written = true;
Austin Schuhe309d2a2019-11-29 13:25:21 -0800205 } else {
Austin Schuh15649d62019-12-28 16:36:38 -0800206 break;
Austin Schuhe309d2a2019-11-29 13:25:21 -0800207 }
208 }
Austin Schuhe309d2a2019-11-29 13:25:21 -0800209 }
210
211 ++channel_index;
212 }
213
214 CHECK_EQ(channel_index, fetchers_.size());
215
216 // If we missed cycles, we could be pretty far behind. Spin until we are
217 // caught up.
218 } while (last_synchronized_time_ + polling_period_ < monotonic_now);
219
220 writer_->Flush();
221}
222
223LogReader::LogReader(absl::string_view filename)
224 : fd_(open(std::string(filename).c_str(), O_RDONLY | O_CLOEXEC)) {
225 PCHECK(fd_ != -1) << ": Failed to open " << filename;
226
227 // Make sure we have enough to read the size.
228 absl::Span<const uint8_t> config_data = ReadMessage();
229
230 // Make sure something was read.
231 CHECK(config_data != absl::Span<const uint8_t>());
232
233 // And copy the config so we have it forever.
234 configuration_ = std::vector<uint8_t>(config_data.begin(), config_data.end());
235
236 max_out_of_order_duration_ = std::chrono::nanoseconds(
237 flatbuffers::GetSizePrefixedRoot<LogFileHeader>(configuration_.data())
238 ->max_out_of_order_duration());
239
240 channels_.resize(configuration()->channels()->size());
241
242 QueueMessages();
243}
244
James Kuszmaul7daef362019-12-31 18:28:17 -0800245LogReader::~LogReader() {
246 CHECK(!event_loop_unique_ptr_) << "Did you remember to call Deregister?";
247}
248
Austin Schuhe309d2a2019-11-29 13:25:21 -0800249bool LogReader::ReadBlock() {
250 if (end_of_file_) {
251 return false;
252 }
253
254 // Appends 256k. This is enough that the read call is efficient. We don't
255 // want to spend too much time reading small chunks because the syscalls for
256 // that will be expensive.
257 constexpr size_t kReadSize = 256 * 1024;
258
259 // Strip off any unused data at the front.
260 if (consumed_data_ != 0) {
261 data_.erase(data_.begin(), data_.begin() + consumed_data_);
262 consumed_data_ = 0;
263 }
264
265 const size_t starting_size = data_.size();
266
267 // This should automatically grow the backing store. It won't shrink if we
268 // get a small chunk later. This reduces allocations when we want to append
269 // more data.
270 data_.resize(data_.size() + kReadSize);
271
272 ssize_t count = read(fd_, &data_[starting_size], kReadSize);
273 data_.resize(starting_size + std::max(count, static_cast<ssize_t>(0)));
274 if (count == 0) {
275 end_of_file_ = true;
276 return false;
277 }
278 PCHECK(count > 0);
279
280 return true;
281}
282
283bool LogReader::MessageAvailable() {
284 // Are we big enough to read the size?
285 if (data_.size() - consumed_data_ < sizeof(flatbuffers::uoffset_t)) {
286 return false;
287 }
288
289 // Then, are we big enough to read the full message?
290 const size_t data_size =
291 flatbuffers::GetPrefixedSize(data_.data() + consumed_data_) +
292 sizeof(flatbuffers::uoffset_t);
293 if (data_.size() < consumed_data_ + data_size) {
294 return false;
295 }
296
297 return true;
298}
299
300absl::Span<const uint8_t> LogReader::ReadMessage() {
301 // Make sure we have enough for the size.
302 if (data_.size() - consumed_data_ < sizeof(flatbuffers::uoffset_t)) {
303 if (!ReadBlock()) {
304 return absl::Span<const uint8_t>();
305 }
306 }
307
308 // Now make sure we have enough for the message.
309 const size_t data_size =
310 flatbuffers::GetPrefixedSize(data_.data() + consumed_data_) +
311 sizeof(flatbuffers::uoffset_t);
312 while (data_.size() < consumed_data_ + data_size) {
313 if (!ReadBlock()) {
314 return absl::Span<const uint8_t>();
315 }
316 }
317
318 // And return it, consuming the data.
319 const uint8_t *data_ptr = data_.data() + consumed_data_;
320
321 consumed_data_ += data_size;
322
323 return absl::Span<const uint8_t>(data_ptr, data_size);
324}
325
326void LogReader::QueueMessages() {
327 while (true) {
328 // Don't queue if we have enough data already.
329 // When a log file starts, there should be a message from each channel.
330 // Those messages might be very old. Make sure to read a chunk past the
331 // starting time.
332 if (channel_heap_.size() > 0 &&
333 newest_timestamp_ >
334 std::max(oldest_message().first, monotonic_start_time()) +
335 max_out_of_order_duration_) {
336 break;
337 }
338
339 absl::Span<const uint8_t> msg_data = ReadMessage();
340 if (msg_data == absl::Span<const uint8_t>()) {
341 break;
342 }
343
344 FlatbufferVector<MessageHeader> msg(std::vector<uint8_t>(
345 msg_data.begin() + sizeof(flatbuffers::uoffset_t), msg_data.end()));
346
347 EmplaceDataBack(std::move(msg));
348 }
349
350 queue_data_time_ = newest_timestamp_ - max_out_of_order_duration_;
351}
352
Austin Schuh15649d62019-12-28 16:36:38 -0800353const Configuration *LogReader::configuration() const {
Austin Schuhe309d2a2019-11-29 13:25:21 -0800354 return flatbuffers::GetSizePrefixedRoot<LogFileHeader>(configuration_.data())
355 ->configuration();
356}
357
Austin Schuh15649d62019-12-28 16:36:38 -0800358const Node *LogReader::node() const {
Austin Schuhfd960622020-01-01 13:22:55 -0800359 if (configuration()->has_nodes()) {
360 CHECK(flatbuffers::GetSizePrefixedRoot<LogFileHeader>(configuration_.data())
361 ->has_node());
362 CHECK(flatbuffers::GetSizePrefixedRoot<LogFileHeader>(configuration_.data())
363 ->node()
364 ->has_name());
365 return configuration::GetNode(
366 configuration(),
367 flatbuffers::GetSizePrefixedRoot<LogFileHeader>(configuration_.data())
368 ->node()
369 ->name()
370 ->string_view());
371 } else {
372 CHECK(
373 !flatbuffers::GetSizePrefixedRoot<LogFileHeader>(configuration_.data())
374 ->has_node());
375 return nullptr;
376 }
Austin Schuh15649d62019-12-28 16:36:38 -0800377}
378
Austin Schuhe309d2a2019-11-29 13:25:21 -0800379monotonic_clock::time_point LogReader::monotonic_start_time() {
380 return monotonic_clock::time_point(std::chrono::nanoseconds(
381 flatbuffers::GetSizePrefixedRoot<LogFileHeader>(configuration_.data())
Austin Schuh629c9172019-12-23 20:34:43 -0800382 ->monotonic_start_time()));
Austin Schuhe309d2a2019-11-29 13:25:21 -0800383}
384
385realtime_clock::time_point LogReader::realtime_start_time() {
386 return realtime_clock::time_point(std::chrono::nanoseconds(
387 flatbuffers::GetSizePrefixedRoot<LogFileHeader>(configuration_.data())
Austin Schuh629c9172019-12-23 20:34:43 -0800388 ->realtime_start_time()));
Austin Schuhe309d2a2019-11-29 13:25:21 -0800389}
390
Austin Schuh92547522019-12-28 14:33:43 -0800391void LogReader::Register(SimulatedEventLoopFactory *event_loop_factory) {
392 event_loop_unique_ptr_ = event_loop_factory->MakeEventLoop("log_reader");
393 event_loop_factory_ = event_loop_factory;
394 // We don't run timing reports when trying to print out logged data, because
395 // otherwise we would end up printing out the timing reports themselves...
396 // This is only really relevant when we are replaying into a simulation.
397 event_loop_unique_ptr_->SkipTimingReport();
398
399 Register(event_loop_unique_ptr_.get());
400 event_loop_factory_->RunFor(monotonic_start_time() -
401 event_loop_factory_->monotonic_now());
402}
403
Austin Schuhe309d2a2019-11-29 13:25:21 -0800404void LogReader::Register(EventLoop *event_loop) {
405 event_loop_ = event_loop;
406
Austin Schuh39788ff2019-12-01 18:22:57 -0800407 // Otherwise we replay the timing report and try to resend it...
408 event_loop_->SkipTimingReport();
409
Austin Schuhe309d2a2019-11-29 13:25:21 -0800410 for (size_t i = 0; i < channels_.size(); ++i) {
411 CHECK_EQ(configuration()->channels()->Get(i)->name(),
412 event_loop_->configuration()->channels()->Get(i)->name());
413 CHECK_EQ(configuration()->channels()->Get(i)->type(),
414 event_loop_->configuration()->channels()->Get(i)->type());
415
416 channels_[i].raw_sender = event_loop_->MakeRawSender(
417 event_loop_->configuration()->channels()->Get(i));
418 }
419
420 timer_handler_ = event_loop_->AddTimer([this]() {
421 std::pair<monotonic_clock::time_point, int> oldest_channel_index =
422 PopOldestChannel();
423 const monotonic_clock::time_point monotonic_now =
Austin Schuhad154822019-12-27 15:45:13 -0800424 event_loop_->context().monotonic_event_time;
Austin Schuhe309d2a2019-11-29 13:25:21 -0800425 CHECK(monotonic_now == oldest_channel_index.first)
426 << ": Now " << monotonic_now.time_since_epoch().count()
427 << " trying to send "
428 << oldest_channel_index.first.time_since_epoch().count();
429
430 struct LogReader::ChannelData &channel =
431 channels_[oldest_channel_index.second];
432
433 FlatbufferVector<MessageHeader> front = std::move(channel.front());
434
Austin Schuh15649d62019-12-28 16:36:38 -0800435 if (oldest_channel_index.first > monotonic_start_time() ||
436 event_loop_factory_ != nullptr) {
437 if (!FLAGS_skip_missing_forwarding_entries ||
438 front.message().data() != nullptr) {
439 CHECK(front.message().data() != nullptr)
440 << ": Got a message without data. Forwarding entry which was not "
441 "matched? Use --skip_missing_forwarding_entries to ignore "
442 "this.";
Austin Schuh92547522019-12-28 14:33:43 -0800443
Austin Schuh15649d62019-12-28 16:36:38 -0800444 // If we have access to the factory, use it to fix the realtime time.
445 if (event_loop_factory_ != nullptr) {
446 event_loop_factory_->SetRealtimeOffset(
447 monotonic_clock::time_point(
448 chrono::nanoseconds(front.message().monotonic_sent_time())),
449 realtime_clock::time_point(
450 chrono::nanoseconds(front.message().realtime_sent_time())));
451 }
452
453 channel.raw_sender->Send(
454 front.message().data()->Data(), front.message().data()->size(),
Austin Schuh92547522019-12-28 14:33:43 -0800455 monotonic_clock::time_point(
Austin Schuh15649d62019-12-28 16:36:38 -0800456 chrono::nanoseconds(front.message().monotonic_remote_time())),
Austin Schuh92547522019-12-28 14:33:43 -0800457 realtime_clock::time_point(
Austin Schuh15649d62019-12-28 16:36:38 -0800458 chrono::nanoseconds(front.message().realtime_remote_time())),
459 front.message().remote_queue_index());
Austin Schuh92547522019-12-28 14:33:43 -0800460 }
Austin Schuhe309d2a2019-11-29 13:25:21 -0800461 } else {
462 LOG(WARNING) << "Not sending data from before the start of the log file. "
463 << oldest_channel_index.first.time_since_epoch().count()
464 << " start "
465 << monotonic_start_time().time_since_epoch().count() << " "
466 << FlatbufferToJson(front);
467 }
468 channel.data.pop_front();
469
470 // Re-push it and update the oldest timestamp.
471 if (channel.data.size() != 0) {
472 const monotonic_clock::time_point timestamp = monotonic_clock::time_point(
473 chrono::nanoseconds(channel.front().message().monotonic_sent_time()));
474 PushChannelHeap(timestamp, oldest_channel_index.second);
475 channel.oldest_timestamp = timestamp;
476 } else {
477 channel.oldest_timestamp = monotonic_clock::min_time;
478 }
479
480 if (monotonic_now > queue_data_time_) {
481 QueueMessages();
482 }
483
484 if (channel_heap_.size() != 0) {
485 timer_handler_->Setup(oldest_message().first);
486 }
487 });
488
489 if (channel_heap_.size() > 0u) {
490 event_loop_->OnRun(
491 [this]() { timer_handler_->Setup(oldest_message().first); });
492 }
493}
494
495void LogReader::Deregister() {
496 for (size_t i = 0; i < channels_.size(); ++i) {
497 channels_[i].raw_sender.reset();
498 }
Austin Schuh92547522019-12-28 14:33:43 -0800499
500 event_loop_factory_ = nullptr;
501 event_loop_unique_ptr_.reset();
502 event_loop_ = nullptr;
Austin Schuhe309d2a2019-11-29 13:25:21 -0800503}
504
505void LogReader::EmplaceDataBack(FlatbufferVector<MessageHeader> &&new_data) {
506 const monotonic_clock::time_point timestamp = monotonic_clock::time_point(
507 chrono::nanoseconds(new_data.message().monotonic_sent_time()));
508 const size_t channel_index = new_data.message().channel_index();
509 CHECK_LT(channel_index, channels_.size());
510 newest_timestamp_ = std::max(newest_timestamp_, timestamp);
511 if (channels_[channel_index].data.size() == 0) {
512 channels_[channel_index].oldest_timestamp = timestamp;
513 PushChannelHeap(timestamp, channel_index);
514 }
515 channels_[channel_index].data.emplace_back(std::move(new_data));
516}
517
518namespace {
519
520bool ChannelHeapCompare(
521 const std::pair<monotonic_clock::time_point, int> first,
522 const std::pair<monotonic_clock::time_point, int> second) {
523 if (first.first > second.first) {
524 return true;
525 } else if (first.first == second.first) {
526 return first.second > second.second;
527 } else {
528 return false;
529 }
530}
531
532} // namespace
533
534void LogReader::PushChannelHeap(monotonic_clock::time_point timestamp,
535 int channel_index) {
536 channel_heap_.push_back(std::make_pair(timestamp, channel_index));
537
538 // The default sort puts the newest message first. Use a custom comparator to
539 // put the oldest message first.
540 std::push_heap(channel_heap_.begin(), channel_heap_.end(),
541 ChannelHeapCompare);
542}
543
544std::pair<monotonic_clock::time_point, int> LogReader::PopOldestChannel() {
545 std::pair<monotonic_clock::time_point, int> result = channel_heap_.front();
546 std::pop_heap(channel_heap_.begin(), channel_heap_.end(),
547 &ChannelHeapCompare);
548 channel_heap_.pop_back();
549 return result;
550}
551
552} // namespace logger
553} // namespace aos