James Kuszmaul | 38735e8 | 2019-12-07 16:42:06 -0800 | [diff] [blame] | 1 | #include "aos/events/logging/logger.h" |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 2 | |
| 3 | #include <fcntl.h> |
Austin Schuh | 4c4e009 | 2019-12-22 16:18:03 -0800 | [diff] [blame] | 4 | #include <limits.h> |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 5 | #include <sys/stat.h> |
| 6 | #include <sys/types.h> |
| 7 | #include <sys/uio.h> |
| 8 | #include <vector> |
| 9 | |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 10 | #include "Eigen/Dense" |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 11 | #include "absl/types/span.h" |
| 12 | #include "aos/events/event_loop.h" |
James Kuszmaul | 38735e8 | 2019-12-07 16:42:06 -0800 | [diff] [blame] | 13 | #include "aos/events/logging/logger_generated.h" |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 14 | #include "aos/flatbuffer_merge.h" |
Austin Schuh | 288479d | 2019-12-18 19:47:52 -0800 | [diff] [blame] | 15 | #include "aos/network/team_number.h" |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 16 | #include "aos/time/time.h" |
| 17 | #include "flatbuffers/flatbuffers.h" |
| 18 | |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 19 | DEFINE_bool(skip_missing_forwarding_entries, false, |
| 20 | "If true, drop any forwarding entries with missing data. If " |
| 21 | "false, CHECK."); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 22 | |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 23 | DEFINE_bool(timestamps_to_csv, false, |
| 24 | "If true, write all the time synchronization information to a set " |
| 25 | "of CSV files in /tmp/. This should only be needed when debugging " |
| 26 | "time synchronization."); |
| 27 | |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 28 | namespace aos { |
| 29 | namespace logger { |
| 30 | |
| 31 | namespace chrono = std::chrono; |
| 32 | |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 33 | Logger::Logger(DetachedBufferWriter *writer, EventLoop *event_loop, |
| 34 | std::chrono::milliseconds polling_period) |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 35 | : Logger(std::make_unique<LocalLogNamer>(writer, event_loop->node()), |
| 36 | event_loop, polling_period) {} |
| 37 | |
| 38 | Logger::Logger(std::unique_ptr<LogNamer> log_namer, EventLoop *event_loop, |
| 39 | std::chrono::milliseconds polling_period) |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 40 | : event_loop_(event_loop), |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 41 | log_namer_(std::move(log_namer)), |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 42 | timer_handler_(event_loop_->AddTimer([this]() { DoLogData(); })), |
| 43 | polling_period_(polling_period) { |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 44 | VLOG(1) << "Starting logger for " << FlatbufferToJson(event_loop_->node()); |
| 45 | int channel_index = 0; |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 46 | for (const Channel *channel : *event_loop_->configuration()->channels()) { |
| 47 | FetcherStruct fs; |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 48 | const bool is_local = |
| 49 | configuration::ChannelIsSendableOnNode(channel, event_loop_->node()); |
| 50 | |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 51 | const bool is_readable = |
| 52 | configuration::ChannelIsReadableOnNode(channel, event_loop_->node()); |
| 53 | const bool log_message = configuration::ChannelMessageIsLoggedOnNode( |
| 54 | channel, event_loop_->node()) && |
| 55 | is_readable; |
| 56 | |
| 57 | const bool log_delivery_times = |
| 58 | (event_loop_->node() == nullptr) |
| 59 | ? false |
| 60 | : configuration::ConnectionDeliveryTimeIsLoggedOnNode( |
| 61 | channel, event_loop_->node(), event_loop_->node()); |
| 62 | |
| 63 | if (log_message || log_delivery_times) { |
| 64 | fs.fetcher = event_loop->MakeRawFetcher(channel); |
| 65 | VLOG(1) << "Logging channel " |
| 66 | << configuration::CleanedChannelToString(channel); |
| 67 | |
| 68 | if (log_delivery_times) { |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 69 | VLOG(1) << " Delivery times"; |
| 70 | fs.timestamp_writer = log_namer_->MakeTimestampWriter(channel); |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 71 | } |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 72 | if (log_message) { |
| 73 | VLOG(1) << " Data"; |
| 74 | fs.writer = log_namer_->MakeWriter(channel); |
| 75 | if (!is_local) { |
| 76 | fs.log_type = LogType::kLogRemoteMessage; |
| 77 | } |
| 78 | } |
| 79 | fs.channel_index = channel_index; |
| 80 | fs.written = false; |
| 81 | fetchers_.emplace_back(std::move(fs)); |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 82 | } |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 83 | ++channel_index; |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 84 | } |
| 85 | |
| 86 | // When things start, we want to log the header, then the most recent messages |
| 87 | // available on each fetcher to capture the previous state, then start |
| 88 | // polling. |
| 89 | event_loop_->OnRun([this, polling_period]() { |
| 90 | // Grab data from each channel right before we declare the log file started |
| 91 | // so we can capture the latest message on each channel. This lets us have |
| 92 | // non periodic messages with configuration that now get logged. |
| 93 | for (FetcherStruct &f : fetchers_) { |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 94 | f.written = !f.fetcher->Fetch(); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 95 | } |
| 96 | |
| 97 | // We need to pick a point in time to declare the log file "started". This |
| 98 | // starts here. It needs to be after everything is fetched so that the |
| 99 | // fetchers are all pointed at the most recent message before the start |
| 100 | // time. |
Austin Schuh | fa89589 | 2020-01-07 20:07:41 -0800 | [diff] [blame] | 101 | monotonic_start_time_ = event_loop_->monotonic_now(); |
| 102 | realtime_start_time_ = event_loop_->realtime_now(); |
| 103 | last_synchronized_time_ = monotonic_start_time_; |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 104 | |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 105 | LOG(INFO) << "Logging node as " << FlatbufferToJson(event_loop_->node()) |
| 106 | << " start_time " << monotonic_start_time_; |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 107 | |
Austin Schuh | fa89589 | 2020-01-07 20:07:41 -0800 | [diff] [blame] | 108 | WriteHeader(); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 109 | |
| 110 | timer_handler_->Setup(event_loop_->monotonic_now() + polling_period, |
| 111 | polling_period); |
| 112 | }); |
| 113 | } |
| 114 | |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 115 | // TODO(austin): Set the remote start time to the first time we see a remote |
| 116 | // message when we are logging those messages separate? Need to signal what to |
| 117 | // do, or how to get a good timestamp. |
Austin Schuh | fa89589 | 2020-01-07 20:07:41 -0800 | [diff] [blame] | 118 | void Logger::WriteHeader() { |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 119 | for (const Node *node : log_namer_->nodes()) { |
| 120 | WriteHeader(node); |
| 121 | } |
| 122 | } |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 123 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 124 | void Logger::WriteHeader(const Node *node) { |
Austin Schuh | fa89589 | 2020-01-07 20:07:41 -0800 | [diff] [blame] | 125 | // Now write the header with this timestamp in it. |
| 126 | flatbuffers::FlatBufferBuilder fbb; |
Austin Schuh | d7b15da | 2020-02-17 15:06:11 -0800 | [diff] [blame^] | 127 | fbb.ForceDefaults(true); |
Austin Schuh | fa89589 | 2020-01-07 20:07:41 -0800 | [diff] [blame] | 128 | |
| 129 | flatbuffers::Offset<aos::Configuration> configuration_offset = |
| 130 | CopyFlatBuffer(event_loop_->configuration(), &fbb); |
| 131 | |
| 132 | flatbuffers::Offset<flatbuffers::String> string_offset = |
| 133 | fbb.CreateString(network::GetHostname()); |
| 134 | |
| 135 | flatbuffers::Offset<Node> node_offset; |
| 136 | if (event_loop_->node() != nullptr) { |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 137 | node_offset = CopyFlatBuffer(node, &fbb); |
Austin Schuh | fa89589 | 2020-01-07 20:07:41 -0800 | [diff] [blame] | 138 | } |
| 139 | |
| 140 | aos::logger::LogFileHeader::Builder log_file_header_builder(fbb); |
| 141 | |
| 142 | log_file_header_builder.add_name(string_offset); |
| 143 | |
| 144 | // Only add the node if we are running in a multinode configuration. |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 145 | if (node != nullptr) { |
Austin Schuh | fa89589 | 2020-01-07 20:07:41 -0800 | [diff] [blame] | 146 | log_file_header_builder.add_node(node_offset); |
| 147 | } |
| 148 | |
| 149 | log_file_header_builder.add_configuration(configuration_offset); |
| 150 | // The worst case theoretical out of order is the polling period times 2. |
| 151 | // One message could get logged right after the boundary, but be for right |
| 152 | // before the next boundary. And the reverse could happen for another |
| 153 | // message. Report back 3x to be extra safe, and because the cost isn't |
| 154 | // huge on the read side. |
| 155 | log_file_header_builder.add_max_out_of_order_duration( |
| 156 | std::chrono::duration_cast<std::chrono::nanoseconds>(3 * polling_period_) |
| 157 | .count()); |
| 158 | |
| 159 | log_file_header_builder.add_monotonic_start_time( |
| 160 | std::chrono::duration_cast<std::chrono::nanoseconds>( |
| 161 | monotonic_start_time_.time_since_epoch()) |
| 162 | .count()); |
| 163 | log_file_header_builder.add_realtime_start_time( |
| 164 | std::chrono::duration_cast<std::chrono::nanoseconds>( |
| 165 | realtime_start_time_.time_since_epoch()) |
| 166 | .count()); |
| 167 | |
| 168 | fbb.FinishSizePrefixed(log_file_header_builder.Finish()); |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 169 | log_namer_->WriteHeader(&fbb, node); |
Austin Schuh | fa89589 | 2020-01-07 20:07:41 -0800 | [diff] [blame] | 170 | } |
| 171 | |
| 172 | void Logger::Rotate(DetachedBufferWriter *writer) { |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 173 | Rotate(std::make_unique<LocalLogNamer>(writer, event_loop_->node())); |
| 174 | } |
| 175 | |
| 176 | void Logger::Rotate(std::unique_ptr<LogNamer> log_namer) { |
Austin Schuh | fa89589 | 2020-01-07 20:07:41 -0800 | [diff] [blame] | 177 | // Force data up until now to be written. |
| 178 | DoLogData(); |
| 179 | |
| 180 | // Swap the writer out, and re-write the header. |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 181 | log_namer_ = std::move(log_namer); |
| 182 | |
| 183 | // And then update the writers. |
| 184 | for (FetcherStruct &f : fetchers_) { |
| 185 | const Channel *channel = |
| 186 | event_loop_->configuration()->channels()->Get(f.channel_index); |
| 187 | if (f.timestamp_writer != nullptr) { |
| 188 | f.timestamp_writer = log_namer_->MakeTimestampWriter(channel); |
| 189 | } |
| 190 | if (f.writer != nullptr) { |
| 191 | f.writer = log_namer_->MakeWriter(channel); |
| 192 | } |
| 193 | } |
| 194 | |
Austin Schuh | fa89589 | 2020-01-07 20:07:41 -0800 | [diff] [blame] | 195 | WriteHeader(); |
| 196 | } |
| 197 | |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 198 | void Logger::DoLogData() { |
| 199 | // We want to guarentee that messages aren't out of order by more than |
| 200 | // max_out_of_order_duration. To do this, we need sync points. Every write |
| 201 | // cycle should be a sync point. |
Austin Schuh | fa89589 | 2020-01-07 20:07:41 -0800 | [diff] [blame] | 202 | const monotonic_clock::time_point monotonic_now = |
| 203 | event_loop_->monotonic_now(); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 204 | |
| 205 | do { |
| 206 | // Move the sync point up by at most polling_period. This forces one sync |
| 207 | // per iteration, even if it is small. |
| 208 | last_synchronized_time_ = |
| 209 | std::min(last_synchronized_time_ + polling_period_, monotonic_now); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 210 | // Write each channel to disk, one at a time. |
| 211 | for (FetcherStruct &f : fetchers_) { |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 212 | while (true) { |
| 213 | if (f.written) { |
| 214 | if (!f.fetcher->FetchNext()) { |
| 215 | VLOG(2) << "No new data on " |
| 216 | << configuration::CleanedChannelToString( |
| 217 | f.fetcher->channel()); |
| 218 | break; |
| 219 | } else { |
| 220 | f.written = false; |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 221 | } |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 222 | } |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 223 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 224 | CHECK(!f.written); |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 225 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 226 | // TODO(james): Write tests to exercise this logic. |
| 227 | if (f.fetcher->context().monotonic_event_time < |
| 228 | last_synchronized_time_) { |
| 229 | if (f.writer != nullptr) { |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 230 | // Write! |
| 231 | flatbuffers::FlatBufferBuilder fbb(f.fetcher->context().size + |
| 232 | max_header_size_); |
Austin Schuh | d7b15da | 2020-02-17 15:06:11 -0800 | [diff] [blame^] | 233 | fbb.ForceDefaults(true); |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 234 | |
| 235 | fbb.FinishSizePrefixed(PackMessage(&fbb, f.fetcher->context(), |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 236 | f.channel_index, f.log_type)); |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 237 | |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 238 | VLOG(2) << "Writing data as node " |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 239 | << FlatbufferToJson(event_loop_->node()) << " for channel " |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 240 | << configuration::CleanedChannelToString( |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 241 | f.fetcher->channel()) |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 242 | << " to " << f.writer->filename() << " data " |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 243 | << FlatbufferToJson( |
| 244 | flatbuffers::GetSizePrefixedRoot<MessageHeader>( |
| 245 | fbb.GetBufferPointer())); |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 246 | |
| 247 | max_header_size_ = std::max( |
| 248 | max_header_size_, fbb.GetSize() - f.fetcher->context().size); |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 249 | f.writer->QueueSizedFlatbuffer(&fbb); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 250 | } |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 251 | |
| 252 | if (f.timestamp_writer != nullptr) { |
| 253 | // And now handle timestamps. |
| 254 | flatbuffers::FlatBufferBuilder fbb; |
Austin Schuh | d7b15da | 2020-02-17 15:06:11 -0800 | [diff] [blame^] | 255 | fbb.ForceDefaults(true); |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 256 | |
| 257 | fbb.FinishSizePrefixed(PackMessage(&fbb, f.fetcher->context(), |
| 258 | f.channel_index, |
| 259 | LogType::kLogDeliveryTimeOnly)); |
| 260 | |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 261 | VLOG(2) << "Writing timestamps as node " |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 262 | << FlatbufferToJson(event_loop_->node()) << " for channel " |
| 263 | << configuration::CleanedChannelToString( |
| 264 | f.fetcher->channel()) |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 265 | << " to " << f.timestamp_writer->filename() << " timestamp " |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 266 | << FlatbufferToJson( |
| 267 | flatbuffers::GetSizePrefixedRoot<MessageHeader>( |
| 268 | fbb.GetBufferPointer())); |
| 269 | |
| 270 | f.timestamp_writer->QueueSizedFlatbuffer(&fbb); |
| 271 | } |
| 272 | |
| 273 | f.written = true; |
| 274 | } else { |
| 275 | break; |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 276 | } |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 277 | } |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 278 | } |
| 279 | |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 280 | // If we missed cycles, we could be pretty far behind. Spin until we are |
| 281 | // caught up. |
| 282 | } while (last_synchronized_time_ + polling_period_ < monotonic_now); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 283 | } |
| 284 | |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 285 | LogReader::LogReader(std::string_view filename, |
| 286 | const Configuration *replay_configuration) |
Austin Schuh | fa89589 | 2020-01-07 20:07:41 -0800 | [diff] [blame] | 287 | : LogReader(std::vector<std::string>{std::string(filename)}, |
| 288 | replay_configuration) {} |
| 289 | |
| 290 | LogReader::LogReader(const std::vector<std::string> &filenames, |
| 291 | const Configuration *replay_configuration) |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 292 | : LogReader(std::vector<std::vector<std::string>>{filenames}, |
| 293 | replay_configuration) {} |
| 294 | |
| 295 | LogReader::LogReader(const std::vector<std::vector<std::string>> &filenames, |
| 296 | const Configuration *replay_configuration) |
| 297 | : filenames_(filenames), |
| 298 | log_file_header_(ReadHeader(filenames[0][0])), |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 299 | replay_configuration_(replay_configuration) { |
Austin Schuh | 6331ef9 | 2020-01-07 18:28:09 -0800 | [diff] [blame] | 300 | MakeRemappedConfig(); |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 301 | |
| 302 | if (!configuration::MultiNode(configuration())) { |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 303 | states_.emplace_back(std::make_unique<State>()); |
| 304 | State *state = states_[0].get(); |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 305 | |
| 306 | state->channel_merger = std::make_unique<ChannelMerger>(filenames); |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 307 | } else { |
| 308 | states_.resize(configuration()->nodes()->size()); |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 309 | } |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 310 | } |
| 311 | |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 312 | LogReader::~LogReader() { Deregister(); |
| 313 | if (offset_fp_ != nullptr) { |
| 314 | fclose(offset_fp_); |
| 315 | } |
| 316 | } |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 317 | |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 318 | const Configuration *LogReader::logged_configuration() const { |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 319 | return log_file_header_.message().configuration(); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 320 | } |
| 321 | |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 322 | const Configuration *LogReader::configuration() const { |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 323 | return remapped_configuration_; |
| 324 | } |
| 325 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 326 | std::vector<const Node *> LogReader::Nodes() const { |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 327 | // Because the Node pointer will only be valid if it actually points to memory |
| 328 | // owned by remapped_configuration_, we need to wait for the |
| 329 | // remapped_configuration_ to be populated before accessing it. |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 330 | // |
| 331 | // Also, note, that when ever a map is changed, the nodes in here are |
| 332 | // invalidated. |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 333 | CHECK(remapped_configuration_ != nullptr) |
| 334 | << ": Need to call Register before the node() pointer will be valid."; |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 335 | return configuration::GetNodes(remapped_configuration_); |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 336 | } |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 337 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 338 | monotonic_clock::time_point LogReader::monotonic_start_time(const Node *node) { |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 339 | State *state = |
| 340 | states_[configuration::GetNodeIndex(configuration(), node)].get(); |
| 341 | CHECK(state != nullptr) << ": Unknown node " << FlatbufferToJson(node); |
| 342 | |
| 343 | return state->channel_merger->monotonic_start_time(); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 344 | } |
| 345 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 346 | realtime_clock::time_point LogReader::realtime_start_time(const Node *node) { |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 347 | State *state = |
| 348 | states_[configuration::GetNodeIndex(configuration(), node)].get(); |
| 349 | CHECK(state != nullptr) << ": Unknown node " << FlatbufferToJson(node); |
| 350 | |
| 351 | return state->channel_merger->realtime_start_time(); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 352 | } |
| 353 | |
James Kuszmaul | 84ff3e5 | 2020-01-03 19:48:53 -0800 | [diff] [blame] | 354 | void LogReader::Register() { |
| 355 | event_loop_factory_unique_ptr_ = |
Austin Schuh | ac0771c | 2020-01-07 18:36:30 -0800 | [diff] [blame] | 356 | std::make_unique<SimulatedEventLoopFactory>(configuration()); |
James Kuszmaul | 84ff3e5 | 2020-01-03 19:48:53 -0800 | [diff] [blame] | 357 | Register(event_loop_factory_unique_ptr_.get()); |
| 358 | } |
| 359 | |
Austin Schuh | 9254752 | 2019-12-28 14:33:43 -0800 | [diff] [blame] | 360 | void LogReader::Register(SimulatedEventLoopFactory *event_loop_factory) { |
Austin Schuh | 9254752 | 2019-12-28 14:33:43 -0800 | [diff] [blame] | 361 | event_loop_factory_ = event_loop_factory; |
Austin Schuh | 9254752 | 2019-12-28 14:33:43 -0800 | [diff] [blame] | 362 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 363 | for (const Node *node : configuration::GetNodes(configuration())) { |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 364 | const size_t node_index = |
| 365 | configuration::GetNodeIndex(configuration(), node); |
| 366 | states_[node_index] = std::make_unique<State>(); |
| 367 | State *state = states_[node_index].get(); |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 368 | |
| 369 | state->channel_merger = std::make_unique<ChannelMerger>(filenames_); |
| 370 | |
| 371 | state->node_event_loop_factory = |
| 372 | event_loop_factory_->GetNodeEventLoopFactory(node); |
| 373 | state->event_loop_unique_ptr = |
| 374 | event_loop_factory->MakeEventLoop("log_reader", node); |
| 375 | |
| 376 | Register(state->event_loop_unique_ptr.get()); |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 377 | } |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 378 | |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 379 | // We need to now seed our per-node time offsets and get everything set up to |
| 380 | // run. |
| 381 | const size_t num_nodes = !configuration::MultiNode(logged_configuration()) |
| 382 | ? 1u |
| 383 | : logged_configuration()->nodes()->size(); |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 384 | |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 385 | // It is easiest to solve for per node offsets with a matrix rather than |
| 386 | // trying to solve the equations by hand. So let's get after it. |
| 387 | // |
| 388 | // Now, build up the map matrix. |
| 389 | // |
| 390 | // sample_matrix_ = map_matrix_ * offset_matrix_ |
| 391 | map_matrix_ = Eigen::MatrixXd::Zero(filters_.size() + 1, num_nodes); |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 392 | |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 393 | sample_matrix_ = Eigen::VectorXd::Zero(filters_.size() + 1); |
| 394 | offset_matrix_ = Eigen::VectorXd::Zero(num_nodes); |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 395 | |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 396 | // And the base offset matrix, which will be a copy of the initial offset |
| 397 | // matrix. |
| 398 | base_offset_matrix_ = |
| 399 | Eigen::Matrix<std::chrono::nanoseconds, Eigen::Dynamic, 1>::Zero( |
| 400 | num_nodes); |
| 401 | |
| 402 | // All offsets should sum to 0. Add that as the first constraint in our least |
| 403 | // squares. |
| 404 | map_matrix_.row(0).setOnes(); |
| 405 | |
| 406 | { |
| 407 | // Now, add the a - b -> sample elements. |
| 408 | size_t i = 1; |
| 409 | for (std::pair<const std::tuple<const Node *, const Node *>, |
| 410 | message_bridge::ClippedAverageFilter> &filter : filters_) { |
| 411 | const Node *const node_a = std::get<0>(filter.first); |
| 412 | const Node *const node_b = std::get<1>(filter.first); |
| 413 | |
| 414 | const size_t node_a_index = |
| 415 | configuration::GetNodeIndex(configuration(), node_a); |
| 416 | const size_t node_b_index = |
| 417 | configuration::GetNodeIndex(configuration(), node_b); |
| 418 | |
| 419 | // +a |
| 420 | map_matrix_(i, node_a_index) = 1.0; |
| 421 | // -b |
| 422 | map_matrix_(i, node_b_index) = -1.0; |
| 423 | |
| 424 | // -> sample |
| 425 | filter.second.set_sample_pointer(&sample_matrix_(i, 0)); |
| 426 | |
| 427 | ++i; |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 428 | } |
| 429 | } |
| 430 | |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 431 | // Rank of the map matrix tells you if all the nodes are in communication with |
| 432 | // each other, which tells you if the offsets are observable. |
| 433 | const size_t connected_nodes = |
| 434 | Eigen::FullPivLU<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>>( |
| 435 | map_matrix_) |
| 436 | .rank(); |
| 437 | |
| 438 | // We don't need to support isolated nodes until someone has a real use case. |
| 439 | CHECK_EQ(connected_nodes, num_nodes) |
| 440 | << ": There is a node which isn't communicating with the rest."; |
| 441 | |
| 442 | // Now, iterate through all the timestamps from all the nodes and seed |
| 443 | // everything. |
| 444 | for (std::unique_ptr<State> &state : states_) { |
| 445 | for (size_t i = 0; i < logged_configuration()->channels()->size(); ++i) { |
| 446 | TimestampMerger::DeliveryTimestamp timestamp = |
| 447 | state->channel_merger->OldestTimestampForChannel(i); |
| 448 | if (timestamp.monotonic_event_time != monotonic_clock::min_time) { |
| 449 | CHECK(state->MaybeUpdateTimestamp(timestamp, i)); |
| 450 | } |
| 451 | } |
| 452 | } |
| 453 | |
| 454 | // Make sure all the samples have been seeded. |
| 455 | for (int i = 1; i < sample_matrix_.cols(); ++i) { |
| 456 | // The seeding logic is pretty basic right now because we don't have great |
| 457 | // use cases yet. It wants to see data from every node. Blow up for now, |
| 458 | // and once we have a reason to do something different, update this logic. |
| 459 | // Maybe read further in the log file? Or seed off the realtime time? |
| 460 | CHECK_NE(sample_matrix_(i, 0), 0.0) |
| 461 | << ": Sample " << i << " is not seeded."; |
| 462 | } |
| 463 | |
| 464 | // And solve. |
| 465 | offset_matrix_ = SolveOffsets(); |
| 466 | |
| 467 | // Save off the base offsets so we can work in deltas from here out. That |
| 468 | // will significantly simplify the numerical precision problems. |
| 469 | for (size_t i = 0; i < num_nodes; ++i) { |
| 470 | base_offset_matrix_(i, 0) = |
| 471 | std::chrono::duration_cast<std::chrono::nanoseconds>( |
| 472 | std::chrono::duration<double>(offset_matrix_(i, 0))); |
| 473 | } |
| 474 | |
| 475 | { |
| 476 | // Shift everything so we never could (reasonably) require the distributed |
| 477 | // clock to have a large backwards jump in time. This makes it so the boot |
| 478 | // time on the node up the longest will essentially start matching the |
| 479 | // distributed clock. |
| 480 | const chrono::nanoseconds offset = -base_offset_matrix_.maxCoeff(); |
| 481 | for (int i = 0; i < base_offset_matrix_.rows(); ++i) { |
| 482 | base_offset_matrix_(i, 0) += offset; |
| 483 | } |
| 484 | } |
| 485 | |
| 486 | { |
| 487 | // Re-compute the samples and setup all the filters so that they |
| 488 | // subtract this base offset. |
| 489 | |
| 490 | size_t i = 1; |
| 491 | for (std::pair<const std::tuple<const Node *, const Node *>, |
| 492 | message_bridge::ClippedAverageFilter> &filter : filters_) { |
| 493 | CHECK(filter.second.sample_pointer() == &sample_matrix_(i, 0)); |
| 494 | |
| 495 | const Node *const node_a = std::get<0>(filter.first); |
| 496 | const Node *const node_b = std::get<1>(filter.first); |
| 497 | |
| 498 | const size_t node_a_index = |
| 499 | configuration::GetNodeIndex(configuration(), node_a); |
| 500 | const size_t node_b_index = |
| 501 | configuration::GetNodeIndex(configuration(), node_b); |
| 502 | |
| 503 | filter.second.set_base_offset(base_offset_matrix_(node_a_index) - |
| 504 | base_offset_matrix_(node_b_index)); |
| 505 | |
| 506 | ++i; |
| 507 | } |
| 508 | } |
| 509 | |
| 510 | // Now, iterate again through all the offsets now that we have set the base |
| 511 | // offset to something sane. This will seed everything with an accurate |
| 512 | // initial offset. |
| 513 | for (std::unique_ptr<State> &state : states_) { |
| 514 | for (size_t i = 0; i < logged_configuration()->channels()->size(); ++i) { |
| 515 | TimestampMerger::DeliveryTimestamp timestamp = |
| 516 | state->channel_merger->OldestTimestampForChannel(i); |
| 517 | if (timestamp.monotonic_event_time != monotonic_clock::min_time) { |
| 518 | CHECK(state->MaybeUpdateTimestamp(timestamp, i)); |
| 519 | } |
| 520 | } |
| 521 | } |
| 522 | |
| 523 | UpdateOffsets(); |
| 524 | |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 525 | // We want to start the log file at the last start time of the log files from |
| 526 | // all the nodes. Compute how long each node's simulation needs to run to |
| 527 | // move time to this point. |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 528 | distributed_clock::time_point start_time = distributed_clock::min_time; |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 529 | |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 530 | for (std::unique_ptr<State> &state : states_) { |
| 531 | // Setup the realtime clock to have something sane in it now. |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 532 | state->node_event_loop_factory->SetRealtimeOffset( |
| 533 | state->channel_merger->monotonic_start_time(), |
| 534 | state->channel_merger->realtime_start_time()); |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 535 | // And start computing the start time on the distributed clock now that that |
| 536 | // works. |
| 537 | start_time = std::max(start_time, |
| 538 | state->node_event_loop_factory->ToDistributedClock( |
| 539 | state->channel_merger->monotonic_start_time())); |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 540 | } |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 541 | CHECK_GE(start_time, distributed_clock::epoch()); |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 542 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 543 | // Forwarding is tracked per channel. If it is enabled, we want to turn it |
| 544 | // off. Otherwise messages replayed will get forwarded across to the other |
| 545 | // nodes, and also replayed on the other nodes. This may not satisfy all our |
| 546 | // users, but it'll start the discussion. |
| 547 | if (configuration::MultiNode(event_loop_factory_->configuration())) { |
| 548 | for (size_t i = 0; i < logged_configuration()->channels()->size(); ++i) { |
| 549 | const Channel *channel = logged_configuration()->channels()->Get(i); |
| 550 | const Node *node = configuration::GetNode( |
| 551 | configuration(), channel->source_node()->string_view()); |
| 552 | |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 553 | State *state = |
| 554 | states_[configuration::GetNodeIndex(configuration(), node)].get(); |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 555 | |
| 556 | const Channel *remapped_channel = |
| 557 | RemapChannel(state->event_loop, channel); |
| 558 | |
| 559 | event_loop_factory_->DisableForwarding(remapped_channel); |
| 560 | } |
| 561 | } |
| 562 | |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 563 | // While we are starting the system up, we might be relying on matching data |
| 564 | // to timestamps on log files where the timestamp log file starts before the |
| 565 | // data. In this case, it is reasonable to expect missing data. |
| 566 | ignore_missing_data_ = true; |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 567 | event_loop_factory_->RunFor(start_time.time_since_epoch()); |
Austin Schuh | cde938c | 2020-02-02 17:30:07 -0800 | [diff] [blame] | 568 | // Now that we are running for real, missing data means that the log file is |
| 569 | // corrupted or went wrong. |
| 570 | ignore_missing_data_ = false; |
Austin Schuh | 9254752 | 2019-12-28 14:33:43 -0800 | [diff] [blame] | 571 | } |
| 572 | |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 573 | void LogReader::UpdateOffsets() { |
| 574 | // TODO(austin): Evaluate less accurate inverses. We might be able to |
| 575 | // do some tricks to keep the accuracy up. |
| 576 | offset_matrix_ = SolveOffsets(); |
| 577 | |
| 578 | size_t node_index = 0; |
| 579 | for (std::unique_ptr<State> &state : states_) { |
| 580 | state->node_event_loop_factory->SetDistributedOffset(offset(node_index)); |
| 581 | ++node_index; |
| 582 | } |
| 583 | } |
| 584 | |
| 585 | std::tuple<message_bridge::ClippedAverageFilter *, bool> LogReader::GetFilter( |
| 586 | const Node *node_a, const Node *node_b) { |
| 587 | CHECK_NE(node_a, node_b); |
| 588 | CHECK_EQ(configuration::GetNode(configuration(), node_a), node_a); |
| 589 | CHECK_EQ(configuration::GetNode(configuration(), node_b), node_b); |
| 590 | |
| 591 | if (node_a > node_b) { |
| 592 | return std::make_pair(std::get<0>(GetFilter(node_b, node_a)), false); |
| 593 | } |
| 594 | |
| 595 | auto tuple = std::make_tuple(node_a, node_b); |
| 596 | |
| 597 | auto it = filters_.find(tuple); |
| 598 | |
| 599 | if (it == filters_.end()) { |
| 600 | auto &x = filters_ |
| 601 | .insert(std::make_pair( |
| 602 | tuple, message_bridge::ClippedAverageFilter())) |
| 603 | .first->second; |
| 604 | if (FLAGS_timestamps_to_csv) { |
| 605 | std::string fwd_name = |
| 606 | absl::StrCat("/tmp/timestamp_", node_a->name()->string_view(), "_", |
| 607 | node_b->name()->string_view(), ".csv"); |
| 608 | x.fwd_fp = fopen(fwd_name.c_str(), "w"); |
| 609 | std::string rev_name = |
| 610 | absl::StrCat("/tmp/timestamp_", node_b->name()->string_view(), "_", |
| 611 | node_a->name()->string_view(), ".csv"); |
| 612 | x.rev_fp = fopen(rev_name.c_str(), "w"); |
| 613 | } |
| 614 | |
| 615 | return std::make_tuple(&x, true); |
| 616 | } else { |
| 617 | return std::make_tuple(&(it->second), true); |
| 618 | } |
| 619 | } |
| 620 | |
| 621 | bool LogReader::State::MaybeUpdateTimestamp( |
| 622 | const TimestampMerger::DeliveryTimestamp &channel_timestamp, |
| 623 | int channel_index) { |
| 624 | if (channel_timestamp.monotonic_remote_time == monotonic_clock::min_time) { |
| 625 | return false; |
| 626 | } |
| 627 | |
| 628 | // Got a forwarding timestamp! |
| 629 | CHECK(std::get<0>(filters[channel_index]) != nullptr); |
| 630 | |
| 631 | // Call the correct method depending on if we are the forward or reverse |
| 632 | // direction here. |
| 633 | if (std::get<1>(filters[channel_index])) { |
| 634 | std::get<0>(filters[channel_index]) |
| 635 | ->FwdSample(channel_timestamp.monotonic_event_time, |
| 636 | channel_timestamp.monotonic_event_time - |
| 637 | channel_timestamp.monotonic_remote_time); |
| 638 | } else { |
| 639 | std::get<0>(filters[channel_index]) |
| 640 | ->RevSample(channel_timestamp.monotonic_event_time, |
| 641 | channel_timestamp.monotonic_event_time - |
| 642 | channel_timestamp.monotonic_remote_time); |
| 643 | } |
| 644 | return true; |
| 645 | } |
| 646 | |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 647 | void LogReader::Register(EventLoop *event_loop) { |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 648 | State *state = |
| 649 | states_[configuration::GetNodeIndex(configuration(), event_loop->node())] |
| 650 | .get(); |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 651 | |
| 652 | state->event_loop = event_loop; |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 653 | |
Tyler Chatow | 67ddb03 | 2020-01-12 14:30:04 -0800 | [diff] [blame] | 654 | // We don't run timing reports when trying to print out logged data, because |
| 655 | // otherwise we would end up printing out the timing reports themselves... |
| 656 | // This is only really relevant when we are replaying into a simulation. |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 657 | event_loop->SkipTimingReport(); |
| 658 | event_loop->SkipAosLog(); |
Austin Schuh | 39788ff | 2019-12-01 18:22:57 -0800 | [diff] [blame] | 659 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 660 | state->channel_merger->SetNode(event_loop->node()); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 661 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 662 | state->channels.resize(logged_configuration()->channels()->size()); |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 663 | state->filters.resize(state->channels.size()); |
| 664 | |
| 665 | state->channel_target_event_loop_factory.resize(state->channels.size()); |
Austin Schuh | 6331ef9 | 2020-01-07 18:28:09 -0800 | [diff] [blame] | 666 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 667 | for (size_t i = 0; i < state->channels.size(); ++i) { |
| 668 | const Channel *channel = |
| 669 | RemapChannel(event_loop, logged_configuration()->channels()->Get(i)); |
Austin Schuh | 6331ef9 | 2020-01-07 18:28:09 -0800 | [diff] [blame] | 670 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 671 | state->channels[i] = event_loop->MakeRawSender(channel); |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 672 | |
| 673 | state->filters[i] = std::make_tuple(nullptr, false); |
| 674 | |
| 675 | if (!configuration::ChannelIsSendableOnNode(channel, event_loop->node()) && |
| 676 | configuration::ChannelIsReadableOnNode(channel, event_loop->node())) { |
| 677 | const Node *target_node = configuration::GetNode( |
| 678 | event_loop->configuration(), channel->source_node()->string_view()); |
| 679 | state->filters[i] = GetFilter(event_loop->node(), target_node); |
| 680 | |
| 681 | if (event_loop_factory_ != nullptr) { |
| 682 | state->channel_target_event_loop_factory[i] = |
| 683 | event_loop_factory_->GetNodeEventLoopFactory(target_node); |
| 684 | } |
| 685 | } |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 686 | } |
| 687 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 688 | state->timer_handler = event_loop->AddTimer([this, state]() { |
| 689 | if (state->channel_merger->OldestMessage() == monotonic_clock::max_time) { |
| 690 | --live_nodes_; |
| 691 | if (live_nodes_ == 0) { |
| 692 | event_loop_factory_->Exit(); |
| 693 | } |
James Kuszmaul | 314f167 | 2020-01-03 20:02:08 -0800 | [diff] [blame] | 694 | return; |
| 695 | } |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 696 | bool update_offsets = false; |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 697 | TimestampMerger::DeliveryTimestamp channel_timestamp; |
Austin Schuh | 05b7047 | 2020-01-01 17:11:17 -0800 | [diff] [blame] | 698 | int channel_index; |
| 699 | FlatbufferVector<MessageHeader> channel_data = |
| 700 | FlatbufferVector<MessageHeader>::Empty(); |
| 701 | |
| 702 | std::tie(channel_timestamp, channel_index, channel_data) = |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 703 | state->channel_merger->PopOldest(); |
Austin Schuh | 05b7047 | 2020-01-01 17:11:17 -0800 | [diff] [blame] | 704 | |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 705 | const monotonic_clock::time_point monotonic_now = |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 706 | state->event_loop->context().monotonic_event_time; |
| 707 | CHECK(monotonic_now == channel_timestamp.monotonic_event_time) |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 708 | << ": " << FlatbufferToJson(state->event_loop->node()) << " Now " |
| 709 | << monotonic_now << " trying to send " |
| 710 | << channel_timestamp.monotonic_event_time << " failure " |
| 711 | << state->channel_merger->DebugString(); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 712 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 713 | if (channel_timestamp.monotonic_event_time > |
| 714 | state->channel_merger->monotonic_start_time() || |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 715 | event_loop_factory_ != nullptr) { |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 716 | if ((!ignore_missing_data_ && !FLAGS_skip_missing_forwarding_entries && |
| 717 | !state->channel_merger->at_end()) || |
Austin Schuh | 05b7047 | 2020-01-01 17:11:17 -0800 | [diff] [blame] | 718 | channel_data.message().data() != nullptr) { |
| 719 | CHECK(channel_data.message().data() != nullptr) |
| 720 | << ": Got a message without data. Forwarding entry which was " |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 721 | "not matched? Use --skip_missing_forwarding_entries to ignore " |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 722 | "this."; |
Austin Schuh | 9254752 | 2019-12-28 14:33:43 -0800 | [diff] [blame] | 723 | |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 724 | if (state->MaybeUpdateTimestamp(channel_timestamp, channel_index)) { |
| 725 | // Confirm that the message was sent on the sending node before the |
| 726 | // destination node (this node). As a proxy, do this by making sure |
| 727 | // that time on the source node is past when the message was sent. |
| 728 | CHECK_LT(channel_timestamp.monotonic_remote_time, |
| 729 | state->channel_target_event_loop_factory[channel_index] |
| 730 | ->monotonic_now()); |
| 731 | |
| 732 | update_offsets = true; |
| 733 | |
| 734 | if (FLAGS_timestamps_to_csv) { |
| 735 | if (offset_fp_ == nullptr) { |
| 736 | offset_fp_ = fopen("/tmp/offsets.csv", "w"); |
| 737 | fprintf( |
| 738 | offset_fp_, |
| 739 | "# time_since_start, offset node 0, offset node 1, ...\n"); |
| 740 | first_time_ = channel_timestamp.realtime_event_time; |
| 741 | } |
| 742 | |
| 743 | fprintf(offset_fp_, "%.9f", |
| 744 | std::chrono::duration_cast<std::chrono::duration<double>>( |
| 745 | channel_timestamp.realtime_event_time - first_time_) |
| 746 | .count()); |
| 747 | for (int i = 0; i < base_offset_matrix_.rows(); ++i) { |
| 748 | fprintf( |
| 749 | offset_fp_, ", %.9f", |
| 750 | offset_matrix_(i, 0) + |
| 751 | std::chrono::duration_cast<std::chrono::duration<double>>( |
| 752 | base_offset_matrix_(i, 0)) |
| 753 | .count()); |
| 754 | } |
| 755 | fprintf(offset_fp_, "\n"); |
| 756 | } |
| 757 | |
| 758 | } else { |
| 759 | CHECK(std::get<0>(state->filters[channel_index]) == nullptr); |
| 760 | } |
| 761 | |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 762 | // If we have access to the factory, use it to fix the realtime time. |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 763 | if (state->node_event_loop_factory != nullptr) { |
| 764 | state->node_event_loop_factory->SetRealtimeOffset( |
| 765 | channel_timestamp.monotonic_event_time, |
| 766 | channel_timestamp.realtime_event_time); |
Austin Schuh | 15649d6 | 2019-12-28 16:36:38 -0800 | [diff] [blame] | 767 | } |
| 768 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 769 | state->channels[channel_index]->Send( |
Austin Schuh | 05b7047 | 2020-01-01 17:11:17 -0800 | [diff] [blame] | 770 | channel_data.message().data()->Data(), |
| 771 | channel_data.message().data()->size(), |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 772 | channel_timestamp.monotonic_remote_time, |
| 773 | channel_timestamp.realtime_remote_time, |
| 774 | channel_timestamp.remote_queue_index); |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 775 | } else if (state->channel_merger->at_end()) { |
| 776 | // We are at the end of the log file and found missing data. Finish |
| 777 | // reading the rest of the log file and call it quits. We don't want to |
| 778 | // replay partial data. |
| 779 | while (state->channel_merger->OldestMessage() != |
| 780 | monotonic_clock::max_time) { |
| 781 | state->channel_merger->PopOldest(); |
| 782 | } |
Austin Schuh | 9254752 | 2019-12-28 14:33:43 -0800 | [diff] [blame] | 783 | } |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 784 | |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 785 | } else { |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 786 | LOG(WARNING) |
| 787 | << "Not sending data from before the start of the log file. " |
| 788 | << channel_timestamp.monotonic_event_time.time_since_epoch().count() |
| 789 | << " start " << monotonic_start_time().time_since_epoch().count() |
| 790 | << " " << FlatbufferToJson(channel_data); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 791 | } |
| 792 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 793 | const monotonic_clock::time_point next_time = |
| 794 | state->channel_merger->OldestMessage(); |
| 795 | if (next_time != monotonic_clock::max_time) { |
| 796 | state->timer_handler->Setup(next_time); |
James Kuszmaul | 314f167 | 2020-01-03 20:02:08 -0800 | [diff] [blame] | 797 | } else { |
| 798 | // Set a timer up immediately after now to die. If we don't do this, then |
| 799 | // the senders waiting on the message we just read will never get called. |
Austin Schuh | eecb928 | 2020-01-08 17:43:30 -0800 | [diff] [blame] | 800 | if (event_loop_factory_ != nullptr) { |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 801 | state->timer_handler->Setup(monotonic_now + |
| 802 | event_loop_factory_->send_delay() + |
| 803 | std::chrono::nanoseconds(1)); |
Austin Schuh | eecb928 | 2020-01-08 17:43:30 -0800 | [diff] [blame] | 804 | } |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 805 | } |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 806 | |
| 807 | // Once we make this call, the current time changes. So do everything which |
| 808 | // involves time before changing it. That especially includes sending the |
| 809 | // message. |
| 810 | if (update_offsets) { |
| 811 | UpdateOffsets(); |
| 812 | } |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 813 | }); |
| 814 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 815 | ++live_nodes_; |
| 816 | |
| 817 | if (state->channel_merger->OldestMessage() != monotonic_clock::max_time) { |
| 818 | event_loop->OnRun([state]() { |
| 819 | state->timer_handler->Setup(state->channel_merger->OldestMessage()); |
Austin Schuh | 05b7047 | 2020-01-01 17:11:17 -0800 | [diff] [blame] | 820 | }); |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 821 | } |
| 822 | } |
| 823 | |
| 824 | void LogReader::Deregister() { |
James Kuszmaul | 84ff3e5 | 2020-01-03 19:48:53 -0800 | [diff] [blame] | 825 | // Make sure that things get destroyed in the correct order, rather than |
| 826 | // relying on getting the order correct in the class definition. |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 827 | for (std::unique_ptr<State> &state : states_) { |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 828 | for (size_t i = 0; i < state->channels.size(); ++i) { |
| 829 | state->channels[i].reset(); |
| 830 | } |
| 831 | state->event_loop_unique_ptr.reset(); |
| 832 | state->event_loop = nullptr; |
| 833 | state->node_event_loop_factory = nullptr; |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 834 | } |
Austin Schuh | 9254752 | 2019-12-28 14:33:43 -0800 | [diff] [blame] | 835 | |
James Kuszmaul | 84ff3e5 | 2020-01-03 19:48:53 -0800 | [diff] [blame] | 836 | event_loop_factory_unique_ptr_.reset(); |
| 837 | event_loop_factory_ = nullptr; |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 838 | } |
| 839 | |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 840 | void LogReader::RemapLoggedChannel(std::string_view name, std::string_view type, |
| 841 | std::string_view add_prefix) { |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 842 | for (size_t ii = 0; ii < logged_configuration()->channels()->size(); ++ii) { |
| 843 | const Channel *const channel = logged_configuration()->channels()->Get(ii); |
| 844 | if (channel->name()->str() == name && |
| 845 | channel->type()->string_view() == type) { |
| 846 | CHECK_EQ(0u, remapped_channels_.count(ii)) |
| 847 | << "Already remapped channel " |
| 848 | << configuration::CleanedChannelToString(channel); |
| 849 | remapped_channels_[ii] = std::string(add_prefix) + std::string(name); |
| 850 | VLOG(1) << "Remapping channel " |
| 851 | << configuration::CleanedChannelToString(channel) |
| 852 | << " to have name " << remapped_channels_[ii]; |
Austin Schuh | 6331ef9 | 2020-01-07 18:28:09 -0800 | [diff] [blame] | 853 | MakeRemappedConfig(); |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 854 | return; |
| 855 | } |
| 856 | } |
| 857 | LOG(FATAL) << "Unabled to locate channel with name " << name << " and type " |
| 858 | << type; |
| 859 | } |
| 860 | |
| 861 | void LogReader::MakeRemappedConfig() { |
Austin Schuh | 8bd9632 | 2020-02-13 21:18:22 -0800 | [diff] [blame] | 862 | for (std::unique_ptr<State> &state : states_) { |
| 863 | CHECK(!state->event_loop) |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 864 | << ": Can't change the mapping after the events are scheduled."; |
| 865 | } |
Austin Schuh | ac0771c | 2020-01-07 18:36:30 -0800 | [diff] [blame] | 866 | |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 867 | // If no remapping occurred and we are using the original config, then there |
| 868 | // is nothing interesting to do here. |
| 869 | if (remapped_channels_.empty() && replay_configuration_ == nullptr) { |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 870 | remapped_configuration_ = logged_configuration(); |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 871 | return; |
| 872 | } |
| 873 | // Config to copy Channel definitions from. Use the specified |
| 874 | // replay_configuration_ if it has been provided. |
| 875 | const Configuration *const base_config = replay_configuration_ == nullptr |
| 876 | ? logged_configuration() |
| 877 | : replay_configuration_; |
| 878 | // The remapped config will be identical to the base_config, except that it |
| 879 | // will have a bunch of extra channels in the channel list, which are exact |
| 880 | // copies of the remapped channels, but with different names. |
| 881 | // Because the flatbuffers API is a pain to work with, this requires a bit of |
| 882 | // a song-and-dance to get copied over. |
| 883 | // The order of operations is to: |
| 884 | // 1) Make a flatbuffer builder for a config that will just contain a list of |
| 885 | // the new channels that we want to add. |
| 886 | // 2) For each channel that we are remapping: |
| 887 | // a) Make a buffer/builder and construct into it a Channel table that only |
| 888 | // contains the new name for the channel. |
| 889 | // b) Merge the new channel with just the name into the channel that we are |
| 890 | // trying to copy, built in the flatbuffer builder made in 1. This gives |
| 891 | // us the new channel definition that we need. |
| 892 | // 3) Using this list of offsets, build the Configuration of just new |
| 893 | // Channels. |
| 894 | // 4) Merge the Configuration with the new Channels into the base_config. |
| 895 | // 5) Call MergeConfiguration() on that result to give MergeConfiguration a |
| 896 | // chance to sanitize the config. |
| 897 | |
| 898 | // This is the builder that we use for the config containing all the new |
| 899 | // channels. |
| 900 | flatbuffers::FlatBufferBuilder new_config_fbb; |
Austin Schuh | d7b15da | 2020-02-17 15:06:11 -0800 | [diff] [blame^] | 901 | new_config_fbb.ForceDefaults(true); |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 902 | std::vector<flatbuffers::Offset<Channel>> channel_offsets; |
| 903 | for (auto &pair : remapped_channels_) { |
| 904 | // This is the builder that we use for creating the Channel with just the |
| 905 | // new name. |
| 906 | flatbuffers::FlatBufferBuilder new_name_fbb; |
Austin Schuh | d7b15da | 2020-02-17 15:06:11 -0800 | [diff] [blame^] | 907 | new_name_fbb.ForceDefaults(true); |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 908 | const flatbuffers::Offset<flatbuffers::String> name_offset = |
| 909 | new_name_fbb.CreateString(pair.second); |
| 910 | ChannelBuilder new_name_builder(new_name_fbb); |
| 911 | new_name_builder.add_name(name_offset); |
| 912 | new_name_fbb.Finish(new_name_builder.Finish()); |
| 913 | const FlatbufferDetachedBuffer<Channel> new_name = new_name_fbb.Release(); |
| 914 | // Retrieve the channel that we want to copy, confirming that it is actually |
| 915 | // present in base_config. |
| 916 | const Channel *const base_channel = CHECK_NOTNULL(configuration::GetChannel( |
| 917 | base_config, logged_configuration()->channels()->Get(pair.first), "", |
| 918 | nullptr)); |
| 919 | // Actually create the new channel and put it into the vector of Offsets |
| 920 | // that we will use to create the new Configuration. |
| 921 | channel_offsets.emplace_back(MergeFlatBuffers<Channel>( |
| 922 | reinterpret_cast<const flatbuffers::Table *>(base_channel), |
| 923 | reinterpret_cast<const flatbuffers::Table *>(&new_name.message()), |
| 924 | &new_config_fbb)); |
| 925 | } |
| 926 | // Create the Configuration containing the new channels that we want to add. |
Austin Schuh | fa89589 | 2020-01-07 20:07:41 -0800 | [diff] [blame] | 927 | const auto new_name_vector_offsets = |
| 928 | new_config_fbb.CreateVector(channel_offsets); |
James Kuszmaul | c7bbb3e | 2020-01-03 20:01:00 -0800 | [diff] [blame] | 929 | ConfigurationBuilder new_config_builder(new_config_fbb); |
| 930 | new_config_builder.add_channels(new_name_vector_offsets); |
| 931 | new_config_fbb.Finish(new_config_builder.Finish()); |
| 932 | const FlatbufferDetachedBuffer<Configuration> new_name_config = |
| 933 | new_config_fbb.Release(); |
| 934 | // Merge the new channels configuration into the base_config, giving us the |
| 935 | // remapped configuration. |
| 936 | remapped_configuration_buffer_ = |
| 937 | std::make_unique<FlatbufferDetachedBuffer<Configuration>>( |
| 938 | MergeFlatBuffers<Configuration>(base_config, |
| 939 | &new_name_config.message())); |
| 940 | // Call MergeConfiguration to deal with sanitizing the config. |
| 941 | remapped_configuration_buffer_ = |
| 942 | std::make_unique<FlatbufferDetachedBuffer<Configuration>>( |
| 943 | configuration::MergeConfiguration(*remapped_configuration_buffer_)); |
| 944 | |
| 945 | remapped_configuration_ = &remapped_configuration_buffer_->message(); |
| 946 | } |
| 947 | |
Austin Schuh | 6f3babe | 2020-01-26 20:34:50 -0800 | [diff] [blame] | 948 | const Channel *LogReader::RemapChannel(const EventLoop *event_loop, |
| 949 | const Channel *channel) { |
| 950 | std::string_view channel_name = channel->name()->string_view(); |
| 951 | std::string_view channel_type = channel->type()->string_view(); |
| 952 | const int channel_index = |
| 953 | configuration::ChannelIndex(logged_configuration(), channel); |
| 954 | // If the channel is remapped, find the correct channel name to use. |
| 955 | if (remapped_channels_.count(channel_index) > 0) { |
| 956 | VLOG(2) << "Got remapped channel on " |
| 957 | << configuration::CleanedChannelToString(channel); |
| 958 | channel_name = remapped_channels_[channel_index]; |
| 959 | } |
| 960 | |
| 961 | VLOG(1) << "Going to remap channel " << channel_name << " " << channel_type; |
| 962 | const Channel *remapped_channel = configuration::GetChannel( |
| 963 | event_loop->configuration(), channel_name, channel_type, |
| 964 | event_loop->name(), event_loop->node()); |
| 965 | |
| 966 | CHECK(remapped_channel != nullptr) |
| 967 | << ": Unable to send {\"name\": \"" << channel_name << "\", \"type\": \"" |
| 968 | << channel_type << "\"} because it is not in the provided configuration."; |
| 969 | |
| 970 | return remapped_channel; |
| 971 | } |
| 972 | |
Austin Schuh | e309d2a | 2019-11-29 13:25:21 -0800 | [diff] [blame] | 973 | } // namespace logger |
| 974 | } // namespace aos |