Merge "Adding scale option to viewer, to help with bandwidth issues"
diff --git a/WORKSPACE b/WORKSPACE
index 8db3b72..c603581 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -587,16 +587,16 @@
http_archive(
name = "rsync",
build_file = "@//debian:rsync.BUILD",
- sha256 = "53be65a9214aaa6d1b9176f135184fb4a78ccefd58f95ce0da37e6a392dfeb60",
- url = "https://software.frc971.org/Build-Dependencies/rsync.tar.gz",
+ sha256 = "75ea8ce442c94fd12c0d00eb24860ac1de5ea6af56154bb1b195a96018c9e8a2",
+ url = "https://software.frc971.org/Build-Dependencies/rsync-2023.09.06.tar.gz",
)
# //debian:ssh
http_archive(
name = "ssh",
build_file = "@//debian:ssh.BUILD",
- sha256 = "470fdc1252a2133a9d3c3da778e892a5b88f04f402cb04d8eb1cff7853242034",
- url = "https://software.frc971.org/Build-Dependencies/ssh_v3.tar.gz",
+ sha256 = "9c4a9eefa605283486fb15a44ef9977d4a95b55c3a41c4e71cfbacd1cf20a4b5",
+ url = "https://software.frc971.org/Build-Dependencies/ssh-2023.09.06.tar.gz",
)
http_archive(
diff --git a/aos/BUILD b/aos/BUILD
index 34af003..acef762 100644
--- a/aos/BUILD
+++ b/aos/BUILD
@@ -198,6 +198,25 @@
visibility = ["//visibility:public"],
)
+autocxx_library(
+ name = "test_init_rs",
+ testonly = True,
+ srcs = ["test_init.rs"],
+ crate_name = "aos_test_init",
+ libs = [
+ "//aos/testing:tmpdir",
+ ],
+ override_cc_toolchain = "@llvm_toolchain//:cc-clang-x86_64-linux",
+ target_compatible_with = select({
+ "//conditions:default": ["//tools/platforms/rust:has_support"],
+ "//tools:has_msan": ["@platforms//:incompatible"],
+ }),
+ visibility = ["//visibility:public"],
+ deps = [
+ ":init_rs",
+ ],
+)
+
cc_library(
name = "realtime",
srcs = [
@@ -790,7 +809,18 @@
target_compatible_with = ["@platforms//os:linux"],
visibility = ["//visibility:public"],
deps = [
+ "//aos/util:file",
"@boringssl//:crypto",
"@com_google_absl//absl/types:span",
],
)
+
+cc_test(
+ name = "sha256_test",
+ srcs = ["sha256_test.cc"],
+ deps = [
+ ":sha256",
+ "//aos/testing:googletest",
+ "//aos/testing:tmpdir",
+ ],
+)
diff --git a/aos/config.bzl b/aos/config.bzl
index d281027..faaf45a 100644
--- a/aos/config.bzl
+++ b/aos/config.bzl
@@ -41,22 +41,32 @@
all_files = flatbuffers_depset.to_list() + src_depset.to_list()
ctx.actions.run(
- outputs = [config, stripped_config, binary_config],
+ outputs = [stripped_config, binary_config],
inputs = all_files,
arguments = ctx.attr.flags + [
- config.path,
- stripped_config.path,
- binary_config.path,
+ "--stripped_output=" + stripped_config.path,
+ "--binary_output=" + binary_config.path,
(ctx.label.workspace_root or ".") + "/" + ctx.files.src[0].short_path,
ctx.bin_dir.path,
] + [f.path for f in flatbuffers_depset.to_list()],
progress_message = "Flattening config",
executable = ctx.executable._config_flattener,
)
- runfiles = ctx.runfiles(files = [config, stripped_config, binary_config])
+ ctx.actions.run(
+ outputs = [config],
+ inputs = all_files,
+ arguments = ctx.attr.flags + [
+ "--full_output=" + config.path,
+ (ctx.label.workspace_root or ".") + "/" + ctx.files.src[0].short_path,
+ ctx.bin_dir.path,
+ ] + [f.path for f in flatbuffers_depset.to_list()],
+ progress_message = "Flattening config",
+ executable = ctx.executable._config_flattener,
+ )
+ runfiles = ctx.runfiles(files = [stripped_config, binary_config])
return [
DefaultInfo(
- files = depset([config, stripped_config, binary_config]),
+ files = depset([stripped_config, binary_config]),
runfiles = runfiles,
),
AosConfigInfo(
diff --git a/aos/config_flattener.cc b/aos/config_flattener.cc
index b089d24..8170a89 100644
--- a/aos/config_flattener.cc
+++ b/aos/config_flattener.cc
@@ -9,19 +9,25 @@
#include "aos/json_to_flatbuffer.h"
#include "aos/util/file.h"
+DEFINE_string(full_output, "",
+ "If provided, the path to write the full json configuration to.");
+DEFINE_string(
+ stripped_output, "",
+ "If provided, the path to write the stripped json configuration to.");
+DEFINE_string(
+ binary_output, "",
+ "If provided, the path to write the binary flatbuffer configuration to.");
+
namespace aos {
int Main(int argc, char **argv) {
- CHECK_GE(argc, 6) << ": Too few arguments";
+ CHECK_GE(argc, 3) << ": Too few arguments";
- const char *full_output = argv[1];
- const char *stripped_output = argv[2];
- const char *binary_output = argv[3];
- const char *config_path = argv[4];
+ const char *config_path = argv[1];
// In order to support not only importing things by absolute path, but also
// importing the outputs of genrules (rather than just manually written
// files), we need to tell ReadConfig where the generated files directory is.
- const char *bazel_outs_directory = argv[5];
+ const char *bazel_outs_directory = argv[2];
VLOG(1) << "Reading " << config_path;
FlatbufferDetachedBuffer<Configuration> config =
@@ -38,7 +44,7 @@
std::vector<aos::FlatbufferVector<reflection::Schema>> schemas;
- for (int i = 6; i < argc; ++i) {
+ for (int i = 3; i < argc; ++i) {
schemas.emplace_back(FileToFlatbuffer<reflection::Schema>(argv[i]));
}
@@ -55,11 +61,17 @@
// TODO(austin): Figure out how to squash the schemas onto 1 line so it is
// easier to read?
VLOG(1) << "Flattened config is " << merged_config_json;
- util::WriteStringToFileOrDie(full_output, merged_config_json);
- util::WriteStringToFileOrDie(
- stripped_output,
- FlatbufferToJson(&config.message(), {.multi_line = true}));
- aos::WriteFlatbufferToFile(binary_output, merged_config);
+ if (!FLAGS_full_output.empty()) {
+ util::WriteStringToFileOrDie(FLAGS_full_output, merged_config_json);
+ }
+ if (!FLAGS_stripped_output.empty()) {
+ util::WriteStringToFileOrDie(
+ FLAGS_stripped_output,
+ FlatbufferToJson(&config.message(), {.multi_line = true}));
+ }
+ if (!FLAGS_binary_output.empty()) {
+ aos::WriteFlatbufferToFile(FLAGS_binary_output, merged_config);
+ }
return 0;
}
diff --git a/aos/configuration.cc b/aos/configuration.cc
index 70e7b48..78f9eb2 100644
--- a/aos/configuration.cc
+++ b/aos/configuration.cc
@@ -1144,6 +1144,13 @@
return nullptr;
}
+std::string_view NodeName(const Configuration *config, size_t node_index) {
+ if (!configuration::MultiNode(config)) {
+ return "(singlenode)";
+ }
+ return config->nodes()->Get(node_index)->name()->string_view();
+}
+
const Node *GetMyNode(const Configuration *config) {
const std::string hostname = (FLAGS_override_hostname.size() > 0)
? FLAGS_override_hostname
@@ -1641,9 +1648,10 @@
chrono::nanoseconds channel_storage_duration) {
// Use integer arithmetic and round up at all cost.
return static_cast<int>(
- (999999999 + static_cast<int64_t>(frequency) *
- static_cast<int64_t>(channel_storage_duration.count())) /
- static_cast<int64_t>(1000000000));
+ (999'999'999 +
+ static_cast<int64_t>(frequency) *
+ static_cast<int64_t>(channel_storage_duration.count())) /
+ static_cast<int64_t>(1'000'000'000));
}
int QueueScratchBufferSize(const Channel *channel) {
diff --git a/aos/configuration.h b/aos/configuration.h
index e35c0bf..01b9e7c 100644
--- a/aos/configuration.h
+++ b/aos/configuration.h
@@ -128,6 +128,10 @@
const Node *GetNodeFromHostname(const Configuration *config,
std::string_view name);
+// Returns a printable name for the node. (singlenode) if we are on a single
+// node system, and the name otherwise.
+std::string_view NodeName(const Configuration *config, size_t node_index);
+
// Returns a vector of the nodes in the config. (nullptr is considered the node
// in a single node world.)
std::vector<const Node *> GetNodes(const Configuration *config);
diff --git a/aos/events/BUILD b/aos/events/BUILD
index 174df38..c398e7b 100644
--- a/aos/events/BUILD
+++ b/aos/events/BUILD
@@ -460,6 +460,7 @@
"//aos:init",
"//aos:realtime",
"//aos/ipc_lib:lockless_queue",
+ "//aos/ipc_lib:shm_base",
"//aos/ipc_lib:signalfd",
"//aos/stl_mutex",
"//aos/util:phased_loop",
@@ -471,7 +472,7 @@
name = "shm_event_loop_test",
srcs = ["shm_event_loop_test.cc"],
flaky = True,
- shard_count = 24,
+ shard_count = 50,
target_compatible_with = ["@platforms//os:linux"],
deps = [
":event_loop_param_test",
@@ -622,7 +623,7 @@
rustc_flags = ["-Crelocation-model=static"],
deps = [
":ping_rust_fbs",
- "//aos:init_rs",
+ "//aos:test_init_rs",
"@crate_index//:futures",
"@rules_rust//tools/runfiles",
],
@@ -672,7 +673,7 @@
}),
deps = [
":ping_rust_fbs",
- "//aos:init_rs",
+ "//aos:test_init_rs",
"@crate_index//:futures",
"@rules_rust//tools/runfiles",
],
diff --git a/aos/events/event_loop.cc b/aos/events/event_loop.cc
index 072630f..ba71b97 100644
--- a/aos/events/event_loop.cc
+++ b/aos/events/event_loop.cc
@@ -35,6 +35,8 @@
}
} // namespace
+std::optional<std::string> EventLoop::default_version_string_;
+
std::pair<SharedSpan, absl::Span<uint8_t>> MakeSharedSpan(size_t size) {
AlignedOwningSpan *const span = reinterpret_cast<AlignedOwningSpan *>(
malloc(sizeof(AlignedOwningSpan) + size + kChannelDataAlignment - 1));
@@ -60,8 +62,12 @@
}
void RawSender::CheckOk(const RawSender::Error err) {
- CHECK_EQ(err, Error::kOk) << "Messages were sent too fast on channel: "
- << configuration::CleanedChannelToString(channel_);
+ if (err != Error::kOk) {
+ event_loop_->SendTimingReport();
+ CHECK_EQ(err, Error::kOk)
+ << "Messages were sent too fast on channel: "
+ << configuration::CleanedChannelToString(channel_);
+ }
}
RawSender::RawSender(EventLoop *event_loop, const Channel *channel)
@@ -146,7 +152,8 @@
PhasedLoopHandler::~PhasedLoopHandler() {}
EventLoop::EventLoop(const Configuration *configuration)
- : timing_report_(flatbuffers::DetachedBuffer()),
+ : version_string_(default_version_string_),
+ timing_report_(flatbuffers::DetachedBuffer()),
configuration_(configuration) {}
EventLoop::~EventLoop() {
@@ -472,8 +479,13 @@
flatbuffers::Offset<flatbuffers::String> name_offset =
fbb.CreateString(name());
+ const flatbuffers::Offset<flatbuffers::String> version_offset =
+ version_string_.has_value() ? fbb.CreateString(version_string_.value())
+ : flatbuffers::Offset<flatbuffers::String>();
+
timing::Report::Builder report_builder(fbb);
report_builder.add_name(name_offset);
+ report_builder.add_version(version_offset);
report_builder.add_pid(GetTid());
if (timer_offsets.size() > 0) {
report_builder.add_timers(timers_offset);
@@ -642,6 +654,18 @@
cpu_set_t EventLoop::DefaultAffinity() { return aos::DefaultAffinity(); }
+void EventLoop::SetDefaultVersionString(std::string_view version) {
+ default_version_string_ = version;
+}
+
+void EventLoop::SetVersionString(std::string_view version) {
+ CHECK(!is_running())
+ << ": Can't do things that might alter the timing report while running.";
+ version_string_ = version;
+
+ UpdateTimingReport();
+}
+
void WatcherState::set_timing_report(timing::Watcher *watcher) {
watcher_ = watcher;
if (!watcher) {
diff --git a/aos/events/event_loop.fbs b/aos/events/event_loop.fbs
index c0aaf19..15c6149 100644
--- a/aos/events/event_loop.fbs
+++ b/aos/events/event_loop.fbs
@@ -95,6 +95,11 @@
// Identifier for the event loop. This should change every time a process
// gets restarted.
pid:int (id: 1);
+ // Version string associated with this application. This can be empty or
+ // anything (an actual version, a git sha, etc.). This provides a convenient
+ // way for applications to self-report their version in a way that gets
+ // logged.
+ version:string (id: 8);
// List of statistics for each watcher, sender, fetcher, timer, and
// phased loop.
diff --git a/aos/events/event_loop.h b/aos/events/event_loop.h
index 17e3e00..1ed4337 100644
--- a/aos/events/event_loop.h
+++ b/aos/events/event_loop.h
@@ -48,9 +48,16 @@
// Fetches the next message in the queue without blocking. Returns true if
// there was a new message and we got it.
bool FetchNext();
+ // Fetches the next message if there is one, and the provided function returns
+ // true. The data and buffer_index are the only pieces of the Context which
+ // are zeroed out. The function must be valid.
+ bool FetchNextIf(std::function<bool(const Context &context)> fn);
// Fetches the latest message without blocking.
bool Fetch();
+ // Fetches the latest message conditionally without blocking. fn must be
+ // valid.
+ bool FetchIf(std::function<bool(const Context &context)> fn);
// Returns the channel this fetcher uses.
const Channel *channel() const { return channel_; }
@@ -67,7 +74,11 @@
friend class EventLoop;
// Implementation
virtual std::pair<bool, monotonic_clock::time_point> DoFetchNext() = 0;
+ virtual std::pair<bool, monotonic_clock::time_point> DoFetchNextIf(
+ std::function<bool(const Context &)> fn) = 0;
virtual std::pair<bool, monotonic_clock::time_point> DoFetch() = 0;
+ virtual std::pair<bool, monotonic_clock::time_point> DoFetchIf(
+ std::function<bool(const Context &)> fn) = 0;
EventLoop *const event_loop_;
const Channel *const channel_;
@@ -241,6 +252,18 @@
return result;
}
+ // Fetches the next message if there is one, and the provided function returns
+ // true. The data and buffer_index are the only pieces of the Context which
+ // are zeroed out. The function must be valid.
+ bool FetchNextIf(std::function<bool(const Context &)> fn) {
+ const bool result = CHECK_NOTNULL(fetcher_)->FetchNextIf(std::move(fn));
+ if (result) {
+ CheckChannelDataAlignment(fetcher_->context().data,
+ fetcher_->context().size);
+ }
+ return result;
+ }
+
// Fetches the most recent message. Returns true if it fetched a new message.
// This will return the latest message regardless of if it was sent before or
// after the fetcher was created.
@@ -253,6 +276,18 @@
return result;
}
+ // Fetches the most recent message conditionally. Returns true if it fetched a
+ // new message. This will return the latest message regardless of if it was
+ // sent before or after the fetcher was created. The function must be valid.
+ bool FetchIf(std::function<bool(const Context &)> fn) {
+ const bool result = CHECK_NOTNULL(fetcher_)->FetchIf(std::move(fn));
+ if (result) {
+ CheckChannelDataAlignment(fetcher_->context().data,
+ fetcher_->context().size);
+ }
+ return result;
+ }
+
// Returns a pointer to the contained flatbuffer, or nullptr if there is no
// available message.
const T *get() const {
@@ -773,6 +808,16 @@
// Returns the boot UUID.
virtual const UUID &boot_uuid() const = 0;
+ // Sets the version string that will be used in any newly constructed
+ // EventLoop objects. This can be overridden for individual EventLoop's by
+ // calling EventLoop::SetVersionString(). The version string is populated into
+ // the timing report message. Makes a copy of the provided string_view.
+ static void SetDefaultVersionString(std::string_view version);
+
+ // Overrides the version string for this event loop. Makes a copy of the
+ // provided string_view.
+ void SetVersionString(std::string_view version);
+
protected:
// Sets the name of the event loop. This is the application name.
virtual void set_name(const std::string_view name) = 0;
@@ -863,6 +908,13 @@
private:
virtual pid_t GetTid() = 0;
+ // Default version string to be used in the timing report for any newly
+ // created EventLoop objects.
+ static std::optional<std::string> default_version_string_;
+
+ // Timing report version string for this event loop.
+ std::optional<std::string> version_string_;
+
FlatbufferDetachedBuffer<timing::Report> timing_report_;
::std::atomic<bool> is_running_{false};
diff --git a/aos/events/event_loop_param_test.cc b/aos/events/event_loop_param_test.cc
index 0559aae..c861284 100644
--- a/aos/events/event_loop_param_test.cc
+++ b/aos/events/event_loop_param_test.cc
@@ -319,6 +319,93 @@
}
}
+std::function<bool(const Context &)> MakeShouldFetch(
+ bool should_fetch, size_t *called_count = nullptr) {
+ return [should_fetch, called_count](const Context &) {
+ if (called_count != nullptr) {
+ (*called_count)++;
+ }
+ return should_fetch;
+ };
+}
+
+// Tests that a fetcher using FetchIf can fetch from a sender.
+TEST_P(AbstractEventLoopTest, FetchIfWithoutRun) {
+ auto loop1 = Make();
+ auto loop2 = Make();
+ auto loop3 = MakePrimary();
+
+ auto sender = loop1->MakeSender<TestMessage>("/test");
+
+ auto fetcher = loop2->MakeFetcher<TestMessage>("/test");
+
+ for (const bool should_fetch : {true, false}) {
+ EXPECT_FALSE(fetcher.FetchIf(MakeShouldFetch(should_fetch)));
+ EXPECT_EQ(fetcher.get(), nullptr);
+
+ EXPECT_EQ(fetcher.context().monotonic_event_time,
+ monotonic_clock::min_time);
+ EXPECT_EQ(fetcher.context().monotonic_remote_time,
+ monotonic_clock::min_time);
+ EXPECT_EQ(fetcher.context().realtime_event_time, realtime_clock::min_time);
+ EXPECT_EQ(fetcher.context().realtime_remote_time, realtime_clock::min_time);
+ EXPECT_EQ(fetcher.context().source_boot_uuid, UUID::Zero());
+ EXPECT_EQ(fetcher.context().queue_index, 0xffffffffu);
+ EXPECT_EQ(fetcher.context().size, 0u);
+ EXPECT_EQ(fetcher.context().data, nullptr);
+ EXPECT_EQ(fetcher.context().buffer_index, -1);
+ }
+
+ aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
+ TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
+ builder.add_value(200);
+ msg.CheckOk(msg.Send(builder.Finish()));
+
+ // Make sure failing to fetch won't affect anything.
+ EXPECT_FALSE(fetcher.FetchIf(MakeShouldFetch(false)));
+ EXPECT_EQ(fetcher.get(), nullptr);
+
+ EXPECT_EQ(fetcher.context().monotonic_event_time, monotonic_clock::min_time);
+ EXPECT_EQ(fetcher.context().monotonic_remote_time, monotonic_clock::min_time);
+ EXPECT_EQ(fetcher.context().realtime_event_time, realtime_clock::min_time);
+ EXPECT_EQ(fetcher.context().realtime_remote_time, realtime_clock::min_time);
+ EXPECT_EQ(fetcher.context().source_boot_uuid, UUID::Zero());
+ EXPECT_EQ(fetcher.context().queue_index, 0xffffffffu);
+ EXPECT_EQ(fetcher.context().size, 0u);
+ EXPECT_EQ(fetcher.context().data, nullptr);
+ EXPECT_EQ(fetcher.context().buffer_index, -1);
+
+ // And now confirm we succeed and everything gets set right.
+ EXPECT_TRUE(fetcher.FetchIf(MakeShouldFetch(true)));
+ ASSERT_FALSE(fetcher.get() == nullptr);
+ EXPECT_EQ(fetcher.get()->value(), 200);
+
+ const chrono::milliseconds kEpsilon(100);
+
+ const aos::monotonic_clock::time_point monotonic_now = loop2->monotonic_now();
+ const aos::realtime_clock::time_point realtime_now = loop2->realtime_now();
+ EXPECT_EQ(fetcher.context().monotonic_event_time,
+ fetcher.context().monotonic_remote_time);
+ EXPECT_EQ(fetcher.context().realtime_event_time,
+ fetcher.context().realtime_remote_time);
+
+ EXPECT_GE(fetcher.context().monotonic_event_time, monotonic_now - kEpsilon);
+ EXPECT_LE(fetcher.context().monotonic_event_time, monotonic_now + kEpsilon);
+ EXPECT_GE(fetcher.context().realtime_event_time, realtime_now - kEpsilon);
+ EXPECT_LE(fetcher.context().realtime_event_time, realtime_now + kEpsilon);
+ EXPECT_EQ(fetcher.context().source_boot_uuid, loop2->boot_uuid());
+ EXPECT_EQ(fetcher.context().queue_index, 0x0u);
+ EXPECT_EQ(fetcher.context().size, 20u);
+ EXPECT_NE(fetcher.context().data, nullptr);
+ if (read_method() == ReadMethod::PIN) {
+ EXPECT_GE(fetcher.context().buffer_index, 0);
+ EXPECT_LT(fetcher.context().buffer_index,
+ loop2->NumberBuffers(fetcher.channel()));
+ } else {
+ EXPECT_EQ(fetcher.context().buffer_index, -1);
+ }
+}
+
// Tests that watcher will receive all messages sent if they are sent after
// initialization and before running.
TEST_P(AbstractEventLoopTest, DoubleSendAtStartup) {
@@ -345,16 +432,10 @@
}
loop2->OnRun([&]() {
- {
+ for (int i = 200; i < 202; ++i) {
aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(200);
- msg.CheckOk(msg.Send(builder.Finish()));
- }
- {
- aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
- TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(201);
+ builder.add_value(i);
msg.CheckOk(msg.Send(builder.Finish()));
}
});
@@ -374,16 +455,10 @@
::std::vector<int> values;
- {
+ for (int i = 200; i < 202; ++i) {
aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(200);
- msg.CheckOk(msg.Send(builder.Finish()));
- }
- {
- aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
- TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(201);
+ builder.add_value(i);
msg.CheckOk(msg.Send(builder.Finish()));
}
@@ -412,16 +487,10 @@
::std::vector<int> values;
- {
+ for (int i = 200; i < 202; ++i) {
aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(200);
- msg.CheckOk(msg.Send(builder.Finish()));
- }
- {
- aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
- TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(201);
+ builder.add_value(i);
msg.CheckOk(msg.Send(builder.Finish()));
}
@@ -451,16 +520,10 @@
::std::vector<int> values;
- {
+ for (int i = 200; i < 202; ++i) {
aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(200);
- msg.CheckOk(msg.Send(builder.Finish()));
- }
- {
- aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
- TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(201);
+ builder.add_value(i);
msg.CheckOk(msg.Send(builder.Finish()));
}
@@ -483,6 +546,41 @@
EXPECT_THAT(0, values.size());
}
+// Tests that FetchNextIf gets no messages sent before it is constructed.
+TEST_P(AbstractEventLoopTest, FetchNextIfAfterSend) {
+ auto loop1 = Make();
+ auto loop2 = MakePrimary();
+
+ auto sender = loop1->MakeSender<TestMessage>("/test");
+
+ ::std::vector<int> values;
+
+ for (int i = 200; i < 202; ++i) {
+ aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
+ TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
+ builder.add_value(i);
+ msg.CheckOk(msg.Send(builder.Finish()));
+ }
+
+ auto fetcher = loop2->MakeFetcher<TestMessage>("/test");
+
+ // Add a timer to actually quit.
+ auto test_timer = loop2->AddTimer([&fetcher, &values, this]() {
+ while (fetcher.FetchNextIf(MakeShouldFetch(true))) {
+ values.push_back(fetcher.get()->value());
+ }
+ this->Exit();
+ });
+
+ loop2->OnRun([&test_timer, &loop2]() {
+ test_timer->Schedule(loop2->monotonic_now(),
+ ::std::chrono::milliseconds(100));
+ });
+
+ Run();
+ EXPECT_EQ(0, values.size());
+}
+
// Tests that Fetch returns the last message created before the loop was
// started.
TEST_P(AbstractEventLoopTest, FetchDataFromBeforeCreation) {
@@ -493,16 +591,10 @@
::std::vector<int> values;
- {
+ for (int i = 200; i < 202; ++i) {
aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(200);
- msg.CheckOk(msg.Send(builder.Finish()));
- }
- {
- aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
- TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(201);
+ builder.add_value(i);
msg.CheckOk(msg.Send(builder.Finish()));
}
@@ -529,6 +621,50 @@
EXPECT_THAT(values, ::testing::ElementsAreArray({201}));
}
+// Tests that FetchIf returns the last message created before the loop was
+// started.
+TEST_P(AbstractEventLoopTest, FetchIfDataFromBeforeCreation) {
+ auto loop1 = Make();
+ auto loop2 = MakePrimary();
+
+ auto sender = loop1->MakeSender<TestMessage>("/test");
+
+ ::std::vector<int> values;
+
+ for (int i = 200; i < 202; ++i) {
+ aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
+ TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
+ builder.add_value(i);
+ msg.CheckOk(msg.Send(builder.Finish()));
+ }
+
+ auto fetcher = loop2->MakeFetcher<TestMessage>("/test");
+
+ // Add a timer to actually quit.
+ auto test_timer = loop2->AddTimer([&fetcher, &values, this]() {
+ if (fetcher.FetchIf(MakeShouldFetch(true))) {
+ values.push_back(fetcher.get()->value());
+ }
+
+ if (fetcher.FetchIf(MakeShouldFetch(false))) {
+ values.push_back(fetcher.get()->value());
+ }
+ // Do it again to make sure we don't double fetch.
+ if (fetcher.FetchIf(MakeShouldFetch(true))) {
+ values.push_back(fetcher.get()->value());
+ }
+ this->Exit();
+ });
+
+ loop2->OnRun([&test_timer, &loop2]() {
+ test_timer->Schedule(loop2->monotonic_now(),
+ ::std::chrono::milliseconds(100));
+ });
+
+ Run();
+ EXPECT_THAT(values, ::testing::ElementsAreArray({201}));
+}
+
// Tests that timer handler is enabled after setup (even if it is in the past)
// and is disabled after running
TEST_P(AbstractEventLoopTest, CheckTimerDisabled) {
@@ -636,16 +772,10 @@
::std::vector<int> values;
- {
+ for (int i = 200; i < 202; ++i) {
aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(200);
- msg.CheckOk(msg.Send(builder.Finish()));
- }
- {
- aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
- TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(201);
+ builder.add_value(i);
msg.CheckOk(msg.Send(builder.Finish()));
}
@@ -657,22 +787,10 @@
values.push_back(fetcher.get()->value());
}
- {
+ for (int i = 202; i < 205; ++i) {
aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(202);
- msg.CheckOk(msg.Send(builder.Finish()));
- }
- {
- aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
- TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(203);
- msg.CheckOk(msg.Send(builder.Finish()));
- }
- {
- aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
- TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
- builder.add_value(204);
+ builder.add_value(i);
msg.CheckOk(msg.Send(builder.Finish()));
}
@@ -696,6 +814,62 @@
EXPECT_THAT(values, ::testing::ElementsAreArray({201, 202, 204}));
}
+// Tests that Fetch{If,} and FetchNext{If,} interleave as expected.
+TEST_P(AbstractEventLoopTest, FetchAndFetchNextIfTogether) {
+ auto loop1 = Make();
+ auto loop2 = MakePrimary();
+
+ auto sender = loop1->MakeSender<TestMessage>("/test");
+
+ ::std::vector<int> values;
+
+ for (int i = 200; i < 202; ++i) {
+ aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
+ TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
+ builder.add_value(i);
+ msg.CheckOk(msg.Send(builder.Finish()));
+ }
+
+ auto fetcher = loop2->MakeFetcher<TestMessage>("/test");
+
+ // Add a timer to actually quit.
+ auto test_timer = loop2->AddTimer([&fetcher, &values, &sender, this]() {
+ if (fetcher.Fetch()) {
+ values.push_back(fetcher.get()->value());
+ }
+
+ for (int i = 202; i < 205; ++i) {
+ aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
+ TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
+ builder.add_value(i);
+ msg.CheckOk(msg.Send(builder.Finish()));
+ }
+
+ EXPECT_FALSE(fetcher.FetchNextIf(MakeShouldFetch(false)));
+
+ if (fetcher.FetchNext()) {
+ values.push_back(fetcher.get()->value());
+ }
+
+ EXPECT_FALSE(fetcher.FetchNextIf(MakeShouldFetch(false)));
+ EXPECT_FALSE(fetcher.FetchIf(MakeShouldFetch(false)));
+
+ if (fetcher.FetchIf(MakeShouldFetch(true))) {
+ values.push_back(fetcher.get()->value());
+ }
+
+ this->Exit();
+ });
+
+ loop2->OnRun([&test_timer, &loop2]() {
+ test_timer->Schedule(loop2->monotonic_now(),
+ ::std::chrono::milliseconds(100));
+ });
+
+ Run();
+ EXPECT_THAT(values, ::testing::ElementsAreArray({201, 202, 204}));
+}
+
// Tests that FetchNext behaves correctly when we get two messages in the queue
// but don't consume the first until after the second has been sent.
TEST_P(AbstractEventLoopTest, FetchNextTest) {
@@ -732,6 +906,49 @@
EXPECT_EQ(200, fetcher.get()->value());
}
+// Tests that FetchNext behaves correctly when we get two messages in the queue
+// but don't consume the first until after the second has been sent.
+TEST_P(AbstractEventLoopTest, FetchNextIfTest) {
+ auto send_loop = Make();
+ auto fetch_loop = Make();
+ auto sender = send_loop->MakeSender<TestMessage>("/test");
+ Fetcher<TestMessage> fetcher = fetch_loop->MakeFetcher<TestMessage>("/test");
+
+ {
+ aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
+ TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
+ builder.add_value(100);
+ msg.CheckOk(msg.Send(builder.Finish()));
+ }
+
+ {
+ aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
+ TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
+ builder.add_value(200);
+ msg.CheckOk(msg.Send(builder.Finish()));
+ }
+
+ size_t called_count = 0;
+ ASSERT_TRUE(fetcher.FetchNextIf(MakeShouldFetch(true, &called_count)));
+ ASSERT_NE(nullptr, fetcher.get());
+ EXPECT_EQ(100, fetcher.get()->value());
+ EXPECT_EQ(called_count, 1u);
+
+ ASSERT_FALSE(fetcher.FetchNextIf(MakeShouldFetch(false, &called_count)));
+ EXPECT_EQ(called_count, 2u);
+
+ ASSERT_TRUE(fetcher.FetchNextIf(MakeShouldFetch(true, &called_count)));
+ ASSERT_NE(nullptr, fetcher.get());
+ EXPECT_EQ(200, fetcher.get()->value());
+ EXPECT_EQ(called_count, 3u);
+
+ // When we run off the end of the queue, expect to still have the old message:
+ ASSERT_FALSE(fetcher.FetchNextIf(MakeShouldFetch(false, &called_count)));
+ EXPECT_EQ(called_count, 3u);
+ ASSERT_NE(nullptr, fetcher.get());
+ EXPECT_EQ(200, fetcher.get()->value());
+}
+
// Verify that a fetcher still holds its data, even after falling behind.
TEST_P(AbstractEventLoopTest, FetcherBehindData) {
auto send_loop = Make();
@@ -1271,6 +1488,84 @@
}
}
+// Test that setting a default version string results in it getting populated
+// correctly.
+TEST_P(AbstractEventLoopTest, DefaultVersionStringInTimingReport) {
+ gflags::FlagSaver flag_saver;
+ FLAGS_timing_report_ms = 1000;
+
+ EventLoop::SetDefaultVersionString("default_version_string");
+
+ auto loop = MakePrimary();
+
+ Fetcher<timing::Report> report_fetcher =
+ loop->MakeFetcher<timing::Report>("/aos");
+
+ TimerHandler *exit_timer = loop->AddTimer([this]() { Exit(); });
+ loop->OnRun([exit_timer, &loop, &report_fetcher]() {
+ report_fetcher.Fetch();
+ exit_timer->Schedule(loop->monotonic_now() + std::chrono::seconds(2));
+ });
+
+ Run();
+
+ bool found_primary_report = false;
+ while (report_fetcher.FetchNext()) {
+ if (report_fetcher->name()->string_view() == "primary") {
+ found_primary_report = true;
+ EXPECT_EQ("default_version_string",
+ report_fetcher->version()->string_view());
+ } else {
+ FAIL() << report_fetcher->name()->string_view();
+ }
+ }
+
+ if (do_timing_reports() == DoTimingReports::kYes) {
+ EXPECT_TRUE(found_primary_report);
+ } else {
+ EXPECT_FALSE(found_primary_report);
+ }
+}
+
+// Test that overriding the default version string results in it getting
+// populated correctly.
+TEST_P(AbstractEventLoopTest, OverrideDersionStringInTimingReport) {
+ gflags::FlagSaver flag_saver;
+ FLAGS_timing_report_ms = 1000;
+
+ EventLoop::SetDefaultVersionString("default_version_string");
+
+ auto loop = MakePrimary();
+ loop->SetVersionString("override_version");
+
+ Fetcher<timing::Report> report_fetcher =
+ loop->MakeFetcher<timing::Report>("/aos");
+
+ TimerHandler *exit_timer = loop->AddTimer([this]() { Exit(); });
+ loop->OnRun([exit_timer, &loop, &report_fetcher]() {
+ report_fetcher.Fetch();
+ exit_timer->Schedule(loop->monotonic_now() + std::chrono::seconds(2));
+ });
+
+ Run();
+
+ bool found_primary_report = false;
+ while (report_fetcher.FetchNext()) {
+ if (report_fetcher->name()->string_view() == "primary") {
+ found_primary_report = true;
+ EXPECT_EQ("override_version", report_fetcher->version()->string_view());
+ } else {
+ FAIL() << report_fetcher->name()->string_view();
+ }
+ }
+
+ if (do_timing_reports() == DoTimingReports::kYes) {
+ EXPECT_TRUE(found_primary_report);
+ } else {
+ EXPECT_FALSE(found_primary_report);
+ }
+}
+
// Verify that we can change a timer's parameters during execution.
TEST_P(AbstractEventLoopTest, TimerChangeParameters) {
auto loop = MakePrimary();
@@ -3192,6 +3487,19 @@
"May only send the buffer detached from this Sender");
}
+// Tests that senders fail when created on the wrong node.
+TEST_P(AbstractEventLoopDeathTest, SetVersionWhileRunning) {
+ auto loop1 = MakePrimary();
+
+ loop1->OnRun([&loop1, this]() {
+ EXPECT_DEATH({ loop1->SetVersionString("abcdef"); },
+ "timing report while running");
+ Exit();
+ });
+
+ Run();
+}
+
int TestChannelFrequency(EventLoop *event_loop) {
return event_loop->GetChannel<TestMessage>("/test")->frequency();
}
diff --git a/aos/events/event_loop_tmpl.h b/aos/events/event_loop_tmpl.h
index 6c132b2..9a5fe19 100644
--- a/aos/events/event_loop_tmpl.h
+++ b/aos/events/event_loop_tmpl.h
@@ -102,6 +102,37 @@
return false;
}
+inline bool RawFetcher::FetchNextIf(std::function<bool(const Context &)> fn) {
+ DCHECK(fn);
+ const auto result = DoFetchNextIf(std::move(fn));
+ if (result.first) {
+ if (timing_.fetcher) {
+ timing_.fetcher->mutate_count(timing_.fetcher->count() + 1);
+ }
+ const monotonic_clock::time_point monotonic_time = result.second;
+ ftrace_.FormatMessage(
+ "%.*s: fetch next if: now=%" PRId64 " event=%" PRId64 " queue=%" PRIu32,
+ static_cast<int>(ftrace_prefix_.size()), ftrace_prefix_.data(),
+ static_cast<int64_t>(monotonic_time.time_since_epoch().count()),
+ static_cast<int64_t>(
+ context_.monotonic_event_time.time_since_epoch().count()),
+ context_.queue_index);
+ const float latency =
+ std::chrono::duration_cast<std::chrono::duration<float>>(
+ monotonic_time - context_.monotonic_event_time)
+ .count();
+ timing_.latency.Add(latency);
+ return true;
+ }
+ ftrace_.FormatMessage(
+ "%.*s: fetch next: still event=%" PRId64 " queue=%" PRIu32,
+ static_cast<int>(ftrace_prefix_.size()), ftrace_prefix_.data(),
+ static_cast<int64_t>(
+ context_.monotonic_event_time.time_since_epoch().count()),
+ context_.queue_index);
+ return false;
+}
+
inline bool RawFetcher::Fetch() {
const auto result = DoFetch();
if (result.first) {
@@ -132,6 +163,38 @@
return false;
}
+inline bool RawFetcher::FetchIf(std::function<bool(const Context &)> fn) {
+ DCHECK(fn);
+
+ const auto result = DoFetchIf(std::move(fn));
+ if (result.first) {
+ if (timing_.fetcher) {
+ timing_.fetcher->mutate_count(timing_.fetcher->count() + 1);
+ }
+ const monotonic_clock::time_point monotonic_time = result.second;
+ ftrace_.FormatMessage(
+ "%.*s: fetch latest: now=%" PRId64 " event=%" PRId64 " queue=%" PRIu32,
+ static_cast<int>(ftrace_prefix_.size()), ftrace_prefix_.data(),
+ static_cast<int64_t>(monotonic_time.time_since_epoch().count()),
+ static_cast<int64_t>(
+ context_.monotonic_event_time.time_since_epoch().count()),
+ context_.queue_index);
+ const float latency =
+ std::chrono::duration_cast<std::chrono::duration<float>>(
+ monotonic_time - context_.monotonic_event_time)
+ .count();
+ timing_.latency.Add(latency);
+ return true;
+ }
+ ftrace_.FormatMessage(
+ "%.*s: fetch latest: still event=%" PRId64 " queue=%" PRIu32,
+ static_cast<int>(ftrace_prefix_.size()), ftrace_prefix_.data(),
+ static_cast<int64_t>(
+ context_.monotonic_event_time.time_since_epoch().count()),
+ context_.queue_index);
+ return false;
+}
+
inline RawSender::Error RawSender::Send(size_t size) {
return Send(size, monotonic_clock::min_time, realtime_clock::min_time,
0xffffffffu, event_loop_->boot_uuid());
diff --git a/aos/events/event_scheduler.cc b/aos/events/event_scheduler.cc
index 115b2c7..ae50917 100644
--- a/aos/events/event_scheduler.cc
+++ b/aos/events/event_scheduler.cc
@@ -459,6 +459,10 @@
}
void EventSchedulerScheduler::TemporarilyStopAndRun(std::function<void()> fn) {
+ if (in_on_run_) {
+ LOG(FATAL)
+ << "Can't call AllowApplicationCreationDuring from an OnRun callback.";
+ }
const bool was_running = is_running_;
if (is_running_) {
is_running_ = false;
@@ -475,11 +479,13 @@
for (EventScheduler *scheduler : schedulers_) {
scheduler->MaybeRunOnStartup();
}
+ in_on_run_ = true;
// We must trigger all the OnRun's *after* all the OnStartup callbacks are
// triggered because that is the contract that we have stated.
for (EventScheduler *scheduler : schedulers_) {
scheduler->MaybeRunOnRun();
}
+ in_on_run_ = false;
}
} // namespace aos
diff --git a/aos/events/event_scheduler.h b/aos/events/event_scheduler.h
index 848a1cf..55fa9a4 100644
--- a/aos/events/event_scheduler.h
+++ b/aos/events/event_scheduler.h
@@ -357,6 +357,8 @@
// True if we are running.
bool is_running_ = false;
+
+ bool in_on_run_ = false;
// The current time.
distributed_clock::time_point now_ = distributed_clock::epoch();
// List of schedulers to run in sync.
diff --git a/aos/events/logging/BUILD b/aos/events/logging/BUILD
index 2516e6c..c33606c 100644
--- a/aos/events/logging/BUILD
+++ b/aos/events/logging/BUILD
@@ -28,14 +28,31 @@
)
flatbuffer_cc_library(
+ name = "log_replayer_stats_fbs",
+ srcs = ["log_replayer_stats.fbs"],
+ gen_reflections = True,
+ target_compatible_with = ["@platforms//os:linux"],
+ deps = [
+ ":replay_config_fbs",
+ ],
+)
+
+cc_static_flatbuffer(
+ name = "log_replayer_stats_schema",
+ function = "aos::LogReplayerStatsSchema",
+ target = ":log_replayer_stats_fbs_reflection_out",
+ visibility = ["//visibility:public"],
+)
+
+flatbuffer_cc_library(
name = "replay_config_fbs",
srcs = ["log_replayer_config.fbs"],
gen_reflections = True,
- includes = [
- "//aos:configuration_fbs_includes",
- ],
target_compatible_with = ["@platforms//os:linux"],
visibility = ["//visibility:public"],
+ deps = [
+ "//aos:configuration_fbs",
+ ],
)
cc_library(
@@ -59,6 +76,8 @@
deps = [
":log_reader",
":log_reader_utils",
+ ":log_replayer_stats_fbs",
+ ":log_replayer_stats_schema",
":replay_config_fbs",
":replay_timing_fbs",
":replay_timing_schema",
@@ -173,6 +192,7 @@
"//aos:sha256",
"//aos:uuid",
"//aos:configuration",
+ "//aos/containers:error_list",
"//aos:flatbuffer_merge",
"//aos:flatbuffers",
"//aos/containers:resizeable_buffer",
@@ -389,6 +409,8 @@
":logfile_utils",
":logger_fbs",
"//aos:uuid",
+ "//aos/containers:error_list",
+ "//aos/containers:sized_array",
"@com_github_google_flatbuffers//:flatbuffers",
"@com_github_google_glog//:glog",
],
@@ -420,10 +442,12 @@
],
hdrs = [
"log_reader.h",
+ "replay_channels.h",
],
target_compatible_with = ["@platforms//os:linux"],
visibility = ["//visibility:public"],
deps = [
+ ":config_remapper",
":log_namer",
":log_writer",
":logfile_utils",
@@ -449,6 +473,58 @@
],
)
+cc_library(
+ name = "config_remapper",
+ srcs = [
+ "config_remapper.cc",
+ ],
+ hdrs = [
+ "config_remapper.h",
+ "replay_channels.h",
+ ],
+ target_compatible_with = ["@platforms//os:linux"],
+ visibility = ["//visibility:public"],
+ deps = [
+ ":logfile_utils",
+ ":logger_fbs",
+ "//aos/events:event_loop",
+ "//aos/network:message_bridge_server_fbs",
+ "//aos/network:multinode_timestamp_filter",
+ "//aos/network:remote_message_fbs",
+ "//aos/network:remote_message_schema",
+ "//aos/network:team_number",
+ "//aos/network:timestamp_filter",
+ "@com_github_google_flatbuffers//:flatbuffers",
+ "@com_google_absl//absl/strings",
+ ],
+)
+
+cc_test(
+ name = "config_remapper_test",
+ srcs = ["config_remapper_test.cc"],
+ copts = select({
+ "//tools:cpu_k8": ["-DLZMA=1"],
+ "//tools:cpu_arm64": ["-DLZMA=1"],
+ "//conditions:default": [],
+ }),
+ data = [
+ ":multinode_pingpong_combined_config",
+ ":multinode_pingpong_split3_config",
+ ":multinode_pingpong_split4_config",
+ ":multinode_pingpong_split4_mixed1_config",
+ ":multinode_pingpong_split4_mixed2_config",
+ ":multinode_pingpong_split4_reliable_config",
+ ":multinode_pingpong_split_config",
+ ":multinode_pingpong_triangle_split_config",
+ "//aos/events:pingpong_config",
+ ],
+ target_compatible_with = ["@platforms//os:linux"],
+ deps = [
+ ":config_remapper",
+ ":multinode_logger_test_lib",
+ ],
+)
+
cc_binary(
name = "log_cat",
srcs = [
@@ -469,6 +545,24 @@
],
)
+cc_library(
+ name = "logfile_validator",
+ srcs = [
+ "logfile_validator.cc",
+ ],
+ hdrs = ["logfile_validator.h"],
+ target_compatible_with = ["@platforms//os:linux"],
+ visibility = ["//visibility:public"],
+ deps = [
+ ":logfile_utils",
+ "//aos:configuration",
+ "//aos/events:simulated_event_loop",
+ "//aos/network:multinode_timestamp_filter",
+ "@com_github_gflags_gflags//:gflags",
+ "@com_github_google_glog//:glog",
+ ],
+)
+
cc_binary(
name = "timestamp_extractor",
srcs = [
@@ -478,6 +572,7 @@
visibility = ["//visibility:public"],
deps = [
":logfile_utils",
+ ":logfile_validator",
"//aos:configuration",
"//aos:init",
"//aos/events:simulated_event_loop",
@@ -576,6 +671,7 @@
deps = [
":log_reader",
":log_writer",
+ ":logfile_validator",
":snappy_encoder",
"//aos/events:message_counter",
"//aos/events:ping_lib",
@@ -708,6 +804,49 @@
deps = ["//aos/events:aos_config"],
)
+aos_config(
+ name = "multinode_pingpong_reboot_ooo_config",
+ src = "multinode_pingpong_reboot_ooo.json",
+ flatbuffers = [
+ "//aos/events:ping_fbs",
+ "//aos/events:pong_fbs",
+ "//aos/network:message_bridge_client_fbs",
+ "//aos/network:message_bridge_server_fbs",
+ "//aos/network:remote_message_fbs",
+ "//aos/network:timestamp_fbs",
+ ],
+ target_compatible_with = ["@platforms//os:linux"],
+ deps = ["//aos/events:aos_config"],
+)
+
+aos_config(
+ name = "multinode_pingpong_pi3_pingpong_config",
+ src = "multinode_pingpong_pi3_pingpong.json",
+ flatbuffers = [
+ "//aos/events:ping_fbs",
+ "//aos/events:pong_fbs",
+ "//aos/network:message_bridge_client_fbs",
+ "//aos/network:message_bridge_server_fbs",
+ "//aos/network:remote_message_fbs",
+ "//aos/network:timestamp_fbs",
+ ],
+ target_compatible_with = ["@platforms//os:linux"],
+ deps = ["//aos/events:aos_config"],
+)
+
+aos_config(
+ name = "multinode_single_node_config",
+ src = "multinode_single_node.json",
+ flatbuffers = [
+ "//aos/network:message_bridge_client_fbs",
+ "//aos/network:message_bridge_server_fbs",
+ "//aos/network:remote_message_fbs",
+ "//aos/network:timestamp_fbs",
+ ],
+ target_compatible_with = ["@platforms//os:linux"],
+ deps = ["//aos/events:aos_config"],
+)
+
cc_test(
name = "realtime_replay_test",
srcs = ["realtime_replay_test.cc"],
@@ -752,6 +891,8 @@
}),
data = [
":multinode_pingpong_combined_config",
+ ":multinode_pingpong_pi3_pingpong_config",
+ ":multinode_pingpong_reboot_ooo_config",
":multinode_pingpong_split3_config",
":multinode_pingpong_split4_config",
":multinode_pingpong_split4_mixed1_config",
@@ -759,6 +900,7 @@
":multinode_pingpong_split4_reliable_config",
":multinode_pingpong_split_config",
":multinode_pingpong_triangle_split_config",
+ ":multinode_single_node_config",
"//aos/events:pingpong_config",
],
shard_count = 10,
diff --git a/aos/events/logging/config_remapper.cc b/aos/events/logging/config_remapper.cc
new file mode 100644
index 0000000..21f45d5
--- /dev/null
+++ b/aos/events/logging/config_remapper.cc
@@ -0,0 +1,679 @@
+#include "aos/events/logging/config_remapper.h"
+
+#include <vector>
+
+#include "absl/strings/escaping.h"
+#include "flatbuffers/flatbuffers.h"
+
+#include "aos/events/logging/logger_generated.h"
+#include "aos/flatbuffer_merge.h"
+#include "aos/json_to_flatbuffer.h"
+#include "aos/network/multinode_timestamp_filter.h"
+#include "aos/network/remote_message_generated.h"
+#include "aos/network/remote_message_schema.h"
+#include "aos/network/team_number.h"
+#include "aos/network/timestamp_channel.h"
+
+namespace aos {
+using message_bridge::RemoteMessage;
+
+namespace {
+// Checks if the specified channel name/type exists in the config and, depending
+// on the value of conflict_handling, calls conflict_handler or just dies.
+template <typename F>
+void CheckAndHandleRemapConflict(
+ std::string_view new_name, std::string_view new_type,
+ const Configuration *config,
+ ConfigRemapper::RemapConflict conflict_handling, F conflict_handler) {
+ const Channel *existing_channel =
+ configuration::GetChannel(config, new_name, new_type, "", nullptr, true);
+ if (existing_channel != nullptr) {
+ switch (conflict_handling) {
+ case ConfigRemapper::RemapConflict::kDisallow:
+ LOG(FATAL)
+ << "Channel "
+ << configuration::StrippedChannelToString(existing_channel)
+ << " is already used--you can't remap an original channel to it.";
+ break;
+ case ConfigRemapper::RemapConflict::kCascade:
+ VLOG(1) << "Automatically remapping "
+ << configuration::StrippedChannelToString(existing_channel)
+ << " to avoid conflicts.";
+ conflict_handler();
+ break;
+ }
+ }
+}
+} // namespace
+
+namespace configuration {
+// We don't really want to expose this publicly, but log reader doesn't really
+// want to re-implement it.
+void HandleMaps(const flatbuffers::Vector<flatbuffers::Offset<Map>> *maps,
+ std::string *name, std::string_view type, const Node *node);
+} // namespace configuration
+
+bool CompareChannels(const Channel *c,
+ ::std::pair<std::string_view, std::string_view> p) {
+ int name_compare = c->name()->string_view().compare(p.first);
+ if (name_compare == 0) {
+ return c->type()->string_view() < p.second;
+ } else if (name_compare < 0) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool EqualsChannels(const Channel *c,
+ ::std::pair<std::string_view, std::string_view> p) {
+ return c->name()->string_view() == p.first &&
+ c->type()->string_view() == p.second;
+}
+// Copies the channel, removing the schema as we go. If new_name is provided,
+// it is used instead of the name inside the channel. If new_type is provided,
+// it is used instead of the type in the channel.
+flatbuffers::Offset<Channel> CopyChannel(const Channel *c,
+ std::string_view new_name,
+ std::string_view new_type,
+ flatbuffers::FlatBufferBuilder *fbb) {
+ CHECK_EQ(Channel::MiniReflectTypeTable()->num_elems, 14u)
+ << ": Merging logic needs to be updated when the number of channel "
+ "fields changes.";
+
+ flatbuffers::Offset<flatbuffers::String> name_offset =
+ fbb->CreateSharedString(new_name.empty() ? c->name()->string_view()
+ : new_name);
+ flatbuffers::Offset<flatbuffers::String> type_offset =
+ fbb->CreateSharedString(new_type.empty() ? c->type()->str() : new_type);
+ flatbuffers::Offset<flatbuffers::String> source_node_offset =
+ c->has_source_node() ? fbb->CreateSharedString(c->source_node()->str())
+ : 0;
+
+ flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Connection>>>
+ destination_nodes_offset =
+ RecursiveCopyVectorTable(c->destination_nodes(), fbb);
+
+ flatbuffers::Offset<
+ flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>>
+ logger_nodes_offset = CopyVectorSharedString(c->logger_nodes(), fbb);
+
+ Channel::Builder channel_builder(*fbb);
+ channel_builder.add_name(name_offset);
+ channel_builder.add_type(type_offset);
+ if (c->has_frequency()) {
+ channel_builder.add_frequency(c->frequency());
+ }
+ if (c->has_max_size()) {
+ channel_builder.add_max_size(c->max_size());
+ }
+ if (c->has_num_senders()) {
+ channel_builder.add_num_senders(c->num_senders());
+ }
+ if (c->has_num_watchers()) {
+ channel_builder.add_num_watchers(c->num_watchers());
+ }
+ if (!source_node_offset.IsNull()) {
+ channel_builder.add_source_node(source_node_offset);
+ }
+ if (!destination_nodes_offset.IsNull()) {
+ channel_builder.add_destination_nodes(destination_nodes_offset);
+ }
+ if (c->has_logger()) {
+ channel_builder.add_logger(c->logger());
+ }
+ if (!logger_nodes_offset.IsNull()) {
+ channel_builder.add_logger_nodes(logger_nodes_offset);
+ }
+ if (c->has_read_method()) {
+ channel_builder.add_read_method(c->read_method());
+ }
+ if (c->has_num_readers()) {
+ channel_builder.add_num_readers(c->num_readers());
+ }
+ if (c->has_channel_storage_duration()) {
+ channel_builder.add_channel_storage_duration(c->channel_storage_duration());
+ }
+ return channel_builder.Finish();
+}
+
+ConfigRemapper::ConfigRemapper(const Configuration *config,
+ const Configuration *replay_config,
+ const logger::ReplayChannels *replay_channels)
+ : remapped_configuration_(config),
+ original_configuration_(config),
+ replay_configuration_(replay_config),
+ replay_channels_(replay_channels) {
+ MakeRemappedConfig();
+
+ // If any remote timestamp channel was not marked NOT_LOGGED, then remap that
+ // channel to avoid the redundant logged data. Also, this loop handles the
+ // MessageHeader to RemoteMessae name change.
+ // Note: This path is mainly for backwards compatibility reasons, and should
+ // not be necessary for any new logs.
+ for (const Node *node : configuration::GetNodes(original_configuration())) {
+ message_bridge::ChannelTimestampFinder finder(original_configuration(),
+ "log_reader", node);
+
+ absl::btree_set<std::string_view> remote_nodes;
+
+ for (const Channel *channel : *original_configuration()->channels()) {
+ if (!configuration::ChannelIsSendableOnNode(channel, node)) {
+ continue;
+ }
+ if (!channel->has_destination_nodes()) {
+ continue;
+ }
+ for (const Connection *connection : *channel->destination_nodes()) {
+ if (configuration::ConnectionDeliveryTimeIsLoggedOnNode(connection,
+ node)) {
+ // Start by seeing if the split timestamp channels are being used for
+ // this message.
+ const Channel *timestamp_channel = configuration::GetChannel(
+ original_configuration(),
+ finder.SplitChannelName(channel, connection),
+ RemoteMessage::GetFullyQualifiedName(), "", node, true);
+
+ if (timestamp_channel != nullptr) {
+ // If for some reason a timestamp channel is not NOT_LOGGED (which
+ // is unusual), then remap the channel so that the replayed channel
+ // doesn't overlap with the special separate replay we do for
+ // timestamps.
+ if (timestamp_channel->logger() != LoggerConfig::NOT_LOGGED) {
+ RemapOriginalChannel<RemoteMessage>(
+ timestamp_channel->name()->string_view(), node);
+ }
+ continue;
+ }
+
+ // Otherwise collect this one up as a node to look for a combined
+ // channel from. It is more efficient to compare nodes than channels.
+ LOG(WARNING) << "Failed to find channel "
+ << finder.SplitChannelName(channel, connection)
+ << " on node " << FlatbufferToJson(node);
+ remote_nodes.insert(connection->name()->string_view());
+ }
+ }
+ }
+
+ std::vector<const Node *> timestamp_logger_nodes =
+ configuration::TimestampNodes(original_configuration(), node);
+ for (const std::string_view remote_node : remote_nodes) {
+ const std::string channel = finder.CombinedChannelName(remote_node);
+
+ // See if the log file is an old log with logger::MessageHeader channels
+ // in it, or a newer log with RemoteMessage. If we find an older log,
+ // rename the type too along with the name.
+ if (HasChannel<logger::MessageHeader>(channel, node)) {
+ CHECK(!HasChannel<RemoteMessage>(channel, node))
+ << ": Can't have both a logger::MessageHeader and RemoteMessage "
+ "remote "
+ "timestamp channel.";
+ // In theory, we should check NOT_LOGGED like RemoteMessage and be more
+ // careful about updating the config, but there are fewer and fewer logs
+ // with logger::MessageHeader remote messages, so it isn't worth the
+ // effort.
+ RemapOriginalChannel<logger::MessageHeader>(
+ channel, node, "/original", "aos.message_bridge.RemoteMessage");
+ } else {
+ CHECK(HasChannel<RemoteMessage>(channel, node))
+ << ": Failed to find {\"name\": \"" << channel << "\", \"type\": \""
+ << RemoteMessage::GetFullyQualifiedName() << "\"} for node "
+ << node->name()->string_view();
+ // Only bother to remap if there's something on the channel. We can
+ // tell if the channel was marked NOT_LOGGED or not. This makes the
+ // config not change un-necesarily when we replay a log with NOT_LOGGED
+ // messages.
+ if (HasOriginalChannel<RemoteMessage>(channel, node)) {
+ RemapOriginalChannel<RemoteMessage>(channel, node);
+ }
+ }
+ }
+ }
+ if (replay_configuration_) {
+ CHECK_EQ(configuration::MultiNode(remapped_configuration()),
+ configuration::MultiNode(replay_configuration_))
+ << ": Log file and replay config need to both be multi or single "
+ "node.";
+ }
+}
+
+ConfigRemapper::~ConfigRemapper() {
+ // Zero out some buffers. It's easy to do use-after-frees on these, so make
+ // it more obvious.
+ if (remapped_configuration_buffer_) {
+ remapped_configuration_buffer_->Wipe();
+ }
+}
+
+const Configuration *ConfigRemapper::original_configuration() const {
+ return original_configuration_;
+}
+
+const Configuration *ConfigRemapper::remapped_configuration() const {
+ return remapped_configuration_;
+}
+
+void ConfigRemapper::set_configuration(const Configuration *configuration) {
+ remapped_configuration_ = configuration;
+}
+
+std::vector<const Channel *> ConfigRemapper::RemappedChannels() const {
+ std::vector<const Channel *> result;
+ result.reserve(remapped_channels_.size());
+ for (auto &pair : remapped_channels_) {
+ const Channel *const original_channel =
+ CHECK_NOTNULL(original_configuration()->channels()->Get(pair.first));
+
+ auto channel_iterator = std::lower_bound(
+ remapped_configuration_->channels()->cbegin(),
+ remapped_configuration_->channels()->cend(),
+ std::make_pair(std::string_view(pair.second.remapped_name),
+ original_channel->type()->string_view()),
+ CompareChannels);
+
+ CHECK(channel_iterator != remapped_configuration_->channels()->cend());
+ CHECK(EqualsChannels(
+ *channel_iterator,
+ std::make_pair(std::string_view(pair.second.remapped_name),
+ original_channel->type()->string_view())));
+ result.push_back(*channel_iterator);
+ }
+ return result;
+}
+
+const Channel *ConfigRemapper::RemapChannel(const EventLoop *event_loop,
+ const Node *node,
+ const Channel *channel) {
+ std::string_view channel_name = channel->name()->string_view();
+ std::string_view channel_type = channel->type()->string_view();
+ const int channel_index =
+ configuration::ChannelIndex(original_configuration(), channel);
+ // If the channel is remapped, find the correct channel name to use.
+ if (remapped_channels_.count(channel_index) > 0) {
+ VLOG(3) << "Got remapped channel on "
+ << configuration::CleanedChannelToString(channel);
+ channel_name = remapped_channels_[channel_index].remapped_name;
+ }
+
+ VLOG(2) << "Going to remap channel " << channel_name << " " << channel_type;
+ const Channel *remapped_channel = configuration::GetChannel(
+ remapped_configuration(), channel_name, channel_type,
+ event_loop ? event_loop->name() : "log_reader", node);
+
+ CHECK(remapped_channel != nullptr)
+ << ": Unable to send {\"name\": \"" << channel_name << "\", \"type\": \""
+ << channel_type << "\"} because it is not in the provided configuration.";
+
+ return remapped_channel;
+}
+
+void ConfigRemapper::RemapOriginalChannel(std::string_view name,
+ std::string_view type,
+ std::string_view add_prefix,
+ std::string_view new_type,
+ RemapConflict conflict_handling) {
+ RemapOriginalChannel(name, type, nullptr, add_prefix, new_type,
+ conflict_handling);
+}
+
+void ConfigRemapper::RemapOriginalChannel(std::string_view name,
+ std::string_view type,
+ const Node *node,
+ std::string_view add_prefix,
+ std::string_view new_type,
+ RemapConflict conflict_handling) {
+ if (node != nullptr) {
+ VLOG(1) << "Node is " << FlatbufferToJson(node);
+ }
+ if (replay_channels_ != nullptr) {
+ CHECK(std::find(replay_channels_->begin(), replay_channels_->end(),
+ std::make_pair(std::string{name}, std::string{type})) !=
+ replay_channels_->end())
+ << "Attempted to remap channel " << name << " " << type
+ << " which is not included in the replay channels passed to "
+ "ConfigRemapper.";
+ }
+ const Channel *remapped_channel =
+ configuration::GetChannel(original_configuration(), name, type, "", node);
+ CHECK(remapped_channel != nullptr) << ": Failed to find {\"name\": \"" << name
+ << "\", \"type\": \"" << type << "\"}";
+ VLOG(1) << "Original {\"name\": \"" << name << "\", \"type\": \"" << type
+ << "\"}";
+ VLOG(1) << "Remapped "
+ << configuration::StrippedChannelToString(remapped_channel);
+
+ // We want to make /spray on node 0 go to /0/spray by snooping the maps. And
+ // we want it to degrade if the heuristics fail to just work.
+ //
+ // The easiest way to do this is going to be incredibly specific and verbose.
+ // Look up /spray, to /0/spray. Then, prefix the result with /original to get
+ // /original/0/spray. Then, create a map from /original/spray to
+ // /original/0/spray for just the type we were asked for.
+ if (name != remapped_channel->name()->string_view()) {
+ MapT new_map;
+ new_map.match = std::make_unique<ChannelT>();
+ new_map.match->name = absl::StrCat(add_prefix, name);
+ new_map.match->type = type;
+ if (node != nullptr) {
+ new_map.match->source_node = node->name()->str();
+ }
+ new_map.rename = std::make_unique<ChannelT>();
+ new_map.rename->name =
+ absl::StrCat(add_prefix, remapped_channel->name()->string_view());
+ maps_.emplace_back(std::move(new_map));
+ }
+
+ // Then remap the original channel to the prefixed channel.
+ const size_t channel_index =
+ configuration::ChannelIndex(original_configuration(), remapped_channel);
+ CHECK_EQ(0u, remapped_channels_.count(channel_index))
+ << "Already remapped channel "
+ << configuration::CleanedChannelToString(remapped_channel);
+
+ RemappedChannel remapped_channel_struct;
+ remapped_channel_struct.remapped_name =
+ std::string(add_prefix) +
+ std::string(remapped_channel->name()->string_view());
+ remapped_channel_struct.new_type = new_type;
+ const std::string_view remapped_type = new_type.empty() ? type : new_type;
+ CheckAndHandleRemapConflict(
+ remapped_channel_struct.remapped_name, remapped_type,
+ remapped_configuration_, conflict_handling,
+ [this, &remapped_channel_struct, remapped_type, node, add_prefix,
+ conflict_handling]() {
+ RemapOriginalChannel(remapped_channel_struct.remapped_name,
+ remapped_type, node, add_prefix, "",
+ conflict_handling);
+ });
+ remapped_channels_[channel_index] = std::move(remapped_channel_struct);
+ MakeRemappedConfig();
+}
+
+void ConfigRemapper::RenameOriginalChannel(const std::string_view name,
+ const std::string_view type,
+ const std::string_view new_name,
+ const std::vector<MapT> &add_maps) {
+ RenameOriginalChannel(name, type, nullptr, new_name, add_maps);
+}
+
+void ConfigRemapper::RenameOriginalChannel(const std::string_view name,
+ const std::string_view type,
+ const Node *const node,
+ const std::string_view new_name,
+ const std::vector<MapT> &add_maps) {
+ if (node != nullptr) {
+ VLOG(1) << "Node is " << FlatbufferToJson(node);
+ }
+ // First find the channel and rename it.
+ const Channel *remapped_channel =
+ configuration::GetChannel(original_configuration(), name, type, "", node);
+ CHECK(remapped_channel != nullptr) << ": Failed to find {\"name\": \"" << name
+ << "\", \"type\": \"" << type << "\"}";
+ VLOG(1) << "Original {\"name\": \"" << name << "\", \"type\": \"" << type
+ << "\"}";
+ VLOG(1) << "Remapped "
+ << configuration::StrippedChannelToString(remapped_channel);
+
+ const size_t channel_index =
+ configuration::ChannelIndex(original_configuration(), remapped_channel);
+ CHECK_EQ(0u, remapped_channels_.count(channel_index))
+ << "Already remapped channel "
+ << configuration::CleanedChannelToString(remapped_channel);
+
+ RemappedChannel remapped_channel_struct;
+ remapped_channel_struct.remapped_name = new_name;
+ remapped_channel_struct.new_type.clear();
+ remapped_channels_[channel_index] = std::move(remapped_channel_struct);
+
+ // Then add any provided maps.
+ for (const MapT &map : add_maps) {
+ maps_.push_back(map);
+ }
+
+ // Finally rewrite the config.
+ MakeRemappedConfig();
+}
+
+void ConfigRemapper::MakeRemappedConfig() {
+ // If no remapping occurred and we are using the original config, then there
+ // is nothing interesting to do here.
+ if (remapped_channels_.empty() && replay_configuration_ == nullptr) {
+ remapped_configuration_ = original_configuration();
+ return;
+ }
+ // Config to copy Channel definitions from. Use the specified
+ // replay_configuration_ if it has been provided.
+ const Configuration *const base_config = replay_configuration_ == nullptr
+ ? original_configuration()
+ : replay_configuration_;
+
+ // Create a config with all the channels, but un-sorted/merged. Collect up
+ // the schemas while we do this. Call MergeConfiguration to sort everything,
+ // and then merge it all in together.
+
+ // This is the builder that we use for the config containing all the new
+ // channels.
+ flatbuffers::FlatBufferBuilder fbb;
+ fbb.ForceDefaults(true);
+ std::vector<flatbuffers::Offset<Channel>> channel_offsets;
+
+ CHECK_EQ(Channel::MiniReflectTypeTable()->num_elems, 14u)
+ << ": Merging logic needs to be updated when the number of channel "
+ "fields changes.";
+
+ // List of schemas.
+ std::map<std::string_view, FlatbufferVector<reflection::Schema>> schema_map;
+ // Make sure our new RemoteMessage schema is in there for old logs without it.
+ schema_map.insert(std::make_pair(
+ message_bridge::RemoteMessage::GetFullyQualifiedName(),
+ FlatbufferVector<reflection::Schema>(FlatbufferSpan<reflection::Schema>(
+ message_bridge::RemoteMessageSchema()))));
+
+ // Reconstruct the remapped channels.
+ for (auto &pair : remapped_channels_) {
+ const Channel *const c = CHECK_NOTNULL(configuration::GetChannel(
+ base_config, original_configuration()->channels()->Get(pair.first), "",
+ nullptr));
+ channel_offsets.emplace_back(
+ CopyChannel(c, pair.second.remapped_name, "", &fbb));
+
+ if (c->has_destination_nodes()) {
+ for (const Connection *connection : *c->destination_nodes()) {
+ switch (connection->timestamp_logger()) {
+ case LoggerConfig::LOCAL_LOGGER:
+ case LoggerConfig::NOT_LOGGED:
+ // There is no timestamp channel associated with this, so ignore it.
+ break;
+
+ case LoggerConfig::REMOTE_LOGGER:
+ case LoggerConfig::LOCAL_AND_REMOTE_LOGGER:
+ // We want to make a split timestamp channel regardless of what type
+ // of log this used to be. No sense propagating the single
+ // timestamp channel.
+
+ CHECK(connection->has_timestamp_logger_nodes());
+ for (const flatbuffers::String *timestamp_logger_node :
+ *connection->timestamp_logger_nodes()) {
+ const Node *node =
+ configuration::GetNode(original_configuration(),
+ timestamp_logger_node->string_view());
+ message_bridge::ChannelTimestampFinder finder(
+ original_configuration(), "log_reader", node);
+
+ // We are assuming here that all the maps are setup correctly to
+ // handle arbitrary timestamps. Apply the maps for this node to
+ // see what name this ends up with.
+ std::string name = finder.SplitChannelName(
+ pair.second.remapped_name, c->type()->str(), connection);
+ std::string unmapped_name = name;
+ configuration::HandleMaps(original_configuration()->maps(), &name,
+ "aos.message_bridge.RemoteMessage",
+ node);
+ CHECK_NE(name, unmapped_name)
+ << ": Remote timestamp channel was not remapped, this is "
+ "very fishy";
+ flatbuffers::Offset<flatbuffers::String> channel_name_offset =
+ fbb.CreateString(name);
+ flatbuffers::Offset<flatbuffers::String> channel_type_offset =
+ fbb.CreateString("aos.message_bridge.RemoteMessage");
+ flatbuffers::Offset<flatbuffers::String> source_node_offset =
+ fbb.CreateString(timestamp_logger_node->string_view());
+
+ // Now, build a channel. Don't log it, 2 senders, and match the
+ // source frequency.
+ Channel::Builder channel_builder(fbb);
+ channel_builder.add_name(channel_name_offset);
+ channel_builder.add_type(channel_type_offset);
+ channel_builder.add_source_node(source_node_offset);
+ channel_builder.add_logger(LoggerConfig::NOT_LOGGED);
+ channel_builder.add_num_senders(2);
+ if (c->has_frequency()) {
+ channel_builder.add_frequency(c->frequency());
+ }
+ if (c->has_channel_storage_duration()) {
+ channel_builder.add_channel_storage_duration(
+ c->channel_storage_duration());
+ }
+ channel_offsets.emplace_back(channel_builder.Finish());
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ // Now reconstruct the original channels, translating types as needed
+ for (const Channel *c : *base_config->channels()) {
+ // Search for a mapping channel.
+ std::string_view new_type = "";
+ for (auto &pair : remapped_channels_) {
+ const Channel *const remapped_channel =
+ original_configuration()->channels()->Get(pair.first);
+ if (remapped_channel->name()->string_view() == c->name()->string_view() &&
+ remapped_channel->type()->string_view() == c->type()->string_view()) {
+ new_type = pair.second.new_type;
+ break;
+ }
+ }
+
+ // Copy everything over.
+ channel_offsets.emplace_back(CopyChannel(c, "", new_type, &fbb));
+
+ // Add the schema if it doesn't exist.
+ if (schema_map.find(c->type()->string_view()) == schema_map.end()) {
+ CHECK(c->has_schema());
+ schema_map.insert(std::make_pair(c->type()->string_view(),
+ RecursiveCopyFlatBuffer(c->schema())));
+ }
+ }
+
+ // The MergeConfiguration API takes a vector, not a map. Convert.
+ std::vector<FlatbufferVector<reflection::Schema>> schemas;
+ while (!schema_map.empty()) {
+ schemas.emplace_back(std::move(schema_map.begin()->second));
+ schema_map.erase(schema_map.begin());
+ }
+
+ // Create the Configuration containing the new channels that we want to add.
+ const flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Channel>>>
+ channels_offset =
+ channel_offsets.empty() ? 0 : fbb.CreateVector(channel_offsets);
+
+ // Copy over the old maps.
+ std::vector<flatbuffers::Offset<Map>> map_offsets;
+ if (base_config->maps()) {
+ for (const Map *map : *base_config->maps()) {
+ map_offsets.emplace_back(RecursiveCopyFlatBuffer(map, &fbb));
+ }
+ }
+
+ // Now create the new maps. These are second so they take effect first.
+ for (const MapT &map : maps_) {
+ CHECK(!map.match->name.empty());
+ const flatbuffers::Offset<flatbuffers::String> match_name_offset =
+ fbb.CreateString(map.match->name);
+ flatbuffers::Offset<flatbuffers::String> match_type_offset;
+ if (!map.match->type.empty()) {
+ match_type_offset = fbb.CreateString(map.match->type);
+ }
+ flatbuffers::Offset<flatbuffers::String> match_source_node_offset;
+ if (!map.match->source_node.empty()) {
+ match_source_node_offset = fbb.CreateString(map.match->source_node);
+ }
+ CHECK(!map.rename->name.empty());
+ const flatbuffers::Offset<flatbuffers::String> rename_name_offset =
+ fbb.CreateString(map.rename->name);
+ Channel::Builder match_builder(fbb);
+ match_builder.add_name(match_name_offset);
+ if (!match_type_offset.IsNull()) {
+ match_builder.add_type(match_type_offset);
+ }
+ if (!match_source_node_offset.IsNull()) {
+ match_builder.add_source_node(match_source_node_offset);
+ }
+ const flatbuffers::Offset<Channel> match_offset = match_builder.Finish();
+
+ Channel::Builder rename_builder(fbb);
+ rename_builder.add_name(rename_name_offset);
+ const flatbuffers::Offset<Channel> rename_offset = rename_builder.Finish();
+
+ Map::Builder map_builder(fbb);
+ map_builder.add_match(match_offset);
+ map_builder.add_rename(rename_offset);
+ map_offsets.emplace_back(map_builder.Finish());
+ }
+
+ flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Map>>>
+ maps_offsets = map_offsets.empty() ? 0 : fbb.CreateVector(map_offsets);
+
+ // And copy everything else over.
+ flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Node>>>
+ nodes_offset = RecursiveCopyVectorTable(base_config->nodes(), &fbb);
+
+ flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Application>>>
+ applications_offset =
+ RecursiveCopyVectorTable(base_config->applications(), &fbb);
+
+ // Now insert everything else in unmodified.
+ ConfigurationBuilder configuration_builder(fbb);
+ if (!channels_offset.IsNull()) {
+ configuration_builder.add_channels(channels_offset);
+ }
+ if (!maps_offsets.IsNull()) {
+ configuration_builder.add_maps(maps_offsets);
+ }
+ if (!nodes_offset.IsNull()) {
+ configuration_builder.add_nodes(nodes_offset);
+ }
+ if (!applications_offset.IsNull()) {
+ configuration_builder.add_applications(applications_offset);
+ }
+
+ if (base_config->has_channel_storage_duration()) {
+ configuration_builder.add_channel_storage_duration(
+ base_config->channel_storage_duration());
+ }
+
+ CHECK_EQ(Configuration::MiniReflectTypeTable()->num_elems, 6u)
+ << ": Merging logic needs to be updated when the number of configuration "
+ "fields changes.";
+
+ fbb.Finish(configuration_builder.Finish());
+
+ // Clean it up and return it! By using MergeConfiguration here, we'll
+ // actually get a deduplicated config for free too.
+ FlatbufferDetachedBuffer<Configuration> new_merged_config =
+ configuration::MergeConfiguration(
+ FlatbufferDetachedBuffer<Configuration>(fbb.Release()));
+
+ remapped_configuration_buffer_ =
+ std::make_unique<FlatbufferDetachedBuffer<Configuration>>(
+ configuration::MergeConfiguration(new_merged_config, schemas));
+
+ remapped_configuration_ = &remapped_configuration_buffer_->message();
+
+ // TODO(austin): Lazily re-build to save CPU?
+}
+
+} // namespace aos
diff --git a/aos/events/logging/config_remapper.h b/aos/events/logging/config_remapper.h
new file mode 100644
index 0000000..d1eb0c6
--- /dev/null
+++ b/aos/events/logging/config_remapper.h
@@ -0,0 +1,196 @@
+#ifndef AOS_EVENTS_LOGGING_CONFIG_REMAPPER_H_
+#define AOS_EVENTS_LOGGING_CONFIG_REMAPPER_H_
+
+#include <map>
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+#include "flatbuffers/flatbuffers.h"
+
+#include "aos/events/event_loop.h"
+#include "aos/events/logging/logger_generated.h"
+#include "aos/events/logging/replay_channels.h"
+
+namespace aos {
+
+// This class is used for remapping and renaming channels with the passed in
+// configuration to the constructor. Typically, the templated versions of
+// RemapOriginalChannel and RenameOriginalChannel are the main functions to use
+// for type safety. After remapping and renaming, the remapped configuration can
+// be accessed through remapped_configuration(), and the original configuration
+// that is not mutated can be accessed through original_configuration.
+//
+// This class assumes no ownership over any pointers provided to it.
+//
+// Timestamp channels are automatically remapped on construction
+//
+// Note: This class does not need logfiles to function unlike LogReader. This
+// logic originally lived in LogReader and was refactored out into this class.
+// The same API for remapping and renaming still exists in LogReader which now
+// just passes along the args to this class.
+class ConfigRemapper {
+ public:
+ ConfigRemapper(const Configuration *config,
+ const Configuration *replay_config = nullptr,
+ const logger::ReplayChannels *replay_channels = nullptr);
+ ~ConfigRemapper();
+
+ // Map of channel indices to new name. The channel index will be an index into
+ // original_configuration(), and the string key will be the name of the
+ // channel to send on instead of the orignal channel name.
+ struct RemappedChannel {
+ std::string remapped_name;
+ std::string new_type;
+ };
+
+ // Enum to use for indicating how RemapOriginalChannel behaves when there is
+ // already a channel with the remapped name (e.g., as may happen when
+ // replaying a logfile that was itself generated from replay).
+ enum class RemapConflict {
+ // LOG(FATAL) on conflicts in remappings.
+ kDisallow,
+ // If we run into a conflict, attempt to remap the channel we would be
+ // overriding (and continue to do so if remapping *that* channel also
+ // generates a conflict).
+ // This will mean that if we repeatedly replay a log, we will end up
+ // stacking more and more /original's on the start of the oldest version
+ // of the channels.
+ kCascade
+ };
+
+ // Remaps a channel from the original configuration passed to the constructor
+ // to the given one. This operates on raw channel names, without any node or
+ // application specific mappings.
+ void RemapOriginalChannel(
+ std::string_view name, std::string_view type,
+ std::string_view add_prefix = "/original", std::string_view new_type = "",
+ RemapConflict conflict_handling = RemapConflict::kCascade);
+ template <typename T>
+ void RemapOriginalChannel(
+ std::string_view name, std::string_view add_prefix = "/original",
+ std::string_view new_type = "",
+ RemapConflict conflict_handling = RemapConflict::kCascade) {
+ RemapOriginalChannel(name, T::GetFullyQualifiedName(), add_prefix, new_type,
+ conflict_handling);
+ }
+
+ // Remaps the provided channel, though this respects node mappings, and
+ // preserves them too. This makes it so if /aos -> /pi1/aos on one node,
+ // /original/aos -> /original/pi1/aos on the same node after renaming, just
+ // like you would hope. If new_type is not empty, the new channel will use
+ // the provided type instead. This allows for renaming messages.
+ //
+ // TODO(austin): If you have 2 nodes remapping something to the same channel,
+ // this doesn't handle that. No use cases exist yet for that, so it isn't
+ // being done yet.
+ void RemapOriginalChannel(
+ std::string_view name, std::string_view type, const Node *node,
+ std::string_view add_prefix = "/original", std::string_view new_type = "",
+ RemapConflict conflict_handling = RemapConflict::kCascade);
+
+ template <typename T>
+ void RemapOriginalChannel(
+ std::string_view name, const Node *node,
+ std::string_view add_prefix = "/original", std::string_view new_type = "",
+ RemapConflict conflict_handling = RemapConflict::kCascade) {
+ RemapOriginalChannel(name, T::GetFullyQualifiedName(), node, add_prefix,
+ new_type, conflict_handling);
+ }
+
+ // Similar to RemapOriginalChannel(), but lets you specify a name for the new
+ // channel without constraints. By default, this will not add any maps for the
+ // new channel. Use add_maps to specify any maps you'd like added.
+ void RenameOriginalChannel(std::string_view name, std::string_view type,
+ std::string_view new_name,
+ const std::vector<MapT> &add_maps = {});
+ template <typename T>
+ void RenameOriginalChannel(std::string_view name, std::string_view new_name,
+ const std::vector<MapT> &add_maps = {}) {
+ RenameOriginalChannel(name, T::GetFullyQualifiedName(), new_name, add_maps);
+ }
+ // The following overloads are more suitable for multi-node configurations,
+ // and let you rename a channel on a specific node.
+ void RenameOriginalChannel(std::string_view name, std::string_view type,
+ const Node *node, std::string_view new_name,
+ const std::vector<MapT> &add_maps = {});
+ template <typename T>
+ void RenameOriginalChannel(std::string_view name, const Node *node,
+ std::string_view new_name,
+ const std::vector<MapT> &add_maps = {}) {
+ RenameOriginalChannel(name, T::GetFullyQualifiedName(), node, new_name,
+ add_maps);
+ }
+
+ template <typename T>
+ bool HasChannel(std::string_view name, const Node *node = nullptr) {
+ return HasChannel(name, T::GetFullyQualifiedName(), node);
+ }
+ bool HasChannel(std::string_view name, std::string_view type,
+ const Node *node) {
+ return configuration::GetChannel(original_configuration(), name, type, "",
+ node, true) != nullptr;
+ }
+
+ // Returns true if the channel exists on the node and was in the original
+ // config
+ template <typename T>
+ bool HasOriginalChannel(std::string_view name, const Node *node = nullptr) {
+ const Channel *channel =
+ configuration::GetChannel(original_configuration(), name,
+ T::GetFullyQualifiedName(), "", node, true);
+ if (channel == nullptr) return false;
+ return channel->logger() != LoggerConfig::NOT_LOGGED;
+ }
+
+ template <typename T>
+ void MaybeRemapOriginalChannel(std::string_view name,
+ const Node *node = nullptr) {
+ if (HasChannel<T>(name, node)) {
+ RemapOriginalChannel<T>(name, node);
+ }
+ }
+ template <typename T>
+ void MaybeRenameOriginalChannel(std::string_view name, const Node *node,
+ std::string_view new_name,
+ const std::vector<MapT> &add_maps = {}) {
+ if (HasChannel<T>(name, node)) {
+ RenameOriginalChannel<T>(name, node, new_name, add_maps);
+ }
+ }
+
+ const Channel *RemapChannel(const EventLoop *event_loop, const Node *node,
+ const Channel *channel);
+ // Returns a list of all the original channels from remapping.
+ std::vector<const Channel *> RemappedChannels() const;
+
+ void set_configuration(const Configuration *configuration);
+
+ // Returns the configuration that was originally passed to the constructor.
+ // This class does not own this pointer.
+ const Configuration *original_configuration() const;
+
+ // Returns the configuration that contains the remapping and renamings done on
+ // the original configuration. The pointer is invalidated whenever
+ // RemapOriginalChannel is called.
+ const Configuration *remapped_configuration() const;
+
+ private:
+ // Handle constructing a configuration with all the additional remapped
+ // channels from calls to RemapOriginalChannel.
+ void MakeRemappedConfig();
+
+ std::map<size_t, RemappedChannel> remapped_channels_;
+ std::vector<MapT> maps_;
+ std::unique_ptr<FlatbufferDetachedBuffer<Configuration>>
+ remapped_configuration_buffer_;
+
+ const Configuration *remapped_configuration_ = nullptr;
+ const Configuration *original_configuration_ = nullptr;
+ const Configuration *replay_configuration_ = nullptr;
+
+ const logger::ReplayChannels *replay_channels_ = nullptr;
+}; // class ConfigRemapper
+
+} // namespace aos
+#endif // AOS_EVENTS_LOGGING_CONFIG_REMAPPER_H_
diff --git a/aos/events/logging/config_remapper_test.cc b/aos/events/logging/config_remapper_test.cc
new file mode 100644
index 0000000..a1fedb2
--- /dev/null
+++ b/aos/events/logging/config_remapper_test.cc
@@ -0,0 +1,101 @@
+#include "aos/events/logging/config_remapper.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include "aos/events/event_loop_generated.h"
+#include "aos/events/logging/log_reader.h"
+#include "aos/events/logging/multinode_logger_test_lib.h"
+#include "aos/events/message_counter.h"
+#include "aos/events/ping_lib.h"
+#include "aos/events/pong_lib.h"
+#include "aos/network/remote_message_generated.h"
+#include "aos/network/timestamp_generated.h"
+#include "aos/testing/tmpdir.h"
+#include "multinode_logger_test_lib.h"
+
+namespace aos {
+namespace testing {
+using namespace logger::testing;
+using namespace logger;
+namespace chrono = std::chrono;
+
+using ConfigRemapperTest = MultinodeLoggerTest;
+
+INSTANTIATE_TEST_SUITE_P(
+ All, ConfigRemapperTest,
+ ::testing::Combine(
+ ::testing::Values(
+ ConfigParams{"multinode_pingpong_combined_config.json", true,
+ kCombinedConfigSha1(), kCombinedConfigSha1(),
+ FileStrategy::kCombine,
+ ForceTimestampBuffering::kForceBufferTimestamps},
+ ConfigParams{"multinode_pingpong_split_config.json", false,
+ kSplitConfigSha1(), kReloggedSplitConfigSha1(),
+ FileStrategy::kCombine,
+ ForceTimestampBuffering::kForceBufferTimestamps}),
+ ::testing::ValuesIn(SupportedCompressionAlgorithms())));
+
+// Tests that we can read a config and remap a channel
+TEST_P(ConfigRemapperTest, RemapOriginalChannel) {
+ ConfigRemapper remapper(&config_.message());
+
+ remapper.RemapOriginalChannel<examples::Ping>("/test");
+
+ const Channel *channel = configuration::GetChannel<examples::Ping>(
+ remapper.remapped_configuration(), "/original/test", "pi1", nullptr);
+ EXPECT_NE(channel, nullptr);
+ EXPECT_EQ(channel->name()->string_view(), "/original/test");
+ EXPECT_EQ(channel->type()->string_view(), "aos.examples.Ping");
+}
+
+// Tests that we can read a config and rename a channel
+TEST_P(ConfigRemapperTest, RenameOriginalChannel) {
+ ConfigRemapper remapper(&config_.message());
+
+ remapper.RenameOriginalChannel<examples::Ping>("/test", "/original/test");
+
+ const Channel *channel = configuration::GetChannel<examples::Ping>(
+ remapper.remapped_configuration(), "/original/test", "pi1", nullptr);
+ EXPECT_NE(channel, nullptr);
+ EXPECT_EQ(channel->name()->string_view(), "/original/test");
+ EXPECT_EQ(channel->type()->string_view(), "aos.examples.Ping");
+}
+
+// Tests that we can remap a channel specifying a certain node
+TEST_P(ConfigRemapperTest, RemapOriginalChannelWithNode) {
+ ConfigRemapper remapper(&config_.message());
+
+ const Node *node =
+ configuration::GetNode(remapper.remapped_configuration(), "pi1");
+
+ // Remap just on pi1.
+ remapper.RemapOriginalChannel<aos::timing::Report>("/aos", node);
+
+ const Channel *channel = configuration::GetChannel<aos::timing::Report>(
+ remapper.remapped_configuration(), "/original/pi1/aos", "pi1", node);
+ EXPECT_NE(channel, nullptr);
+ EXPECT_EQ(channel->name()->string_view(), "/original/pi1/aos");
+ EXPECT_EQ(channel->type()->string_view(), "aos.timing.Report");
+}
+
+// Tests that we can rename a channel specifying a certain node
+TEST_P(ConfigRemapperTest, RenameOriginalChannelWithNode) {
+ ConfigRemapper remapper(&config_.message());
+
+ const Node *node =
+ configuration::GetNode(remapper.remapped_configuration(), "pi1");
+
+ // Rename just on pi1.
+ remapper.RenameOriginalChannel<aos::timing::Report>("/aos", node,
+ "/original/pi1/aos");
+
+ const Channel *channel = configuration::GetChannel<aos::timing::Report>(
+ remapper.remapped_configuration(), "/original/pi1/aos", "pi1", node);
+ EXPECT_NE(channel, nullptr);
+ EXPECT_EQ(channel->name()->string_view(), "/original/pi1/aos");
+ EXPECT_EQ(channel->type()->string_view(), "aos.timing.Report");
+}
+
+} // namespace testing
+} // namespace aos
diff --git a/aos/events/logging/log_backend.h b/aos/events/logging/log_backend.h
index 3a47f42..91f048c 100644
--- a/aos/events/logging/log_backend.h
+++ b/aos/events/logging/log_backend.h
@@ -269,10 +269,6 @@
// in memory. id is usually generated by log namer and looks like name of the
// file within a log folder.
virtual std::unique_ptr<LogSink> RequestFile(const std::string_view id) = 0;
-
- std::unique_ptr<LogSink> RequestFile(const LogSource::File &id) {
- return RequestFile(id.name);
- }
};
// Implements requests log files from file system.
diff --git a/aos/events/logging/log_namer.cc b/aos/events/logging/log_namer.cc
index 28223b2..6682ae3 100644
--- a/aos/events/logging/log_namer.cc
+++ b/aos/events/logging/log_namer.cc
@@ -10,6 +10,8 @@
#include "flatbuffers/flatbuffers.h"
#include "glog/logging.h"
+#include "aos/containers/error_list.h"
+#include "aos/containers/sized_array.h"
#include "aos/events/logging/logfile_utils.h"
#include "aos/events/logging/logger_generated.h"
#include "aos/flatbuffer_merge.h"
@@ -24,7 +26,8 @@
const Node *logger_node,
std::function<void(NewDataWriter *)> reopen,
std::function<void(NewDataWriter *)> close,
- size_t max_message_size)
+ size_t max_message_size,
+ std::initializer_list<StoredDataType> types)
: node_(node),
node_index_(configuration::GetNodeIndex(log_namer->configuration_, node)),
logger_node_index_(
@@ -32,9 +35,16 @@
log_namer_(log_namer),
reopen_(std::move(reopen)),
close_(std::move(close)),
- max_message_size_(max_message_size) {
+ max_message_size_(max_message_size),
+ max_out_of_order_duration_(log_namer_->base_max_out_of_order_duration()) {
+ allowed_data_types_.fill(false);
+
state_.resize(configuration::NodesCount(log_namer->configuration_));
CHECK_LT(node_index_, state_.size());
+ for (StoredDataType type : types) {
+ CHECK_LT(static_cast<size_t>(type), allowed_data_types_.size());
+ allowed_data_types_[static_cast<size_t>(type)] = true;
+ }
}
NewDataWriter::~NewDataWriter() {
@@ -48,9 +58,17 @@
if (header_written_) {
VLOG(1) << "Rotated " << name();
++parts_index_;
+
+ aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> header =
+ MakeHeader();
+
+ if (header.span().size() > max_message_size_) {
+ max_message_size_ = header.span().size();
+ }
+
reopen_(this);
header_written_ = false;
- QueueHeader(MakeHeader());
+ QueueHeader(std::move(header));
}
}
@@ -79,6 +97,11 @@
state_[node_index_].boot_uuid = source_node_boot_uuid;
VLOG(1) << "Rebooted " << name();
+ newest_message_time_ = monotonic_clock::min_time;
+ // When a node reboots, parts_uuid changes but the same writer continues to
+ // write the data, so we can reset the max out of order duration. If we don't
+ // do this, the max out of order duration can grow to an unreasonable value.
+ max_out_of_order_duration_ = log_namer_->base_max_out_of_order_duration();
}
void NewDataWriter::UpdateBoot(const UUID &source_node_boot_uuid) {
@@ -184,9 +207,38 @@
}
}
+void NewDataWriter::CopyDataMessage(
+ DataEncoder::Copier *coppier, const UUID &source_node_boot_uuid,
+ aos::monotonic_clock::time_point now,
+ aos::monotonic_clock::time_point message_time) {
+ CHECK(allowed_data_types_[static_cast<size_t>(StoredDataType::DATA)])
+ << ": Tried to write data on non-data writer.";
+ CopyMessage(coppier, source_node_boot_uuid, now, message_time);
+}
+
+void NewDataWriter::CopyTimestampMessage(
+ DataEncoder::Copier *coppier, const UUID &source_node_boot_uuid,
+ aos::monotonic_clock::time_point now,
+ aos::monotonic_clock::time_point message_time) {
+ CHECK(allowed_data_types_[static_cast<size_t>(StoredDataType::TIMESTAMPS)])
+ << ": Tried to write timestamps on non-timestamp writer.";
+ CopyMessage(coppier, source_node_boot_uuid, now, message_time);
+}
+
+void NewDataWriter::CopyRemoteTimestampMessage(
+ DataEncoder::Copier *coppier, const UUID &source_node_boot_uuid,
+ aos::monotonic_clock::time_point now,
+ aos::monotonic_clock::time_point message_time) {
+ CHECK(allowed_data_types_[static_cast<size_t>(
+ StoredDataType::REMOTE_TIMESTAMPS)])
+ << ": Tried to write remote timestamps on non-remote timestamp writer.";
+ CopyMessage(coppier, source_node_boot_uuid, now, message_time);
+}
+
void NewDataWriter::CopyMessage(DataEncoder::Copier *coppier,
const UUID &source_node_boot_uuid,
- aos::monotonic_clock::time_point now) {
+ aos::monotonic_clock::time_point now,
+ aos::monotonic_clock::time_point message_time) {
// Trigger a reboot if we detect the boot UUID change.
UpdateBoot(source_node_boot_uuid);
@@ -194,9 +246,65 @@
QueueHeader(MakeHeader());
}
+ bool max_out_of_order_duration_exceeded = false;
+ // Enforce max out of duration contract.
+ //
+ // Updates the newest message time.
+ // Rotate the part file if current message is more than
+ // max_out_of_order_duration behind the newest message we've logged so far.
+ if (message_time > newest_message_time_) {
+ newest_message_time_ = message_time;
+ }
+
+ // Don't consider messages before start up when checking for max out of order
+ // duration.
+ monotonic_clock::time_point monotonic_start_time =
+ log_namer_->monotonic_start_time(node_index_, source_node_boot_uuid);
+
+ if (std::chrono::nanoseconds((newest_message_time_ -
+ std::max(monotonic_start_time, message_time))) >
+ max_out_of_order_duration_) {
+ // If the new message is older than 2 * max_out_order_duration, doubling it
+ // won't be sufficient.
+ //
+ // Example: newest_message_time = 10, logged_message_time = 5,
+ // max_ooo_duration = 2
+ //
+ // In this case actual max_ooo_duration = 10 - 5 = 5, but we double the
+ // existing max_ooo_duration we get 4 which is not sufficient.
+ //
+ // Take the max of the two values.
+ max_out_of_order_duration_ =
+ 2 * std::max(max_out_of_order_duration_,
+ std::chrono::nanoseconds(
+ (newest_message_time_ - message_time)));
+ max_out_of_order_duration_exceeded = true;
+ }
+
// If the start time has changed for this node, trigger a rotation.
- if (log_namer_->monotonic_start_time(node_index_, source_node_boot_uuid) !=
- monotonic_start_time_) {
+ if ((monotonic_start_time != monotonic_start_time_) ||
+ max_out_of_order_duration_exceeded) {
+ // If we just received a start time now, we will rotate parts shortly. Use a
+ // reasonable max out of order durationin the new header based on start time
+ // information available now.
+ if ((monotonic_start_time_ == monotonic_clock::min_time) &&
+ (monotonic_start_time != monotonic_clock::min_time)) {
+ // If we're writing current messages but we receive an older start time,
+ // we can pick a reasonable max ooo duration number for the next part.
+ //
+ // For example - Our current max ooo duration is 0.3 seconds. We're
+ // writing messages at 20 seconds and recieve a start time of 1 second. We
+ // don't need max ooo duration to be (20 - 1) = 19 seconds although that
+ // would still work.
+ //
+ // Pick the minimum max out of duration value that satisifies the
+ // requirement but bound the minimum at the base value we started with.
+ max_out_of_order_duration_ =
+ std::max(log_namer_->base_max_out_of_order_duration(),
+ std::min(max_out_of_order_duration_,
+ std::chrono::nanoseconds(newest_message_time_ -
+ monotonic_start_time)));
+ }
CHECK(header_written_);
Rotate();
}
@@ -207,6 +315,7 @@
CHECK(writer);
CHECK(header_written_) << ": Attempting to write message before header to "
<< writer->name();
+ CHECK_LE(coppier->size(), max_message_size_);
writer->CopyMessage(coppier, now);
}
@@ -221,8 +330,9 @@
} else {
CHECK_EQ(state_[logger_node_index].boot_uuid, logger_node_boot_uuid);
}
- return log_namer_->MakeHeader(node_index_, state_, parts_uuid(),
- parts_index_);
+ return log_namer_->MakeHeader(node_index_, state_, parts_uuid(), parts_index_,
+ max_out_of_order_duration_,
+ allowed_data_types_);
}
void NewDataWriter::QueueHeader(
@@ -252,6 +362,7 @@
CHECK(writer);
DataEncoder::SpanCopier coppier(header.span());
+ CHECK_LE(coppier.size(), max_message_size_);
writer->CopyMessage(&coppier, aos::monotonic_clock::now());
header_written_ = true;
monotonic_start_time_ = log_namer_->monotonic_start_time(
@@ -278,11 +389,14 @@
aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> LogNamer::MakeHeader(
size_t node_index, const std::vector<NewDataWriter::State> &state,
- const UUID &parts_uuid, int parts_index) {
+ const UUID &parts_uuid, int parts_index,
+ std::chrono::nanoseconds max_out_of_order_duration,
+ const std::array<bool, static_cast<size_t>(StoredDataType::MAX) + 1>
+ &allowed_data_types) {
const UUID &source_node_boot_uuid = state[node_index].boot_uuid;
const Node *const source_node =
configuration::GetNode(configuration_, node_index);
- CHECK_EQ(LogFileHeader::MiniReflectTypeTable()->num_elems, 34u);
+ CHECK_EQ(LogFileHeader::MiniReflectTypeTable()->num_elems, 35u);
flatbuffers::FlatBufferBuilder fbb;
fbb.ForceDefaults(true);
@@ -460,6 +574,18 @@
flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>>
boot_uuids_offset = fbb.CreateVector(boot_uuid_offsets);
+ aos::ErrorList<StoredDataType> allowed_data_types_vector;
+ for (size_t type = static_cast<size_t>(StoredDataType::MIN);
+ type <= static_cast<size_t>(StoredDataType::MAX); ++type) {
+ if (allowed_data_types[type]) {
+ allowed_data_types_vector.Set(static_cast<StoredDataType>(type));
+ }
+ }
+
+ flatbuffers::Offset<flatbuffers::Vector<StoredDataType>> data_stored_offset =
+ fbb.CreateVector(allowed_data_types_vector.data(),
+ allowed_data_types_vector.size());
+
aos::logger::LogFileHeader::Builder log_file_header_builder(fbb);
log_file_header_builder.add_name(name_offset);
@@ -479,8 +605,9 @@
if (!configuration_offset.IsNull()) {
log_file_header_builder.add_configuration(configuration_offset);
}
+
log_file_header_builder.add_max_out_of_order_duration(
- header_.message().max_out_of_order_duration());
+ max_out_of_order_duration.count());
NodeState *node_state = GetNodeState(node_index, source_node_boot_uuid);
log_file_header_builder.add_monotonic_start_time(
@@ -552,6 +679,8 @@
log_file_header_builder
.add_oldest_logger_local_unreliable_monotonic_timestamps(
oldest_logger_local_unreliable_monotonic_timestamps_offset);
+
+ log_file_header_builder.add_data_stored(data_stored_offset);
fbb.FinishSizePrefixed(log_file_header_builder.Finish());
aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> result(
fbb.Release());
@@ -585,16 +714,10 @@
}
void MultiNodeLogNamer::Rotate(const Node *node) {
- if (node == this->node()) {
- if (data_writer_) {
- data_writer_->Rotate();
- }
- } else {
- for (std::pair<const Channel *const, NewDataWriter> &data_writer :
- data_writers_) {
- if (node == data_writer.second.node()) {
- data_writer.second.Rotate();
- }
+ for (auto &data_map : {&node_data_writers_, &node_timestamp_writers_}) {
+ auto it = data_map->find(node);
+ if (it != data_map->end()) {
+ it->second.Rotate();
}
}
}
@@ -622,6 +745,50 @@
CloseWriter(&writer);
}
+void MultiNodeLogNamer::NoticeNode(const Node *source_node) {
+ if (std::find(nodes_.begin(), nodes_.end(), source_node) == nodes_.end()) {
+ nodes_.emplace_back(source_node);
+ }
+}
+
+NewDataWriter *MultiNodeLogNamer::FindNodeDataWriter(const Node *source_node,
+ size_t max_message_size) {
+ NoticeNode(source_node);
+
+ auto it = node_data_writers_.find(source_node);
+ if (it != node_data_writers_.end()) {
+ it->second.UpdateMaxMessageSize(max_message_size);
+ return &(it->second);
+ }
+ return nullptr;
+}
+
+NewDataWriter *MultiNodeLogNamer::FindNodeTimestampWriter(
+ const Node *source_node, size_t max_message_size) {
+ NoticeNode(source_node);
+
+ auto it = node_timestamp_writers_.find(source_node);
+ if (it != node_timestamp_writers_.end()) {
+ it->second.UpdateMaxMessageSize(max_message_size);
+ return &(it->second);
+ }
+ return nullptr;
+}
+
+NewDataWriter *MultiNodeLogNamer::AddNodeDataWriter(const Node *source_node,
+ NewDataWriter &&writer) {
+ auto result = node_data_writers_.emplace(source_node, std::move(writer));
+ CHECK(result.second);
+ return &(result.first->second);
+}
+
+NewDataWriter *MultiNodeLogNamer::AddNodeTimestampWriter(
+ const Node *source_node, NewDataWriter &&writer) {
+ auto result = node_timestamp_writers_.emplace(source_node, std::move(writer));
+ CHECK(result.second);
+ return &(result.first->second);
+}
+
NewDataWriter *MultiNodeLogNamer::MakeWriter(const Channel *channel) {
// See if we can read the data on this node at all.
const bool is_readable =
@@ -638,39 +805,40 @@
return nullptr;
}
- // Now, sort out if this is data generated on this node, or not. It is
- // generated if it is sendable on this node.
- if (configuration::ChannelIsSendableOnNode(channel, this->node())) {
- if (!data_writer_) {
- MakeDataWriter();
- }
- data_writer_->UpdateMaxMessageSize(
- PackMessageSize(LogType::kLogRemoteMessage, channel->max_size()));
- return data_writer_.get();
- }
-
// Ok, we have data that is being forwarded to us that we are supposed to
// log. It needs to be logged with send timestamps, but be sorted enough
// to be able to be processed.
- CHECK(data_writers_.find(channel) == data_writers_.end());
- // Track that this node is being logged.
- const Node *source_node = configuration::GetNode(
- configuration_, channel->source_node()->string_view());
+ const Node *source_node =
+ configuration::MultiNode(configuration_)
+ ? configuration::GetNode(configuration_,
+ channel->source_node()->string_view())
+ : nullptr;
- if (std::find(nodes_.begin(), nodes_.end(), source_node) == nodes_.end()) {
- nodes_.emplace_back(source_node);
+ // If we already have a data writer for the node, then use the same writer for
+ // all channels of that node.
+ NewDataWriter *result = FindNodeDataWriter(
+ source_node,
+ PackMessageSize(LogType::kLogRemoteMessage, channel->max_size()));
+ if (result != nullptr) {
+ return result;
}
- NewDataWriter data_writer(
- this, source_node, node_,
- [this, channel](NewDataWriter *data_writer) {
- OpenWriter(channel, data_writer);
- },
- [this](NewDataWriter *data_writer) { CloseWriter(&data_writer->writer); },
- PackMessageSize(LogType::kLogRemoteMessage, channel->max_size()));
- return &(
- data_writers_.emplace(channel, std::move(data_writer)).first->second);
+ // If we don't have a data writer for the node, create one.
+ return AddNodeDataWriter(
+ source_node,
+ NewDataWriter{
+ this,
+ source_node,
+ node_,
+ [this, source_node](NewDataWriter *data_writer) {
+ OpenDataWriter(source_node, data_writer);
+ },
+ [this](NewDataWriter *data_writer) {
+ CloseWriter(&data_writer->writer);
+ },
+ PackMessageSize(LogType::kLogRemoteMessage, channel->max_size()),
+ {StoredDataType::DATA}});
}
NewDataWriter *MultiNodeLogNamer::MakeForwardedTimestampWriter(
@@ -680,21 +848,29 @@
configuration::ChannelIsReadableOnNode(channel, this->node());
CHECK(is_readable) << ": " << configuration::CleanedChannelToString(channel);
- CHECK(data_writers_.find(channel) == data_writers_.end());
+ CHECK_NE(node, this->node());
- if (std::find(nodes_.begin(), nodes_.end(), node) == nodes_.end()) {
- nodes_.emplace_back(node);
+ // If we have a remote timestamp writer for a particular node, use the same
+ // writer for all remote timestamp channels of that node.
+ NewDataWriter *result =
+ FindNodeTimestampWriter(node, PackRemoteMessageSize());
+ if (result != nullptr) {
+ return result;
}
- NewDataWriter data_writer(
- this, configuration::GetNode(configuration_, node), node_,
- [this, channel](NewDataWriter *data_writer) {
- OpenForwardedTimestampWriter(channel, data_writer);
- },
- [this](NewDataWriter *data_writer) { CloseWriter(&data_writer->writer); },
- PackRemoteMessageSize());
- return &(
- data_writers_.emplace(channel, std::move(data_writer)).first->second);
+ // If there are no remote timestamp writers for the node, create one.
+ return AddNodeTimestampWriter(
+ node, NewDataWriter{this,
+ configuration::GetNode(configuration_, node),
+ node_,
+ [this](NewDataWriter *data_writer) {
+ OpenForwardedTimestampWriter(node_, data_writer);
+ },
+ [this](NewDataWriter *data_writer) {
+ CloseWriter(&data_writer->writer);
+ },
+ PackRemoteMessageSize(),
+ {StoredDataType::REMOTE_TIMESTAMPS}});
}
NewDataWriter *MultiNodeLogNamer::MakeTimestampWriter(const Channel *channel) {
@@ -707,17 +883,30 @@
return nullptr;
}
- if (!data_writer_) {
- MakeDataWriter();
+ // There is only one of these.
+ NewDataWriter *result = FindNodeTimestampWriter(
+ this->node(), PackMessageSize(LogType::kLogDeliveryTimeOnly, 0));
+ if (result != nullptr) {
+ return result;
}
- data_writer_->UpdateMaxMessageSize(
- PackMessageSize(LogType::kLogDeliveryTimeOnly, 0));
- return data_writer_.get();
+
+ return AddNodeTimestampWriter(
+ node_, NewDataWriter{this,
+ node_,
+ node_,
+ [this](NewDataWriter *data_writer) {
+ OpenTimestampWriter(data_writer);
+ },
+ [this](NewDataWriter *data_writer) {
+ CloseWriter(&data_writer->writer);
+ },
+ PackMessageSize(LogType::kLogDeliveryTimeOnly, 0),
+ {StoredDataType::TIMESTAMPS}});
}
WriteCode MultiNodeLogNamer::Close() {
- data_writers_.clear();
- data_writer_.reset();
+ node_data_writers_.clear();
+ node_timestamp_writers_.clear();
if (ran_out_of_space_) {
return WriteCode::kOutOfSpace;
}
@@ -725,13 +914,15 @@
}
void MultiNodeLogNamer::ResetStatistics() {
- for (std::pair<const Channel *const, NewDataWriter> &data_writer :
- data_writers_) {
+ for (std::pair<const Node *const, NewDataWriter> &data_writer :
+ node_data_writers_) {
if (!data_writer.second.writer) continue;
data_writer.second.writer->WriteStatistics()->ResetStats();
}
- if (data_writer_ != nullptr && data_writer_->writer != nullptr) {
- data_writer_->writer->WriteStatistics()->ResetStats();
+ for (std::pair<const Node *const, NewDataWriter> &data_writer :
+ node_timestamp_writers_) {
+ if (!data_writer.second.writer) continue;
+ data_writer.second.writer->WriteStatistics()->ResetStats();
}
max_write_time_ = std::chrono::nanoseconds::zero();
max_write_time_bytes_ = -1;
@@ -743,47 +934,40 @@
}
void MultiNodeLogNamer::OpenForwardedTimestampWriter(
- const Channel *channel, NewDataWriter *data_writer) {
+ const Node * /*source_node*/, NewDataWriter *data_writer) {
+ const std::string filename = absl::StrCat(
+ "timestamps/remote_", data_writer->node()->name()->string_view(), ".part",
+ data_writer->parts_index(), ".bfbs", extension_);
+ CreateBufferWriter(filename, data_writer->max_message_size(),
+ &data_writer->writer);
+}
+
+void MultiNodeLogNamer::OpenDataWriter(const Node *source_node,
+ NewDataWriter *data_writer) {
+ std::string filename;
+
+ if (source_node != nullptr) {
+ if (source_node == node_) {
+ filename = absl::StrCat(source_node->name()->string_view(), "_");
+ } else {
+ filename = absl::StrCat("data/", source_node->name()->string_view(), "_");
+ }
+ }
+
+ absl::StrAppend(&filename, "data.part", data_writer->parts_index(), ".bfbs",
+ extension_);
+ CreateBufferWriter(filename, data_writer->max_message_size(),
+ &data_writer->writer);
+}
+
+void MultiNodeLogNamer::OpenTimestampWriter(NewDataWriter *data_writer) {
std::string filename =
- absl::StrCat("timestamps", channel->name()->string_view(), "/",
- channel->type()->string_view(), ".part",
+ absl::StrCat(node()->name()->string_view(), "_timestamps.part",
data_writer->parts_index(), ".bfbs", extension_);
CreateBufferWriter(filename, data_writer->max_message_size(),
&data_writer->writer);
}
-void MultiNodeLogNamer::OpenWriter(const Channel *channel,
- NewDataWriter *data_writer) {
- const std::string filename = absl::StrCat(
- CHECK_NOTNULL(channel->source_node())->string_view(), "_data",
- channel->name()->string_view(), "/", channel->type()->string_view(),
- ".part", data_writer->parts_index(), ".bfbs", extension_);
- CreateBufferWriter(filename, data_writer->max_message_size(),
- &data_writer->writer);
-}
-
-void MultiNodeLogNamer::MakeDataWriter() {
- if (!data_writer_) {
- data_writer_ = std::make_unique<NewDataWriter>(
- this, node_, node_,
- [this](NewDataWriter *writer) {
- std::string name;
- if (node() != nullptr) {
- name = absl::StrCat(name, node()->name()->string_view(), "_");
- }
- absl::StrAppend(&name, "data.part", writer->parts_index(), ".bfbs",
- extension_);
- CreateBufferWriter(name, writer->max_message_size(), &writer->writer);
- },
- [this](NewDataWriter *data_writer) {
- CloseWriter(&data_writer->writer);
- },
- // Default size is 0 so it will be obvious if something doesn't fix it
- // afterwards.
- 0);
- }
-}
-
void MultiNodeLogNamer::CreateBufferWriter(
std::string_view path, size_t max_message_size,
std::unique_ptr<DetachedBufferWriter> *destination) {
@@ -846,5 +1030,167 @@
}
}
+NewDataWriter *MinimalFileMultiNodeLogNamer::MakeWriter(
+ const Channel *channel) {
+ // See if we can read the data on this node at all.
+ const bool is_readable =
+ configuration::ChannelIsReadableOnNode(channel, this->node());
+ if (!is_readable) {
+ return nullptr;
+ }
+
+ // Then, see if we are supposed to log the data here.
+ const bool log_message =
+ configuration::ChannelMessageIsLoggedOnNode(channel, this->node());
+
+ if (!log_message) {
+ return nullptr;
+ }
+
+ // Ok, we have data that is being forwarded to us that we are supposed to
+ // log. It needs to be logged with send timestamps, but be sorted enough
+ // to be able to be processed.
+
+ const Node *source_node =
+ configuration::MultiNode(configuration_)
+ ? configuration::GetNode(configuration_,
+ channel->source_node()->string_view())
+ : nullptr;
+
+ // If we don't have a data writer for the node, create one.
+ if (this->node() == source_node) {
+ // If we already have a data writer for the node, then use the same writer
+ // for all channels of that node.
+ NewDataWriter *result = FindNodeDataWriter(
+ source_node,
+ PackMessageSize(LogType::kLogMessage, channel->max_size()));
+ if (result != nullptr) {
+ return result;
+ }
+
+ return AddNodeDataWriter(
+ source_node,
+ NewDataWriter{
+ this,
+ source_node,
+ node_,
+ [this, source_node](NewDataWriter *data_writer) {
+ OpenNodeWriter(source_node, data_writer);
+ },
+ [this](NewDataWriter *data_writer) {
+ CloseWriter(&data_writer->writer);
+ },
+ PackMessageSize(LogType::kLogMessage, channel->max_size()),
+ {StoredDataType::DATA, StoredDataType::TIMESTAMPS}});
+ } else {
+ // If we already have a data writer for the node, then use the same writer
+ // for all channels of that node.
+ NewDataWriter *result = FindNodeDataWriter(
+ source_node,
+ PackMessageSize(LogType::kLogRemoteMessage, channel->max_size()));
+ if (result != nullptr) {
+ return result;
+ }
+
+ return AddNodeDataWriter(
+ source_node,
+ NewDataWriter{
+ this,
+ source_node,
+ node_,
+ [this, source_node](NewDataWriter *data_writer) {
+ OpenNodeWriter(source_node, data_writer);
+ },
+ [this](NewDataWriter *data_writer) {
+ CloseWriter(&data_writer->writer);
+ },
+ PackMessageSize(LogType::kLogRemoteMessage, channel->max_size()),
+ {StoredDataType::DATA, StoredDataType::REMOTE_TIMESTAMPS}});
+ }
+}
+
+NewDataWriter *MinimalFileMultiNodeLogNamer::MakeTimestampWriter(
+ const Channel *channel) {
+ bool log_delivery_times = false;
+ if (this->node() != nullptr) {
+ log_delivery_times = configuration::ConnectionDeliveryTimeIsLoggedOnNode(
+ channel, this->node(), this->node());
+ }
+ if (!log_delivery_times) {
+ return nullptr;
+ }
+
+ // There is only one of these.
+ NewDataWriter *result = FindNodeDataWriter(
+ this->node(), PackMessageSize(LogType::kLogDeliveryTimeOnly, 0));
+ if (result != nullptr) {
+ return result;
+ }
+
+ return AddNodeDataWriter(
+ node_, NewDataWriter{this,
+ node_,
+ node_,
+ [this](NewDataWriter *data_writer) {
+ OpenNodeWriter(node_, data_writer);
+ },
+ [this](NewDataWriter *data_writer) {
+ CloseWriter(&data_writer->writer);
+ },
+ PackMessageSize(LogType::kLogDeliveryTimeOnly, 0),
+ {StoredDataType::DATA, StoredDataType::TIMESTAMPS}});
+}
+
+NewDataWriter *MinimalFileMultiNodeLogNamer::MakeForwardedTimestampWriter(
+ const Channel *channel, const Node *node) {
+ // See if we can read the data on this node at all.
+ const bool is_readable =
+ configuration::ChannelIsReadableOnNode(channel, this->node());
+ CHECK(is_readable) << ": " << configuration::CleanedChannelToString(channel);
+
+ CHECK_NE(node, this->node());
+
+ // If we have a remote timestamp writer for a particular node, use the same
+ // writer for all remote timestamp channels of that node.
+ NewDataWriter *result = FindNodeDataWriter(node, PackRemoteMessageSize());
+ if (result != nullptr) {
+ return result;
+ }
+
+ // If there are no remote timestamp writers for the node, create one.
+ return AddNodeDataWriter(
+ node,
+ NewDataWriter{this,
+ configuration::GetNode(configuration_, node),
+ node_,
+ [this, node](NewDataWriter *data_writer) {
+ OpenNodeWriter(node, data_writer);
+ },
+ [this](NewDataWriter *data_writer) {
+ CloseWriter(&data_writer->writer);
+ },
+ PackRemoteMessageSize(),
+ {StoredDataType::DATA, StoredDataType::REMOTE_TIMESTAMPS}});
+}
+
+void MinimalFileMultiNodeLogNamer::OpenNodeWriter(const Node *source_node,
+ NewDataWriter *data_writer) {
+ std::string filename;
+
+ if (node() != nullptr) {
+ filename = absl::StrCat(node()->name()->string_view(), "_");
+ }
+
+ if (source_node != nullptr) {
+ absl::StrAppend(&filename, source_node->name()->string_view(), "_");
+ }
+
+ absl::StrAppend(&filename, "all.part", data_writer->parts_index(), ".bfbs",
+ extension_);
+ VLOG(1) << "Going to open " << filename;
+ CreateBufferWriter(filename, data_writer->max_message_size(),
+ &data_writer->writer);
+}
+
} // namespace logger
} // namespace aos
diff --git a/aos/events/logging/log_namer.h b/aos/events/logging/log_namer.h
index 8d69fbb..5374acf 100644
--- a/aos/events/logging/log_namer.h
+++ b/aos/events/logging/log_namer.h
@@ -41,7 +41,8 @@
NewDataWriter(LogNamer *log_namer, const Node *node, const Node *logger_node,
std::function<void(NewDataWriter *)> reopen,
std::function<void(NewDataWriter *)> close,
- size_t max_message_size);
+ size_t max_message_size,
+ std::initializer_list<StoredDataType> types);
void UpdateMaxMessageSize(size_t new_size) {
if (new_size > max_message_size_) {
@@ -52,6 +53,10 @@
}
size_t max_message_size() const { return max_message_size_; }
+ std::chrono::nanoseconds max_out_of_order_duration() const {
+ return max_out_of_order_duration_;
+ }
+
NewDataWriter(NewDataWriter &&other) = default;
aos::logger::NewDataWriter &operator=(NewDataWriter &&other) = default;
NewDataWriter(const NewDataWriter &) = delete;
@@ -70,10 +75,20 @@
bool reliable,
monotonic_clock::time_point monotonic_timestamp_time =
monotonic_clock::min_time);
+
// Coppies a message with the provided boot UUID.
- void CopyMessage(DataEncoder::Copier *coppier,
- const UUID &source_node_boot_uuid,
- aos::monotonic_clock::time_point now);
+ void CopyDataMessage(DataEncoder::Copier *copier,
+ const UUID &source_node_boot_uuid,
+ aos::monotonic_clock::time_point now,
+ aos::monotonic_clock::time_point message_time);
+ void CopyTimestampMessage(DataEncoder::Copier *copier,
+ const UUID &source_node_boot_uuid,
+ aos::monotonic_clock::time_point now,
+ aos::monotonic_clock::time_point message_time);
+ void CopyRemoteTimestampMessage(
+ DataEncoder::Copier *copier, const UUID &source_node_boot_uuid,
+ aos::monotonic_clock::time_point now,
+ aos::monotonic_clock::time_point message_time);
// Updates the current boot for the source node. This is useful when you want
// to queue a message that may trigger a reboot rotation, but then need to
@@ -139,6 +154,11 @@
// Signals that a node has rebooted.
void Reboot(const UUID &source_node_boot_uuid);
+ void CopyMessage(DataEncoder::Copier *copier,
+ const UUID &source_node_boot_uuid,
+ aos::monotonic_clock::time_point now,
+ aos::monotonic_clock::time_point message_time);
+
void QueueHeader(
aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> &&header);
@@ -160,6 +180,28 @@
std::vector<State> state_;
size_t max_message_size_;
+
+ // Each data writer logs the channels for that node, i.e.
+ // each data writer writes one file. We may encounter messages which
+ // violate the max out of order duration specified in the header of that file.
+ // Rotate the data writer and start a new part for that particular file.
+ // This shouldn't affect the headers of other data writers, so make this
+ // a property of individual data writer instead of the overall log.
+ std::chrono::nanoseconds max_out_of_order_duration_;
+
+ // Monotonic time point of the latest message we've logged so far, i.e
+ // Message X - time Z
+ // Message Y - time Z + 1
+ // newest_message_time_ = Z + 1 (even if X was logged after Y)
+ //
+ // Since the messages can be logged out of order, this helps determine if
+ // max out of order duration was violated.
+ monotonic_clock::time_point newest_message_time_ = monotonic_clock::min_time;
+
+ // An array with a bool for each value of StoredDataType representing if that
+ // data type is allowed to be logged by this object.
+ std::array<bool, static_cast<size_t>(StoredDataType::MAX) + 1>
+ allowed_data_types_;
};
// Interface describing how to name, track, and add headers to log file parts.
@@ -252,6 +294,14 @@
return node_state->monotonic_start_time;
}
+ // This returns the initial out of order duration set in the header template
+ // by the logger based on polling period. It may be different than the actual
+ // duration used by the data writer.
+ std::chrono::nanoseconds base_max_out_of_order_duration() const {
+ return std::chrono::nanoseconds(
+ header_.message().max_out_of_order_duration());
+ }
+
protected:
// Structure with state per node about times and such.
struct NodeState {
@@ -271,7 +321,10 @@
// them with the arguments provided.
aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> MakeHeader(
size_t node_index, const std::vector<NewDataWriter::State> &state,
- const UUID &parts_uuid, int parts_index);
+ const UUID &parts_uuid, int parts_index,
+ std::chrono::nanoseconds max_out_of_order_duration,
+ const std::array<bool, static_cast<size_t>(StoredDataType::MAX) + 1>
+ &allowed_data_types);
EventLoop *event_loop_;
const Configuration *const configuration_;
@@ -451,33 +504,50 @@
LogBackend *log_backend() { return log_backend_.get(); }
const LogBackend *log_backend() const { return log_backend_.get(); }
- private:
- // Opens up a writer for timestamps forwarded back.
- void OpenForwardedTimestampWriter(const Channel *channel,
- NewDataWriter *data_writer);
+ // Returns the data writer or timestamp writer if we find one for the provided
+ // node.
+ NewDataWriter *FindNodeDataWriter(const Node *node, size_t max_message_size);
+ NewDataWriter *FindNodeTimestampWriter(const Node *node,
+ size_t max_message_size);
- // Opens up a writer for remote data.
- void OpenWriter(const Channel *channel, NewDataWriter *data_writer);
+ // Saves the data writer or timestamp writer for the provided node.
+ NewDataWriter *AddNodeDataWriter(const Node *node, NewDataWriter &&writer);
+ NewDataWriter *AddNodeTimestampWriter(const Node *node,
+ NewDataWriter &&writer);
- // Opens the main data writer file for this node responsible for data_writer_.
- void MakeDataWriter();
+ void CloseWriter(std::unique_ptr<DetachedBufferWriter> *writer_pointer);
void CreateBufferWriter(std::string_view path, size_t max_message_size,
std::unique_ptr<DetachedBufferWriter> *destination);
- void CloseWriter(std::unique_ptr<DetachedBufferWriter> *writer_pointer);
+ std::string extension_;
+
+ private:
+ // Opens up a writer for timestamps forwarded back.
+ void OpenForwardedTimestampWriter(const Node *source_node,
+ NewDataWriter *data_writer);
+
+ // Opens up a writer for remote data.
+ void OpenDataWriter(const Node *source_node, NewDataWriter *data_writer);
+ void OpenTimestampWriter(NewDataWriter *data_writer);
+
+ // Tracks the node in nodes_.
+ void NoticeNode(const Node *source_node);
// A version of std::accumulate which operates over all of our DataWriters.
template <typename T, typename BinaryOperation>
T accumulate_data_writers(T t, BinaryOperation op) const {
- for (const std::pair<const Channel *const, NewDataWriter> &data_writer :
- data_writers_) {
+ for (const std::pair<const Node *const, NewDataWriter> &data_writer :
+ node_data_writers_) {
if (data_writer.second.writer != nullptr) {
t = op(std::move(t), data_writer.second);
}
}
- if (data_writer_ != nullptr && data_writer_->writer != nullptr) {
- t = op(std::move(t), *data_writer_);
+ for (const std::pair<const Node *const, NewDataWriter> &data_writer :
+ node_timestamp_writers_) {
+ if (data_writer.second.writer != nullptr) {
+ t = op(std::move(t), data_writer.second);
+ }
}
return t;
}
@@ -488,7 +558,6 @@
std::vector<std::string> all_filenames_;
std::function<std::unique_ptr<DataEncoder>(size_t)> encoder_factory_;
- std::string extension_;
// Storage for statistics from previously-rotated DetachedBufferWriters.
std::chrono::nanoseconds max_write_time_ = std::chrono::nanoseconds::zero();
@@ -499,10 +568,10 @@
int total_write_messages_ = 0;
int total_write_bytes_ = 0;
- // File to write both delivery timestamps and local data to.
- std::unique_ptr<NewDataWriter> data_writer_;
-
- std::map<const Channel *, NewDataWriter> data_writers_;
+ // Data writer per remote node.
+ std::map<const Node *, NewDataWriter> node_data_writers_;
+ // Remote timestamp writers per node.
+ std::map<const Node *, NewDataWriter> node_timestamp_writers_;
};
// This is specialized log namer that deals with directory centric log events.
@@ -557,6 +626,30 @@
}
};
+// Class which dumps all data from each node into a single file per node. This
+// is mostly interesting for testing.
+class MinimalFileMultiNodeLogNamer : public MultiNodeFilesLogNamer {
+ public:
+ MinimalFileMultiNodeLogNamer(std::string_view base_name,
+ EventLoop *event_loop)
+ : MultiNodeFilesLogNamer(base_name, event_loop) {}
+ MinimalFileMultiNodeLogNamer(std::string_view base_name,
+ const Configuration *configuration,
+ EventLoop *event_loop, const Node *node)
+ : MultiNodeFilesLogNamer(base_name, configuration, event_loop, node) {}
+
+ NewDataWriter *MakeWriter(const Channel *channel) override;
+
+ NewDataWriter *MakeForwardedTimestampWriter(const Channel *channel,
+ const Node *node) override;
+
+ NewDataWriter *MakeTimestampWriter(const Channel *channel) override;
+
+ private:
+ // Names the data writer.
+ void OpenNodeWriter(const Node *source_node, NewDataWriter *data_writer);
+};
+
} // namespace logger
} // namespace aos
diff --git a/aos/events/logging/log_reader.cc b/aos/events/logging/log_reader.cc
index e3dd904..5b437ba 100644
--- a/aos/events/logging/log_reader.cc
+++ b/aos/events/logging/log_reader.cc
@@ -35,6 +35,19 @@
"false, CHECK.");
DECLARE_bool(timestamps_to_csv);
+DEFINE_bool(
+ enable_timestamp_loading, true,
+ "Enable loading all the timestamps into RAM at startup if they are in "
+ "separate files. This fixes any timestamp queueing problems for the cost "
+ "of storing timestamps in RAM only on logs with timestamps logged in "
+ "separate files from data. Only disable this if you are reading a really "
+ "long log file and are experiencing memory problems due to loading all the "
+ "timestamps into RAM.");
+DEFINE_bool(
+ force_timestamp_loading, false,
+ "Force loading all the timestamps into RAM at startup. This fixes any "
+ "timestamp queueing problems for the cost of storing timestamps in RAM and "
+ "potentially reading each log twice.");
DEFINE_bool(skip_order_validation, false,
"If true, ignore any out of orderness in replay");
@@ -73,91 +86,6 @@
namespace logger {
namespace {
-bool CompareChannels(const Channel *c,
- ::std::pair<std::string_view, std::string_view> p) {
- int name_compare = c->name()->string_view().compare(p.first);
- if (name_compare == 0) {
- return c->type()->string_view() < p.second;
- } else if (name_compare < 0) {
- return true;
- } else {
- return false;
- }
-}
-
-bool EqualsChannels(const Channel *c,
- ::std::pair<std::string_view, std::string_view> p) {
- return c->name()->string_view() == p.first &&
- c->type()->string_view() == p.second;
-}
-
-// Copies the channel, removing the schema as we go. If new_name is provided,
-// it is used instead of the name inside the channel. If new_type is provided,
-// it is used instead of the type in the channel.
-flatbuffers::Offset<Channel> CopyChannel(const Channel *c,
- std::string_view new_name,
- std::string_view new_type,
- flatbuffers::FlatBufferBuilder *fbb) {
- CHECK_EQ(Channel::MiniReflectTypeTable()->num_elems, 14u)
- << ": Merging logic needs to be updated when the number of channel "
- "fields changes.";
-
- flatbuffers::Offset<flatbuffers::String> name_offset =
- fbb->CreateSharedString(new_name.empty() ? c->name()->string_view()
- : new_name);
- flatbuffers::Offset<flatbuffers::String> type_offset =
- fbb->CreateSharedString(new_type.empty() ? c->type()->str() : new_type);
- flatbuffers::Offset<flatbuffers::String> source_node_offset =
- c->has_source_node() ? fbb->CreateSharedString(c->source_node()->str())
- : 0;
-
- flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Connection>>>
- destination_nodes_offset =
- aos::RecursiveCopyVectorTable(c->destination_nodes(), fbb);
-
- flatbuffers::Offset<
- flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>>
- logger_nodes_offset = aos::CopyVectorSharedString(c->logger_nodes(), fbb);
-
- Channel::Builder channel_builder(*fbb);
- channel_builder.add_name(name_offset);
- channel_builder.add_type(type_offset);
- if (c->has_frequency()) {
- channel_builder.add_frequency(c->frequency());
- }
- if (c->has_max_size()) {
- channel_builder.add_max_size(c->max_size());
- }
- if (c->has_num_senders()) {
- channel_builder.add_num_senders(c->num_senders());
- }
- if (c->has_num_watchers()) {
- channel_builder.add_num_watchers(c->num_watchers());
- }
- if (!source_node_offset.IsNull()) {
- channel_builder.add_source_node(source_node_offset);
- }
- if (!destination_nodes_offset.IsNull()) {
- channel_builder.add_destination_nodes(destination_nodes_offset);
- }
- if (c->has_logger()) {
- channel_builder.add_logger(c->logger());
- }
- if (!logger_nodes_offset.IsNull()) {
- channel_builder.add_logger_nodes(logger_nodes_offset);
- }
- if (c->has_read_method()) {
- channel_builder.add_read_method(c->read_method());
- }
- if (c->has_num_readers()) {
- channel_builder.add_num_readers(c->num_readers());
- }
- if (c->has_channel_storage_duration()) {
- channel_builder.add_channel_storage_duration(c->channel_storage_duration());
- }
- return channel_builder.Finish();
-}
-
namespace chrono = std::chrono;
using message_bridge::RemoteMessage;
} // namespace
@@ -259,13 +187,15 @@
const ReplayChannels *replay_channels)
: log_files_(std::move(log_files)),
replay_configuration_(replay_configuration),
- replay_channels_(replay_channels) {
+ replay_channels_(replay_channels),
+ config_remapper_(log_files_.config().get(), replay_configuration_,
+ replay_channels_) {
SetStartTime(FLAGS_start_time);
SetEndTime(FLAGS_end_time);
{
// Log files container validates that log files shared the same config.
- const Configuration *config = log_files_.config();
+ const Configuration *config = log_files_.config().get();
CHECK_NOTNULL(config);
}
@@ -274,95 +204,6 @@
"no messages will get replayed.";
}
- MakeRemappedConfig();
-
- // Remap all existing remote timestamp channels. They will be recreated, and
- // the data logged isn't relevant anymore.
- for (const Node *node : configuration::GetNodes(logged_configuration())) {
- message_bridge::ChannelTimestampFinder finder(logged_configuration(),
- "log_reader", node);
-
- absl::btree_set<std::string_view> remote_nodes;
-
- for (const Channel *channel : *logged_configuration()->channels()) {
- if (!configuration::ChannelIsSendableOnNode(channel, node)) {
- continue;
- }
- if (!channel->has_destination_nodes()) {
- continue;
- }
- for (const Connection *connection : *channel->destination_nodes()) {
- if (configuration::ConnectionDeliveryTimeIsLoggedOnNode(connection,
- node)) {
- // Start by seeing if the split timestamp channels are being used for
- // this message. If so, remap them.
- const Channel *timestamp_channel = configuration::GetChannel(
- logged_configuration(),
- finder.SplitChannelName(channel, connection),
- RemoteMessage::GetFullyQualifiedName(), "", node, true);
-
- if (timestamp_channel != nullptr) {
- // If for some reason a timestamp channel is not NOT_LOGGED (which
- // is unusual), then remap the channel so that the replayed channel
- // doesn't overlap with the special separate replay we do for
- // timestamps.
- if (timestamp_channel->logger() != LoggerConfig::NOT_LOGGED) {
- RemapLoggedChannel<RemoteMessage>(
- timestamp_channel->name()->string_view(), node);
- }
- continue;
- }
-
- // Otherwise collect this one up as a node to look for a combined
- // channel from. It is more efficient to compare nodes than channels.
- LOG(WARNING) << "Failed to find channel "
- << finder.SplitChannelName(channel, connection)
- << " on node " << aos::FlatbufferToJson(node);
- remote_nodes.insert(connection->name()->string_view());
- }
- }
- }
-
- std::vector<const Node *> timestamp_logger_nodes =
- configuration::TimestampNodes(logged_configuration(), node);
- for (const std::string_view remote_node : remote_nodes) {
- const std::string channel = finder.CombinedChannelName(remote_node);
-
- // See if the log file is an old log with MessageHeader channels in it, or
- // a newer log with RemoteMessage. If we find an older log, rename the
- // type too along with the name.
- if (HasChannel<MessageHeader>(channel, node)) {
- CHECK(!HasChannel<RemoteMessage>(channel, node))
- << ": Can't have both a MessageHeader and RemoteMessage remote "
- "timestamp channel.";
- // In theory, we should check NOT_LOGGED like RemoteMessage and be more
- // careful about updating the config, but there are fewer and fewer logs
- // with MessageHeader remote messages, so it isn't worth the effort.
- RemapLoggedChannel<MessageHeader>(channel, node, "/original",
- "aos.message_bridge.RemoteMessage");
- } else {
- CHECK(HasChannel<RemoteMessage>(channel, node))
- << ": Failed to find {\"name\": \"" << channel << "\", \"type\": \""
- << RemoteMessage::GetFullyQualifiedName() << "\"} for node "
- << node->name()->string_view();
- // Only bother to remap if there's something on the channel. We can
- // tell if the channel was marked NOT_LOGGED or not. This makes the
- // config not change un-necesarily when we replay a log with NOT_LOGGED
- // messages.
- if (HasLoggedChannel<RemoteMessage>(channel, node)) {
- RemapLoggedChannel<RemoteMessage>(channel, node);
- }
- }
- }
- }
-
- if (replay_configuration) {
- CHECK_EQ(configuration::MultiNode(configuration()),
- configuration::MultiNode(replay_configuration))
- << ": Log file and replay config need to both be multi or single "
- "node.";
- }
-
if (!configuration::MultiNode(configuration())) {
states_.resize(1);
} else {
@@ -392,19 +233,14 @@
LOG(FATAL) << "Must call Deregister before the SimulatedEventLoopFactory "
"is destroyed";
}
- // Zero out some buffers. It's easy to do use-after-frees on these, so make
- // it more obvious.
- if (remapped_configuration_buffer_) {
- remapped_configuration_buffer_->Wipe();
- }
}
const Configuration *LogReader::logged_configuration() const {
- return log_files_.config();
+ return config_remapper_.original_configuration();
}
const Configuration *LogReader::configuration() const {
- return remapped_configuration_;
+ return config_remapper_.remapped_configuration();
}
std::vector<const Node *> LogReader::LoggedNodes() const {
@@ -475,7 +311,7 @@
BootTimestamp>::PushResult result{
*message, queue_until >= last_queued_message_, false};
timestamp_mapper_->PopFront();
- SeedSortedMessages();
+ MaybeSeedSortedMessages();
return result;
},
time);
@@ -545,9 +381,6 @@
stopped_ = true;
started_ = true;
- if (message_queuer_.has_value()) {
- message_queuer_->StopPushing();
- }
}
std::vector<
@@ -601,13 +434,21 @@
void LogReader::RegisterWithoutStarting(
SimulatedEventLoopFactory *event_loop_factory) {
event_loop_factory_ = event_loop_factory;
- remapped_configuration_ = event_loop_factory_->configuration();
+ config_remapper_.set_configuration(event_loop_factory_->configuration());
+
+ const TimestampQueueStrategy timestamp_queue_strategy =
+ ComputeTimestampQueueStrategy();
+
filters_ =
std::make_unique<message_bridge::MultiNodeNoncausalOffsetEstimator>(
event_loop_factory_->configuration(), logged_configuration(),
log_files_.boots(), FLAGS_skip_order_validation,
- chrono::duration_cast<chrono::nanoseconds>(
- chrono::duration<double>(FLAGS_time_estimation_buffer_seconds)));
+ timestamp_queue_strategy ==
+ TimestampQueueStrategy::kQueueTimestampsAtStartup
+ ? chrono::seconds(0)
+ : chrono::duration_cast<chrono::nanoseconds>(
+ chrono::duration<double>(
+ FLAGS_time_estimation_buffer_seconds)));
std::vector<TimestampMapper *> timestamp_mappers;
for (const Node *node : configuration::GetNodes(configuration())) {
@@ -621,8 +462,10 @@
states_[node_index] = std::make_unique<State>(
!log_files_.ContainsPartsForNode(node_name)
? nullptr
- : std::make_unique<TimestampMapper>(node_name, log_files_),
- filters_.get(), std::bind(&LogReader::NoticeRealtimeEnd, this), node,
+ : std::make_unique<TimestampMapper>(node_name, log_files_,
+ timestamp_queue_strategy),
+ timestamp_queue_strategy, filters_.get(),
+ std::bind(&LogReader::NoticeRealtimeEnd, this), node,
State::ThreadedBuffering::kNo, MaybeMakeReplayChannelIndices(node),
before_send_callbacks_);
State *state = states_[node_index].get();
@@ -653,6 +496,14 @@
}
}
+ // Now that every state has a peer, load all the timestamps into RAM.
+ if (timestamp_queue_strategy ==
+ TimestampQueueStrategy::kQueueTimestampsAtStartup) {
+ for (std::unique_ptr<State> &state : states_) {
+ state->ReadTimestamps();
+ }
+ }
+
// Register after making all the State objects so we can build references
// between them.
for (const Node *node : configuration::GetNodes(configuration())) {
@@ -688,14 +539,14 @@
filters_->CheckGraph();
for (std::unique_ptr<State> &state : states_) {
- state->SeedSortedMessages();
+ state->MaybeSeedSortedMessages();
}
// Forwarding is tracked per channel. If it is enabled, we want to turn it
// off. Otherwise messages replayed will get forwarded across to the other
// nodes, and also replayed on the other nodes. This may not satisfy all
// our users, but it'll start the discussion.
- if (configuration::MultiNode(event_loop_factory_->configuration())) {
+ if (configuration::NodesCount(event_loop_factory_->configuration()) > 1u) {
for (size_t i = 0; i < logged_configuration()->channels()->size(); ++i) {
const Channel *channel = logged_configuration()->channels()->Get(i);
const Node *node = configuration::GetNode(
@@ -705,7 +556,7 @@
states_[configuration::GetNodeIndex(configuration(), node)].get();
const Channel *remapped_channel =
- RemapChannel(state->event_loop(), node, channel);
+ config_remapper_.RemapChannel(state->event_loop(), node, channel);
event_loop_factory_->DisableForwarding(remapped_channel);
}
@@ -738,7 +589,8 @@
CHECK(state);
VLOG(1) << "Start time is " << state->monotonic_start_time(0)
<< " for node '" << MaybeNodeName(state->node()) << "' now "
- << state->monotonic_now();
+ << (state->event_loop() != nullptr ? state->monotonic_now()
+ : monotonic_clock::min_time);
if (state->monotonic_start_time(0) == monotonic_clock::min_time) {
continue;
}
@@ -768,8 +620,9 @@
state->realtime_start_time(0));
}
VLOG(1) << "Start time is " << state->monotonic_start_time(0)
- << " for node '" << MaybeNodeName(state->event_loop()->node())
- << "' now " << state->monotonic_now();
+ << " for node '" << MaybeNodeName(state->node()) << "' now "
+ << (state->event_loop() != nullptr ? state->monotonic_now()
+ : monotonic_clock::min_time);
}
if (FLAGS_timestamps_to_csv) {
@@ -785,16 +638,33 @@
return nullptr;
}
+TimestampQueueStrategy LogReader::ComputeTimestampQueueStrategy() const {
+ if ((log_files_.TimestampsStoredSeparately() &&
+ FLAGS_enable_timestamp_loading) ||
+ FLAGS_force_timestamp_loading) {
+ return TimestampQueueStrategy::kQueueTimestampsAtStartup;
+ } else {
+ return TimestampQueueStrategy::kQueueTogether;
+ }
+}
+
// TODO(jkuszmaul): Make in-line modifications to
// ServerStatistics/ClientStatistics messages for ShmEventLoop-based replay to
// avoid messing up anything that depends on them having valid offsets.
void LogReader::Register(EventLoop *event_loop) {
+ const TimestampQueueStrategy timestamp_queue_strategy =
+ ComputeTimestampQueueStrategy();
+
filters_ =
std::make_unique<message_bridge::MultiNodeNoncausalOffsetEstimator>(
event_loop->configuration(), logged_configuration(),
log_files_.boots(), FLAGS_skip_order_validation,
- chrono::duration_cast<chrono::nanoseconds>(
- chrono::duration<double>(FLAGS_time_estimation_buffer_seconds)));
+ timestamp_queue_strategy ==
+ TimestampQueueStrategy::kQueueTimestampsAtStartup
+ ? chrono::seconds(0)
+ : chrono::duration_cast<chrono::nanoseconds>(
+ chrono::duration<double>(
+ FLAGS_time_estimation_buffer_seconds)));
std::vector<TimestampMapper *> timestamp_mappers;
for (const Node *node : configuration::GetNodes(configuration())) {
@@ -805,8 +675,10 @@
states_[node_index] = std::make_unique<State>(
!log_files_.ContainsPartsForNode(node_name)
? nullptr
- : std::make_unique<TimestampMapper>(node_name, log_files_),
- filters_.get(), std::bind(&LogReader::NoticeRealtimeEnd, this), node,
+ : std::make_unique<TimestampMapper>(node_name, log_files_,
+ timestamp_queue_strategy),
+ timestamp_queue_strategy, filters_.get(),
+ std::bind(&LogReader::NoticeRealtimeEnd, this), node,
State::ThreadedBuffering::kYes, MaybeMakeReplayChannelIndices(node),
before_send_callbacks_);
State *state = states_[node_index].get();
@@ -830,6 +702,16 @@
}
}
}
+
+ // Now that all the peers are added, we can buffer up all the timestamps if
+ // needed.
+ if (timestamp_queue_strategy ==
+ TimestampQueueStrategy::kQueueTimestampsAtStartup) {
+ for (std::unique_ptr<State> &state : states_) {
+ state->ReadTimestamps();
+ }
+ }
+
for (const Node *node : configuration::GetNodes(configuration())) {
if (node == nullptr || node->name()->string_view() ==
event_loop->node()->name()->string_view()) {
@@ -889,7 +771,7 @@
for (size_t logged_channel_index = 0;
logged_channel_index < logged_configuration()->channels()->size();
++logged_channel_index) {
- const Channel *channel = RemapChannel(
+ const Channel *channel = config_remapper_.RemapChannel(
event_loop, node,
logged_configuration()->channels()->Get(logged_channel_index));
@@ -1140,6 +1022,9 @@
"from one of the nodes it is logged on.";
}
}
+ // The log file is now done, prod the callbacks.
+ state->NotifyLogfileEnd();
+
// Now that we found the end of one channel, artificially stop the
// rest by setting the found_last_message bit. It is confusing when
// part of your data gets replayed but not all. The rest of them will
@@ -1225,7 +1110,7 @@
<< state->monotonic_now();
}));
- state->SeedSortedMessages();
+ state->MaybeSeedSortedMessages();
if (state->SingleThreadedOldestMessageTime() != BootTimestamp::max_time()) {
state->set_startup_timer(
@@ -1303,22 +1188,21 @@
// Checks if the specified channel name/type exists in the config and, depending
// on the value of conflict_handling, calls conflict_handler or just dies.
template <typename F>
-void CheckAndHandleRemapConflict(std::string_view new_name,
- std::string_view new_type,
- const Configuration *config,
- LogReader::RemapConflict conflict_handling,
- F conflict_handler) {
+void CheckAndHandleRemapConflict(
+ std::string_view new_name, std::string_view new_type,
+ const Configuration *config,
+ ConfigRemapper::RemapConflict conflict_handling, F conflict_handler) {
const Channel *existing_channel =
configuration::GetChannel(config, new_name, new_type, "", nullptr, true);
if (existing_channel != nullptr) {
switch (conflict_handling) {
- case LogReader::RemapConflict::kDisallow:
+ case ConfigRemapper::RemapConflict::kDisallow:
LOG(FATAL)
<< "Channel "
<< configuration::StrippedChannelToString(existing_channel)
<< " is already used--you can't remap a logged channel to it.";
break;
- case LogReader::RemapConflict::kCascade:
+ case ConfigRemapper::RemapConflict::kCascade:
LOG(INFO) << "Automatically remapping "
<< configuration::StrippedChannelToString(existing_channel)
<< " to avoid conflicts.";
@@ -1329,88 +1213,29 @@
}
} // namespace
-void LogReader::RemapLoggedChannel(std::string_view name, std::string_view type,
- std::string_view add_prefix,
- std::string_view new_type,
- RemapConflict conflict_handling) {
- RemapLoggedChannel(name, type, nullptr, add_prefix, new_type,
- conflict_handling);
+void LogReader::RemapLoggedChannel(
+ std::string_view name, std::string_view type, std::string_view add_prefix,
+ std::string_view new_type,
+ ConfigRemapper::RemapConflict conflict_handling) {
+ CheckEventsAreNotScheduled();
+ config_remapper_.RemapOriginalChannel(name, type, nullptr, add_prefix,
+ new_type, conflict_handling);
}
-void LogReader::RemapLoggedChannel(std::string_view name, std::string_view type,
- const Node *node,
- std::string_view add_prefix,
- std::string_view new_type,
- RemapConflict conflict_handling) {
- if (node != nullptr) {
- VLOG(1) << "Node is " << aos::FlatbufferToJson(node);
- }
- if (replay_channels_ != nullptr) {
- CHECK(std::find(replay_channels_->begin(), replay_channels_->end(),
- std::make_pair(std::string{name}, std::string{type})) !=
- replay_channels_->end())
- << "Attempted to remap channel " << name << " " << type
- << " which is not included in the replay channels passed to LogReader.";
- }
- const Channel *remapped_channel =
- configuration::GetChannel(logged_configuration(), name, type, "", node);
- CHECK(remapped_channel != nullptr) << ": Failed to find {\"name\": \"" << name
- << "\", \"type\": \"" << type << "\"}";
- VLOG(1) << "Original {\"name\": \"" << name << "\", \"type\": \"" << type
- << "\"}";
- VLOG(1) << "Remapped "
- << aos::configuration::StrippedChannelToString(remapped_channel);
-
- // We want to make /spray on node 0 go to /0/spray by snooping the maps. And
- // we want it to degrade if the heuristics fail to just work.
- //
- // The easiest way to do this is going to be incredibly specific and verbose.
- // Look up /spray, to /0/spray. Then, prefix the result with /original to get
- // /original/0/spray. Then, create a map from /original/spray to
- // /original/0/spray for just the type we were asked for.
- if (name != remapped_channel->name()->string_view()) {
- MapT new_map;
- new_map.match = std::make_unique<ChannelT>();
- new_map.match->name = absl::StrCat(add_prefix, name);
- new_map.match->type = type;
- if (node != nullptr) {
- new_map.match->source_node = node->name()->str();
- }
- new_map.rename = std::make_unique<ChannelT>();
- new_map.rename->name =
- absl::StrCat(add_prefix, remapped_channel->name()->string_view());
- maps_.emplace_back(std::move(new_map));
- }
-
- // Then remap the logged channel to the prefixed channel.
- const size_t channel_index =
- configuration::ChannelIndex(logged_configuration(), remapped_channel);
- CHECK_EQ(0u, remapped_channels_.count(channel_index))
- << "Already remapped channel "
- << configuration::CleanedChannelToString(remapped_channel);
-
- RemappedChannel remapped_channel_struct;
- remapped_channel_struct.remapped_name =
- std::string(add_prefix) +
- std::string(remapped_channel->name()->string_view());
- remapped_channel_struct.new_type = new_type;
- const std::string_view remapped_type = new_type.empty() ? type : new_type;
- CheckAndHandleRemapConflict(
- remapped_channel_struct.remapped_name, remapped_type,
- remapped_configuration_, conflict_handling,
- [this, &remapped_channel_struct, remapped_type, node, add_prefix,
- conflict_handling]() {
- RemapLoggedChannel(remapped_channel_struct.remapped_name, remapped_type,
- node, add_prefix, "", conflict_handling);
- });
- remapped_channels_[channel_index] = std::move(remapped_channel_struct);
- MakeRemappedConfig();
+void LogReader::RemapLoggedChannel(
+ std::string_view name, std::string_view type, const Node *node,
+ std::string_view add_prefix, std::string_view new_type,
+ ConfigRemapper::RemapConflict conflict_handling) {
+ CheckEventsAreNotScheduled();
+ config_remapper_.RemapOriginalChannel(name, type, node, add_prefix, new_type,
+ conflict_handling);
}
void LogReader::RenameLoggedChannel(const std::string_view name,
const std::string_view type,
const std::string_view new_name,
const std::vector<MapT> &add_maps) {
+ CheckEventsAreNotScheduled();
RenameLoggedChannel(name, type, nullptr, new_name, add_maps);
}
@@ -1419,284 +1244,17 @@
const Node *const node,
const std::string_view new_name,
const std::vector<MapT> &add_maps) {
- if (node != nullptr) {
- VLOG(1) << "Node is " << aos::FlatbufferToJson(node);
- }
- // First find the channel and rename it.
- const Channel *remapped_channel =
- configuration::GetChannel(logged_configuration(), name, type, "", node);
- CHECK(remapped_channel != nullptr) << ": Failed to find {\"name\": \"" << name
- << "\", \"type\": \"" << type << "\"}";
- VLOG(1) << "Original {\"name\": \"" << name << "\", \"type\": \"" << type
- << "\"}";
- VLOG(1) << "Remapped "
- << aos::configuration::StrippedChannelToString(remapped_channel);
-
- const size_t channel_index =
- configuration::ChannelIndex(logged_configuration(), remapped_channel);
- CHECK_EQ(0u, remapped_channels_.count(channel_index))
- << "Already remapped channel "
- << configuration::CleanedChannelToString(remapped_channel);
-
- RemappedChannel remapped_channel_struct;
- remapped_channel_struct.remapped_name = new_name;
- remapped_channel_struct.new_type.clear();
- remapped_channels_[channel_index] = std::move(remapped_channel_struct);
-
- // Then add any provided maps.
- for (const MapT &map : add_maps) {
- maps_.push_back(map);
- }
-
- // Finally rewrite the config.
- MakeRemappedConfig();
+ CheckEventsAreNotScheduled();
+ config_remapper_.RenameOriginalChannel(name, type, node, new_name, add_maps);
}
-void LogReader::MakeRemappedConfig() {
+void LogReader::CheckEventsAreNotScheduled() {
for (std::unique_ptr<State> &state : states_) {
if (state) {
CHECK(!state->event_loop())
<< ": Can't change the mapping after the events are scheduled.";
}
}
-
- // If no remapping occurred and we are using the original config, then there
- // is nothing interesting to do here.
- if (remapped_channels_.empty() && replay_configuration_ == nullptr) {
- remapped_configuration_ = logged_configuration();
- return;
- }
- // Config to copy Channel definitions from. Use the specified
- // replay_configuration_ if it has been provided.
- const Configuration *const base_config = replay_configuration_ == nullptr
- ? logged_configuration()
- : replay_configuration_;
-
- // Create a config with all the channels, but un-sorted/merged. Collect up
- // the schemas while we do this. Call MergeConfiguration to sort everything,
- // and then merge it all in together.
-
- // This is the builder that we use for the config containing all the new
- // channels.
- flatbuffers::FlatBufferBuilder fbb;
- fbb.ForceDefaults(true);
- std::vector<flatbuffers::Offset<Channel>> channel_offsets;
-
- CHECK_EQ(Channel::MiniReflectTypeTable()->num_elems, 14u)
- << ": Merging logic needs to be updated when the number of channel "
- "fields changes.";
-
- // List of schemas.
- std::map<std::string_view, FlatbufferVector<reflection::Schema>> schema_map;
- // Make sure our new RemoteMessage schema is in there for old logs without it.
- schema_map.insert(std::make_pair(
- RemoteMessage::GetFullyQualifiedName(),
- FlatbufferVector<reflection::Schema>(FlatbufferSpan<reflection::Schema>(
- message_bridge::RemoteMessageSchema()))));
-
- // Reconstruct the remapped channels.
- for (auto &pair : remapped_channels_) {
- const Channel *const c = CHECK_NOTNULL(configuration::GetChannel(
- base_config, logged_configuration()->channels()->Get(pair.first), "",
- nullptr));
- channel_offsets.emplace_back(
- CopyChannel(c, pair.second.remapped_name, "", &fbb));
-
- if (c->has_destination_nodes()) {
- for (const Connection *connection : *c->destination_nodes()) {
- switch (connection->timestamp_logger()) {
- case LoggerConfig::LOCAL_LOGGER:
- case LoggerConfig::NOT_LOGGED:
- // There is no timestamp channel associated with this, so ignore it.
- break;
-
- case LoggerConfig::REMOTE_LOGGER:
- case LoggerConfig::LOCAL_AND_REMOTE_LOGGER:
- // We want to make a split timestamp channel regardless of what type
- // of log this used to be. No sense propagating the single
- // timestamp channel.
-
- CHECK(connection->has_timestamp_logger_nodes());
- for (const flatbuffers::String *timestamp_logger_node :
- *connection->timestamp_logger_nodes()) {
- const Node *node = configuration::GetNode(
- logged_configuration(), timestamp_logger_node->string_view());
- message_bridge::ChannelTimestampFinder finder(
- logged_configuration(), "log_reader", node);
-
- // We are assuming here that all the maps are setup correctly to
- // handle arbitrary timestamps. Apply the maps for this node to
- // see what name this ends up with.
- std::string name = finder.SplitChannelName(
- pair.second.remapped_name, c->type()->str(), connection);
- std::string unmapped_name = name;
- configuration::HandleMaps(logged_configuration()->maps(), &name,
- "aos.message_bridge.RemoteMessage",
- node);
- CHECK_NE(name, unmapped_name)
- << ": Remote timestamp channel was not remapped, this is "
- "very fishy";
- flatbuffers::Offset<flatbuffers::String> channel_name_offset =
- fbb.CreateString(name);
- flatbuffers::Offset<flatbuffers::String> channel_type_offset =
- fbb.CreateString("aos.message_bridge.RemoteMessage");
- flatbuffers::Offset<flatbuffers::String> source_node_offset =
- fbb.CreateString(timestamp_logger_node->string_view());
-
- // Now, build a channel. Don't log it, 2 senders, and match the
- // source frequency.
- Channel::Builder channel_builder(fbb);
- channel_builder.add_name(channel_name_offset);
- channel_builder.add_type(channel_type_offset);
- channel_builder.add_source_node(source_node_offset);
- channel_builder.add_logger(LoggerConfig::NOT_LOGGED);
- channel_builder.add_num_senders(2);
- if (c->has_frequency()) {
- channel_builder.add_frequency(c->frequency());
- }
- if (c->has_channel_storage_duration()) {
- channel_builder.add_channel_storage_duration(
- c->channel_storage_duration());
- }
- channel_offsets.emplace_back(channel_builder.Finish());
- }
- break;
- }
- }
- }
- }
-
- // Now reconstruct the original channels, translating types as needed
- for (const Channel *c : *base_config->channels()) {
- // Search for a mapping channel.
- std::string_view new_type = "";
- for (auto &pair : remapped_channels_) {
- const Channel *const remapped_channel =
- logged_configuration()->channels()->Get(pair.first);
- if (remapped_channel->name()->string_view() == c->name()->string_view() &&
- remapped_channel->type()->string_view() == c->type()->string_view()) {
- new_type = pair.second.new_type;
- break;
- }
- }
-
- // Copy everything over.
- channel_offsets.emplace_back(CopyChannel(c, "", new_type, &fbb));
-
- // Add the schema if it doesn't exist.
- if (schema_map.find(c->type()->string_view()) == schema_map.end()) {
- CHECK(c->has_schema());
- schema_map.insert(std::make_pair(c->type()->string_view(),
- RecursiveCopyFlatBuffer(c->schema())));
- }
- }
-
- // The MergeConfiguration API takes a vector, not a map. Convert.
- std::vector<FlatbufferVector<reflection::Schema>> schemas;
- while (!schema_map.empty()) {
- schemas.emplace_back(std::move(schema_map.begin()->second));
- schema_map.erase(schema_map.begin());
- }
-
- // Create the Configuration containing the new channels that we want to add.
- const flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Channel>>>
- channels_offset =
- channel_offsets.empty() ? 0 : fbb.CreateVector(channel_offsets);
-
- // Copy over the old maps.
- std::vector<flatbuffers::Offset<Map>> map_offsets;
- if (base_config->maps()) {
- for (const Map *map : *base_config->maps()) {
- map_offsets.emplace_back(aos::RecursiveCopyFlatBuffer(map, &fbb));
- }
- }
-
- // Now create the new maps. These are second so they take effect first.
- for (const MapT &map : maps_) {
- CHECK(!map.match->name.empty());
- const flatbuffers::Offset<flatbuffers::String> match_name_offset =
- fbb.CreateString(map.match->name);
- flatbuffers::Offset<flatbuffers::String> match_type_offset;
- if (!map.match->type.empty()) {
- match_type_offset = fbb.CreateString(map.match->type);
- }
- flatbuffers::Offset<flatbuffers::String> match_source_node_offset;
- if (!map.match->source_node.empty()) {
- match_source_node_offset = fbb.CreateString(map.match->source_node);
- }
- CHECK(!map.rename->name.empty());
- const flatbuffers::Offset<flatbuffers::String> rename_name_offset =
- fbb.CreateString(map.rename->name);
- Channel::Builder match_builder(fbb);
- match_builder.add_name(match_name_offset);
- if (!match_type_offset.IsNull()) {
- match_builder.add_type(match_type_offset);
- }
- if (!match_source_node_offset.IsNull()) {
- match_builder.add_source_node(match_source_node_offset);
- }
- const flatbuffers::Offset<Channel> match_offset = match_builder.Finish();
-
- Channel::Builder rename_builder(fbb);
- rename_builder.add_name(rename_name_offset);
- const flatbuffers::Offset<Channel> rename_offset = rename_builder.Finish();
-
- Map::Builder map_builder(fbb);
- map_builder.add_match(match_offset);
- map_builder.add_rename(rename_offset);
- map_offsets.emplace_back(map_builder.Finish());
- }
-
- flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Map>>>
- maps_offsets = map_offsets.empty() ? 0 : fbb.CreateVector(map_offsets);
-
- // And copy everything else over.
- flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Node>>>
- nodes_offset = aos::RecursiveCopyVectorTable(base_config->nodes(), &fbb);
-
- flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Application>>>
- applications_offset =
- aos::RecursiveCopyVectorTable(base_config->applications(), &fbb);
-
- // Now insert everything else in unmodified.
- ConfigurationBuilder configuration_builder(fbb);
- if (!channels_offset.IsNull()) {
- configuration_builder.add_channels(channels_offset);
- }
- if (!maps_offsets.IsNull()) {
- configuration_builder.add_maps(maps_offsets);
- }
- if (!nodes_offset.IsNull()) {
- configuration_builder.add_nodes(nodes_offset);
- }
- if (!applications_offset.IsNull()) {
- configuration_builder.add_applications(applications_offset);
- }
-
- if (base_config->has_channel_storage_duration()) {
- configuration_builder.add_channel_storage_duration(
- base_config->channel_storage_duration());
- }
-
- CHECK_EQ(Configuration::MiniReflectTypeTable()->num_elems, 6u)
- << ": Merging logic needs to be updated when the number of configuration "
- "fields changes.";
-
- fbb.Finish(configuration_builder.Finish());
-
- // Clean it up and return it! By using MergeConfiguration here, we'll
- // actually get a deduplicated config for free too.
- FlatbufferDetachedBuffer<Configuration> new_merged_config =
- configuration::MergeConfiguration(
- FlatbufferDetachedBuffer<Configuration>(fbb.Release()));
-
- remapped_configuration_buffer_ =
- std::make_unique<FlatbufferDetachedBuffer<Configuration>>(
- configuration::MergeConfiguration(new_merged_config, schemas));
-
- remapped_configuration_ = &remapped_configuration_buffer_->message();
-
- // TODO(austin): Lazily re-build to save CPU?
}
std::unique_ptr<const ReplayChannelIndices>
@@ -1725,57 +1283,18 @@
}
std::vector<const Channel *> LogReader::RemappedChannels() const {
- std::vector<const Channel *> result;
- result.reserve(remapped_channels_.size());
- for (auto &pair : remapped_channels_) {
- const Channel *const logged_channel =
- CHECK_NOTNULL(logged_configuration()->channels()->Get(pair.first));
-
- auto channel_iterator = std::lower_bound(
- remapped_configuration_->channels()->cbegin(),
- remapped_configuration_->channels()->cend(),
- std::make_pair(std::string_view(pair.second.remapped_name),
- logged_channel->type()->string_view()),
- CompareChannels);
-
- CHECK(channel_iterator != remapped_configuration_->channels()->cend());
- CHECK(EqualsChannels(
- *channel_iterator,
- std::make_pair(std::string_view(pair.second.remapped_name),
- logged_channel->type()->string_view())));
- result.push_back(*channel_iterator);
- }
- return result;
+ return config_remapper_.RemappedChannels();
}
const Channel *LogReader::RemapChannel(const EventLoop *event_loop,
const Node *node,
const Channel *channel) {
- std::string_view channel_name = channel->name()->string_view();
- std::string_view channel_type = channel->type()->string_view();
- const int channel_index =
- configuration::ChannelIndex(logged_configuration(), channel);
- // If the channel is remapped, find the correct channel name to use.
- if (remapped_channels_.count(channel_index) > 0) {
- VLOG(3) << "Got remapped channel on "
- << configuration::CleanedChannelToString(channel);
- channel_name = remapped_channels_[channel_index].remapped_name;
- }
-
- VLOG(2) << "Going to remap channel " << channel_name << " " << channel_type;
- const Channel *remapped_channel = configuration::GetChannel(
- configuration(), channel_name, channel_type,
- event_loop ? event_loop->name() : "log_reader", node);
-
- CHECK(remapped_channel != nullptr)
- << ": Unable to send {\"name\": \"" << channel_name << "\", \"type\": \""
- << channel_type << "\"} because it is not in the provided configuration.";
-
- return remapped_channel;
+ return config_remapper_.RemapChannel(event_loop, node, channel);
}
LogReader::State::State(
std::unique_ptr<TimestampMapper> timestamp_mapper,
+ TimestampQueueStrategy timestamp_queue_strategy,
message_bridge::MultiNodeNoncausalOffsetEstimator *multinode_filters,
std::function<void()> notice_realtime_end, const Node *node,
LogReader::State::ThreadedBuffering threading,
@@ -1783,6 +1302,7 @@
const std::vector<std::function<void(void *message)>>
&before_send_callbacks)
: timestamp_mapper_(std::move(timestamp_mapper)),
+ timestamp_queue_strategy_(timestamp_queue_strategy),
notice_realtime_end_(notice_realtime_end),
node_(node),
multinode_filters_(multinode_filters),
@@ -2247,6 +1767,10 @@
message.value().monotonic_event_time +
std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::duration<double>(FLAGS_threaded_look_ahead_seconds)));
+ VLOG(1) << "Popped " << message.value()
+ << configuration::CleanedChannelToString(
+ event_loop_->configuration()->channels()->Get(
+ factory_channel_index_[message->channel_index]));
return message.value();
} else { // single threaded
CHECK(timestamp_mapper_ != nullptr);
@@ -2258,7 +1782,7 @@
VLOG(2) << "Node '" << MaybeNodeName(event_loop_->node())
<< "': PopOldest Popping " << result.monotonic_event_time;
timestamp_mapper_->PopFront();
- SeedSortedMessages();
+ MaybeSeedSortedMessages();
CHECK_EQ(result.monotonic_event_time.boot, boot_count());
@@ -2305,8 +1829,27 @@
return result_ptr->monotonic_event_time;
}
-void LogReader::State::SeedSortedMessages() {
+void LogReader::State::ReadTimestamps() {
if (!timestamp_mapper_) return;
+ timestamp_mapper_->QueueTimestamps();
+
+ // TODO(austin): Maybe make timestamp mapper do this so we don't need to clear
+ // it out?
+ //
+ // If we don't clear the timestamp callback, we end up getting a callback both
+ // when SplitTimestampBootMerger sees the timestamp, and when TimestampMapper
+ // sees it.
+ timestamp_mapper_->set_timestamp_callback([](TimestampedMessage *) {});
+}
+
+void LogReader::State::MaybeSeedSortedMessages() {
+ if (!timestamp_mapper_) return;
+
+ // The whole purpose of seeding is to make timestamp_mapper load timestamps.
+ // So if we have already loaded them, skip seeding.
+ if (timestamp_queue_strategy_ ==
+ TimestampQueueStrategy::kQueueTimestampsAtStartup)
+ return;
timestamp_mapper_->QueueFor(chrono::duration_cast<chrono::seconds>(
chrono::duration<double>(FLAGS_time_estimation_buffer_seconds)));
@@ -2413,6 +1956,10 @@
SetFoundLastMessage(true);
CHECK(notice_realtime_end_);
notice_realtime_end_();
+
+ if (message_queuer_.has_value()) {
+ message_queuer_->StopPushing();
+ }
}
}
diff --git a/aos/events/logging/log_reader.h b/aos/events/logging/log_reader.h
index 0486665..d36db70 100644
--- a/aos/events/logging/log_reader.h
+++ b/aos/events/logging/log_reader.h
@@ -15,9 +15,11 @@
#include "aos/condition.h"
#include "aos/events/event_loop.h"
#include "aos/events/event_loop_tmpl.h"
+#include "aos/events/logging/config_remapper.h"
#include "aos/events/logging/logfile_sorting.h"
#include "aos/events/logging/logfile_utils.h"
#include "aos/events/logging/logger_generated.h"
+#include "aos/events/logging/replay_channels.h"
#include "aos/events/logging/replay_timing_generated.h"
#include "aos/events/shm_event_loop.h"
#include "aos/events/simulated_event_loop.h"
@@ -35,11 +37,6 @@
class EventNotifier;
-// Vector of pair of name and type of the channel
-using ReplayChannels = std::vector<std::pair<std::string, std::string>>;
-// Vector of channel indices
-using ReplayChannelIndices = std::vector<size_t>;
-
// We end up with one of the following 3 log file types.
//
// Single node logged as the source node.
@@ -217,34 +214,21 @@
void SetEndTime(std::string end_time);
void SetEndTime(realtime_clock::time_point end_time);
- // Enum to use for indicating how RemapLoggedChannel behaves when there is
- // already a channel with the remapped name (e.g., as may happen when
- // replaying a logfile that was itself generated from replay).
- enum class RemapConflict {
- // LOG(FATAL) on conflicts in remappings.
- kDisallow,
- // If we run into a conflict, attempt to remap the channel we would be
- // overriding (and continue to do so if remapping *that* channel also
- // generates a conflict).
- // This will mean that if we repeatedly replay a log, we will end up
- // stacking more and more /original's on the start of the oldest version
- // of the channels.
- kCascade
- };
-
// Causes the logger to publish the provided channel on a different name so
// that replayed applications can publish on the proper channel name without
// interference. This operates on raw channel names, without any node or
// application specific mappings.
- void RemapLoggedChannel(
- std::string_view name, std::string_view type,
- std::string_view add_prefix = "/original", std::string_view new_type = "",
- RemapConflict conflict_handling = RemapConflict::kCascade);
+ void RemapLoggedChannel(std::string_view name, std::string_view type,
+ std::string_view add_prefix = "/original",
+ std::string_view new_type = "",
+ ConfigRemapper::RemapConflict conflict_handling =
+ ConfigRemapper::RemapConflict::kCascade);
template <typename T>
- void RemapLoggedChannel(
- std::string_view name, std::string_view add_prefix = "/original",
- std::string_view new_type = "",
- RemapConflict conflict_handling = RemapConflict::kCascade) {
+ void RemapLoggedChannel(std::string_view name,
+ std::string_view add_prefix = "/original",
+ std::string_view new_type = "",
+ ConfigRemapper::RemapConflict conflict_handling =
+ ConfigRemapper::RemapConflict::kCascade) {
RemapLoggedChannel(name, T::GetFullyQualifiedName(), add_prefix, new_type,
conflict_handling);
}
@@ -257,15 +241,18 @@
// TODO(austin): If you have 2 nodes remapping something to the same channel,
// this doesn't handle that. No use cases exist yet for that, so it isn't
// being done yet.
- void RemapLoggedChannel(
- std::string_view name, std::string_view type, const Node *node,
- std::string_view add_prefix = "/original", std::string_view new_type = "",
- RemapConflict conflict_handling = RemapConflict::kCascade);
+ void RemapLoggedChannel(std::string_view name, std::string_view type,
+ const Node *node,
+ std::string_view add_prefix = "/original",
+ std::string_view new_type = "",
+ ConfigRemapper::RemapConflict conflict_handling =
+ ConfigRemapper::RemapConflict::kCascade);
template <typename T>
- void RemapLoggedChannel(
- std::string_view name, const Node *node,
- std::string_view add_prefix = "/original", std::string_view new_type = "",
- RemapConflict conflict_handling = RemapConflict::kCascade) {
+ void RemapLoggedChannel(std::string_view name, const Node *node,
+ std::string_view add_prefix = "/original",
+ std::string_view new_type = "",
+ ConfigRemapper::RemapConflict conflict_handling =
+ ConfigRemapper::RemapConflict::kCascade) {
RemapLoggedChannel(name, T::GetFullyQualifiedName(), node, add_prefix,
new_type, conflict_handling);
}
@@ -325,11 +312,7 @@
// Returns true if the channel exists on the node and was logged.
template <typename T>
bool HasLoggedChannel(std::string_view name, const Node *node = nullptr) {
- const Channel *channel =
- configuration::GetChannel(logged_configuration(), name,
- T::GetFullyQualifiedName(), "", node, true);
- if (channel == nullptr) return false;
- return channel->logger() != LoggerConfig::NOT_LOGGED;
+ return config_remapper_.HasOriginalChannel<T>(name, node);
}
// Returns a list of all the original channels from remapping.
@@ -407,9 +390,10 @@
// Queues at least max_out_of_order_duration_ messages into channels_.
void QueueMessages();
- // Handle constructing a configuration with all the additional remapped
- // channels from calls to RemapLoggedChannel.
- void MakeRemappedConfig();
+
+ // Checks if any states have their event loops initialized which indicates
+ // events have been scheduled
+ void CheckEventsAreNotScheduled();
// Returns the number of nodes.
size_t nodes_count() const {
@@ -475,6 +459,7 @@
// details.
enum class ThreadedBuffering { kYes, kNo };
State(std::unique_ptr<TimestampMapper> timestamp_mapper,
+ TimestampQueueStrategy timestamp_queue_strategy,
message_bridge::MultiNodeNoncausalOffsetEstimator *multinode_filters,
std::function<void()> notice_realtime_end, const Node *node,
ThreadedBuffering threading,
@@ -514,17 +499,36 @@
return node_event_loop_factory_->boot_count();
}
+ // Reads all the timestamps into RAM so we don't need to manage buffering
+ // them. For logs where the timestamps are in separate files, this
+ // minimizes RAM usage in the cases where the log reader decides to buffer
+ // to the end of the file, or where the time estimation buffer needs to be
+ // set high to sort. This means we devote our RAM to holding lots of
+ // timestamps instead of timestamps and much larger data for a shorter
+ // period. For logs where timestamps are stored with the data, this
+ // triggers those files to be read twice.
+ void ReadTimestamps();
+
// Primes the queues inside State. Should be called before calling
// OldestMessageTime.
- void SeedSortedMessages();
+ void MaybeSeedSortedMessages();
void SetUpStartupTimer() {
const monotonic_clock::time_point start_time =
monotonic_start_time(boot_count());
if (start_time == monotonic_clock::min_time) {
- LOG(ERROR)
- << "No start time, skipping, please figure out when this happens";
- NotifyLogfileStart();
+ if (event_loop_->node()) {
+ LOG(ERROR) << "No start time for "
+ << event_loop_->node()->name()->string_view()
+ << ", skipping.";
+ } else {
+ LOG(ERROR) << "No start time, skipping.";
+ }
+
+ // This is called from OnRun. There is too much complexity in supporting
+ // OnStartup callbacks from inside OnRun. Instead, schedule a timer for
+ // "now", and have that do what we need.
+ startup_timer_->Schedule(event_loop_->monotonic_now());
return;
}
if (node_event_loop_factory_) {
@@ -757,6 +761,7 @@
void SendMessageTimings();
// Log file.
std::unique_ptr<TimestampMapper> timestamp_mapper_;
+ const TimestampQueueStrategy timestamp_queue_strategy_;
// Senders.
std::vector<std::unique_ptr<RawSender>> channels_;
@@ -907,26 +912,16 @@
message_bridge::NoncausalOffsetEstimator *GetFilter(const Node *node_a,
const Node *node_b);
+ // Returns the timestamp queueing strategy to use.
+ TimestampQueueStrategy ComputeTimestampQueueStrategy() const;
+
// List of filters for a connection. The pointer to the first node will be
// less than the second node.
std::unique_ptr<message_bridge::MultiNodeNoncausalOffsetEstimator> filters_;
- std::unique_ptr<FlatbufferDetachedBuffer<Configuration>>
- remapped_configuration_buffer_;
-
std::unique_ptr<SimulatedEventLoopFactory> event_loop_factory_unique_ptr_;
SimulatedEventLoopFactory *event_loop_factory_ = nullptr;
- // Map of channel indices to new name. The channel index will be an index into
- // logged_configuration(), and the string key will be the name of the channel
- // to send on instead of the logged channel name.
- struct RemappedChannel {
- std::string remapped_name;
- std::string new_type;
- };
- std::map<size_t, RemappedChannel> remapped_channels_;
- std::vector<MapT> maps_;
-
// Number of nodes which still have data to send. This is used to figure out
// when to exit.
size_t live_nodes_ = 0;
@@ -935,7 +930,6 @@
// running and have yet to hit the realtime end time, if any.
size_t live_nodes_with_realtime_time_end_ = 0;
- const Configuration *remapped_configuration_ = nullptr;
const Configuration *replay_configuration_ = nullptr;
// If a ReplayChannels was passed to LogReader, this will hold the
@@ -956,6 +950,7 @@
realtime_clock::time_point start_time_ = realtime_clock::min_time;
realtime_clock::time_point end_time_ = realtime_clock::max_time;
+ ConfigRemapper config_remapper_;
};
} // namespace logger
diff --git a/aos/events/logging/log_reader_utils_test.cc b/aos/events/logging/log_reader_utils_test.cc
index b61c9de..9cd64f2 100644
--- a/aos/events/logging/log_reader_utils_test.cc
+++ b/aos/events/logging/log_reader_utils_test.cc
@@ -17,7 +17,9 @@
All, MultinodeLoggerOneConfigTest,
::testing::Combine(::testing::Values(ConfigParams{
"multinode_pingpong_combined_config.json", true,
- kCombinedConfigSha1(), kCombinedConfigSha1()}),
+ kCombinedConfigSha1(), kCombinedConfigSha1(),
+ FileStrategy::kCombine,
+ ForceTimestampBuffering::kForceBufferTimestamps}),
::testing::ValuesIn(SupportedCompressionAlgorithms())));
// This test is to check if we are able to get the right channels from a log
@@ -91,7 +93,8 @@
{
LoggerState pi1_logger = MakeLoggerState(
- pi1_, &event_loop_factory_, SupportedCompressionAlgorithms()[0]);
+ pi1_, &event_loop_factory_, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
pi2_->DisableStatistics();
pi2_->Disconnect(pi1_->node());
pi1_->Disconnect(pi2_->node());
diff --git a/aos/events/logging/log_replayer.cc b/aos/events/logging/log_replayer.cc
index b69df4a..22d8b39 100644
--- a/aos/events/logging/log_replayer.cc
+++ b/aos/events/logging/log_replayer.cc
@@ -16,6 +16,8 @@
#include "aos/events/logging/log_reader.h"
#include "aos/events/logging/log_reader_utils.h"
#include "aos/events/logging/log_replayer_config_generated.h"
+#include "aos/events/logging/log_replayer_stats_generated.h"
+#include "aos/events/logging/log_replayer_stats_schema.h"
#include "aos/events/logging/logfile_sorting.h"
#include "aos/events/logging/logfile_utils.h"
#include "aos/events/logging/replay_timing_generated.h"
@@ -48,6 +50,8 @@
DEFINE_string(merge_with_config, "",
"A valid json string to be merged with config. This is used to "
"add extra applications needed to run only for log_replayer");
+DEFINE_bool(print_stats, true,
+ "if set, prints the LogReplayerStats message as JSON to stdout");
namespace aos::logger {
@@ -75,6 +79,16 @@
aos::configuration::GetMyNode(raw_config), channel_overrides);
}
+ // Add the LogReplayerStats channel
+ const aos::Configuration *raw_config = &config.message();
+ aos::ChannelT channel_overrides;
+ channel_overrides.max_size = 10000;
+ channel_overrides.frequency = 1;
+ config = aos::configuration::AddChannelToConfiguration(
+ raw_config, "/replay",
+ aos::FlatbufferSpan<reflection::Schema>(aos::LogReplayerStatsSchema()),
+ aos::configuration::GetMyNode(raw_config), channel_overrides);
+
if (!FLAGS_merge_with_config.empty()) {
config = aos::configuration::MergeWithConfig(&config.message(),
FLAGS_merge_with_config);
@@ -139,7 +153,49 @@
event_loop.SkipAosLog();
event_loop.SkipTimingReport();
+ aos::Sender<aos::LogReplayerStats> stats_sender =
+ event_loop.MakeSender<aos::LogReplayerStats>("/replay");
+ auto builder = stats_sender.MakeBuilder();
+ auto node_name = builder.fbb()->CreateString(event_loop.node()->name());
+ flatbuffers::Offset<aos::ReplayConfig> replay_config_offset;
+ if (replay_config.has_value()) {
+ replay_config_offset =
+ aos::CopyFlatBuffer(&replay_config.value().message(), builder.fbb());
+ }
+
+ auto stats_builder = builder.MakeBuilder<aos::LogReplayerStats>();
+ if (replay_config.has_value()) {
+ stats_builder.add_replay_config(replay_config_offset);
+ }
+
reader.Register(&event_loop);
+
+ // Save off the start and end times of replay.
+ reader.OnStart(event_loop.node(), [&event_loop, &stats_builder,
+ &node_name]() {
+ stats_builder.add_node(node_name);
+ stats_builder.add_realtime_start_time(
+ std::chrono::nanoseconds(event_loop.realtime_now().time_since_epoch())
+ .count());
+
+ stats_builder.add_monotonic_start_time(
+ std::chrono::nanoseconds(
+ event_loop.monotonic_now().time_since_epoch())
+ .count());
+ });
+
+ reader.OnEnd(event_loop.node(), [&event_loop, &stats_builder, &builder]() {
+ stats_builder.add_realtime_end_time(
+ std::chrono::nanoseconds(event_loop.realtime_now().time_since_epoch())
+ .count());
+
+ stats_builder.add_monotonic_end_time(
+ std::chrono::nanoseconds(
+ event_loop.monotonic_now().time_since_epoch())
+ .count());
+ builder.CheckOk(builder.Send(stats_builder.Finish()));
+ });
+
reader.OnEnd(event_loop.node(), [&event_loop]() { event_loop.Exit(); });
if (FLAGS_plot_timing) {
@@ -152,6 +208,13 @@
event_loop.Run();
reader.Deregister();
+
+ if (FLAGS_print_stats) {
+ aos::Fetcher<aos::LogReplayerStats> stats_fetcher =
+ event_loop.MakeFetcher<aos::LogReplayerStats>("/replay");
+ CHECK(stats_fetcher.Fetch()) << "Failed to fetch LogReplayerStats!";
+ std::cout << aos::FlatbufferToJson(stats_fetcher.get());
+ }
}
return EXIT_SUCCESS;
diff --git a/aos/events/logging/log_replayer_stats.fbs b/aos/events/logging/log_replayer_stats.fbs
new file mode 100644
index 0000000..8e54fdb
--- /dev/null
+++ b/aos/events/logging/log_replayer_stats.fbs
@@ -0,0 +1,19 @@
+include "log_replayer_config.fbs";
+
+namespace aos;
+
+table LogReplayerStats {
+ // The ReplayConfig passed to log_replayer.
+ replay_config:ReplayConfig (id: 0);
+ // Realtime start and end times of log replay, in nanoseconds.
+ realtime_start_time:int64 (id: 1);
+ realtime_end_time:int64 (id: 2);
+ // Monotonic start and end times of log replay, in nanoseconds.
+ monotonic_start_time:int64 (id: 3);
+ monotonic_end_time:int64 (id: 4);
+ // Name of the node where the log originated from.
+ // Note: Currently, only single node replay is supported.
+ node:string (id: 5);
+}
+
+root_type LogReplayerStats;
diff --git a/aos/events/logging/log_stats.cc b/aos/events/logging/log_stats.cc
index bbed18f..2819a0b 100644
--- a/aos/events/logging/log_stats.cc
+++ b/aos/events/logging/log_stats.cc
@@ -21,6 +21,17 @@
"Only print channels that have a set max message size that is more "
"than double of the max message size.");
+DEFINE_double(
+ run_for, 0.0,
+ "If set to a positive value, only process the log for this many seconds. "
+ "Otherwise, process the log until the end of the log.");
+
+DEFINE_bool(
+ print_repack_size_diffs, false,
+ "Analyze how many bytes could be saved in each message when converted to "
+ "JSON and back. This can be helpful to identify code that is generating "
+ "inefficiently packed flatbuffer messages.");
+
// This class implements a histogram for tracking message period
// percentiles.
class Histogram {
@@ -112,11 +123,14 @@
class ChannelStats {
public:
ChannelStats(const aos::Channel *channel, const aos::Node *destination_node,
- aos::SimulatedEventLoopFactory *factory)
+ aos::SimulatedEventLoopFactory *factory,
+ const reflection::Schema *schema)
: channel_(channel),
config_(factory->configuration()),
factory_(factory),
- destination_node_(destination_node) {
+ schema_(CHECK_NOTNULL(schema)),
+ destination_node_(destination_node),
+ flatbuffer_type_(schema) {
// Multi-node channel
if (channel_->has_source_node() && channel_->has_destination_nodes() &&
channel_->destination_nodes()->size() > 0) {
@@ -132,6 +146,22 @@
// Adds a sample to the statistics.
void Add(const aos::Context &context) {
+ if (context.size > 0 && context.data != nullptr) {
+ // Perform a very naive repacking of the message. Ideally, we'd use
+ // something like RecursiveCopyFlatBuffer that works with schemas.
+ // For now, this is good enough.
+ const std::string json = aos::FlatbufferToJson(
+ schema_, static_cast<const uint8_t *>(context.data));
+ flatbuffers::DetachedBuffer buffer =
+ aos::JsonToFlatbuffer(json, flatbuffer_type_);
+
+ const ssize_t packed_size_reduction = static_cast<ssize_t>(context.size) -
+ static_cast<ssize_t>(buffer.size());
+ max_packed_size_reduction_ =
+ std::max(max_packed_size_reduction_, packed_size_reduction);
+ total_packed_size_reduction_ += packed_size_reduction;
+ }
+
max_message_size_ = std::max(max_message_size_, context.size);
total_message_size_ += context.size;
total_num_messages_++;
@@ -182,6 +212,10 @@
size_t max_message_size() const { return max_message_size_; }
size_t total_num_messages() const { return total_num_messages_; }
+ ssize_t max_packed_size_reduction() const {
+ return max_packed_size_reduction_;
+ }
+
double avg_messages_per_sec() const {
return total_num_messages_ / SecondsActive();
}
@@ -200,6 +234,10 @@
return total_message_size_ / SecondsActive();
}
+ ssize_t avg_packed_size_reduction() const {
+ return total_packed_size_reduction_ / total_num_messages_;
+ }
+
aos::realtime_clock::time_point channel_end_time() const {
return channel_end_time_;
}
@@ -229,6 +267,7 @@
const aos::Channel *channel_;
const aos::Configuration *config_;
aos::SimulatedEventLoopFactory *factory_;
+ const reflection::Schema *const schema_;
aos::realtime_clock::time_point channel_end_time_ =
aos::realtime_clock::min_time;
aos::monotonic_clock::time_point first_message_time_ =
@@ -247,6 +286,12 @@
size_t max_message_size_ = 0;
size_t total_message_size_ = 0;
+ // The size reduction (in bytes) after a naive repacking. A negative number
+ // indicates that the repacking generated a _bigger_ message than the
+ // original message.
+ ssize_t max_packed_size_reduction_ = 0;
+ ssize_t total_packed_size_reduction_ = 0;
+
// Count of messages which had remote timestamps
size_t num_messages_with_remote_ = 0;
// Sum of latencies in all messages sent on this channel if multinode
@@ -256,6 +301,8 @@
const aos::Node *source_node_ = nullptr;
const aos::Node *destination_node_;
+
+ const aos::FlatbufferType flatbuffer_type_;
};
struct LogfileStats {
@@ -335,16 +382,24 @@
}
// Add a record to the stats vector.
- channel_stats.push_back({channel, node, &log_reader_factory});
+ channel_stats.push_back(ChannelStats{
+ channel, node, &log_reader_factory,
+ aos::configuration::GetSchema(reader.configuration(),
+ channel->type()->string_view())});
// Lambda to read messages and parse for information
- stats_event_loop->MakeRawNoArgWatcher(
- channel,
- [&logfile_stats, &channel_stats, it](const aos::Context &context) {
- channel_stats[it].Add(context);
+ auto watcher = [&logfile_stats, &channel_stats,
+ it](const aos::Context &context) {
+ channel_stats[it].Add(context);
- // Update the overall logfile statistics
- logfile_stats.logfile_length += context.size;
- });
+ // Update the overall logfile statistics
+ logfile_stats.logfile_length += context.size;
+ };
+ if (FLAGS_print_repack_size_diffs) {
+ stats_event_loop->MakeRawWatcher(
+ channel, std::bind(watcher, ::std::placeholders::_1));
+ } else {
+ stats_event_loop->MakeRawNoArgWatcher(channel, watcher);
+ }
it++;
// TODO (Stephan): Frequency of messages per second
// - Sliding window
@@ -355,7 +410,13 @@
LOG(FATAL) << "Could not find any channels";
}
- log_reader_factory.Run();
+ if (FLAGS_run_for > 0.0) {
+ log_reader_factory.RunFor(
+ std::chrono::duration_cast<std::chrono::nanoseconds>(
+ std::chrono::duration<double>(FLAGS_run_for)));
+ } else {
+ log_reader_factory.Run();
+ }
std::cout << std::endl;
@@ -388,6 +449,13 @@
<< "bytes, " << channel_stats[i].Percentile() << ", "
<< channel_stats[i].AvgLatency();
std::cout << std::endl;
+ if (FLAGS_print_repack_size_diffs) {
+ std::cout << " " << channel_stats[i].avg_packed_size_reduction()
+ << " bytes packed reduction avg, "
+ << channel_stats[i].max_packed_size_reduction()
+ << " bytes packed reduction max";
+ std::cout << std::endl;
+ }
}
}
}
diff --git a/aos/events/logging/log_writer.cc b/aos/events/logging/log_writer.cc
index db6bcc1..bc81a4b 100644
--- a/aos/events/logging/log_writer.cc
+++ b/aos/events/logging/log_writer.cc
@@ -27,10 +27,11 @@
node_(configuration::GetNode(configuration_, event_loop->node())),
node_index_(configuration::GetNodeIndex(configuration_, node_)),
name_(network::GetHostname()),
- timer_handler_(event_loop_->AddTimer(
- [this]() { DoLogData(event_loop_->monotonic_now(), true); })),
+ timer_handler_(event_loop_->AddTimer([this]() {
+ DoLogData(event_loop_->monotonic_now() - logging_delay_, true);
+ })),
server_statistics_fetcher_(
- configuration::MultiNode(event_loop_->configuration())
+ configuration::NodesCount(event_loop_->configuration()) > 1u
? event_loop_->MakeFetcher<message_bridge::ServerStatistics>(
"/aos")
: aos::Fetcher<message_bridge::ServerStatistics>()) {
@@ -151,7 +152,7 @@
const bool log_message = is_logged && is_readable;
bool log_delivery_times = false;
- if (configuration::MultiNode(configuration_)) {
+ if (configuration::NodesCount(configuration_) > 1u) {
const aos::Connection *connection =
configuration::ConnectionToNode(config_channel, node_);
@@ -249,7 +250,7 @@
aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> config_header =
PackConfiguration(configuration_);
config_sha256 = Sha256(config_header.span());
- LOG(INFO) << "Config sha256 of " << config_sha256;
+ VLOG(1) << "Config sha256 of " << config_sha256;
log_namer->WriteConfiguration(&config_header, config_sha256);
}
@@ -268,6 +269,7 @@
log_event_uuid_ = UUID::Random();
log_start_uuid_ = log_start_uuid;
+ log_namer_->SetHeaderTemplate(MakeHeader(config_sha256));
// We want to do as much work as possible before the initial Fetch. Time
// between that and actually starting to log opens up the possibility of
@@ -286,11 +288,11 @@
}
}
- log_namer_->SetHeaderTemplate(MakeHeader(config_sha256));
-
const aos::monotonic_clock::time_point beginning_time =
event_loop_->monotonic_now();
+ log_until_time_ = beginning_time;
+
// Grab data from each channel right before we declare the log file started
// so we can capture the latest message on each channel. This lets us have
// non periodic messages with configuration that now get logged.
@@ -338,77 +340,45 @@
}
std::unique_ptr<LogNamer> Logger::RestartLogging(
- std::unique_ptr<LogNamer> log_namer, std::optional<UUID> log_start_uuid) {
+ std::unique_ptr<LogNamer> log_namer, std::optional<UUID> log_start_uuid,
+ std::optional<monotonic_clock::time_point> end_time) {
CHECK(log_namer_) << ": Unexpected restart while not logging";
VLOG(1) << "Restarting logger for " << FlatbufferToJson(node_);
- // Make sure not to write past now so we don't risk out of order problems. We
- // don't want to get into a situation where we write out up to now + 0.1 sec,
- // and that operation takes ~0.1 seconds, so we end up writing a different
- // amount of the early and late channels. That would then result in the next
- // go around finding more than 0.1 sec of data on the early channels.
- //
- // Make sure we read up until "now" and log it. This sets us up so that we
- // are unlikely to fetch a message far in the future and have a ton of data
- // before the offical start time.
- monotonic_clock::time_point newest_record = monotonic_clock::min_time;
- while (true) {
- aos::monotonic_clock::time_point next_time =
- last_synchronized_time_ + polling_period_;
- const aos::monotonic_clock::time_point monotonic_now =
- event_loop_->monotonic_now();
- if (next_time > monotonic_now) {
- next_time = monotonic_now;
- }
+ // Grab a representative time on both the RT and monotonic clock.
+ // Average a monotonic clock before and after to reduce the error.
+ const aos::monotonic_clock::time_point monotonic_now1 =
+ event_loop_->monotonic_now();
+ const aos::realtime_clock::time_point realtime_now =
+ event_loop_->realtime_now();
+ const aos::monotonic_clock::time_point monotonic_now2 =
+ event_loop_->monotonic_now();
- bool wrote_messages = false;
- std::tie(wrote_messages, newest_record) = LogUntil(next_time);
-
- if (next_time == monotonic_now &&
- (!wrote_messages || newest_record < monotonic_now + polling_period_)) {
- // If we stopped writing messages, then we probably have stopped making
- // progress. If the newest record (unwritten or written) on a channel is
- // very close to the current time, then there won't be much data
- // officially after the end of the last log but before the start of the
- // current one. We need to pick the start of the current log to be after
- // the last message on record so we don't have holes in the log.
- break;
+ // Log until the provided end time.
+ if (end_time) {
+ CHECK_LE(*end_time, monotonic_now1) << ": Can't log into the future.";
+ // DoLogData is a bit fragile.
+ if (*end_time > last_synchronized_time_) {
+ DoLogData(*end_time, false);
}
}
- // We are now synchronized up to last_synchronized_time_. Our start time can
- // safely be "newest_record". But, we need to guarentee that the start time
- // is after the newest message we have a record of, and that we don't skip any
- // messages as we rotate. This means we can't call Fetch anywhere.
+ // We are now synchronized up to last_synchronized_time_. We only have record
+ // of messages from before last_synchronized_time_, so it is a safe start
+ // time.
std::unique_ptr<LogNamer> old_log_namer = std::move(log_namer_);
log_namer_ = std::move(log_namer);
- // Now grab a representative time on both the RT and monotonic clock.
- // Average a monotonic clock before and after to reduce the error.
- const aos::monotonic_clock::time_point beginning_time =
- event_loop_->monotonic_now();
- const aos::realtime_clock::time_point beginning_time_rt =
- event_loop_->realtime_now();
- const aos::monotonic_clock::time_point beginning_time2 =
- event_loop_->monotonic_now();
-
- if (beginning_time > last_synchronized_time_) {
- LOG(WARNING) << "Took over " << polling_period_.count()
- << "ns to swap log_namer";
- }
-
- // Our start time is now the newest message we have a record of. We will
- // declare the old log "done", and start in on the new one, double-logging
- // anything we have a record of so we have all the messages from before the
- // start.
- const aos::monotonic_clock::time_point monotonic_start_time = newest_record;
+ // Our start time is now how far we logged until before.
+ const aos::monotonic_clock::time_point monotonic_start_time =
+ last_synchronized_time_;
const aos::realtime_clock::time_point realtime_start_time =
- (beginning_time_rt + (monotonic_start_time.time_since_epoch() -
- ((beginning_time.time_since_epoch() +
- beginning_time2.time_since_epoch()) /
- 2)));
+ (realtime_now + (last_synchronized_time_.time_since_epoch() -
+ ((monotonic_now1.time_since_epoch() +
+ monotonic_now2.time_since_epoch()) /
+ 2)));
auto config_sha256 = WriteConfiguration(log_namer_.get());
@@ -459,7 +429,7 @@
VLOG(1) << "Logging node as " << FlatbufferToJson(node_) << " restart_time "
<< last_synchronized_time_ << ", took "
- << chrono::duration<double>(header_time - beginning_time).count()
+ << chrono::duration<double>(header_time - monotonic_now1).count()
<< " to prepare and write header, "
<< chrono::duration<double>(channel_time - header_time).count()
<< " to write initial channel messages, boot uuid "
@@ -497,7 +467,7 @@
void Logger::WriteHeader(aos::monotonic_clock::time_point monotonic_start_time,
aos::realtime_clock::time_point realtime_start_time) {
- if (configuration::MultiNode(configuration_)) {
+ if (configuration::NodesCount(configuration_) > 1u) {
server_statistics_fetcher_.Fetch();
}
@@ -520,7 +490,7 @@
}
void Logger::WriteMissingTimestamps() {
- if (configuration::MultiNode(configuration_)) {
+ if (configuration::NodesCount(configuration_) > 1u) {
server_statistics_fetcher_.Fetch();
} else {
return;
@@ -714,6 +684,8 @@
total_copy_time_ = std::chrono::nanoseconds::zero();
total_copy_count_ = 0;
total_copy_bytes_ = 0;
+ max_log_delay_ = std::chrono::nanoseconds::zero();
+ max_log_delay_channel_ = -1;
}
void Logger::Rotate() {
@@ -734,7 +706,12 @@
ContextDataCopier coppier(f.fetcher->context(), f.channel_index, f.log_type,
event_loop_);
- writer->CopyMessage(&coppier, source_node_boot_uuid, start);
+ aos::monotonic_clock::time_point message_time =
+ static_cast<int>(node_index_) != f.data_node_index
+ ? f.fetcher->context().monotonic_remote_time
+ : f.fetcher->context().monotonic_event_time;
+ writer->CopyDataMessage(&coppier, source_node_boot_uuid, start,
+ message_time);
RecordCreateMessageTime(start, coppier.end_time(), f);
VLOG(2) << "Wrote data as node " << FlatbufferToJson(node_)
@@ -759,7 +736,9 @@
ContextDataCopier coppier(f.fetcher->context(), f.channel_index,
LogType::kLogDeliveryTimeOnly, event_loop_);
- timestamp_writer->CopyMessage(&coppier, event_loop_->boot_uuid(), start);
+ timestamp_writer->CopyTimestampMessage(
+ &coppier, event_loop_->boot_uuid(), start,
+ f.fetcher->context().monotonic_event_time);
RecordCreateMessageTime(start, coppier.end_time(), f);
VLOG(2) << "Wrote timestamps as node " << FlatbufferToJson(node_)
@@ -809,8 +788,10 @@
RemoteMessageCopier coppier(msg, channel_index, monotonic_timestamp_time,
event_loop_);
- contents_writer->CopyMessage(&coppier, UUID::FromVector(msg->boot_uuid()),
- start);
+ contents_writer->CopyRemoteTimestampMessage(
+ &coppier, UUID::FromVector(msg->boot_uuid()), start,
+ monotonic_clock::time_point(
+ chrono::nanoseconds(msg->monotonic_sent_time())));
RecordCreateMessageTime(start, coppier.end_time(), f);
}
@@ -827,6 +808,8 @@
bool wrote_messages = false;
monotonic_clock::time_point newest_record = monotonic_clock::min_time;
+ log_until_time_ = t;
+
// Grab the latest ServerStatistics message. This will always have the
// oppertunity to be >= to the current time, so it will always represent any
// reboots which may have happened.
@@ -842,7 +825,8 @@
while (true) {
if (f.written) {
const auto start = event_loop_->monotonic_now();
- const bool got_new = f.fetcher->FetchNext();
+ const bool got_new =
+ f.fetcher->FetchNextIf(std::ref(fetch_next_if_fn_));
const auto end = event_loop_->monotonic_now();
RecordFetchResult(start, end, got_new, &f);
if (!got_new) {
@@ -857,7 +841,11 @@
}
// TODO(james): Write tests to exercise this logic.
- if (f.fetcher->context().monotonic_event_time >= t) {
+ CHECK_LE(f.fetcher->context().monotonic_event_time, t);
+
+ // At startup, we can end up grabbing a message at the current time.
+ // Ignore it.
+ if (f.fetcher->context().monotonic_event_time == t) {
break;
}
@@ -874,6 +862,9 @@
void Logger::DoLogData(const monotonic_clock::time_point end_time,
bool run_on_logged) {
+ if (end_time < last_synchronized_time_) return;
+
+ DCHECK(is_started());
// We want to guarantee that messages aren't out of order by more than
// max_out_of_order_duration. To do this, we need sync points. Every write
// cycle should be a sync point.
@@ -884,7 +875,7 @@
LogUntil(std::min(last_synchronized_time_ + polling_period_, end_time));
if (run_on_logged) {
- on_logged_period_();
+ on_logged_period_(last_synchronized_time_);
}
// If we missed cycles, we could be pretty far behind. Spin until we are
@@ -923,6 +914,11 @@
max_copy_time_channel_ = fetcher.channel_index;
max_copy_time_size_ = fetcher.fetcher->context().size;
}
+ const auto log_delay = end - fetcher.fetcher->context().monotonic_event_time;
+ if (log_delay > max_log_delay_) {
+ max_log_delay_ = log_delay;
+ max_log_delay_channel_ = fetcher.channel_index;
+ }
}
} // namespace logger
diff --git a/aos/events/logging/log_writer.h b/aos/events/logging/log_writer.h
index 837a02a..ec02b7d 100644
--- a/aos/events/logging/log_writer.h
+++ b/aos/events/logging/log_writer.h
@@ -54,11 +54,14 @@
logger_version_ = version;
}
- // Sets the callback to run after each period of data is logged. Defaults to
- // doing nothing.
+ // Sets the callback to run *after* each period of data is logged. Defaults
+ // to doing nothing. The argument to the callback is the time which we just
+ // wrote until. This is not called when rotating or finishing logs.
//
// This callback may safely do things like call Rotate().
- void set_on_logged_period(std::function<void()> on_logged_period) {
+ void set_on_logged_period(
+ std::function<void(aos::monotonic_clock::time_point t)>
+ on_logged_period) {
on_logged_period_ = std::move(on_logged_period);
}
@@ -66,6 +69,17 @@
separate_config_ = separate_config;
}
+ // Sets the amount to run the logger behind the current time. This lets us
+ // make decisions about rotating or stopping logging before something happens.
+ // Using this to start logging in the past isn't yet supported. This can be
+ // changed at runtime, but will only influence future writes, not what is
+ // already written.
+ void set_logging_delay(std::chrono::nanoseconds logging_delay) {
+ logging_delay_ = logging_delay;
+ }
+ // Returns the current logging delay.
+ std::chrono::nanoseconds logging_delay() const { return logging_delay_; }
+
// Sets the period between polling the data. Defaults to 100ms.
//
// Changing this while a set of files is being written may result in
@@ -125,6 +139,13 @@
// The total number of bytes copied.
int64_t total_copy_bytes() const { return total_copy_bytes_; }
+ // The maximum time between when a message was sent and when it was logged.
+ // This is 0 if no message has been logged.
+ std::chrono::nanoseconds max_log_delay() const { return max_log_delay_; }
+ // The channel for longest logging delay, or -1 if no messages have been
+ // logged.
+ int max_log_delay_channel() const { return max_log_delay_channel_; }
+
void ResetStatisics();
// Rotates the log file(s), triggering new part files to be written for each
@@ -139,11 +160,14 @@
void StartLogging(std::unique_ptr<LogNamer> log_namer,
std::optional<UUID> log_start_uuid = std::nullopt);
- // Restart logging using a new naming scheme. Intended for log rotation.
- // Returns a unique_ptr to the prior log_namer instance.
+ // Restarts logging using a new naming scheme. Intended for log rotation.
+ // Returns a unique_ptr to the prior log_namer instance. If provided,
+ // end_time is the time to log until. It must be in the past. Times before
+ // the last_synchronized_time are ignored.
std::unique_ptr<LogNamer> RestartLogging(
std::unique_ptr<LogNamer> log_namer,
- std::optional<UUID> log_start_uuid = std::nullopt);
+ std::optional<UUID> log_start_uuid = std::nullopt,
+ std::optional<monotonic_clock::time_point> end_time = std::nullopt);
// Stops logging. Ensures any messages through end_time make it into the log.
//
@@ -167,6 +191,10 @@
});
}
+ // Returns the current log event UUID. This is randomly assigned when the log
+ // starts or restarts.
+ const UUID &log_event_uuid() const { return log_event_uuid_; }
+
private:
// Structure to track both a fetcher, and if the data fetched has been
// written. We may want to delay writing data to disk so that we don't let
@@ -295,7 +323,10 @@
std::string logger_sha1_;
std::string logger_version_;
- std::function<void()> on_logged_period_ = []() {};
+ // The callback to get called on each logged period. See
+ // set_on_logged_period() above for more details.
+ std::function<void(aos::monotonic_clock::time_point t)> on_logged_period_ =
+ [](aos::monotonic_clock::time_point) {};
std::chrono::nanoseconds max_message_fetch_time_ =
std::chrono::nanoseconds::zero();
@@ -306,6 +337,9 @@
int total_message_fetch_count_ = 0;
int64_t total_message_fetch_bytes_ = 0;
+ std::chrono::nanoseconds max_log_delay_ = std::chrono::nanoseconds::zero();
+ int max_log_delay_channel_ = -1;
+
std::chrono::nanoseconds total_nop_fetch_time_ =
std::chrono::nanoseconds::zero();
int total_nop_fetch_count_ = 0;
@@ -331,6 +365,16 @@
// Fetcher for all the statistics from all the nodes.
aos::Fetcher<message_bridge::ServerStatistics> server_statistics_fetcher_;
+
+ monotonic_clock::time_point log_until_time_ = monotonic_clock::min_time;
+
+ std::function<bool(const Context &)> fetch_next_if_fn_ =
+ [this](const Context &context) {
+ return context.monotonic_event_time < log_until_time_;
+ };
+
+ // Amount of time to run the logger behind now.
+ std::chrono::nanoseconds logging_delay_ = std::chrono::nanoseconds(0);
};
} // namespace logger
diff --git a/aos/events/logging/logfile_sorting.cc b/aos/events/logging/logfile_sorting.cc
index 59d18d0..b8530fb 100644
--- a/aos/events/logging/logfile_sorting.cc
+++ b/aos/events/logging/logfile_sorting.cc
@@ -11,6 +11,7 @@
#include "absl/container/btree_map.h"
#include "absl/strings/str_join.h"
+#include "aos/containers/error_list.h"
#include "aos/events/logging/file_operations.h"
#include "aos/events/logging/logfile_utils.h"
#include "aos/flatbuffer_merge.h"
@@ -48,7 +49,7 @@
}
bool ConfigOnly(const LogFileHeader *header) {
- CHECK_EQ(LogFileHeader::MiniReflectTypeTable()->num_elems, 34u);
+ CHECK_EQ(LogFileHeader::MiniReflectTypeTable()->num_elems, 35u);
if (header->has_monotonic_start_time()) return false;
if (header->has_realtime_start_time()) return false;
if (header->has_max_out_of_order_duration()) return false;
@@ -80,6 +81,7 @@
return false;
if (header->has_logger_sha1()) return false;
if (header->has_logger_version()) return false;
+ if (header->has_data_stored()) return false;
return header->has_configuration();
}
@@ -138,7 +140,7 @@
// Start by grouping all parts by UUID, and extracting the part index.
// Datastructure to hold all the info extracted from a set of parts which go
-// together so we can sort them afterwords.
+// together so we can sort them afterwards.
struct UnsortedLogParts {
// Start times.
aos::monotonic_clock::time_point monotonic_start_time;
@@ -165,6 +167,18 @@
std::vector<std::pair<std::string, int>> parts;
std::string config_sha256;
+
+ // Largest max out of order duration among parts with monotonic start
+ // time greater than min_time
+ std::optional<std::chrono::nanoseconds>
+ max_out_of_order_duration_valid_start_time = std::nullopt;
+ // Largest max out of order duration among parts with monotonic start time
+ // equal to min_time.
+ std::optional<std::chrono::nanoseconds>
+ max_out_of_order_duration_min_start_time = std::nullopt;
+
+ // All the data types we've seen so far, or all of them if we don't know.
+ aos::ErrorList<StoredDataType> data_stored;
};
// Struct to hold both the node, and the parts associated with it.
@@ -370,6 +384,9 @@
const realtime_clock::time_point logger_realtime_start_time(
chrono::nanoseconds(
log_header->message().logger_realtime_start_time()));
+ const std::chrono::nanoseconds max_out_of_order_duration =
+ std::chrono::nanoseconds(
+ log_header->message().max_out_of_order_duration());
const std::string_view node =
log_header->message().has_node()
@@ -505,7 +522,12 @@
old_parts.back().parts.config_sha256 = configuration_sha256;
old_parts.back().unsorted_parts.emplace_back(
std::make_pair(first_message_time, part.name));
+ old_parts.back().parts.max_out_of_order_duration =
+ max_out_of_order_duration;
old_parts.back().name = name;
+ old_parts.back().parts.data_stored = {
+ StoredDataType::DATA, StoredDataType::TIMESTAMPS,
+ StoredDataType::REMOTE_TIMESTAMPS};
} else {
result->unsorted_parts.emplace_back(
std::make_pair(first_message_time, part.name));
@@ -583,6 +605,33 @@
} else {
CHECK_EQ(it->second.config_sha256, configuration_sha256);
}
+ if (log_header->message().has_data_stored()) {
+ for (const StoredDataType type : *log_header->message().data_stored()) {
+ it->second.data_stored.Set(type);
+ }
+ } else {
+ it->second.data_stored.Set(StoredDataType::DATA);
+ it->second.data_stored.Set(StoredDataType::TIMESTAMPS);
+ it->second.data_stored.Set(StoredDataType::REMOTE_TIMESTAMPS);
+ }
+ // Keep track of the largest max out of order duration times based on
+ // whether monotonic start time is available. We'll decide which value to
+ // use later when creating log files.
+ if (monotonic_start_time == monotonic_clock::min_time) {
+ it->second.max_out_of_order_duration_min_start_time =
+ it->second.max_out_of_order_duration_min_start_time.has_value()
+ ? std::max(
+ it->second.max_out_of_order_duration_min_start_time.value(),
+ max_out_of_order_duration)
+ : max_out_of_order_duration;
+ } else {
+ it->second.max_out_of_order_duration_valid_start_time =
+ it->second.max_out_of_order_duration_valid_start_time.has_value()
+ ? std::max(it->second.max_out_of_order_duration_valid_start_time
+ .value(),
+ max_out_of_order_duration)
+ : max_out_of_order_duration;
+ }
// We've got a newer log with boot_uuids, and oldest timestamps. Fill in
// this->boot_times with the info we have found.
@@ -1885,7 +1934,18 @@
new_parts.parts_uuid = parts.first.first;
new_parts.node = std::move(parts.second.node);
new_parts.boots = boot_counts;
-
+ new_parts.data_stored.reserve(parts.second.data_stored.size());
+ for (const StoredDataType data : parts.second.data_stored) {
+ new_parts.data_stored.push_back(data);
+ }
+ // If there are no part files which have a monotonic start time greater
+ // than min time, use the max of whatever we have, else chose the max out
+ // of order duration of parts with monotonic start time greater than min
+ // time.
+ new_parts.max_out_of_order_duration =
+ parts.second.max_out_of_order_duration_valid_start_time.has_value()
+ ? parts.second.max_out_of_order_duration_valid_start_time.value()
+ : parts.second.max_out_of_order_duration_min_start_time.value();
{
auto boot_count_it =
boot_counts->boot_count_map.find(new_parts.source_boot_uuid);
@@ -2096,9 +2156,19 @@
stream << ",\n \"logger_realtime_start_time\": \""
<< parts.logger_realtime_start_time << "\"";
}
+ stream << ",\n \"max_out_of_order_duration\": \""
+ << parts.max_out_of_order_duration.count() << "\"";
+
stream << ",\n \"monotonic_start_time\": \"" << parts.monotonic_start_time
<< "\",\n \"realtime_start_time\": \"" << parts.realtime_start_time
- << "\",\n \"parts\": [";
+ << "\",\n \"data_stored\": [";
+ for (size_t i = 0; i < parts.data_stored.size(); ++i) {
+ if (i != 0u) {
+ stream << ", ";
+ }
+ stream << "\"" << EnumNameStoredDataType(parts.data_stored[i]) << "\"";
+ }
+ stream << "],\n \"parts\": [";
for (size_t i = 0; i < parts.parts.size(); ++i) {
if (i != 0u) {
@@ -2113,7 +2183,7 @@
// Validates that collection of log files or log parts shares the same configs.
template <typename TCollection>
-bool CheckMatchingConfigs(const TCollection &items) {
+bool HasMatchingConfigsTemplate(const TCollection &items) {
const Configuration *config = nullptr;
for (const auto &item : items) {
VLOG(1) << item;
@@ -2134,12 +2204,23 @@
return true;
}
+// Validates that collection of log files or log parts shares the same configs.
+bool HasMatchingConfigs(const std::vector<LogFile> &items) {
+ return HasMatchingConfigsTemplate(items);
+}
+
// Provides unified access to config field stored in LogFile. It is used in
-// CheckMatchingConfigs.
+// HasMatchingConfigs.
inline const Configuration *GetConfig(const LogFile &log_file) {
return log_file.config.get();
}
+// Provides unified access to config field stored in LogPartsAccess. It is used
+// in HasMatchingConfigs.
+inline const Configuration *GetConfig(const LogPartsAccess &log_parts_access) {
+ return log_parts_access.config().get();
+}
+
// Output of LogPartsAccess for debug purposes.
std::ostream &operator<<(std::ostream &stream,
const LogPartsAccess &log_parts_access) {
@@ -2153,11 +2234,18 @@
: node_name_(node_name),
boot_index_(boot_index),
log_parts_(std::move(log_parts)) {
- CHECK_GT(log_parts_.size(), 0u) << ": Nothing was selected for node "
- << node_name_ << " boot " << boot_index_;
- CHECK(CheckMatchingConfigs(log_parts_));
+ if (log_parts_.empty()) {
+ VLOG(1) << "Nothing was selected for node " << node_name_ << " boot "
+ << boot_index_;
+ return;
+ }
+ CHECK(HasMatchingConfigsTemplate(log_parts_));
config_ = log_parts_.front().config();
+ for (LogPartsAccess &part : log_parts_) {
+ CHECK_EQ(config_.get(), part.config().get());
+ }
+
// Enforce that we are sorting things only from a single node from a single
// boot.
const std::string_view part0_source_boot_uuid =
@@ -2174,14 +2262,15 @@
std::optional<const LogSource *> log_source, std::vector<LogFile> log_files)
: log_source_(log_source), log_files_(std::move(log_files)) {
CHECK_GT(log_files_.size(), 0u);
- CHECK(CheckMatchingConfigs(log_files_));
- config_ = log_files_.front().config.get();
+ CHECK(HasMatchingConfigsTemplate(log_files_));
+ config_ = log_files_.front().config;
boots_ = log_files_.front().boots;
std::unordered_set<std::string> logger_nodes;
// Scan and collect all related nodes and number of reboots per node.
for (const LogFile &log_file : log_files_) {
+ CHECK_EQ(config_.get(), log_file.config.get());
for (const LogParts &part : log_file.parts) {
auto node_item = nodes_boots_.find(part.node);
if (node_item != nodes_boots_.end()) {
@@ -2206,12 +2295,28 @@
return node_item->second;
}
-SelectedLogParts LogFilesContainer::SelectParts(std::string_view node_name,
- size_t boot_index) const {
+SelectedLogParts LogFilesContainer::SelectParts(
+ std::string_view node_name, size_t boot_index,
+ const std::vector<StoredDataType> &types) const {
std::vector<LogPartsAccess> result;
for (const LogFile &log_file : log_files_) {
for (const LogParts &part : log_file.parts) {
if (part.node == node_name && part.boot_count == boot_index) {
+ bool found = false;
+ for (const StoredDataType type : types) {
+ for (const StoredDataType part_type : part.data_stored) {
+ if (type == part_type) {
+ found = true;
+ break;
+ }
+ }
+ if (found) {
+ break;
+ }
+ }
+ if (!found) {
+ continue;
+ }
result.emplace_back(log_source_, part);
}
}
@@ -2219,5 +2324,54 @@
return SelectedLogParts(node_name, boot_index, result);
}
+bool LogFilesContainer::TimestampsStoredSeparately() const {
+ for (const LogFile &log_file : log_files_) {
+ for (const LogParts &part : log_file.parts) {
+ bool has_data = false;
+ bool has_timestamps = false;
+ bool has_remote_timestamps = false;
+ for (StoredDataType type : part.data_stored) {
+ switch (type) {
+ case StoredDataType::DATA:
+ has_data = true;
+ break;
+ case StoredDataType::TIMESTAMPS:
+ has_timestamps = true;
+ break;
+ case StoredDataType::REMOTE_TIMESTAMPS:
+ has_remote_timestamps = true;
+ break;
+ }
+ }
+
+ if (has_data && (has_timestamps || has_remote_timestamps)) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+bool LogFilesContainer::HasTimestamps(std::string_view node_name) const {
+ for (const LogFile &log_file : log_files_) {
+ for (const LogParts &part : log_file.parts) {
+ if (part.node != node_name) {
+ continue;
+ }
+
+ for (StoredDataType type : part.data_stored) {
+ switch (type) {
+ case StoredDataType::DATA:
+ break;
+ case StoredDataType::TIMESTAMPS:
+ case StoredDataType::REMOTE_TIMESTAMPS:
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+}
+
} // namespace logger
} // namespace aos
diff --git a/aos/events/logging/logfile_sorting.h b/aos/events/logging/logfile_sorting.h
index b7f8297..e1aa620 100644
--- a/aos/events/logging/logfile_sorting.h
+++ b/aos/events/logging/logfile_sorting.h
@@ -84,6 +84,13 @@
// Information about all the boots that the system has observed.
std::shared_ptr<const Boots> boots;
+
+ // Highest max out of order durations among all parts.
+ std::chrono::nanoseconds max_out_of_order_duration;
+
+ // The types of data we know are in this log file. If we have no info, this
+ // will be a vector of all the potential types.
+ std::vector<StoredDataType> data_stored;
};
// Datastructure to hold parts from the same run of the logger which have no
@@ -143,6 +150,9 @@
// Sort parts of a single log.
std::vector<LogFile> SortParts(const LogSource &log_source);
+// Validates that collection of log files or log parts shares the same configs.
+bool HasMatchingConfigs(const std::vector<LogFile> &items);
+
// Recursively searches the file/folder for .bfbs and .bfbs.xz files and adds
// them to the vector.
void FindLogs(std::vector<internal::FileOperations::File> *files,
@@ -173,10 +183,16 @@
size_t boot_count() const { return log_parts_.boot_count; }
- const Configuration *config() const { return log_parts_.config.get(); }
+ std::shared_ptr<const Configuration> config() const {
+ return log_parts_.config;
+ }
std::optional<const LogSource *> log_source() const { return log_source_; }
+ std::chrono::nanoseconds max_out_of_order_duration() const {
+ return log_parts_.max_out_of_order_duration;
+ }
+
std::string GetPartAt(size_t index) const {
CHECK_LT(index, log_parts_.parts.size());
return log_parts_.parts[index];
@@ -192,12 +208,6 @@
LogParts log_parts_;
};
-// Provides unified access to config field stored in LogPartsAccess. It is used
-// in CheckMatchingConfigs.
-inline const Configuration *GetConfig(const LogPartsAccess &log_parts_access) {
- return log_parts_access.config();
-}
-
// Output of LogPartsAccess for debug purposes.
std::ostream &operator<<(std::ostream &stream,
const LogPartsAccess &log_parts_access);
@@ -212,20 +222,22 @@
auto begin() const { return log_parts_.begin(); }
auto end() const { return log_parts_.end(); }
+ bool empty() const { return log_parts_.empty(); }
+
// Config that shared across all log parts.
- const Configuration *config() const { return config_; }
+ std::shared_ptr<const Configuration> config() const { return config_; }
const std::string &node_name() const { return node_name_; }
- // Number of boots found in the log parts.
- size_t boot_count() const { return boot_index_; }
+ // Returns the boot index found in the log parts.
+ size_t boot_index() const { return boot_index_; }
private:
std::string node_name_;
- size_t boot_index_;
+ size_t boot_index_ = 0u;
std::vector<LogPartsAccess> log_parts_;
- const Configuration *config_;
+ std::shared_ptr<const Configuration> config_ = nullptr;
};
// Container that keeps a sorted list of log files and provides functions that
@@ -241,7 +253,7 @@
explicit LogFilesContainer(const LogSource *log_source)
: LogFilesContainer(log_source, SortParts(*log_source)) {}
- // Returns true when at least on of the log files associated with node.
+ // Returns true when at least one of the log files associated with node.
bool ContainsPartsForNode(std::string_view node_name) const {
// TODO (Alexei): Implement
// https://en.cppreference.com/w/cpp/container/unordered_map/find with C++20
@@ -251,15 +263,16 @@
// Returns numbers of reboots found in log files associated with the node.
size_t BootsForNode(std::string_view node_name) const;
- // Get only log parts that associated with node and boot number.
- SelectedLogParts SelectParts(std::string_view node_name,
- size_t boot_index) const;
+ // Get only log parts that associated with node, boot number, and with data of
+ // any of the types provided.
+ SelectedLogParts SelectParts(std::string_view node_name, size_t boot_index,
+ const std::vector<StoredDataType> &types) const;
// It provides access to boots logged by all log files in the container.
const std::shared_ptr<const Boots> &boots() const { return boots_; }
// Access the configuration shared with all log files in the container.
- const Configuration *config() const { return config_; }
+ std::shared_ptr<const Configuration> config() const { return config_; }
// List of logger nodes for given set of log files.
const auto &logger_nodes() const { return logger_nodes_; }
@@ -268,6 +281,15 @@
// Review its usage.
std::string_view name() const { return log_files_[0].name; }
+ // Returns true if the timestamps (remote and local) are stored only in files
+ // distinct from the data.
+ bool TimestampsStoredSeparately() const;
+
+ // Returns true if we have timestamps in any of the files.
+ bool HasTimestamps(std::string_view node_name) const;
+
+ const std::vector<LogFile> &log_files() const { return log_files_; }
+
private:
LogFilesContainer(std::optional<const LogSource *> log_source,
std::vector<LogFile> log_files);
@@ -275,7 +297,7 @@
std::optional<const LogSource *> log_source_;
std::vector<LogFile> log_files_;
- const Configuration *config_;
+ std::shared_ptr<const Configuration> config_;
std::shared_ptr<const Boots> boots_;
// Keeps information about nodes and number of reboots per node.
diff --git a/aos/events/logging/logfile_utils.cc b/aos/events/logging/logfile_utils.cc
index ec6bff3..02cff6d 100644
--- a/aos/events/logging/logfile_utils.cc
+++ b/aos/events/logging/logfile_utils.cc
@@ -1598,7 +1598,9 @@
PartsMessageReader::PartsMessageReader(LogPartsAccess log_parts_access)
: log_parts_access_(std::move(log_parts_access)),
- message_reader_(MakeSpanReader(log_parts_access_, 0)) {
+ message_reader_(MakeSpanReader(log_parts_access_, 0)),
+ max_out_of_order_duration_(
+ log_parts_access_.max_out_of_order_duration()) {
if (log_parts_access_.size() >= 2) {
next_message_reader_.emplace(MakeSpanReader(log_parts_access_, 1));
}
@@ -1606,8 +1608,9 @@
}
void PartsMessageReader::ComputeBootCounts() {
- boot_counts_.assign(configuration::NodesCount(log_parts_access_.config()),
- std::nullopt);
+ boot_counts_.assign(
+ configuration::NodesCount(log_parts_access_.config().get()),
+ std::nullopt);
const auto boots = log_parts_access_.parts().boots;
@@ -1634,7 +1637,7 @@
// Older multi-node logs which are guarenteed to have UUIDs logged, or
// single node log files with boot UUIDs in the header. We only know how to
// order certain boots in certain circumstances.
- if (configuration::MultiNode(log_parts_access_.config()) || boots) {
+ if (configuration::MultiNode(log_parts_access_.config().get()) || boots) {
for (size_t node_index = 0; node_index < boot_counts_.size();
++node_index) {
if (boots->boots[node_index].size() == 1u) {
@@ -1702,11 +1705,9 @@
}
bool Message::operator<(const Message &m2) const {
- CHECK_EQ(this->timestamp.boot, m2.timestamp.boot);
-
- if (this->timestamp.time < m2.timestamp.time) {
+ if (this->timestamp < m2.timestamp) {
return true;
- } else if (this->timestamp.time > m2.timestamp.time) {
+ } else if (this->timestamp > m2.timestamp) {
return false;
}
@@ -1721,66 +1722,69 @@
bool Message::operator>=(const Message &m2) const { return !(*this < m2); }
bool Message::operator==(const Message &m2) const {
- CHECK_EQ(this->timestamp.boot, m2.timestamp.boot);
-
- return timestamp.time == m2.timestamp.time &&
- channel_index == m2.channel_index && queue_index == m2.queue_index;
+ return timestamp == m2.timestamp && channel_index == m2.channel_index &&
+ queue_index == m2.queue_index;
}
-std::ostream &operator<<(std::ostream &os, const UnpackedMessageHeader &m) {
- os << "{.channel_index=" << m.channel_index
- << ", .monotonic_sent_time=" << m.monotonic_sent_time
- << ", .realtime_sent_time=" << m.realtime_sent_time
- << ", .queue_index=" << m.queue_index;
- if (m.monotonic_remote_time) {
- os << ", .monotonic_remote_time=" << *m.monotonic_remote_time;
+bool Message::operator<=(const Message &m2) const {
+ return *this == m2 || *this < m2;
+}
+
+std::ostream &operator<<(std::ostream &os, const UnpackedMessageHeader &msg) {
+ os << "{.channel_index=" << msg.channel_index
+ << ", .monotonic_sent_time=" << msg.monotonic_sent_time
+ << ", .realtime_sent_time=" << msg.realtime_sent_time
+ << ", .queue_index=" << msg.queue_index;
+ if (msg.monotonic_remote_time) {
+ os << ", .monotonic_remote_time=" << *msg.monotonic_remote_time;
}
os << ", .realtime_remote_time=";
- PrintOptionalOrNull(&os, m.realtime_remote_time);
+ PrintOptionalOrNull(&os, msg.realtime_remote_time);
os << ", .remote_queue_index=";
- PrintOptionalOrNull(&os, m.remote_queue_index);
- if (m.has_monotonic_timestamp_time) {
- os << ", .monotonic_timestamp_time=" << m.monotonic_timestamp_time;
+ PrintOptionalOrNull(&os, msg.remote_queue_index);
+ if (msg.has_monotonic_timestamp_time) {
+ os << ", .monotonic_timestamp_time=" << msg.monotonic_timestamp_time;
}
os << "}";
return os;
}
-std::ostream &operator<<(std::ostream &os, const Message &m) {
- os << "{.channel_index=" << m.channel_index
- << ", .queue_index=" << m.queue_index << ", .timestamp=" << m.timestamp;
- if (m.data != nullptr) {
- if (m.data->remote_queue_index.has_value()) {
- os << ", .remote_queue_index=" << *m.data->remote_queue_index;
+std::ostream &operator<<(std::ostream &os, const Message &msg) {
+ os << "{.channel_index=" << msg.channel_index
+ << ", .queue_index=" << msg.queue_index
+ << ", .timestamp=" << msg.timestamp;
+ if (msg.data != nullptr) {
+ if (msg.data->remote_queue_index.has_value()) {
+ os << ", .remote_queue_index=" << *msg.data->remote_queue_index;
}
- if (m.data->monotonic_remote_time.has_value()) {
- os << ", .monotonic_remote_time=" << *m.data->monotonic_remote_time;
+ if (msg.data->monotonic_remote_time.has_value()) {
+ os << ", .monotonic_remote_time=" << *msg.data->monotonic_remote_time;
}
- os << ", .data=" << m.data;
+ os << ", .data=" << msg.data;
}
os << "}";
return os;
}
-std::ostream &operator<<(std::ostream &os, const TimestampedMessage &m) {
- os << "{.channel_index=" << m.channel_index
- << ", .queue_index=" << m.queue_index
- << ", .monotonic_event_time=" << m.monotonic_event_time
- << ", .realtime_event_time=" << m.realtime_event_time;
- if (m.remote_queue_index != BootQueueIndex::Invalid()) {
- os << ", .remote_queue_index=" << m.remote_queue_index;
+std::ostream &operator<<(std::ostream &os, const TimestampedMessage &msg) {
+ os << "{.channel_index=" << msg.channel_index
+ << ", .queue_index=" << msg.queue_index
+ << ", .monotonic_event_time=" << msg.monotonic_event_time
+ << ", .realtime_event_time=" << msg.realtime_event_time;
+ if (msg.remote_queue_index != BootQueueIndex::Invalid()) {
+ os << ", .remote_queue_index=" << msg.remote_queue_index;
}
- if (m.monotonic_remote_time != BootTimestamp::min_time()) {
- os << ", .monotonic_remote_time=" << m.monotonic_remote_time;
+ if (msg.monotonic_remote_time != BootTimestamp::min_time()) {
+ os << ", .monotonic_remote_time=" << msg.monotonic_remote_time;
}
- if (m.realtime_remote_time != realtime_clock::min_time) {
- os << ", .realtime_remote_time=" << m.realtime_remote_time;
+ if (msg.realtime_remote_time != realtime_clock::min_time) {
+ os << ", .realtime_remote_time=" << msg.realtime_remote_time;
}
- if (m.monotonic_timestamp_time != BootTimestamp::min_time()) {
- os << ", .monotonic_timestamp_time=" << m.monotonic_timestamp_time;
+ if (msg.monotonic_timestamp_time != BootTimestamp::min_time()) {
+ os << ", .monotonic_timestamp_time=" << msg.monotonic_timestamp_time;
}
- if (m.data != nullptr) {
- os << ", .data=" << *m.data;
+ if (msg.data != nullptr) {
+ os << ", .data=" << *msg.data;
} else {
os << ", .data=nullptr";
}
@@ -1805,40 +1809,41 @@
break;
}
- std::shared_ptr<UnpackedMessageHeader> m =
+ std::shared_ptr<UnpackedMessageHeader> msg =
parts_message_reader_.ReadMessage();
// No data left, sorted forever, work through what is left.
- if (!m) {
+ if (!msg) {
sorted_until_ = monotonic_clock::max_time;
break;
}
size_t monotonic_timestamp_boot = 0;
- if (m->has_monotonic_timestamp_time) {
+ if (msg->has_monotonic_timestamp_time) {
monotonic_timestamp_boot = parts().logger_boot_count;
}
size_t monotonic_remote_boot = 0xffffff;
- if (m->monotonic_remote_time.has_value()) {
- const Node *node =
- parts().config->nodes()->Get(source_node_index_[m->channel_index]);
+ if (msg->monotonic_remote_time.has_value()) {
+ const Node *node = parts().config->nodes()->Get(
+ source_node_index_[msg->channel_index]);
std::optional<size_t> boot = parts_message_reader_.boot_count(
- source_node_index_[m->channel_index]);
+ source_node_index_[msg->channel_index]);
CHECK(boot) << ": Failed to find boot for node '" << MaybeNodeName(node)
- << "', with index " << source_node_index_[m->channel_index];
+ << "', with index "
+ << source_node_index_[msg->channel_index];
monotonic_remote_boot = *boot;
}
messages_.insert(
- Message{.channel_index = m->channel_index,
+ Message{.channel_index = msg->channel_index,
.queue_index = BootQueueIndex{.boot = parts().boot_count,
- .index = m->queue_index},
+ .index = msg->queue_index},
.timestamp = BootTimestamp{.boot = parts().boot_count,
- .time = m->monotonic_sent_time},
+ .time = msg->monotonic_sent_time},
.monotonic_remote_boot = monotonic_remote_boot,
.monotonic_timestamp_boot = monotonic_timestamp_boot,
- .data = std::move(m)});
+ .data = std::move(msg)});
// Now, update sorted_until_ to match the new message.
if (parts_message_reader_.newest_timestamp() >
@@ -1862,6 +1867,8 @@
CHECK_GE(messages_.begin()->timestamp.time, last_message_time_)
<< DebugString() << " reading " << parts_message_reader_.filename();
last_message_time_ = messages_.begin()->timestamp.time;
+ VLOG(1) << this << " Front, sorted until " << sorted_until_ << " for "
+ << (*messages_.begin()) << " on " << parts_message_reader_.filename();
return &(*messages_.begin());
}
@@ -1872,9 +1879,9 @@
ss << "messages: [\n";
int count = 0;
bool no_dots = true;
- for (const Message &m : messages_) {
+ for (const Message &msg : messages_) {
if (count < 15 || count > static_cast<int>(messages_.size()) - 15) {
- ss << m << "\n";
+ ss << msg << "\n";
} else if (no_dots) {
ss << "...\n";
no_dots = false;
@@ -1885,31 +1892,25 @@
return ss.str();
}
-PartsMerger::PartsMerger(std::string_view node_name, size_t boot_count,
- const LogFilesContainer &log_files) {
- const auto parts = log_files.SelectParts(node_name, boot_count);
- node_ = configuration::GetNodeIndex(parts.config(), node_name);
-
- for (LogPartsAccess part : parts) {
- message_sorters_.emplace_back(std::move(part));
- }
-
- monotonic_start_time_ = monotonic_clock::max_time;
- realtime_start_time_ = realtime_clock::min_time;
- for (const MessageSorter &message_sorter : message_sorters_) {
+// Class to merge start times cleanly, reusably, and incrementally.
+class StartTimes {
+ public:
+ void Update(monotonic_clock::time_point new_monotonic_start_time,
+ realtime_clock::time_point new_realtime_start_time) {
// We want to capture the earliest meaningful start time here. The start
// time defaults to min_time when there's no meaningful value to report, so
// let's ignore those.
- if (message_sorter.monotonic_start_time() != monotonic_clock::min_time) {
+ if (new_monotonic_start_time != monotonic_clock::min_time) {
bool accept = false;
// We want to prioritize start times from the logger node. Really, we
// want to prioritize start times with a valid realtime_clock time. So,
// if we have a start time without a RT clock, prefer a start time with a
// RT clock, even it if is later.
- if (message_sorter.realtime_start_time() != realtime_clock::min_time) {
+ if (new_realtime_start_time != realtime_clock::min_time) {
// We've got a good one. See if the current start time has a good RT
// clock, or if we should use this one instead.
- if (message_sorter.monotonic_start_time() < monotonic_start_time_) {
+ if (new_monotonic_start_time < monotonic_start_time_ ||
+ monotonic_start_time_ == monotonic_clock::min_time) {
accept = true;
} else if (realtime_start_time_ == realtime_clock::min_time) {
// The previous start time doesn't have a good RT time, so it is very
@@ -1919,23 +1920,45 @@
}
} else if (realtime_start_time_ == realtime_clock::min_time) {
// We don't have a RT time, so take the oldest.
- if (message_sorter.monotonic_start_time() < monotonic_start_time_) {
+ if (new_monotonic_start_time < monotonic_start_time_ ||
+ monotonic_start_time_ == monotonic_clock::min_time) {
accept = true;
}
}
if (accept) {
- monotonic_start_time_ = message_sorter.monotonic_start_time();
- realtime_start_time_ = message_sorter.realtime_start_time();
+ monotonic_start_time_ = new_monotonic_start_time;
+ realtime_start_time_ = new_realtime_start_time;
}
}
}
- // If there was no meaningful start time reported, just use min_time.
- if (monotonic_start_time_ == monotonic_clock::max_time) {
- monotonic_start_time_ = monotonic_clock::min_time;
- realtime_start_time_ = realtime_clock::min_time;
+ monotonic_clock::time_point monotonic_start_time() const {
+ return monotonic_start_time_;
}
+ realtime_clock::time_point realtime_start_time() const {
+ return realtime_start_time_;
+ }
+
+ private:
+ monotonic_clock::time_point monotonic_start_time_ = monotonic_clock::min_time;
+ realtime_clock::time_point realtime_start_time_ = realtime_clock::min_time;
+};
+
+PartsMerger::PartsMerger(SelectedLogParts &&parts) {
+ node_ = configuration::GetNodeIndex(parts.config().get(), parts.node_name());
+
+ for (LogPartsAccess part : parts) {
+ message_sorters_.emplace_back(std::move(part));
+ }
+
+ StartTimes start_times;
+ for (const MessageSorter &message_sorter : message_sorters_) {
+ start_times.Update(message_sorter.monotonic_start_time(),
+ message_sorter.realtime_start_time());
+ }
+ monotonic_start_time_ = start_times.monotonic_start_time();
+ realtime_start_time_ = start_times.realtime_start_time();
}
std::vector<const LogParts *> PartsMerger::Parts() const {
@@ -1952,6 +1975,8 @@
if (current_ != nullptr) {
Message *result = current_->Front();
CHECK_GE(result->timestamp.time, last_message_time_);
+ VLOG(1) << this << " PartsMerger::Front for node " << node_name() << " "
+ << *result;
return result;
}
@@ -1960,25 +1985,25 @@
Message *oldest = nullptr;
sorted_until_ = monotonic_clock::max_time;
for (MessageSorter &message_sorter : message_sorters_) {
- Message *m = message_sorter.Front();
- if (!m) {
+ Message *msg = message_sorter.Front();
+ if (!msg) {
sorted_until_ = std::min(sorted_until_, message_sorter.sorted_until());
continue;
}
- if (oldest == nullptr || *m < *oldest) {
- oldest = m;
+ if (oldest == nullptr || *msg < *oldest) {
+ oldest = msg;
current_ = &message_sorter;
- } else if (*m == *oldest) {
+ } else if (*msg == *oldest) {
// Found a duplicate. If there is a choice, we want the one which has
// the timestamp time.
- if (!m->data->has_monotonic_timestamp_time) {
+ if (!msg->data->has_monotonic_timestamp_time) {
message_sorter.PopFront();
} else if (!oldest->data->has_monotonic_timestamp_time) {
current_->PopFront();
current_ = &message_sorter;
- oldest = m;
+ oldest = msg;
} else {
- CHECK_EQ(m->data->monotonic_timestamp_time,
+ CHECK_EQ(msg->data->monotonic_timestamp_time,
oldest->data->monotonic_timestamp_time);
message_sorter.PopFront();
}
@@ -1991,6 +2016,11 @@
if (oldest) {
CHECK_GE(oldest->timestamp.time, last_message_time_);
last_message_time_ = oldest->timestamp.time;
+ if (monotonic_oldest_time_ > oldest->timestamp.time) {
+ VLOG(1) << this << " Updating oldest to " << oldest->timestamp.time
+ << " for node " << node_name() << " with a start time of "
+ << monotonic_start_time_ << " " << *oldest;
+ }
monotonic_oldest_time_ =
std::min(monotonic_oldest_time_, oldest->timestamp.time);
} else {
@@ -1999,6 +2029,12 @@
// Return the oldest message found. This will be nullptr if nothing was
// found, indicating there is nothing left.
+ if (oldest) {
+ VLOG(1) << this << " PartsMerger::Front for node " << node_name() << " "
+ << *oldest;
+ } else {
+ VLOG(1) << this << " PartsMerger::Front for node " << node_name();
+ }
return oldest;
}
@@ -2009,29 +2045,52 @@
}
BootMerger::BootMerger(std::string_view node_name,
- const LogFilesContainer &log_files) {
+ const LogFilesContainer &log_files,
+ const std::vector<StoredDataType> &types)
+ : configuration_(log_files.config()),
+ node_(configuration::GetNodeIndex(configuration_.get(), node_name)) {
size_t number_of_boots = log_files.BootsForNode(node_name);
parts_mergers_.reserve(number_of_boots);
for (size_t i = 0; i < number_of_boots; ++i) {
VLOG(2) << "Boot " << i;
- parts_mergers_.emplace_back(
- std::make_unique<PartsMerger>(node_name, i, log_files));
+ SelectedLogParts selected_parts =
+ log_files.SelectParts(node_name, i, types);
+ // We are guarenteed to have something each boot, but not guarenteed to have
+ // both timestamps and data for each boot. If we don't have anything, don't
+ // create a parts merger. The rest of this class will detect that and
+ // ignore it as required.
+ if (selected_parts.empty()) {
+ parts_mergers_.emplace_back(nullptr);
+ } else {
+ parts_mergers_.emplace_back(
+ std::make_unique<PartsMerger>(std::move(selected_parts)));
+ }
}
}
-Message *BootMerger::Front() {
- Message *result = parts_mergers_[index_]->Front();
+std::string_view BootMerger::node_name() const {
+ return configuration::NodeName(configuration().get(), node());
+}
- if (result != nullptr) {
- return result;
+Message *BootMerger::Front() {
+ if (parts_mergers_[index_].get() != nullptr) {
+ Message *result = parts_mergers_[index_]->Front();
+
+ if (result != nullptr) {
+ VLOG(1) << this << " BootMerger::Front " << node_name() << " " << *result;
+ return result;
+ }
}
if (index_ + 1u == parts_mergers_.size()) {
// At the end of the last node merger, just return.
+ VLOG(1) << this << " BootMerger::Front " << node_name() << " nullptr";
return nullptr;
} else {
++index_;
- return Front();
+ Message *result = Front();
+ VLOG(1) << this << " BootMerger::Front " << node_name() << " " << *result;
+ return result;
}
}
@@ -2040,6 +2099,8 @@
std::vector<const LogParts *> BootMerger::Parts() const {
std::vector<const LogParts *> results;
for (const std::unique_ptr<PartsMerger> &parts_merger : parts_mergers_) {
+ if (!parts_merger) continue;
+
std::vector<const LogParts *> node_parts = parts_merger->Parts();
results.insert(results.end(), std::make_move_iterator(node_parts.begin()),
@@ -2049,17 +2110,241 @@
return results;
}
-TimestampMapper::TimestampMapper(std::string_view node_name,
- const LogFilesContainer &log_files)
- : boot_merger_(node_name, log_files),
- timestamp_callback_([](TimestampedMessage *) {}) {
- for (const LogParts *part : boot_merger_.Parts()) {
- if (!configuration_) {
- configuration_ = part->config;
- } else {
- CHECK_EQ(configuration_.get(), part->config.get());
- }
+monotonic_clock::time_point BootMerger::monotonic_start_time(
+ size_t boot) const {
+ CHECK_LT(boot, parts_mergers_.size());
+ if (parts_mergers_[boot]) {
+ return parts_mergers_[boot]->monotonic_start_time();
}
+ return monotonic_clock::min_time;
+}
+
+realtime_clock::time_point BootMerger::realtime_start_time(size_t boot) const {
+ CHECK_LT(boot, parts_mergers_.size());
+ if (parts_mergers_[boot]) {
+ return parts_mergers_[boot]->realtime_start_time();
+ }
+ return realtime_clock::min_time;
+}
+
+monotonic_clock::time_point BootMerger::monotonic_oldest_time(
+ size_t boot) const {
+ CHECK_LT(boot, parts_mergers_.size());
+ if (parts_mergers_[boot]) {
+ return parts_mergers_[boot]->monotonic_oldest_time();
+ }
+ return monotonic_clock::max_time;
+}
+
+bool BootMerger::started() const {
+ if (index_ == 0) {
+ if (!parts_mergers_[0]) {
+ return false;
+ }
+ return parts_mergers_[index_]->sorted_until() != monotonic_clock::min_time;
+ }
+ return true;
+}
+
+SplitTimestampBootMerger::SplitTimestampBootMerger(
+ std::string_view node_name, const LogFilesContainer &log_files,
+ TimestampQueueStrategy timestamp_queue_strategy)
+ : boot_merger_(node_name, log_files,
+ (timestamp_queue_strategy ==
+ TimestampQueueStrategy::kQueueTimestampsAtStartup)
+ ? std::vector<StoredDataType>{StoredDataType::DATA}
+ : std::vector<StoredDataType>{
+ StoredDataType::DATA, StoredDataType::TIMESTAMPS,
+ StoredDataType::REMOTE_TIMESTAMPS}) {
+ // Make the timestamp_boot_merger_ only if we are asked to, and if there are
+ // files to put in it. We don't need it for a data only log.
+ if (timestamp_queue_strategy ==
+ TimestampQueueStrategy::kQueueTimestampsAtStartup &&
+ log_files.HasTimestamps(node_name)) {
+ timestamp_boot_merger_ = std::make_unique<BootMerger>(
+ node_name, log_files,
+ std::vector<StoredDataType>{StoredDataType::TIMESTAMPS,
+ StoredDataType::REMOTE_TIMESTAMPS});
+ }
+
+ size_t number_of_boots = log_files.BootsForNode(node_name);
+ monotonic_start_time_.reserve(number_of_boots);
+ realtime_start_time_.reserve(number_of_boots);
+
+ // Start times are split across the timestamp boot merger, and data boot
+ // merger. Pull from both and combine them to get the same answer as before.
+ for (size_t i = 0u; i < number_of_boots; ++i) {
+ StartTimes start_times;
+
+ if (timestamp_boot_merger_) {
+ start_times.Update(timestamp_boot_merger_->monotonic_start_time(i),
+ timestamp_boot_merger_->realtime_start_time(i));
+ }
+
+ start_times.Update(boot_merger_.monotonic_start_time(i),
+ boot_merger_.realtime_start_time(i));
+
+ monotonic_start_time_.push_back(start_times.monotonic_start_time());
+ realtime_start_time_.push_back(start_times.realtime_start_time());
+ }
+}
+
+void SplitTimestampBootMerger::QueueTimestamps(
+ std::function<void(TimestampedMessage *)> fn,
+ const std::vector<size_t> &source_node) {
+ if (!timestamp_boot_merger_) {
+ return;
+ }
+
+ while (true) {
+ // Load all the timestamps. If we find data, ignore it and drop it on the
+ // floor. It will be read when boot_merger_ is used.
+ Message *msg = timestamp_boot_merger_->Front();
+ if (!msg) {
+ queue_timestamps_ran_ = true;
+ return;
+ }
+ if (source_node[msg->channel_index] != static_cast<size_t>(node())) {
+ timestamp_messages_.emplace_back(TimestampedMessage{
+ .channel_index = msg->channel_index,
+ .queue_index = msg->queue_index,
+ .monotonic_event_time = msg->timestamp,
+ .realtime_event_time = msg->data->realtime_sent_time,
+ .remote_queue_index =
+ BootQueueIndex{.boot = msg->monotonic_remote_boot,
+ .index = msg->data->remote_queue_index.value()},
+ .monotonic_remote_time = {msg->monotonic_remote_boot,
+ msg->data->monotonic_remote_time.value()},
+ .realtime_remote_time = msg->data->realtime_remote_time.value(),
+ .monotonic_timestamp_time = {msg->monotonic_timestamp_boot,
+ msg->data->monotonic_timestamp_time},
+ .data = std::move(msg->data)});
+
+ VLOG(2) << this << " Queued timestamp of " << timestamp_messages_.back();
+ fn(×tamp_messages_.back());
+ } else {
+ VLOG(2) << this << " Dropped data";
+ }
+ timestamp_boot_merger_->PopFront();
+ }
+
+ // TODO(austin): Push the queue into TimestampMapper instead. Have it pull
+ // all the timestamps. That will also make it so we don't have to clear the
+ // function.
+}
+
+std::string_view SplitTimestampBootMerger::node_name() const {
+ return configuration::NodeName(configuration().get(), node());
+}
+
+monotonic_clock::time_point SplitTimestampBootMerger::monotonic_start_time(
+ size_t boot) const {
+ CHECK_LT(boot, monotonic_start_time_.size());
+ return monotonic_start_time_[boot];
+}
+
+realtime_clock::time_point SplitTimestampBootMerger::realtime_start_time(
+ size_t boot) const {
+ CHECK_LT(boot, realtime_start_time_.size());
+ return realtime_start_time_[boot];
+}
+
+monotonic_clock::time_point SplitTimestampBootMerger::monotonic_oldest_time(
+ size_t boot) const {
+ if (!timestamp_boot_merger_) {
+ return boot_merger_.monotonic_oldest_time(boot);
+ }
+ return std::min(boot_merger_.monotonic_oldest_time(boot),
+ timestamp_boot_merger_->monotonic_oldest_time(boot));
+}
+
+Message *SplitTimestampBootMerger::Front() {
+ Message *boot_merger_front = boot_merger_.Front();
+
+ if (timestamp_boot_merger_) {
+ CHECK(queue_timestamps_ran_);
+ }
+
+ // timestamp_messages_ is a queue of TimestampedMessage, but we are supposed
+ // to return a Message. We need to convert the first message in the list
+ // before returning it (and comparing, honestly). Fill next_timestamp_ in if
+ // it is empty so the rest of the logic here can just look at next_timestamp_
+ // and use that instead.
+ if (!next_timestamp_ && !timestamp_messages_.empty()) {
+ auto &front = timestamp_messages_.front();
+ next_timestamp_ = Message{
+ .channel_index = front.channel_index,
+ .queue_index = front.queue_index,
+ .timestamp = front.monotonic_event_time,
+ .monotonic_remote_boot = front.remote_queue_index.boot,
+ .monotonic_timestamp_boot = front.monotonic_timestamp_time.boot,
+ .data = std::move(front.data),
+ };
+ timestamp_messages_.pop_front();
+ }
+
+ if (!next_timestamp_) {
+ message_source_ = MessageSource::kBootMerger;
+ if (boot_merger_front != nullptr) {
+ VLOG(1) << this << " SplitTimestampBootMerger::Front " << node_name()
+ << " " << *boot_merger_front;
+ } else {
+ VLOG(1) << this << " SplitTimestampBootMerger::Front " << node_name()
+ << " nullptr";
+ }
+ return boot_merger_front;
+ }
+
+ if (boot_merger_front == nullptr) {
+ message_source_ = MessageSource::kTimestampMessage;
+
+ VLOG(1) << this << " SplitTimestampBootMerger::Front " << node_name() << " "
+ << next_timestamp_.value();
+ return &next_timestamp_.value();
+ }
+
+ if (*boot_merger_front <= next_timestamp_.value()) {
+ if (*boot_merger_front == next_timestamp_.value()) {
+ VLOG(1) << this << " SplitTimestampBootMerger::Front " << node_name()
+ << " Dropping duplicate timestamp.";
+ next_timestamp_.reset();
+ }
+ message_source_ = MessageSource::kBootMerger;
+ if (boot_merger_front != nullptr) {
+ VLOG(1) << this << " SplitTimestampBootMerger::Front " << node_name()
+ << " " << *boot_merger_front;
+ } else {
+ VLOG(1) << this << " SplitTimestampBootMerger::Front " << node_name()
+ << " nullptr";
+ }
+ return boot_merger_front;
+ } else {
+ message_source_ = MessageSource::kTimestampMessage;
+ VLOG(1) << this << " SplitTimestampBootMerger::Front " << node_name() << " "
+ << next_timestamp_.value();
+ return &next_timestamp_.value();
+ }
+}
+
+void SplitTimestampBootMerger::PopFront() {
+ switch (message_source_) {
+ case MessageSource::kTimestampMessage:
+ CHECK(next_timestamp_.has_value());
+ next_timestamp_.reset();
+ break;
+ case MessageSource::kBootMerger:
+ boot_merger_.PopFront();
+ break;
+ }
+}
+
+TimestampMapper::TimestampMapper(
+ std::string_view node_name, const LogFilesContainer &log_files,
+ TimestampQueueStrategy timestamp_queue_strategy)
+ : boot_merger_(node_name, log_files, timestamp_queue_strategy),
+ timestamp_callback_([](TimestampedMessage *) {}) {
+ configuration_ = boot_merger_.configuration();
+
const Configuration *config = configuration_.get();
// Only fill out nodes_data_ if there are nodes. Otherwise, everything is
// pretty simple.
@@ -2118,17 +2403,18 @@
}
}
-void TimestampMapper::QueueMessage(Message *m) {
+void TimestampMapper::QueueMessage(Message *msg) {
matched_messages_.emplace_back(
- TimestampedMessage{.channel_index = m->channel_index,
- .queue_index = m->queue_index,
- .monotonic_event_time = m->timestamp,
- .realtime_event_time = m->data->realtime_sent_time,
+ TimestampedMessage{.channel_index = msg->channel_index,
+ .queue_index = msg->queue_index,
+ .monotonic_event_time = msg->timestamp,
+ .realtime_event_time = msg->data->realtime_sent_time,
.remote_queue_index = BootQueueIndex::Invalid(),
.monotonic_remote_time = BootTimestamp::min_time(),
.realtime_remote_time = realtime_clock::min_time,
.monotonic_timestamp_time = BootTimestamp::min_time(),
- .data = std::move(m->data)});
+ .data = std::move(msg->data)});
+ VLOG(1) << node_name() << " Inserted " << matched_messages_.back();
}
TimestampedMessage *TimestampMapper::Front() {
@@ -2137,18 +2423,26 @@
case FirstMessage::kNeedsUpdate:
break;
case FirstMessage::kInMessage:
+ VLOG(1) << this << " TimestampMapper::Front " << node_name() << " "
+ << matched_messages_.front();
return &matched_messages_.front();
case FirstMessage::kNullptr:
+ VLOG(1) << this << " TimestampMapper::Front " << node_name()
+ << " nullptr";
return nullptr;
}
if (matched_messages_.empty()) {
if (!QueueMatched()) {
first_message_ = FirstMessage::kNullptr;
+ VLOG(1) << this << " TimestampMapper::Front " << node_name()
+ << " nullptr";
return nullptr;
}
}
first_message_ = FirstMessage::kInMessage;
+ VLOG(1) << this << " TimestampMapper::Front " << node_name() << " "
+ << matched_messages_.front();
return &matched_messages_.front();
}
@@ -2164,6 +2458,7 @@
const TimestampedMessage & /*message*/) {
if (replay_channels_callback_ &&
!replay_channels_callback_(matched_messages_.back())) {
+ VLOG(1) << node_name() << " Popped " << matched_messages_.back();
matched_messages_.pop_back();
return true;
}
@@ -2174,15 +2469,16 @@
if (nodes_data_.empty()) {
// Simple path. We are single node, so there are no timestamps to match!
CHECK_EQ(messages_.size(), 0u);
- Message *m = boot_merger_.Front();
- if (!m) {
+ Message *msg = boot_merger_.Front();
+ if (!msg) {
return MatchResult::kEndOfFile;
}
// Enqueue this message into matched_messages_ so we have a place to
// associate remote timestamps, and return it.
- QueueMessage(m);
+ QueueMessage(msg);
- CHECK_GE(matched_messages_.back().monotonic_event_time, last_message_time_);
+ CHECK_GE(matched_messages_.back().monotonic_event_time, last_message_time_)
+ << " on " << node_name();
last_message_time_ = matched_messages_.back().monotonic_event_time;
// We are thin wrapper around parts_merger. Call it directly.
@@ -2208,12 +2504,13 @@
boot_merger_.PopFront();
}
- Message *m = &(messages_.front());
+ Message *msg = &(messages_.front());
- if (source_node_[m->channel_index] == node()) {
+ if (source_node_[msg->channel_index] == node()) {
// From us, just forward it on, filling the remote data in as invalid.
- QueueMessage(m);
- CHECK_GE(matched_messages_.back().monotonic_event_time, last_message_time_);
+ QueueMessage(msg);
+ CHECK_GE(matched_messages_.back().monotonic_event_time, last_message_time_)
+ << " on " << node_name();
last_message_time_ = matched_messages_.back().monotonic_event_time;
messages_.pop_front();
timestamp_callback_(&matched_messages_.back());
@@ -2224,25 +2521,28 @@
} else {
// Got a timestamp, find the matching remote data, match it, and return
// it.
- Message data = MatchingMessageFor(*m);
+ Message data = MatchingMessageFor(*msg);
// Return the data from the remote. The local message only has timestamp
// info which isn't relevant anymore once extracted.
matched_messages_.emplace_back(TimestampedMessage{
- .channel_index = m->channel_index,
- .queue_index = m->queue_index,
- .monotonic_event_time = m->timestamp,
- .realtime_event_time = m->data->realtime_sent_time,
+ .channel_index = msg->channel_index,
+ .queue_index = msg->queue_index,
+ .monotonic_event_time = msg->timestamp,
+ .realtime_event_time = msg->data->realtime_sent_time,
.remote_queue_index =
- BootQueueIndex{.boot = m->monotonic_remote_boot,
- .index = m->data->remote_queue_index.value()},
- .monotonic_remote_time = {m->monotonic_remote_boot,
- m->data->monotonic_remote_time.value()},
- .realtime_remote_time = m->data->realtime_remote_time.value(),
- .monotonic_timestamp_time = {m->monotonic_timestamp_boot,
- m->data->monotonic_timestamp_time},
+ BootQueueIndex{.boot = msg->monotonic_remote_boot,
+ .index = msg->data->remote_queue_index.value()},
+ .monotonic_remote_time = {msg->monotonic_remote_boot,
+ msg->data->monotonic_remote_time.value()},
+ .realtime_remote_time = msg->data->realtime_remote_time.value(),
+ .monotonic_timestamp_time = {msg->monotonic_timestamp_boot,
+ msg->data->monotonic_timestamp_time},
.data = std::move(data.data)});
- CHECK_GE(matched_messages_.back().monotonic_event_time, last_message_time_);
+ VLOG(1) << node_name() << " Inserted timestamp "
+ << matched_messages_.back();
+ CHECK_GE(matched_messages_.back().monotonic_event_time, last_message_time_)
+ << " on " << node_name() << " " << matched_messages_.back();
last_message_time_ = matched_messages_.back().monotonic_event_time;
// Since messages_ holds the data, drop it.
messages_.pop_front();
@@ -2298,6 +2598,7 @@
last_popped_message_time_ = Front()->monotonic_event_time;
first_message_ = FirstMessage::kNeedsUpdate;
+ VLOG(1) << node_name() << " Popped " << matched_messages_.back();
matched_messages_.pop_front();
}
@@ -2391,9 +2692,9 @@
auto it = std::find_if(
data_queue->begin(), data_queue->end(),
[remote_queue_index,
- remote_boot = monotonic_remote_time.boot](const Message &m) {
- return m.queue_index == remote_queue_index &&
- m.timestamp.boot == remote_boot;
+ remote_boot = monotonic_remote_time.boot](const Message &msg) {
+ return msg.queue_index == remote_queue_index &&
+ msg.timestamp.boot == remote_boot;
});
if (it == data_queue->end()) {
return Message{.channel_index = message.channel_index,
@@ -2444,14 +2745,14 @@
}
bool TimestampMapper::Queue() {
- Message *m = boot_merger_.Front();
- if (m == nullptr) {
+ Message *msg = boot_merger_.Front();
+ if (msg == nullptr) {
return false;
}
for (NodeData &node_data : nodes_data_) {
if (!node_data.any_delivered) continue;
if (!node_data.save_for_peer) continue;
- if (node_data.channels[m->channel_index].delivered) {
+ if (node_data.channels[msg->channel_index].delivered) {
// If we have data but no timestamps (logs where the timestamps didn't get
// logged are classic), we can grow this indefinitely. We don't need to
// keep anything that is older than the last message returned.
@@ -2459,11 +2760,11 @@
// We have the time on the source node.
// We care to wait until we have the time on the destination node.
std::deque<Message> &messages =
- node_data.channels[m->channel_index].messages;
+ node_data.channels[msg->channel_index].messages;
// Max delay over the network is the TTL, so let's take the queue time and
// add TTL to it. Don't forget any messages which are reliable until
// someone can come up with a good reason to forget those too.
- if (node_data.channels[m->channel_index].time_to_live >
+ if (node_data.channels[msg->channel_index].time_to_live >
chrono::nanoseconds(0)) {
// We need to make *some* assumptions about network delay for this to
// work. We want to only look at the RX side. This means we need to
@@ -2477,21 +2778,25 @@
// messages getting sent twice.
while (messages.size() > 1u &&
messages.begin()->timestamp +
- node_data.channels[m->channel_index].time_to_live +
+ node_data.channels[msg->channel_index].time_to_live +
chrono::duration_cast<chrono::nanoseconds>(
chrono::duration<double>(FLAGS_max_network_delay)) <
last_popped_message_time_) {
messages.pop_front();
}
}
- node_data.channels[m->channel_index].messages.emplace_back(*m);
+ node_data.channels[msg->channel_index].messages.emplace_back(*msg);
}
}
- messages_.emplace_back(std::move(*m));
+ messages_.emplace_back(std::move(*msg));
return true;
}
+void TimestampMapper::QueueTimestamps() {
+ boot_merger_.QueueTimestamps(std::ref(timestamp_callback_), source_node_);
+}
+
std::string TimestampMapper::DebugString() const {
std::stringstream ss;
ss << "node " << node() << " (" << node_name() << ") [\n";
@@ -2510,8 +2815,8 @@
}
ss << " channel " << channel_index << " [\n";
- for (const Message &m : channel_data.messages) {
- ss << " " << m << "\n";
+ for (const Message &msg : channel_data.messages) {
+ ss << " " << msg << "\n";
}
ss << " ]\n";
++channel_index;
diff --git a/aos/events/logging/logfile_utils.h b/aos/events/logging/logfile_utils.h
index 18902d3..fb3fcbf 100644
--- a/aos/events/logging/logfile_utils.h
+++ b/aos/events/logging/logfile_utils.h
@@ -18,6 +18,7 @@
#include "absl/types/span.h"
#include "flatbuffers/flatbuffers.h"
+#include "aos/configuration.h"
#include "aos/containers/resizeable_buffer.h"
#include "aos/events/event_loop.h"
#include "aos/events/logging/boot_timestamp.h"
@@ -288,8 +289,8 @@
return raw_log_file_header_;
}
- // Returns the minimum maount of data needed to queue up for sorting before
- // ware guarenteed to not see data out of order.
+ // Returns the minimum amount of data needed to queue up for sorting before
+ // we're guarenteed to not see data out of order.
std::chrono::nanoseconds max_out_of_order_duration() const {
return max_out_of_order_duration_;
}
@@ -376,7 +377,7 @@
// Returns the minimum amount of data needed to queue up for sorting before
// we are guarenteed to not see data out of order.
std::chrono::nanoseconds max_out_of_order_duration() const {
- return message_reader_.max_out_of_order_duration();
+ return max_out_of_order_duration_;
}
// Returns the newest timestamp read out of the log file.
@@ -432,6 +433,8 @@
// Per node boot counts.
std::vector<std::optional<size_t>> boot_counts_;
+
+ const std::chrono::nanoseconds max_out_of_order_duration_;
};
// Stores MessageHeader as a flat header and inline, aligned block of data.
@@ -526,6 +529,7 @@
std::shared_ptr<UnpackedMessageHeader> data;
bool operator<(const Message &m2) const;
+ bool operator<=(const Message &m2) const;
bool operator>=(const Message &m2) const;
bool operator==(const Message &m2) const;
};
@@ -608,8 +612,7 @@
// boot.
class PartsMerger {
public:
- PartsMerger(std::string_view node_name, size_t boot_count,
- const LogFilesContainer &log_files);
+ PartsMerger(SelectedLogParts &&selected_parts);
// Copying and moving will mess up the internal raw pointers. Just don't do
// it.
@@ -621,11 +624,15 @@
// Node index in the configuration of this node.
int node() const { return node_; }
+ std::string_view node_name() const {
+ return configuration::NodeName(configuration().get(), node());
+ }
+
// List of parts being sorted together.
std::vector<const LogParts *> Parts() const;
- const Configuration *configuration() const {
- return message_sorters_[0].parts().config.get();
+ const std::shared_ptr<const Configuration> configuration() const {
+ return message_sorters_[0].parts().config;
}
monotonic_clock::time_point monotonic_start_time() const {
@@ -634,7 +641,15 @@
realtime_clock::time_point realtime_start_time() const {
return realtime_start_time_;
}
- monotonic_clock::time_point monotonic_oldest_time() const {
+
+ // Returns the oldest message observed in this set of parts. This could be
+ // before the start time if we fetched it at the start of logging from long
+ // ago.
+ monotonic_clock::time_point monotonic_oldest_time() {
+ if (monotonic_oldest_time_ == monotonic_clock::max_time) {
+ VLOG(1) << "No oldest message time, fetching " << node_name();
+ (void)Front();
+ }
return monotonic_oldest_time_;
}
@@ -675,7 +690,8 @@
// stream.
class BootMerger {
public:
- BootMerger(std::string_view node_name, const LogFilesContainer &log_files);
+ BootMerger(std::string_view node_name, const LogFilesContainer &log_files,
+ const std::vector<StoredDataType> &types);
// Copying and moving will mess up the internal raw pointers. Just don't do
// it.
@@ -685,33 +701,21 @@
void operator=(BootMerger &&) = delete;
// Node index in the configuration of this node.
- int node() const { return parts_mergers_[0]->node(); }
+ int node() const { return node_; }
+ std::string_view node_name() const;
// List of parts being sorted together.
std::vector<const LogParts *> Parts() const;
- const Configuration *configuration() const {
- return parts_mergers_[0]->configuration();
+ const std::shared_ptr<const Configuration> configuration() const {
+ return configuration_;
}
- monotonic_clock::time_point monotonic_start_time(size_t boot) const {
- CHECK_LT(boot, parts_mergers_.size());
- return parts_mergers_[boot]->monotonic_start_time();
- }
- realtime_clock::time_point realtime_start_time(size_t boot) const {
- CHECK_LT(boot, parts_mergers_.size());
- return parts_mergers_[boot]->realtime_start_time();
- }
- monotonic_clock::time_point monotonic_oldest_time(size_t boot) const {
- CHECK_LT(boot, parts_mergers_.size());
- return parts_mergers_[boot]->monotonic_oldest_time();
- }
+ monotonic_clock::time_point monotonic_start_time(size_t boot) const;
+ realtime_clock::time_point realtime_start_time(size_t boot) const;
+ monotonic_clock::time_point monotonic_oldest_time(size_t boot) const;
- bool started() const {
- return parts_mergers_[index_]->sorted_until() !=
- monotonic_clock::min_time ||
- index_ != 0;
- }
+ bool started() const;
// Returns the next sorted message from the set of log files. It is safe to
// call std::move() on the result to move the data flatbuffer from it.
@@ -725,7 +729,101 @@
// TODO(austin): Sanjay points out this is pretty inefficient. Don't keep so
// many things open.
+ // A list of all the parts mergers. Only the boots with something to sort are
+ // instantiated.
std::vector<std::unique_ptr<PartsMerger>> parts_mergers_;
+
+ std::shared_ptr<const Configuration> configuration_;
+ int node_;
+};
+
+enum class TimestampQueueStrategy {
+ // Read the timestamps at the same time as all the other data.
+ kQueueTogether,
+ // Read the timestamps first.
+ kQueueTimestampsAtStartup,
+};
+
+// Class to manage queueing up timestamps from BootMerger and notifying
+// TimestampMapper of them.
+class SplitTimestampBootMerger {
+ public:
+ SplitTimestampBootMerger(std::string_view node_name,
+ const LogFilesContainer &log_files,
+ TimestampQueueStrategy timestamp_queue_strategy);
+
+ // Copying and moving will mess up the internal raw pointers. Just don't do
+ // it.
+ SplitTimestampBootMerger(SplitTimestampBootMerger const &) = delete;
+ SplitTimestampBootMerger(SplitTimestampBootMerger &&) = delete;
+ void operator=(SplitTimestampBootMerger const &) = delete;
+ void operator=(SplitTimestampBootMerger &&) = delete;
+
+ // Reads all timestamps into a member variable queue, and calls the function
+ // on each timestamp. This only saves timestamps, which are defined as
+ // messages sent on this node, but not originally from this node. To make
+ // that distinction, source_node is provided which has a list of which node
+ // index is the source node for each channel, where the channel index is the
+ // array index.
+ void QueueTimestamps(std::function<void(TimestampedMessage *)> fn,
+ const std::vector<size_t> &source_node);
+
+ // Node index in the configuration of this node.
+ int node() const { return boot_merger_.node(); }
+ // Returns the name of the node this class is sorting for.
+ std::string_view node_name() const;
+
+ std::shared_ptr<const Configuration> configuration() const {
+ return boot_merger_.configuration();
+ }
+
+ monotonic_clock::time_point monotonic_start_time(size_t boot) const;
+ realtime_clock::time_point realtime_start_time(size_t boot) const;
+ monotonic_clock::time_point monotonic_oldest_time(size_t boot) const;
+
+ // Returns true if the log has been started.
+ bool started() const {
+ // Timestamps don't count, so only track boot_merger_.
+ return boot_merger_.started();
+ }
+
+ // Returns the next sorted message from the set of log files. It is safe to
+ // call std::move() on the result to move the data flatbuffer from it.
+ Message *Front();
+
+ // Pops the front message. This should only be called after a call to
+ // Front().
+ void PopFront();
+
+ private:
+ enum class MessageSource {
+ kTimestampMessage,
+ kBootMerger,
+ };
+
+ MessageSource message_source_ = MessageSource::kBootMerger;
+
+ // Boot merger for data and potentially timestamps.
+ BootMerger boot_merger_;
+
+ // Boot merger for just timestamps. Any data read from here is to be ignored.
+ std::unique_ptr<BootMerger> timestamp_boot_merger_;
+
+ // The callback requires us to convert each message to a TimestampedMessage.
+ std::deque<TimestampedMessage> timestamp_messages_;
+
+ // Storage for the next timestamp message to return. This is separate so we
+ // can convert them back to a Message.
+ //
+ // TODO(austin): It would be nice to not have to convert...
+ std::optional<Message> next_timestamp_;
+
+ // Start times for each boot.
+ std::vector<monotonic_clock::time_point> monotonic_start_time_;
+ std::vector<realtime_clock::time_point> realtime_start_time_;
+
+ // Tracks if QueueTimestamps loaded any timestamps.
+ bool queue_timestamps_ran_ = false;
};
// Class to match timestamps with the corresponding data from other nodes.
@@ -735,7 +833,8 @@
class TimestampMapper {
public:
TimestampMapper(std::string_view node_name,
- const LogFilesContainer &log_files);
+ const LogFilesContainer &log_files,
+ TimestampQueueStrategy timestamp_queue_strategy);
// Copying and moving will mess up the internal raw pointers. Just don't do
// it.
@@ -782,6 +881,11 @@
// Returns debug information about this node.
std::string DebugString() const;
+ // Queues just the timestamps so that the timestamp callback gets called.
+ // Note, the timestamp callback will get called when they get returned too, so
+ // make sure to unset it if you don't want to be called twice.
+ void QueueTimestamps();
+
// Queues data the provided time.
void QueueUntil(BootTimestamp queue_time);
// Queues until we have time_estimation_buffer of data in the queue.
@@ -880,15 +984,11 @@
// Returns the name of the node this class is sorting for.
std::string_view node_name() const {
- return configuration_->has_nodes() ? configuration_->nodes()
- ->Get(boot_merger_.node())
- ->name()
- ->string_view()
- : "(single node)";
+ return configuration::NodeName(configuration(), node());
}
// The node merger to source messages from.
- BootMerger boot_merger_;
+ SplitTimestampBootMerger boot_merger_;
std::shared_ptr<const Configuration> configuration_;
diff --git a/aos/events/logging/logfile_utils_test.cc b/aos/events/logging/logfile_utils_test.cc
index f4071e4..ea56d8d 100644
--- a/aos/events/logging/logfile_utils_test.cc
+++ b/aos/events/logging/logfile_utils_test.cc
@@ -216,8 +216,13 @@
writer.QueueSpan(m2.span());
}
+ // When parts are sorted, we choose the highest max out of order duration for
+ // all parts with the same part uuid.
const std::vector<LogFile> parts = SortParts({logfile0, logfile1});
+ EXPECT_EQ(parts.size(), 1);
+ EXPECT_EQ(parts[0].parts.size(), 1);
+
PartsMessageReader reader(parts[0].parts[0]);
EXPECT_EQ(reader.filename(), logfile0);
@@ -225,16 +230,18 @@
// Confirm that the timestamps track, and the filename also updates.
// Read the first message.
EXPECT_EQ(reader.newest_timestamp(), monotonic_clock::min_time);
+ // Since config1 has higher max out of order duration, that will be used to
+ // read partfiles with same part uuid, i.e logfile0 and logfile1.
EXPECT_EQ(
reader.max_out_of_order_duration(),
- std::chrono::nanoseconds(config0.message().max_out_of_order_duration()));
+ std::chrono::nanoseconds(config1.message().max_out_of_order_duration()));
EXPECT_TRUE(reader.ReadMessage());
EXPECT_EQ(reader.filename(), logfile0);
EXPECT_EQ(reader.newest_timestamp(),
monotonic_clock::time_point(chrono::nanoseconds(1)));
EXPECT_EQ(
reader.max_out_of_order_duration(),
- std::chrono::nanoseconds(config0.message().max_out_of_order_duration()));
+ std::chrono::nanoseconds(config1.message().max_out_of_order_duration()));
// Read the second message.
EXPECT_TRUE(reader.ReadMessage());
@@ -252,6 +259,11 @@
reader.max_out_of_order_duration(),
std::chrono::nanoseconds(config1.message().max_out_of_order_duration()));
EXPECT_EQ(reader.newest_timestamp(), monotonic_clock::max_time);
+
+ // Verify that the parts metadata has the correct max out of order duration.
+ EXPECT_EQ(
+ parts[0].parts[0].max_out_of_order_duration,
+ std::chrono::nanoseconds(config1.message().max_out_of_order_duration()));
}
// Tests that Message's operator < works as expected.
@@ -767,7 +779,10 @@
LogFilesContainer log_files(parts);
ASSERT_EQ(parts.size(), 1u);
- PartsMerger merger("pi1", 0, log_files);
+ PartsMerger merger(
+ log_files.SelectParts("pi1", 0,
+ {StoredDataType::DATA, StoredDataType::TIMESTAMPS,
+ StoredDataType::REMOTE_TIMESTAMPS}));
EXPECT_EQ(merger.sorted_until(), monotonic_clock::min_time);
@@ -869,7 +884,10 @@
LogFilesContainer log_files(parts);
ASSERT_EQ(parts.size(), 1u);
- PartsMerger merger("pi1", 0, log_files);
+ PartsMerger merger(
+ log_files.SelectParts("pi1", 0,
+ {StoredDataType::DATA, StoredDataType::TIMESTAMPS,
+ StoredDataType::REMOTE_TIMESTAMPS}));
EXPECT_EQ(merger.sorted_until(), monotonic_clock::min_time);
@@ -937,11 +955,13 @@
size_t mapper0_count = 0;
- TimestampMapper mapper0("pi1", log_files);
+ TimestampMapper mapper0("pi1", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper0.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper0_count; });
size_t mapper1_count = 0;
- TimestampMapper mapper1("pi2", log_files);
+ TimestampMapper mapper1("pi2", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper1.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper1_count; });
@@ -1082,13 +1102,15 @@
// messages due to the channel filter callbacks used
size_t mapper0_count = 0;
- TimestampMapper mapper0("pi1", log_files);
+ TimestampMapper mapper0("pi1", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper0.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper0_count; });
mapper0.set_replay_channels_callback(
[&](const TimestampedMessage &) -> bool { return mapper0_count != 2; });
size_t mapper1_count = 0;
- TimestampMapper mapper1("pi2", log_files);
+ TimestampMapper mapper1("pi2", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper1.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper1_count; });
mapper1.set_replay_channels_callback(
@@ -1219,11 +1241,13 @@
ASSERT_EQ(parts.size(), 1u);
size_t mapper0_count = 0;
- TimestampMapper mapper0("pi1", log_files);
+ TimestampMapper mapper0("pi1", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper0.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper0_count; });
size_t mapper1_count = 0;
- TimestampMapper mapper1("pi2", log_files);
+ TimestampMapper mapper1("pi2", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper1.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper1_count; });
@@ -1341,11 +1365,13 @@
ASSERT_EQ(parts[1].logger_node, "pi2");
size_t mapper0_count = 0;
- TimestampMapper mapper0("pi1", log_files);
+ TimestampMapper mapper0("pi1", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper0.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper0_count; });
size_t mapper1_count = 0;
- TimestampMapper mapper1("pi2", log_files);
+ TimestampMapper mapper1("pi2", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper1.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper1_count; });
@@ -1461,11 +1487,13 @@
ASSERT_EQ(parts[1].logger_node, "pi2");
size_t mapper0_count = 0;
- TimestampMapper mapper0("pi1", log_files);
+ TimestampMapper mapper0("pi1", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper0.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper0_count; });
size_t mapper1_count = 0;
- TimestampMapper mapper1("pi2", log_files);
+ TimestampMapper mapper1("pi2", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper1.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper1_count; });
@@ -1545,11 +1573,13 @@
ASSERT_EQ(parts[1].logger_node, "pi2");
size_t mapper0_count = 0;
- TimestampMapper mapper0("pi1", log_files);
+ TimestampMapper mapper0("pi1", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper0.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper0_count; });
size_t mapper1_count = 0;
- TimestampMapper mapper1("pi2", log_files);
+ TimestampMapper mapper1("pi2", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper1.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper1_count; });
@@ -1630,11 +1660,13 @@
ASSERT_EQ(parts[1].logger_node, "pi2");
size_t mapper0_count = 0;
- TimestampMapper mapper0("pi1", log_files);
+ TimestampMapper mapper0("pi1", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper0.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper0_count; });
size_t mapper1_count = 0;
- TimestampMapper mapper1("pi2", log_files);
+ TimestampMapper mapper1("pi2", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper1.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper1_count; });
@@ -1705,11 +1737,13 @@
ASSERT_EQ(parts[1].logger_node, "pi2");
size_t mapper0_count = 0;
- TimestampMapper mapper0("pi1", log_files);
+ TimestampMapper mapper0("pi1", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper0.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper0_count; });
size_t mapper1_count = 0;
- TimestampMapper mapper1("pi2", log_files);
+ TimestampMapper mapper1("pi2", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper1.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper1_count; });
@@ -1769,7 +1803,8 @@
LogFilesContainer log_files(parts);
size_t mapper0_count = 0;
- TimestampMapper mapper0("pi1", log_files);
+ TimestampMapper mapper0("pi1", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper0.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper0_count; });
@@ -1811,7 +1846,8 @@
ASSERT_EQ(parts[1].logger_node, "pi2");
size_t mapper1_count = 0;
- TimestampMapper mapper1("pi2", log_files);
+ TimestampMapper mapper1("pi2", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper1.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper1_count; });
@@ -1885,11 +1921,13 @@
ASSERT_EQ(parts[1].logger_node, "pi2");
size_t mapper0_count = 0;
- TimestampMapper mapper0("pi1", log_files);
+ TimestampMapper mapper0("pi1", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper0.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper0_count; });
size_t mapper1_count = 0;
- TimestampMapper mapper1("pi2", log_files);
+ TimestampMapper mapper1("pi2", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper1.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper1_count; });
@@ -2142,7 +2180,9 @@
ASSERT_EQ(parts.size(), 1u);
ASSERT_EQ(parts[0].parts.size(), 2u);
- BootMerger merger("pi2", log_files);
+ BootMerger merger("pi2", log_files,
+ {StoredDataType::DATA, StoredDataType::TIMESTAMPS,
+ StoredDataType::REMOTE_TIMESTAMPS});
EXPECT_EQ(merger.node(), 1u);
@@ -2326,11 +2366,13 @@
ASSERT_EQ(parts[0].logger_node, "pi1");
size_t mapper0_count = 0;
- TimestampMapper mapper0("pi1", log_files);
+ TimestampMapper mapper0("pi1", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper0.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper0_count; });
size_t mapper1_count = 0;
- TimestampMapper mapper1("pi2", log_files);
+ TimestampMapper mapper1("pi2", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper1.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper1_count; });
@@ -2545,11 +2587,13 @@
ASSERT_EQ(parts[0].logger_node, "pi1");
size_t mapper0_count = 0;
- TimestampMapper mapper0("pi1", log_files);
+ TimestampMapper mapper0("pi1", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper0.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper0_count; });
size_t mapper1_count = 0;
- TimestampMapper mapper1("pi2", log_files);
+ TimestampMapper mapper1("pi2", log_files,
+ TimestampQueueStrategy::kQueueTogether);
mapper1.set_timestamp_callback(
[&](TimestampedMessage *) { ++mapper1_count; });
diff --git a/aos/events/logging/logfile_validator.cc b/aos/events/logging/logfile_validator.cc
new file mode 100644
index 0000000..19d11c8
--- /dev/null
+++ b/aos/events/logging/logfile_validator.cc
@@ -0,0 +1,162 @@
+
+#include "aos/events/logging/logfile_validator.h"
+
+#include "aos/events/logging/logfile_sorting.h"
+#include "aos/events/logging/logfile_utils.h"
+#include "aos/events/logging/logfile_validator.h"
+#include "aos/network/multinode_timestamp_filter.h"
+
+namespace aos::logger {
+bool MultiNodeLogIsReadable(const LogFilesContainer &log_files,
+ bool skip_order_validation) {
+ const Configuration *config = log_files.config().get();
+
+ CHECK(configuration::MultiNode(config))
+ << ": Timestamps only make sense in a multi-node world.";
+
+ // Now, build up all the TimestampMapper classes to read and sort the data.
+ std::vector<std::unique_ptr<TimestampMapper>> mappers;
+
+ for (const Node *node : configuration::GetNodes(config)) {
+ auto node_name = MaybeNodeName(node);
+ // Confirm that all the parts are from the same boot if there are enough
+ // parts to not be from the same boot.
+ if (log_files.ContainsPartsForNode(node_name)) {
+ // Filter the parts relevant to each node when building the mapper.
+ mappers.emplace_back(std::make_unique<TimestampMapper>(
+ node_name, log_files, TimestampQueueStrategy::kQueueTogether));
+ } else {
+ mappers.emplace_back(nullptr);
+ }
+ }
+
+ // Now, build up the estimator used to solve for time.
+ message_bridge::MultiNodeNoncausalOffsetEstimator multinode_estimator(
+ config, config, log_files.boots(), skip_order_validation,
+ std::chrono::seconds(0));
+ multinode_estimator.set_reboot_found(
+ [config](distributed_clock::time_point reboot_time,
+ const std::vector<logger::BootTimestamp> &node_times) {
+ LOG(INFO) << "Rebooted at distributed " << reboot_time;
+ size_t node_index = 0;
+ for (const logger::BootTimestamp &time : node_times) {
+ LOG(INFO) << " "
+ << config->nodes()->Get(node_index)->name()->string_view()
+ << " " << time;
+ ++node_index;
+ }
+ });
+
+ // Because RAII doesn't let us do non-fatal/non-exception things, use this
+ // when returning to handle certain cleanup-related checks that would normally
+ // happen fatally in the estimator destrictor.
+ auto preempt_destructor = [&multinode_estimator](bool success) {
+ if (!multinode_estimator.RunDestructorChecks()) {
+ return false;
+ }
+ return success;
+ };
+
+ {
+ std::vector<TimestampMapper *> timestamp_mappers;
+ for (std::unique_ptr<TimestampMapper> &mapper : mappers) {
+ timestamp_mappers.emplace_back(mapper.get());
+ }
+ multinode_estimator.SetTimestampMappers(std::move(timestamp_mappers));
+ }
+
+ // To make things more like the logger and faster, cache the node + channel ->
+ // filter mapping in a set of vectors.
+ std::vector<std::vector<message_bridge::NoncausalOffsetEstimator *>> filters;
+ filters.resize(configuration::NodesCount(config));
+
+ for (const Node *node : configuration::GetNodes(config)) {
+ const size_t node_index = configuration::GetNodeIndex(config, node);
+ filters[node_index].resize(config->channels()->size(), nullptr);
+ for (size_t channel_index = 0; channel_index < config->channels()->size();
+ ++channel_index) {
+ const Channel *channel = config->channels()->Get(channel_index);
+
+ if (!configuration::ChannelIsSendableOnNode(channel, node) &&
+ configuration::ChannelIsReadableOnNode(channel, node)) {
+ // We've got a message which is being forwarded to this node.
+ const Node *source_node = configuration::GetNode(
+ config, channel->source_node()->string_view());
+ filters[node_index][channel_index] =
+ multinode_estimator.GetFilter(node, source_node);
+ }
+ }
+ }
+
+ multinode_estimator.CheckGraph();
+
+ // Now, read all the timestamps for each node. This is simpler than the
+ // logger on purpose. It loads in *all* the timestamps in 1 go per node,
+ // ignoring memory usage.
+ for (const Node *node : configuration::GetNodes(config)) {
+ LOG(INFO) << "Reading all data for " << node->name()->string_view();
+ const size_t node_index = configuration::GetNodeIndex(config, node);
+ TimestampMapper *timestamp_mapper = mappers[node_index].get();
+ if (timestamp_mapper == nullptr) {
+ continue;
+ }
+ while (true) {
+ TimestampedMessage *m = timestamp_mapper->Front();
+ if (m == nullptr) {
+ break;
+ }
+ timestamp_mapper->PopFront();
+ }
+ }
+
+ // Don't get clever. Use the first time as the start time. Note: this is
+ // different than how log_cat and others work.
+ std::optional<std::optional<const std::tuple<distributed_clock::time_point,
+ std::vector<BootTimestamp>> *>>
+ next_timestamp = multinode_estimator.QueueNextTimestamp();
+ if (!next_timestamp.has_value() || !next_timestamp.value().has_value()) {
+ return preempt_destructor(false);
+ }
+ LOG(INFO) << "Starting at:";
+ for (const Node *node : configuration::GetNodes(config)) {
+ const size_t node_index = configuration::GetNodeIndex(config, node);
+ LOG(INFO) << " " << node->name()->string_view() << " -> "
+ << std::get<1>(*next_timestamp.value().value())[node_index].time;
+ }
+
+ std::vector<monotonic_clock::time_point> just_monotonic(
+ std::get<1>(*next_timestamp.value().value()).size());
+ for (size_t i = 0; i < just_monotonic.size(); ++i) {
+ CHECK_EQ(std::get<1>(*next_timestamp.value().value())[i].boot, 0u);
+ just_monotonic[i] = std::get<1>(*next_timestamp.value().value())[i].time;
+ }
+ multinode_estimator.Start(just_monotonic);
+
+ // As we pull off all the timestamps, the time problem is continually solved,
+ // filling in the CSV files.
+ while (true) {
+ std::optional<std::optional<const std::tuple<distributed_clock::time_point,
+ std::vector<BootTimestamp>> *>>
+ next_timestamp = multinode_estimator.QueueNextTimestamp();
+ if (!next_timestamp.has_value()) {
+ return preempt_destructor(false);
+ }
+ if (!next_timestamp.value().has_value()) {
+ break;
+ }
+ multinode_estimator.ObserveTimePassed(
+ std::get<0>(*next_timestamp.value().value()));
+ }
+
+ LOG(INFO) << "Done";
+
+ return preempt_destructor(true);
+}
+
+bool LogIsReadableIfMultiNode(const LogFilesContainer &log_files) {
+ if (aos::configuration::NodesCount(log_files.config().get()) == 1u) {
+ return true;
+ }
+ return MultiNodeLogIsReadable(log_files);
+}
+} // namespace aos::logger
diff --git a/aos/events/logging/logfile_validator.h b/aos/events/logging/logfile_validator.h
new file mode 100644
index 0000000..814b89e
--- /dev/null
+++ b/aos/events/logging/logfile_validator.h
@@ -0,0 +1,16 @@
+#ifndef AOS_EVENTS_LOGGING_LOGFILE_VALIDATOR_H_
+#define AOS_EVENTS_LOGGING_LOGFILE_VALIDATOR_H_
+#include "aos/events/logging/logfile_sorting.h"
+namespace aos::logger {
+// Attempts to validate that a log is readable without actually running the full
+// LogReader. This aims to allow the user to preempt fatal crashes that can
+// occur when trying to replay a log.
+// Returns true if successful.
+bool MultiNodeLogIsReadable(const LogFilesContainer &log_files,
+ bool skip_order_validation = false);
+
+// Returns true if the requested log is either a single-node log or if the
+// MultiNodeLogIsReadable() returns true.
+bool LogIsReadableIfMultiNode(const LogFilesContainer &log_files);
+} // namespace aos::logger
+#endif // AOS_EVENTS_LOGGING_LOGFILE_VALIDATOR_H_
diff --git a/aos/events/logging/logger.fbs b/aos/events/logging/logger.fbs
index ec1a450..5733fa0 100644
--- a/aos/events/logging/logger.fbs
+++ b/aos/events/logging/logger.fbs
@@ -2,6 +2,15 @@
namespace aos.logger;
+enum StoredDataType : uint8 {
+ // We have message data in this file.
+ DATA = 0,
+ // We have timestamps of delivered data in this file.
+ TIMESTAMPS = 1,
+ // We have remote timestamps in this file.
+ REMOTE_TIMESTAMPS = 2,
+}
+
// A log file is a sequence of size prefixed flatbuffers.
// The first flatbuffer will be the LogFileHeader, followed by an arbitrary
// number of MessageHeaders.
@@ -32,6 +41,10 @@
// at most this duration (in nanoseconds). If the log reader buffers until
// it finds messages this much newer than it's simulation time, it will never
// find a message out of order.
+ //
+ // By definition this is the time for log reader to buffer messages after startup,
+ // i.e. messages sent before startup are not taken into account when checking for
+ // max out of order duration.
max_out_of_order_duration:long (id: 2);
// The configuration of the channels. It is valid to have a log file with
@@ -205,6 +218,10 @@
// Logger textual version. This is normally the release name stamped into
// the binary.
logger_version:string (id:33);
+
+ // The types of data stored in this log file. This can be used to find logs
+ // with only timestamps in them to pre-solve the time problem.
+ data_stored:[StoredDataType] (id:34);
}
// Table holding a message.
diff --git a/aos/events/logging/logger_main.cc b/aos/events/logging/logger_main.cc
index afc166f..f99fe4c 100644
--- a/aos/events/logging/logger_main.cc
+++ b/aos/events/logging/logger_main.cc
@@ -79,12 +79,11 @@
aos::logger::Logger logger(&event_loop);
if (FLAGS_rotate_every != 0.0) {
- logger.set_on_logged_period([&] {
- const auto now = event_loop.monotonic_now();
- if (now > last_rotation_time +
- std::chrono::duration<double>(FLAGS_rotate_every)) {
+ logger.set_on_logged_period([&](aos::monotonic_clock::time_point t) {
+ if (t > last_rotation_time +
+ std::chrono::duration<double>(FLAGS_rotate_every)) {
logger.Rotate();
- last_rotation_time = now;
+ last_rotation_time = t;
}
});
}
diff --git a/aos/events/logging/multinode_logger_test.cc b/aos/events/logging/multinode_logger_test.cc
index 677fc72..b8d07a2 100644
--- a/aos/events/logging/multinode_logger_test.cc
+++ b/aos/events/logging/multinode_logger_test.cc
@@ -26,9 +26,29 @@
::testing::Combine(
::testing::Values(
ConfigParams{"multinode_pingpong_combined_config.json", true,
- kCombinedConfigSha1(), kCombinedConfigSha1()},
+ kCombinedConfigSha1(), kCombinedConfigSha1(),
+ FileStrategy::kKeepSeparate,
+ ForceTimestampBuffering::kForceBufferTimestamps},
+ ConfigParams{"multinode_pingpong_combined_config.json", true,
+ kCombinedConfigSha1(), kCombinedConfigSha1(),
+ FileStrategy::kKeepSeparate,
+ ForceTimestampBuffering::kAutoBuffer},
ConfigParams{"multinode_pingpong_split_config.json", false,
- kSplitConfigSha1(), kReloggedSplitConfigSha1()}),
+ kSplitConfigSha1(), kReloggedSplitConfigSha1(),
+ FileStrategy::kKeepSeparate,
+ ForceTimestampBuffering::kForceBufferTimestamps},
+ ConfigParams{"multinode_pingpong_split_config.json", false,
+ kSplitConfigSha1(), kReloggedSplitConfigSha1(),
+ FileStrategy::kKeepSeparate,
+ ForceTimestampBuffering::kAutoBuffer},
+ ConfigParams{"multinode_pingpong_split_config.json", false,
+ kSplitConfigSha1(), kReloggedSplitConfigSha1(),
+ FileStrategy::kCombine,
+ ForceTimestampBuffering::kForceBufferTimestamps},
+ ConfigParams{"multinode_pingpong_split_config.json", false,
+ kSplitConfigSha1(), kReloggedSplitConfigSha1(),
+ FileStrategy::kCombine,
+ ForceTimestampBuffering::kAutoBuffer}),
::testing::ValuesIn(SupportedCompressionAlgorithms())));
INSTANTIATE_TEST_SUITE_P(
@@ -36,13 +56,37 @@
::testing::Combine(
::testing::Values(
ConfigParams{"multinode_pingpong_combined_config.json", true,
- kCombinedConfigSha1(), kCombinedConfigSha1()},
+ kCombinedConfigSha1(), kCombinedConfigSha1(),
+ FileStrategy::kKeepSeparate,
+ ForceTimestampBuffering::kForceBufferTimestamps},
+ ConfigParams{"multinode_pingpong_combined_config.json", true,
+ kCombinedConfigSha1(), kCombinedConfigSha1(),
+ FileStrategy::kKeepSeparate,
+ ForceTimestampBuffering::kAutoBuffer},
ConfigParams{"multinode_pingpong_split_config.json", false,
- kSplitConfigSha1(), kReloggedSplitConfigSha1()}),
+ kSplitConfigSha1(), kReloggedSplitConfigSha1(),
+ FileStrategy::kKeepSeparate,
+ ForceTimestampBuffering::kForceBufferTimestamps},
+ ConfigParams{"multinode_pingpong_split_config.json", false,
+ kSplitConfigSha1(), kReloggedSplitConfigSha1(),
+ FileStrategy::kKeepSeparate,
+ ForceTimestampBuffering::kAutoBuffer},
+ ConfigParams{"multinode_pingpong_split_config.json", false,
+ kSplitConfigSha1(), kReloggedSplitConfigSha1(),
+ FileStrategy::kCombine,
+ ForceTimestampBuffering::kForceBufferTimestamps},
+ ConfigParams{"multinode_pingpong_split_config.json", false,
+ kSplitConfigSha1(), kReloggedSplitConfigSha1(),
+ FileStrategy::kCombine,
+ ForceTimestampBuffering::kAutoBuffer}),
::testing::ValuesIn(SupportedCompressionAlgorithms())));
// Tests that we can write and read simple multi-node log files.
TEST_P(MultinodeLoggerTest, SimpleMultiNode) {
+ if (file_strategy() == FileStrategy::kCombine) {
+ GTEST_SKIP() << "We don't need to test the combined file writer this deep.";
+ }
+
std::vector<std::string> actual_filenames;
time_converter_.StartEqual();
@@ -79,11 +123,7 @@
}
EXPECT_EQ(logfile_uuids.size(), 2u);
- if (shared()) {
- EXPECT_EQ(parts_uuids.size(), 7u);
- } else {
- EXPECT_EQ(parts_uuids.size(), 8u);
- }
+ EXPECT_EQ(parts_uuids.size(), 8u);
// And confirm everything is on the correct node.
EXPECT_EQ(log_header[2].message().node()->name()->string_view(), "pi1");
@@ -92,122 +132,128 @@
EXPECT_EQ(log_header[5].message().node()->name()->string_view(), "pi2");
EXPECT_EQ(log_header[6].message().node()->name()->string_view(), "pi2");
-
EXPECT_EQ(log_header[7].message().node()->name()->string_view(), "pi2");
- EXPECT_EQ(log_header[8].message().node()->name()->string_view(), "pi2");
- EXPECT_EQ(log_header[9].message().node()->name()->string_view(), "pi2");
- EXPECT_EQ(log_header[10].message().node()->name()->string_view(), "pi1");
- EXPECT_EQ(log_header[11].message().node()->name()->string_view(), "pi1");
+ EXPECT_EQ(log_header[8].message().node()->name()->string_view(), "pi1");
+ EXPECT_EQ(log_header[9].message().node()->name()->string_view(), "pi1");
+
+ EXPECT_EQ(log_header[10].message().node()->name()->string_view(), "pi2");
+ EXPECT_EQ(log_header[11].message().node()->name()->string_view(), "pi2");
EXPECT_EQ(log_header[12].message().node()->name()->string_view(), "pi2");
EXPECT_EQ(log_header[13].message().node()->name()->string_view(), "pi2");
+ EXPECT_EQ(log_header[14].message().node()->name()->string_view(), "pi2");
- if (shared()) {
- EXPECT_EQ(log_header[14].message().node()->name()->string_view(), "pi2");
- EXPECT_EQ(log_header[15].message().node()->name()->string_view(), "pi2");
- EXPECT_EQ(log_header[16].message().node()->name()->string_view(), "pi2");
-
- EXPECT_EQ(log_header[17].message().node()->name()->string_view(), "pi1");
- EXPECT_EQ(log_header[18].message().node()->name()->string_view(), "pi1");
- } else {
- EXPECT_EQ(log_header[14].message().node()->name()->string_view(), "pi2");
- EXPECT_EQ(log_header[15].message().node()->name()->string_view(), "pi2");
-
- EXPECT_EQ(log_header[16].message().node()->name()->string_view(), "pi1");
- EXPECT_EQ(log_header[17].message().node()->name()->string_view(), "pi1");
-
- EXPECT_EQ(log_header[18].message().node()->name()->string_view(), "pi2");
- EXPECT_EQ(log_header[19].message().node()->name()->string_view(), "pi2");
- }
+ EXPECT_EQ(log_header[15].message().node()->name()->string_view(), "pi1");
+ EXPECT_EQ(log_header[16].message().node()->name()->string_view(), "pi1");
// And the parts index matches.
EXPECT_EQ(log_header[2].message().parts_index(), 0);
- EXPECT_EQ(log_header[3].message().parts_index(), 1);
- EXPECT_EQ(log_header[4].message().parts_index(), 2);
+
+ EXPECT_EQ(log_header[3].message().parts_index(), 0);
+ EXPECT_EQ(log_header[4].message().parts_index(), 1);
EXPECT_EQ(log_header[5].message().parts_index(), 0);
- EXPECT_EQ(log_header[6].message().parts_index(), 1);
- EXPECT_EQ(log_header[7].message().parts_index(), 0);
- EXPECT_EQ(log_header[8].message().parts_index(), 1);
- EXPECT_EQ(log_header[9].message().parts_index(), 2);
+ EXPECT_EQ(log_header[6].message().parts_index(), 0);
+ EXPECT_EQ(log_header[7].message().parts_index(), 1);
+
+ EXPECT_EQ(log_header[8].message().parts_index(), 0);
+ EXPECT_EQ(log_header[9].message().parts_index(), 1);
EXPECT_EQ(log_header[10].message().parts_index(), 0);
EXPECT_EQ(log_header[11].message().parts_index(), 1);
EXPECT_EQ(log_header[12].message().parts_index(), 0);
EXPECT_EQ(log_header[13].message().parts_index(), 1);
+ EXPECT_EQ(log_header[14].message().parts_index(), 2);
- if (shared()) {
- EXPECT_EQ(log_header[14].message().parts_index(), 0);
- EXPECT_EQ(log_header[15].message().parts_index(), 1);
- EXPECT_EQ(log_header[16].message().parts_index(), 2);
+ EXPECT_EQ(log_header[15].message().parts_index(), 0);
+ EXPECT_EQ(log_header[16].message().parts_index(), 1);
- EXPECT_EQ(log_header[17].message().parts_index(), 0);
- EXPECT_EQ(log_header[18].message().parts_index(), 1);
- } else {
- EXPECT_EQ(log_header[14].message().parts_index(), 0);
- EXPECT_EQ(log_header[15].message().parts_index(), 1);
+ // And that the data_stored field is right.
+ EXPECT_THAT(*log_header[2].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::DATA));
+ EXPECT_THAT(*log_header[3].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::TIMESTAMPS));
+ EXPECT_THAT(*log_header[4].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::TIMESTAMPS));
- EXPECT_EQ(log_header[16].message().parts_index(), 0);
- EXPECT_EQ(log_header[17].message().parts_index(), 1);
+ EXPECT_THAT(*log_header[5].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::DATA));
+ EXPECT_THAT(*log_header[6].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::TIMESTAMPS));
+ EXPECT_THAT(*log_header[7].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::TIMESTAMPS));
- EXPECT_EQ(log_header[18].message().parts_index(), 0);
- EXPECT_EQ(log_header[19].message().parts_index(), 1);
- }
+ EXPECT_THAT(*log_header[8].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::DATA));
+ EXPECT_THAT(*log_header[9].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::DATA));
+
+ EXPECT_THAT(*log_header[10].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::DATA));
+ EXPECT_THAT(*log_header[11].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::DATA));
+
+ EXPECT_THAT(*log_header[12].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::REMOTE_TIMESTAMPS));
+ EXPECT_THAT(*log_header[13].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::REMOTE_TIMESTAMPS));
+ EXPECT_THAT(*log_header[14].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::REMOTE_TIMESTAMPS));
+
+ EXPECT_THAT(*log_header[15].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::REMOTE_TIMESTAMPS));
+ EXPECT_THAT(*log_header[16].message().data_stored(),
+ ::testing::ElementsAre(StoredDataType::REMOTE_TIMESTAMPS));
}
- const std::vector<LogFile> sorted_log_files = SortParts(logfiles_);
+ const std::vector<LogFile> sorted_parts = SortParts(logfiles_);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
{
using ::testing::UnorderedElementsAre;
- std::shared_ptr<const aos::Configuration> config =
- sorted_log_files[0].config;
+ std::shared_ptr<const aos::Configuration> config = sorted_parts[0].config;
// Timing reports, pings
- EXPECT_THAT(CountChannelsData(config, logfiles_[2]),
- UnorderedElementsAre(
- std::make_tuple("/pi1/aos",
- "aos.message_bridge.ServerStatistics", 1),
- std::make_tuple("/test", "aos.examples.Ping", 1),
- std::make_tuple("/pi1/aos", "aos.examples.Ping", 1)))
- << " : " << logfiles_[2];
- {
- std::vector<std::tuple<std::string, std::string, int>> channel_counts = {
- std::make_tuple("/pi1/aos", "aos.examples.Ping", 10),
- std::make_tuple("/pi1/aos", "aos.message_bridge.Timestamp", 1),
- std::make_tuple("/pi1/aos", "aos.message_bridge.ClientStatistics",
- 1)};
- if (!std::get<0>(GetParam()).shared) {
- channel_counts.push_back(
- std::make_tuple("/pi1/aos/remote_timestamps/pi2/pi1/aos/"
- "aos-message_bridge-Timestamp",
- "aos.message_bridge.RemoteMessage", 1));
- }
- EXPECT_THAT(CountChannelsData(config, logfiles_[3]),
- ::testing::UnorderedElementsAreArray(channel_counts))
- << " : " << logfiles_[3];
+ if (shared()) {
+ EXPECT_THAT(
+ CountChannelsData(config, logfiles_[2]),
+ UnorderedElementsAre(
+ std::make_tuple("/pi1/aos", "aos.examples.Ping", 2001),
+ std::make_tuple("/pi1/aos", "aos.message_bridge.ClientStatistics",
+ 200),
+ std::make_tuple("/pi1/aos", "aos.message_bridge.ServerStatistics",
+ 21),
+ std::make_tuple("/pi1/aos", "aos.message_bridge.Timestamp", 200),
+ std::make_tuple("/pi1/aos", "aos.timing.Report", 60),
+ std::make_tuple("/test", "aos.examples.Ping", 2001)))
+ << " : " << logfiles_[2];
+ } else {
+ EXPECT_THAT(
+ CountChannelsData(config, logfiles_[2]),
+ UnorderedElementsAre(
+ std::make_tuple("/pi1/aos", "aos.examples.Ping", 2001),
+ std::make_tuple("/pi1/aos", "aos.message_bridge.ClientStatistics",
+ 200),
+ std::make_tuple("/pi1/aos", "aos.message_bridge.ServerStatistics",
+ 21),
+ std::make_tuple("/pi1/aos", "aos.message_bridge.Timestamp", 200),
+ std::make_tuple("/pi1/aos", "aos.timing.Report", 60),
+ std::make_tuple("/test", "aos.examples.Ping", 2001),
+ std::make_tuple("/pi1/aos/remote_timestamps/pi2/pi1/aos/"
+ "aos-message_bridge-Timestamp",
+ "aos.message_bridge.RemoteMessage", 200)))
+ << " : " << logfiles_[2];
}
- {
- std::vector<std::tuple<std::string, std::string, int>> channel_counts = {
- std::make_tuple("/pi1/aos", "aos.examples.Ping", 1990),
- std::make_tuple("/pi1/aos", "aos.message_bridge.Timestamp", 199),
- std::make_tuple("/pi1/aos", "aos.message_bridge.ServerStatistics",
- 20),
- std::make_tuple("/pi1/aos", "aos.message_bridge.ClientStatistics",
- 199),
- std::make_tuple("/pi1/aos", "aos.timing.Report", 60),
- std::make_tuple("/test", "aos.examples.Ping", 2000)};
- if (!std::get<0>(GetParam()).shared) {
- channel_counts.push_back(
- std::make_tuple("/pi1/aos/remote_timestamps/pi2/pi1/aos/"
- "aos-message_bridge-Timestamp",
- "aos.message_bridge.RemoteMessage", 199));
- }
- EXPECT_THAT(CountChannelsData(config, logfiles_[4]),
- ::testing::UnorderedElementsAreArray(channel_counts))
- << " : " << logfiles_[4];
- }
+
+ EXPECT_THAT(CountChannelsData(config, logfiles_[3]),
+ ::testing::UnorderedElementsAre())
+ << " : " << logfiles_[3];
+ EXPECT_THAT(CountChannelsData(config, logfiles_[3]),
+ ::testing::UnorderedElementsAre())
+ << " : " << logfiles_[4];
+
// Timestamps for pong
EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[2]),
UnorderedElementsAre())
@@ -223,185 +269,121 @@
std::make_tuple("/pi2/aos", "aos.message_bridge.Timestamp", 200)))
<< " : " << logfiles_[4];
- // Pong data.
+ // Timing reports and pongs.
EXPECT_THAT(
CountChannelsData(config, logfiles_[5]),
- UnorderedElementsAre(std::make_tuple("/test", "aos.examples.Pong", 91)))
+ UnorderedElementsAre(
+ std::make_tuple("/pi2/aos", "aos.examples.Ping", 2001),
+ std::make_tuple("/pi2/aos", "aos.message_bridge.ClientStatistics",
+ 200),
+ std::make_tuple("/pi2/aos", "aos.message_bridge.ServerStatistics",
+ 21),
+ std::make_tuple("/pi2/aos", "aos.message_bridge.Timestamp", 200),
+ std::make_tuple("/pi2/aos", "aos.timing.Report", 60),
+ std::make_tuple("/test", "aos.examples.Pong", 2001)))
<< " : " << logfiles_[5];
- EXPECT_THAT(CountChannelsData(config, logfiles_[6]),
- UnorderedElementsAre(
- std::make_tuple("/test", "aos.examples.Pong", 1910)))
+ EXPECT_THAT(CountChannelsData(config, logfiles_[6]), UnorderedElementsAre())
<< " : " << logfiles_[6];
-
- // No timestamps
+ EXPECT_THAT(CountChannelsData(config, logfiles_[7]), UnorderedElementsAre())
+ << " : " << logfiles_[7];
+ // And ping timestamps.
EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[5]),
UnorderedElementsAre())
<< " : " << logfiles_[5];
- EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[6]),
- UnorderedElementsAre())
- << " : " << logfiles_[6];
-
- // Timing reports and pongs.
- EXPECT_THAT(CountChannelsData(config, logfiles_[7]),
- UnorderedElementsAre(
- std::make_tuple("/pi2/aos", "aos.examples.Ping", 1),
- std::make_tuple("/pi2/aos",
- "aos.message_bridge.ServerStatistics", 1)))
- << " : " << logfiles_[7];
EXPECT_THAT(
- CountChannelsData(config, logfiles_[8]),
- UnorderedElementsAre(std::make_tuple("/test", "aos.examples.Pong", 1)))
- << " : " << logfiles_[8];
- EXPECT_THAT(
- CountChannelsData(config, logfiles_[9]),
- UnorderedElementsAre(
- std::make_tuple("/pi2/aos", "aos.examples.Ping", 2000),
- std::make_tuple("/pi2/aos", "aos.message_bridge.Timestamp", 200),
- std::make_tuple("/pi2/aos", "aos.message_bridge.ServerStatistics",
- 20),
- std::make_tuple("/pi2/aos", "aos.message_bridge.ClientStatistics",
- 200),
- std::make_tuple("/pi2/aos", "aos.timing.Report", 60),
- std::make_tuple("/test", "aos.examples.Pong", 2000)))
- << " : " << logfiles_[9];
- // And ping timestamps.
- EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[7]),
- UnorderedElementsAre())
- << " : " << logfiles_[7];
- EXPECT_THAT(
- CountChannelsTimestamp(config, logfiles_[8]),
+ CountChannelsTimestamp(config, logfiles_[6]),
UnorderedElementsAre(std::make_tuple("/test", "aos.examples.Ping", 1)))
- << " : " << logfiles_[8];
+ << " : " << logfiles_[6];
EXPECT_THAT(
- CountChannelsTimestamp(config, logfiles_[9]),
+ CountChannelsTimestamp(config, logfiles_[7]),
UnorderedElementsAre(
std::make_tuple("/test", "aos.examples.Ping", 2000),
std::make_tuple("/pi1/aos", "aos.message_bridge.Timestamp", 200)))
- << " : " << logfiles_[9];
+ << " : " << logfiles_[7];
// And then test that the remotely logged timestamp data files only have
// timestamps in them.
+ EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[8]),
+ UnorderedElementsAre())
+ << " : " << logfiles_[8];
+ EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[9]),
+ UnorderedElementsAre())
+ << " : " << logfiles_[9];
EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[10]),
UnorderedElementsAre())
<< " : " << logfiles_[10];
EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[11]),
UnorderedElementsAre())
<< " : " << logfiles_[11];
- EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[12]),
- UnorderedElementsAre())
- << " : " << logfiles_[12];
- EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[13]),
- UnorderedElementsAre())
- << " : " << logfiles_[13];
- EXPECT_THAT(CountChannelsData(config, logfiles_[10]),
+ EXPECT_THAT(CountChannelsData(config, logfiles_[8]),
UnorderedElementsAre(std::make_tuple(
"/pi1/aos", "aos.message_bridge.Timestamp", 9)))
- << " : " << logfiles_[10];
- EXPECT_THAT(CountChannelsData(config, logfiles_[11]),
+ << " : " << logfiles_[8];
+ EXPECT_THAT(CountChannelsData(config, logfiles_[9]),
UnorderedElementsAre(std::make_tuple(
"/pi1/aos", "aos.message_bridge.Timestamp", 191)))
+ << " : " << logfiles_[9];
+
+ // Pong snd timestamp data.
+ EXPECT_THAT(
+ CountChannelsData(config, logfiles_[10]),
+ UnorderedElementsAre(
+ std::make_tuple("/pi2/aos", "aos.message_bridge.Timestamp", 9),
+ std::make_tuple("/test", "aos.examples.Pong", 91)))
+ << " : " << logfiles_[10];
+ EXPECT_THAT(
+ CountChannelsData(config, logfiles_[11]),
+ UnorderedElementsAre(
+ std::make_tuple("/pi2/aos", "aos.message_bridge.Timestamp", 191),
+ std::make_tuple("/test", "aos.examples.Pong", 1910)))
<< " : " << logfiles_[11];
+ // Timestamps from pi2 on pi1, and the other way.
+ // if (shared()) {
EXPECT_THAT(CountChannelsData(config, logfiles_[12]),
- UnorderedElementsAre(std::make_tuple(
- "/pi2/aos", "aos.message_bridge.Timestamp", 9)))
+ UnorderedElementsAre())
<< " : " << logfiles_[12];
EXPECT_THAT(CountChannelsData(config, logfiles_[13]),
+ UnorderedElementsAre())
+ << " : " << logfiles_[13];
+ EXPECT_THAT(CountChannelsData(config, logfiles_[14]),
+ UnorderedElementsAre())
+ << " : " << logfiles_[14];
+ EXPECT_THAT(CountChannelsData(config, logfiles_[15]),
+ UnorderedElementsAre())
+ << " : " << logfiles_[15];
+ EXPECT_THAT(CountChannelsData(config, logfiles_[16]),
+ UnorderedElementsAre())
+ << " : " << logfiles_[16];
+
+ EXPECT_THAT(
+ CountChannelsTimestamp(config, logfiles_[12]),
+ UnorderedElementsAre(std::make_tuple("/test", "aos.examples.Ping", 1)))
+ << " : " << logfiles_[12];
+ EXPECT_THAT(
+ CountChannelsTimestamp(config, logfiles_[13]),
+ UnorderedElementsAre(
+ std::make_tuple("/pi1/aos", "aos.message_bridge.Timestamp", 9),
+ std::make_tuple("/test", "aos.examples.Ping", 90)))
+ << " : " << logfiles_[13];
+ EXPECT_THAT(
+ CountChannelsTimestamp(config, logfiles_[14]),
+ UnorderedElementsAre(
+ std::make_tuple("/pi1/aos", "aos.message_bridge.Timestamp", 191),
+ std::make_tuple("/test", "aos.examples.Ping", 1910)))
+ << " : " << logfiles_[14];
+ EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[15]),
+ UnorderedElementsAre(std::make_tuple(
+ "/pi2/aos", "aos.message_bridge.Timestamp", 9)))
+ << " : " << logfiles_[15];
+ EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[16]),
UnorderedElementsAre(std::make_tuple(
"/pi2/aos", "aos.message_bridge.Timestamp", 191)))
- << " : " << logfiles_[13];
-
- // Timestamps from pi2 on pi1, and the other way.
- if (shared()) {
- EXPECT_THAT(CountChannelsData(config, logfiles_[14]),
- UnorderedElementsAre())
- << " : " << logfiles_[14];
- EXPECT_THAT(CountChannelsData(config, logfiles_[15]),
- UnorderedElementsAre())
- << " : " << logfiles_[15];
- EXPECT_THAT(CountChannelsData(config, logfiles_[16]),
- UnorderedElementsAre())
- << " : " << logfiles_[16];
- EXPECT_THAT(CountChannelsData(config, logfiles_[17]),
- UnorderedElementsAre())
- << " : " << logfiles_[17];
- EXPECT_THAT(CountChannelsData(config, logfiles_[18]),
- UnorderedElementsAre())
- << " : " << logfiles_[18];
-
- EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[14]),
- UnorderedElementsAre(
- std::make_tuple("/test", "aos.examples.Ping", 1)))
- << " : " << logfiles_[14];
- EXPECT_THAT(
- CountChannelsTimestamp(config, logfiles_[15]),
- UnorderedElementsAre(
- std::make_tuple("/pi1/aos", "aos.message_bridge.Timestamp", 9),
- std::make_tuple("/test", "aos.examples.Ping", 90)))
- << " : " << logfiles_[15];
- EXPECT_THAT(
- CountChannelsTimestamp(config, logfiles_[16]),
- UnorderedElementsAre(
- std::make_tuple("/pi1/aos", "aos.message_bridge.Timestamp", 191),
- std::make_tuple("/test", "aos.examples.Ping", 1910)))
- << " : " << logfiles_[16];
- EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[17]),
- UnorderedElementsAre(std::make_tuple(
- "/pi2/aos", "aos.message_bridge.Timestamp", 9)))
- << " : " << logfiles_[17];
- EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[18]),
- UnorderedElementsAre(std::make_tuple(
- "/pi2/aos", "aos.message_bridge.Timestamp", 191)))
- << " : " << logfiles_[18];
- } else {
- EXPECT_THAT(CountChannelsData(config, logfiles_[14]),
- UnorderedElementsAre())
- << " : " << logfiles_[14];
- EXPECT_THAT(CountChannelsData(config, logfiles_[15]),
- UnorderedElementsAre())
- << " : " << logfiles_[15];
- EXPECT_THAT(CountChannelsData(config, logfiles_[16]),
- UnorderedElementsAre())
- << " : " << logfiles_[16];
- EXPECT_THAT(CountChannelsData(config, logfiles_[17]),
- UnorderedElementsAre())
- << " : " << logfiles_[17];
- EXPECT_THAT(CountChannelsData(config, logfiles_[18]),
- UnorderedElementsAre())
- << " : " << logfiles_[18];
- EXPECT_THAT(CountChannelsData(config, logfiles_[19]),
- UnorderedElementsAre())
- << " : " << logfiles_[19];
-
- EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[14]),
- UnorderedElementsAre(std::make_tuple(
- "/pi1/aos", "aos.message_bridge.Timestamp", 9)))
- << " : " << logfiles_[14];
- EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[15]),
- UnorderedElementsAre(std::make_tuple(
- "/pi1/aos", "aos.message_bridge.Timestamp", 191)))
- << " : " << logfiles_[15];
- EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[16]),
- UnorderedElementsAre(std::make_tuple(
- "/pi2/aos", "aos.message_bridge.Timestamp", 9)))
- << " : " << logfiles_[16];
- EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[17]),
- UnorderedElementsAre(std::make_tuple(
- "/pi2/aos", "aos.message_bridge.Timestamp", 191)))
- << " : " << logfiles_[17];
- EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[18]),
- UnorderedElementsAre(
- std::make_tuple("/test", "aos.examples.Ping", 91)))
- << " : " << logfiles_[18];
- EXPECT_THAT(CountChannelsTimestamp(config, logfiles_[19]),
- UnorderedElementsAre(
- std::make_tuple("/test", "aos.examples.Ping", 1910)))
- << " : " << logfiles_[19];
- }
+ << " : " << logfiles_[16];
}
- LogReader reader(sorted_log_files);
+ LogReader reader(sorted_parts);
SimulatedEventLoopFactory log_reader_factory(reader.configuration());
log_reader_factory.set_send_delay(chrono::microseconds(0));
@@ -571,7 +553,8 @@
pi2_logger.AppendAllFilenames(&actual_filenames);
}
- const std::vector<LogFile> sorted_parts = SortParts(logfiles_);
+ const std::vector<LogFile> sorted_parts = SortParts(actual_filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
LogReader reader(sorted_parts, &config_.message());
// Remap just on pi1.
@@ -644,7 +627,8 @@
pi2_logger.AppendAllFilenames(&actual_filenames);
}
- const std::vector<LogFile> sorted_parts = SortParts(logfiles_);
+ const std::vector<LogFile> sorted_parts = SortParts(actual_filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
LogReader reader(sorted_parts, &config_.message());
@@ -698,6 +682,8 @@
// is forwarded
TEST_P(MultinodeLoggerTest, OnlyDoBeforeSendCallbackOnSenderNode) {
time_converter_.StartEqual();
+
+ std::vector<std::string> filenames;
{
LoggerState pi1_logger = MakeLogger(pi1_);
LoggerState pi2_logger = MakeLogger(pi2_);
@@ -708,9 +694,14 @@
StartLogger(&pi2_logger);
event_loop_factory_.RunFor(chrono::milliseconds(20000));
+
+ pi1_logger.AppendAllFilenames(&filenames);
+ pi2_logger.AppendAllFilenames(&filenames);
}
- LogReader reader(SortParts(logfiles_));
+ const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ LogReader reader(sorted_parts);
int ping_count = 0;
// Adds a callback which mutates the value of the pong message before the
@@ -783,7 +774,8 @@
pi2_logger.AppendAllFilenames(&actual_filenames);
}
- const std::vector<LogFile> sorted_parts = SortParts(logfiles_);
+ const std::vector<LogFile> sorted_parts = SortParts(actual_filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
LogReader reader(sorted_parts, &config_.message());
SimulatedEventLoopFactory log_reader_factory(reader.configuration());
@@ -802,6 +794,8 @@
// the LogReader constructor.
TEST_P(MultinodeLoggerDeathTest, MultiNodeBadReplayConfig) {
time_converter_.StartEqual();
+
+ std::vector<std::string> filenames;
{
LoggerState pi1_logger = MakeLogger(pi1_);
LoggerState pi2_logger = MakeLogger(pi2_);
@@ -812,6 +806,9 @@
StartLogger(&pi2_logger);
event_loop_factory_.RunFor(chrono::milliseconds(20000));
+
+ pi1_logger.AppendAllFilenames(&filenames);
+ pi2_logger.AppendAllFilenames(&filenames);
}
// Test that, if we add an additional node to the replay config that the
@@ -826,7 +823,8 @@
}
)");
- const std::vector<LogFile> sorted_parts = SortParts(logfiles_);
+ const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
EXPECT_DEATH(LogReader(sorted_parts, &extra_nodes_config.message()),
"Log file and replay config need to have matching nodes lists.");
}
@@ -856,7 +854,9 @@
// Since we delay starting pi2, it already knows about all the timestamps so
// we don't end up with extra parts.
- LogReader reader(SortParts(actual_filenames));
+ const std::vector<LogFile> sorted_parts = SortParts(actual_filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ LogReader reader(sorted_parts);
SimulatedEventLoopFactory log_reader_factory(reader.configuration());
log_reader_factory.set_send_delay(chrono::microseconds(0));
@@ -969,6 +969,7 @@
const chrono::nanoseconds logger_run3 = time_converter_.AddMonotonic(
{chrono::milliseconds(400), chrono::milliseconds(400)});
+ std::vector<std::string> actual_filenames;
{
LoggerState pi2_logger = MakeLogger(pi2_);
@@ -1009,14 +1010,19 @@
(pi2_->monotonic_now() - pi1_->monotonic_now()) - initial_pi2_offset,
event_loop_factory_.send_delay() +
event_loop_factory_.network_delay());
+
+ pi1_logger.AppendAllFilenames(&actual_filenames);
}
// And log a bit more on pi2.
event_loop_factory_.RunFor(logger_run3);
+
+ pi2_logger.AppendAllFilenames(&actual_filenames);
}
- LogReader reader(
- SortParts(MakeLogFiles(logfile_base1_, logfile_base2_, 3, 3)));
+ const std::vector<LogFile> sorted_parts = SortParts(actual_filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ LogReader reader(sorted_parts);
SimulatedEventLoopFactory log_reader_factory(reader.configuration());
log_reader_factory.set_send_delay(chrono::microseconds(0));
@@ -1140,6 +1146,8 @@
// Tests that we can sort a bunch of parts with an empty part. We should ignore
// it and remove it from the sorted list.
TEST_P(MultinodeLoggerTest, SortEmptyParts) {
+ std::vector<std::string> actual_filenames;
+
time_converter_.StartEqual();
// Make a bunch of parts.
{
@@ -1152,21 +1160,27 @@
StartLogger(&pi2_logger);
event_loop_factory_.RunFor(chrono::milliseconds(2000));
+ pi1_logger.AppendAllFilenames(&actual_filenames);
+ pi2_logger.AppendAllFilenames(&actual_filenames);
}
// TODO(austin): Should we flip out if the file can't open?
const std::string kEmptyFile("foobarinvalidfiledoesnotexist" + Extension());
aos::util::WriteStringToFileOrDie(kEmptyFile, "");
- logfiles_.emplace_back(kEmptyFile);
+ actual_filenames.emplace_back(kEmptyFile);
- const std::vector<LogFile> sorted_parts = SortParts(logfiles_);
+ const std::vector<LogFile> sorted_parts = SortParts(actual_filenames);
VerifyParts(sorted_parts, {kEmptyFile});
}
// Tests that we can sort a bunch of parts with the end missing off a
// file. We should use the part we can read.
TEST_P(MultinodeLoggerTest, SortTruncatedParts) {
+ if (file_strategy() == FileStrategy::kCombine) {
+ GTEST_SKIP() << "We don't need to test the combined file writer this deep.";
+ }
+
std::vector<std::string> actual_filenames;
time_converter_.StartEqual();
// Make a bunch of parts.
@@ -1192,10 +1206,10 @@
// For snappy, needs to have enough data to be >1 chunk of compressed data so
// that we don't corrupt the entire log part.
::std::string compressed_contents =
- aos::util::ReadFileToStringOrDie(logfiles_[4]);
+ aos::util::ReadFileToStringOrDie(logfiles_[2]);
aos::util::WriteStringToFileOrDie(
- logfiles_[4],
+ logfiles_[2],
compressed_contents.substr(0, compressed_contents.size() - 100));
const std::vector<LogFile> sorted_parts = SortParts(logfiles_);
@@ -1205,6 +1219,8 @@
// Tests that if we remap a logged channel, it shows up correctly.
TEST_P(MultinodeLoggerTest, RemapLoggedChannel) {
time_converter_.StartEqual();
+
+ std::vector<std::string> filenames;
{
LoggerState pi1_logger = MakeLogger(pi1_);
LoggerState pi2_logger = MakeLogger(pi2_);
@@ -1215,9 +1231,14 @@
StartLogger(&pi2_logger);
event_loop_factory_.RunFor(chrono::milliseconds(20000));
+
+ pi1_logger.AppendAllFilenames(&filenames);
+ pi2_logger.AppendAllFilenames(&filenames);
}
- LogReader reader(SortParts(logfiles_));
+ const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ LogReader reader(sorted_parts);
// Remap just on pi1.
reader.RemapLoggedChannel<aos::timing::Report>(
@@ -1300,7 +1321,9 @@
pi2_logger.AppendAllFilenames(&actual_filenames);
}
- LogReader reader(SortParts(actual_filenames));
+ const std::vector<LogFile> sorted_parts = SortParts(actual_filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ LogReader reader(sorted_parts);
// Rename just on pi2. Add some global maps just to verify they get added in
// the config and used correctly.
@@ -1407,7 +1430,9 @@
event_loop_factory_.RunFor(chrono::milliseconds(20000));
}
- LogReader reader(SortParts(logfiles_));
+ const std::vector<LogFile> sorted_parts = SortParts(logfiles_);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ LogReader reader(sorted_parts);
reader.RemapLoggedChannel<examples::Ping>("/test");
@@ -1488,7 +1513,9 @@
pi2_logger.AppendAllFilenames(&actual_filenames);
}
- LogReader reader(SortParts(actual_filenames));
+ const std::vector<LogFile> sorted_parts = SortParts(actual_filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ LogReader reader(sorted_parts);
std::vector<MapT> maps;
{
@@ -1578,6 +1605,8 @@
TEST_P(MultinodeLoggerTest, SingleNodeReplay) {
time_converter_.StartEqual();
constexpr chrono::milliseconds kStartupDelay(95);
+ std::vector<std::string> filenames;
+
{
LoggerState pi1_logger = MakeLogger(pi1_);
LoggerState pi2_logger = MakeLogger(pi2_);
@@ -1588,10 +1617,13 @@
StartLogger(&pi2_logger);
event_loop_factory_.RunFor(chrono::milliseconds(20000));
+
+ pi1_logger.AppendAllFilenames(&filenames);
+ pi2_logger.AppendAllFilenames(&filenames);
}
- LogReader full_reader(SortParts(logfiles_));
- LogReader single_node_reader(SortParts(logfiles_));
+ LogReader full_reader(SortParts(filenames));
+ LogReader single_node_reader(SortParts(filenames));
SimulatedEventLoopFactory full_factory(full_reader.configuration());
SimulatedEventLoopFactory single_node_factory(
@@ -1971,8 +2003,8 @@
LoggerState pi2_logger = MakeLogger(
log_reader_factory.GetNodeEventLoopFactory("pi2"), &log_reader_factory);
- StartLogger(&pi1_logger, tmp_dir_ + "/relogged1");
- StartLogger(&pi2_logger, tmp_dir_ + "/relogged2");
+ StartLogger(&pi1_logger, tmp_dir_ + "/logs/relogged1");
+ StartLogger(&pi2_logger, tmp_dir_ + "/logs/relogged2");
log_reader_factory.Run();
}
@@ -1982,8 +2014,9 @@
// And verify that we can run the LogReader over the relogged files without
// hitting any fatal errors.
{
- LogReader relogged_reader(SortParts(MakeLogFiles(
- tmp_dir_ + "/relogged1", tmp_dir_ + "/relogged2", 3, 3, true)));
+ LogReader relogged_reader(SortParts(
+ MakeLogFiles(tmp_dir_ + "/logs/relogged1", tmp_dir_ + "/logs/relogged2",
+ 1, 1, 2, 2, true)));
relogged_reader.Register();
relogged_reader.event_loop_factory()->Run();
@@ -1992,8 +2025,8 @@
// configuration.
{
LogReader relogged_reader(
- SortParts(MakeLogFiles(tmp_dir_ + "/relogged1", tmp_dir_ + "/relogged2",
- 3, 3, true)),
+ SortParts(MakeLogFiles(tmp_dir_ + "/logs/relogged1",
+ tmp_dir_ + "/logs/relogged2", 1, 1, 2, 2, true)),
reader.configuration());
relogged_reader.Register();
@@ -2023,7 +2056,7 @@
ASSERT_THAT(actual_filenames,
::testing::UnorderedElementsAreArray(logfiles_));
- for (const LogFile &log_file : SortParts(logfiles_)) {
+ for (const LogFile &log_file : SortParts(actual_filenames)) {
for (const LogParts &log_part : log_file.parts) {
if (log_part.node == log_file.logger_node) {
EXPECT_EQ(log_part.logger_monotonic_start_time,
@@ -2047,13 +2080,11 @@
// Test that renaming the base, renames the folder.
TEST_P(MultinodeLoggerTest, LoggerRenameFolder) {
- util::UnlinkRecursive(tmp_dir_ + "/renamefolder");
- util::UnlinkRecursive(tmp_dir_ + "/new-good");
time_converter_.AddMonotonic(
{BootTimestamp::epoch(), BootTimestamp::epoch() + chrono::seconds(1000)});
- logfile_base1_ = tmp_dir_ + "/renamefolder/multi_logfile1";
- logfile_base2_ = tmp_dir_ + "/renamefolder/multi_logfile2";
- logfiles_ = MakeLogFiles(logfile_base1_, logfile_base2_);
+ logfile_base1_ = tmp_dir_ + "/logs/renamefolder/multi_logfile1";
+ logfile_base2_ = tmp_dir_ + "/logs/renamefolder/multi_logfile2";
+
LoggerState pi1_logger = MakeLogger(pi1_);
LoggerState pi2_logger = MakeLogger(pi2_);
@@ -2061,8 +2092,8 @@
StartLogger(&pi2_logger);
event_loop_factory_.RunFor(chrono::milliseconds(10000));
- logfile_base1_ = tmp_dir_ + "/new-good/multi_logfile1";
- logfile_base2_ = tmp_dir_ + "/new-good/multi_logfile2";
+ logfile_base1_ = tmp_dir_ + "/logs/new-good/multi_logfile1";
+ logfile_base2_ = tmp_dir_ + "/logs/new-good/multi_logfile2";
logfiles_ = MakeLogFiles(logfile_base1_, logfile_base2_);
// Sequence of set_base_name and Rotate simulates rename operation. Since
@@ -2084,14 +2115,13 @@
TEST_P(MultinodeLoggerDeathTest, LoggerRenameFile) {
time_converter_.AddMonotonic(
{BootTimestamp::epoch(), BootTimestamp::epoch() + chrono::seconds(1000)});
- util::UnlinkRecursive(tmp_dir_ + "/renamefile");
- logfile_base1_ = tmp_dir_ + "/renamefile/multi_logfile1";
- logfile_base2_ = tmp_dir_ + "/renamefile/multi_logfile2";
- logfiles_ = MakeLogFiles(logfile_base1_, logfile_base2_);
+ logfile_base1_ = tmp_dir_ + "/logs/renamefile/multi_logfile1";
+ logfile_base2_ = tmp_dir_ + "/logs/renamefile/multi_logfile2";
+
LoggerState pi1_logger = MakeLogger(pi1_);
StartLogger(&pi1_logger);
event_loop_factory_.RunFor(chrono::milliseconds(10000));
- logfile_base1_ = tmp_dir_ + "/new-renamefile/new_multi_logfile1";
+ logfile_base1_ = tmp_dir_ + "/logs/new-renamefile/new_multi_logfile1";
EXPECT_DEATH({ pi1_logger.log_namer->set_base_name(logfile_base1_); },
"Rename of file base from");
}
@@ -2103,6 +2133,9 @@
// This should be enough that we can then re-run the logger and get a valid log
// back.
TEST_P(MultinodeLoggerTest, RemoteReboot) {
+ if (file_strategy() == FileStrategy::kCombine) {
+ GTEST_SKIP() << "We don't need to test the combined file writer this deep.";
+ }
std::vector<std::string> actual_filenames;
const UUID pi1_boot0 = UUID::Random();
@@ -2179,8 +2212,7 @@
//
// TODO(austin): I'm not the most thrilled with this test pattern... It
// feels brittle in a different way.
- if (file.find("aos.message_bridge.RemoteMessage") == std::string::npos ||
- !shared()) {
+ if (file.find("timestamps/remote_pi2") == std::string::npos) {
switch (log_header->message().parts_index()) {
case 0:
EXPECT_EQ(source_node_boot_uuid, pi2_boot0);
@@ -2317,99 +2349,115 @@
monotonic_clock::max_time);
EXPECT_EQ(oldest_logger_local_unreliable_monotonic_timestamps,
monotonic_clock::max_time);
- switch (log_header->message().parts_index()) {
- case 0:
- EXPECT_EQ(oldest_remote_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_local_monotonic_timestamps, monotonic_clock::max_time);
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
- monotonic_clock::max_time);
- break;
- case 1:
- EXPECT_EQ(oldest_remote_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(90200)));
- EXPECT_EQ(oldest_local_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(90350)));
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(90200)));
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(90350)));
- EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
- monotonic_clock::max_time);
- break;
- case 2:
- EXPECT_EQ(oldest_remote_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(90200)))
- << file;
- EXPECT_EQ(oldest_local_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(90350)))
- << file;
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(90200)))
- << file;
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(90350)))
- << file;
- EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(100000)))
- << file;
- EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(100150)))
- << file;
- break;
- case 3:
- EXPECT_EQ(oldest_remote_monotonic_timestamps,
- monotonic_clock::time_point(chrono::milliseconds(1323) +
- chrono::microseconds(200)));
- EXPECT_EQ(oldest_local_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(10100350)));
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- monotonic_clock::time_point(chrono::milliseconds(1323) +
- chrono::microseconds(200)));
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(10100350)));
- EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
- monotonic_clock::max_time)
- << file;
- EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
- monotonic_clock::max_time)
- << file;
- break;
- case 4:
- EXPECT_EQ(oldest_remote_monotonic_timestamps,
- monotonic_clock::time_point(chrono::milliseconds(1323) +
- chrono::microseconds(200)));
- EXPECT_EQ(oldest_local_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(10100350)));
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- monotonic_clock::time_point(chrono::milliseconds(1323) +
- chrono::microseconds(200)));
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(10100350)));
- EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(1423000)))
- << file;
- EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
- monotonic_clock::time_point(chrono::microseconds(10200150)))
- << file;
- break;
- default:
- FAIL();
- break;
+ if (log_header->message().data_stored()->Get(0) == StoredDataType::DATA) {
+ switch (log_header->message().parts_index()) {
+ case 0:
+ ASSERT_EQ(oldest_remote_monotonic_timestamps,
+ monotonic_clock::max_time);
+ EXPECT_EQ(oldest_local_monotonic_timestamps,
+ monotonic_clock::max_time);
+ EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ break;
+ default:
+ FAIL();
+ break;
+ }
+ } else if (log_header->message().data_stored()->Get(0) ==
+ StoredDataType::TIMESTAMPS) {
+ switch (log_header->message().parts_index()) {
+ case 0:
+ ASSERT_EQ(oldest_remote_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(90200)));
+ EXPECT_EQ(oldest_local_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(90350)));
+ EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(90200)));
+ EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(90350)));
+ EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ break;
+ case 1:
+ ASSERT_EQ(oldest_remote_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(90200)))
+ << file;
+ EXPECT_EQ(oldest_local_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(90350)))
+ << file;
+ EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(90200)))
+ << file;
+ EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(90350)))
+ << file;
+ EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(100000)))
+ << file;
+ EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(100150)))
+ << file;
+ break;
+ case 2:
+ ASSERT_EQ(oldest_remote_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::milliseconds(1323) +
+ chrono::microseconds(200)));
+ EXPECT_EQ(
+ oldest_local_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(10100350)));
+ EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::milliseconds(1323) +
+ chrono::microseconds(200)));
+ EXPECT_EQ(
+ oldest_local_unreliable_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(10100350)));
+ EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
+ monotonic_clock::max_time)
+ << file;
+ EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
+ monotonic_clock::max_time)
+ << file;
+ break;
+ case 3:
+ ASSERT_EQ(oldest_remote_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::milliseconds(1323) +
+ chrono::microseconds(200)));
+ EXPECT_EQ(
+ oldest_local_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(10100350)));
+ EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::milliseconds(1323) +
+ chrono::microseconds(200)));
+ EXPECT_EQ(
+ oldest_local_unreliable_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(10100350)));
+ EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(1423000)))
+ << file;
+ EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
+ monotonic_clock::time_point(chrono::microseconds(10200150)))
+ << file;
+ break;
+ default:
+ FAIL();
+ break;
+ }
}
}
// Confirm that we refuse to replay logs with missing boot uuids.
{
- LogReader reader(SortParts(pi1_reboot_logfiles_));
+ auto sorted_parts = SortParts(pi1_reboot_logfiles_);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ LogReader reader(sorted_parts);
SimulatedEventLoopFactory log_reader_factory(reader.configuration());
log_reader_factory.set_send_delay(chrono::microseconds(0));
@@ -2427,6 +2475,10 @@
// Tests that we can sort a log which only has timestamps from the remote
// because the local message_bridge_client failed to connect.
TEST_P(MultinodeLoggerTest, RemoteRebootOnlyTimestamps) {
+ if (file_strategy() == FileStrategy::kCombine) {
+ GTEST_SKIP() << "We don't need to test the combined file writer this deep.";
+ }
+
const UUID pi1_boot0 = UUID::Random();
const UUID pi2_boot0 = UUID::Random();
const UUID pi2_boot1 = UUID::Random();
@@ -2541,8 +2593,7 @@
2u);
if (log_header->message().node()->name()->string_view() != "pi1") {
- ASSERT_TRUE(file.find("aos.message_bridge.RemoteMessage") !=
- std::string::npos);
+ ASSERT_TRUE(file.find("timestamps/remote_pi2") != std::string::npos);
const std::optional<SizePrefixedFlatbufferVector<MessageHeader>> msg =
ReadNthMessage(file, 0);
@@ -2629,234 +2680,150 @@
SCOPED_TRACE(aos::FlatbufferToJson(
*log_header, {.multi_line = true, .max_vector_size = 100}));
- if (shared()) {
- // Confirm that the oldest timestamps match what we expect. Based on
- // what we are doing, we know that the oldest time is the first
- // message's time.
- //
- // This makes the test robust to both the split and combined config
- // tests.
- switch (log_header->message().parts_index()) {
- case 0:
- EXPECT_EQ(oldest_remote_monotonic_timestamps,
+ // Confirm that the oldest timestamps match what we expect. Based on
+ // what we are doing, we know that the oldest time is the first
+ // message's time.
+ //
+ // This makes the test robust to both the split and combined config
+ // tests.
+ switch (log_header->message().parts_index()) {
+ case 0:
+ EXPECT_EQ(oldest_remote_monotonic_timestamps,
+ expected_oldest_remote_monotonic_timestamps);
+ EXPECT_EQ(oldest_local_monotonic_timestamps,
+ expected_oldest_local_monotonic_timestamps);
+ EXPECT_EQ(oldest_logger_remote_unreliable_monotonic_timestamps,
+ expected_oldest_local_monotonic_timestamps)
+ << file;
+ EXPECT_EQ(oldest_logger_local_unreliable_monotonic_timestamps,
+ expected_oldest_timestamp_monotonic_timestamps)
+ << file;
+
+ if (reliable) {
+ EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
expected_oldest_remote_monotonic_timestamps);
- EXPECT_EQ(oldest_local_monotonic_timestamps,
+ EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
expected_oldest_local_monotonic_timestamps);
- EXPECT_EQ(oldest_logger_remote_unreliable_monotonic_timestamps,
- expected_oldest_local_monotonic_timestamps)
- << file;
- EXPECT_EQ(oldest_logger_local_unreliable_monotonic_timestamps,
- expected_oldest_timestamp_monotonic_timestamps)
- << file;
-
- if (reliable) {
- EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
- expected_oldest_remote_monotonic_timestamps);
- EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
- expected_oldest_local_monotonic_timestamps);
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- monotonic_clock::max_time);
- } else {
- EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- expected_oldest_remote_monotonic_timestamps);
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- expected_oldest_local_monotonic_timestamps);
- }
- break;
- case 1:
- EXPECT_EQ(oldest_remote_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(90000000));
- EXPECT_EQ(oldest_local_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(90150000));
- EXPECT_EQ(oldest_logger_remote_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(90150000));
- EXPECT_EQ(oldest_logger_local_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(90250000));
- if (reliable) {
- EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
- expected_oldest_remote_monotonic_timestamps);
- EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
- expected_oldest_local_monotonic_timestamps);
- EXPECT_EQ(
- oldest_remote_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(90000000));
- EXPECT_EQ(
- oldest_local_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(90150000));
- } else {
- EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- expected_oldest_remote_monotonic_timestamps);
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- expected_oldest_local_monotonic_timestamps);
- }
- break;
- case 2:
- EXPECT_EQ(
- oldest_remote_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(10000000000));
- EXPECT_EQ(
- oldest_local_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(1323100000));
- EXPECT_EQ(oldest_logger_remote_unreliable_monotonic_timestamps,
- expected_oldest_local_monotonic_timestamps)
- << file;
- EXPECT_EQ(oldest_logger_local_unreliable_monotonic_timestamps,
- expected_oldest_timestamp_monotonic_timestamps)
- << file;
- if (reliable) {
- EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
- expected_oldest_remote_monotonic_timestamps);
- EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
- expected_oldest_local_monotonic_timestamps);
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- monotonic_clock::max_time);
- } else {
- EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- expected_oldest_remote_monotonic_timestamps);
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- expected_oldest_local_monotonic_timestamps);
- }
- break;
-
- case 3:
- EXPECT_EQ(
- oldest_remote_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(10000000000));
- EXPECT_EQ(
- oldest_local_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(1323100000));
+ EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ } else {
+ EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
+ monotonic_clock::max_time);
EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
expected_oldest_remote_monotonic_timestamps);
EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
expected_oldest_local_monotonic_timestamps);
- EXPECT_EQ(
- oldest_logger_remote_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(1323100000));
- EXPECT_EQ(
- oldest_logger_local_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(10100200000));
- break;
- default:
- FAIL();
- break;
- }
+ }
+ break;
+ case 1:
+ EXPECT_EQ(oldest_remote_monotonic_timestamps,
+ monotonic_clock::epoch() + chrono::nanoseconds(90000000));
+ EXPECT_EQ(oldest_local_monotonic_timestamps,
+ monotonic_clock::epoch() + chrono::nanoseconds(90150000));
+ EXPECT_EQ(oldest_logger_remote_unreliable_monotonic_timestamps,
+ monotonic_clock::epoch() + chrono::nanoseconds(90150000));
+ EXPECT_EQ(oldest_logger_local_unreliable_monotonic_timestamps,
+ monotonic_clock::epoch() + chrono::nanoseconds(90250000));
+ if (reliable) {
+ EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
+ expected_oldest_remote_monotonic_timestamps);
+ EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
+ expected_oldest_local_monotonic_timestamps);
+ EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
+ monotonic_clock::epoch() + chrono::nanoseconds(90000000));
+ EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
+ monotonic_clock::epoch() + chrono::nanoseconds(90150000));
+ } else {
+ EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
+ expected_oldest_remote_monotonic_timestamps);
+ EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
+ expected_oldest_local_monotonic_timestamps);
+ }
+ break;
+ case 2:
+ EXPECT_EQ(
+ oldest_remote_monotonic_timestamps,
+ monotonic_clock::epoch() + chrono::nanoseconds(10000000000));
+ EXPECT_EQ(oldest_local_monotonic_timestamps,
+ monotonic_clock::epoch() + chrono::nanoseconds(1323100000));
+ EXPECT_EQ(oldest_logger_remote_unreliable_monotonic_timestamps,
+ expected_oldest_local_monotonic_timestamps)
+ << file;
+ EXPECT_EQ(oldest_logger_local_unreliable_monotonic_timestamps,
+ expected_oldest_timestamp_monotonic_timestamps)
+ << file;
+ if (reliable) {
+ EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
+ expected_oldest_remote_monotonic_timestamps);
+ EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
+ expected_oldest_local_monotonic_timestamps);
+ EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ } else {
+ EXPECT_EQ(oldest_remote_reliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ EXPECT_EQ(oldest_local_reliable_monotonic_timestamps,
+ monotonic_clock::max_time);
+ EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
+ expected_oldest_remote_monotonic_timestamps);
+ EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
+ expected_oldest_local_monotonic_timestamps);
+ }
+ break;
- switch (log_header->message().parts_index()) {
- case 0:
- EXPECT_EQ(source_node_boot_uuid, pi2_boot0);
- EXPECT_EQ(monotonic_start_time, monotonic_clock::min_time);
- break;
- case 1:
- EXPECT_EQ(source_node_boot_uuid, pi2_boot0);
- EXPECT_EQ(monotonic_start_time, monotonic_clock::min_time);
- break;
- case 2:
- EXPECT_EQ(source_node_boot_uuid, pi2_boot1);
- EXPECT_EQ(monotonic_start_time, monotonic_clock::min_time);
- break;
- case 3:
- if (shared()) {
- EXPECT_EQ(source_node_boot_uuid, pi2_boot1);
- EXPECT_EQ(monotonic_start_time, monotonic_clock::min_time);
- break;
- }
- [[fallthrough]];
- default:
- FAIL();
- break;
- }
- } else {
- switch (log_header->message().parts_index()) {
- case 0:
- if (reliable) {
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(
- oldest_logger_remote_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(100150000))
- << file;
- EXPECT_EQ(
- oldest_logger_local_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(100250000))
- << file;
- } else {
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- expected_oldest_remote_monotonic_timestamps);
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- expected_oldest_local_monotonic_timestamps);
- EXPECT_EQ(
- oldest_logger_remote_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(90150000))
- << file;
- EXPECT_EQ(
- oldest_logger_local_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(90250000))
- << file;
- }
- break;
- case 1:
- if (reliable) {
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- monotonic_clock::max_time);
- EXPECT_EQ(
- oldest_logger_remote_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(1323100000));
- EXPECT_EQ(
- oldest_logger_local_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(10100200000));
- } else {
- EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
- expected_oldest_remote_monotonic_timestamps);
- EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
- expected_oldest_local_monotonic_timestamps);
- EXPECT_EQ(
- oldest_logger_remote_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(1323150000));
- EXPECT_EQ(
- oldest_logger_local_unreliable_monotonic_timestamps,
- monotonic_clock::epoch() + chrono::nanoseconds(10100250000));
- }
- break;
- default:
- FAIL();
- break;
- }
-
- switch (log_header->message().parts_index()) {
- case 0:
- EXPECT_EQ(source_node_boot_uuid, pi2_boot0);
- EXPECT_EQ(monotonic_start_time, monotonic_clock::min_time);
- break;
- case 1:
- EXPECT_EQ(source_node_boot_uuid, pi2_boot1);
- EXPECT_EQ(monotonic_start_time, monotonic_clock::min_time);
- break;
- default:
- FAIL();
- break;
- }
+ case 3:
+ EXPECT_EQ(
+ oldest_remote_monotonic_timestamps,
+ monotonic_clock::epoch() + chrono::nanoseconds(10000000000));
+ EXPECT_EQ(oldest_local_monotonic_timestamps,
+ monotonic_clock::epoch() + chrono::nanoseconds(1323100000));
+ EXPECT_EQ(oldest_remote_unreliable_monotonic_timestamps,
+ expected_oldest_remote_monotonic_timestamps);
+ EXPECT_EQ(oldest_local_unreliable_monotonic_timestamps,
+ expected_oldest_local_monotonic_timestamps);
+ EXPECT_EQ(oldest_logger_remote_unreliable_monotonic_timestamps,
+ monotonic_clock::epoch() + chrono::nanoseconds(1323100000));
+ EXPECT_EQ(
+ oldest_logger_local_unreliable_monotonic_timestamps,
+ monotonic_clock::epoch() + chrono::nanoseconds(10100200000));
+ break;
+ default:
+ FAIL();
+ break;
}
+ switch (log_header->message().parts_index()) {
+ case 0:
+ EXPECT_EQ(source_node_boot_uuid, pi2_boot0);
+ EXPECT_EQ(monotonic_start_time, monotonic_clock::min_time);
+ break;
+ case 1:
+ EXPECT_EQ(source_node_boot_uuid, pi2_boot0);
+ EXPECT_EQ(monotonic_start_time, monotonic_clock::min_time);
+ break;
+ case 2:
+ EXPECT_EQ(source_node_boot_uuid, pi2_boot1);
+ EXPECT_EQ(monotonic_start_time, monotonic_clock::min_time);
+ break;
+ case 3:
+ EXPECT_EQ(source_node_boot_uuid, pi2_boot1);
+ EXPECT_EQ(monotonic_start_time, monotonic_clock::min_time);
+ break;
+ [[fallthrough]];
+ default:
+ FAIL();
+ break;
+ }
continue;
}
EXPECT_EQ(
@@ -2908,15 +2875,13 @@
}
}
- if (shared()) {
- EXPECT_EQ(timestamp_file_count, 4u);
- } else {
- EXPECT_EQ(timestamp_file_count, 4u);
- }
+ EXPECT_EQ(timestamp_file_count, 4u);
// Confirm that we can actually sort the resulting log and read it.
{
- LogReader reader(SortParts(filenames));
+ auto sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ LogReader reader(sorted_parts);
SimulatedEventLoopFactory log_reader_factory(reader.configuration());
log_reader_factory.set_send_delay(chrono::microseconds(0));
@@ -2934,6 +2899,8 @@
// Tests that we properly handle one direction of message_bridge being
// unavailable.
TEST_P(MultinodeLoggerTest, OneDirectionWithNegativeSlope) {
+ std::vector<std::string> actual_filenames;
+
pi1_->Disconnect(pi2_->node());
time_converter_.AddMonotonic(
{BootTimestamp::epoch(), BootTimestamp::epoch() + chrono::seconds(1000)});
@@ -2949,11 +2916,12 @@
StartLogger(&pi1_logger);
event_loop_factory_.RunFor(chrono::milliseconds(10000));
+ pi1_logger.AppendAllFilenames(&actual_filenames);
}
- // Confirm that we can parse the result. LogReader has enough internal CHECKs
- // to confirm the right thing happened.
- ConfirmReadable(pi1_single_direction_logfiles_);
+ // Confirm that we can parse the result. LogReader has enough internal
+ // CHECKs to confirm the right thing happened.
+ ConfirmReadable(actual_filenames);
}
// Tests that we properly handle one direction of message_bridge being
@@ -2966,6 +2934,8 @@
time_converter_.AddMonotonic(
{chrono::milliseconds(10000),
chrono::milliseconds(10000) + chrono::milliseconds(1)});
+
+ std::vector<std::string> filenames;
{
LoggerState pi1_logger = MakeLogger(pi1_);
@@ -2974,11 +2944,12 @@
StartLogger(&pi1_logger);
event_loop_factory_.RunFor(chrono::milliseconds(10000));
+ pi1_logger.AppendAllFilenames(&filenames);
}
- // Confirm that we can parse the result. LogReader has enough internal CHECKs
- // to confirm the right thing happened.
- ConfirmReadable(pi1_single_direction_logfiles_);
+ // Confirm that we can parse the result. LogReader has enough internal
+ // CHECKs to confirm the right thing happened.
+ ConfirmReadable(filenames);
}
// Tests that we explode if someone passes in a part file twice with a better
@@ -2986,6 +2957,8 @@
TEST_P(MultinodeLoggerTest, DuplicateLogFiles) {
time_converter_.AddMonotonic(
{BootTimestamp::epoch(), BootTimestamp::epoch() + chrono::seconds(1000)});
+
+ std::vector<std::string> filenames;
{
LoggerState pi1_logger = MakeLogger(pi1_);
@@ -2994,10 +2967,12 @@
StartLogger(&pi1_logger);
event_loop_factory_.RunFor(chrono::milliseconds(10000));
+
+ pi1_logger.AppendAllFilenames(&filenames);
}
std::vector<std::string> duplicates;
- for (const std::string &f : pi1_single_direction_logfiles_) {
+ for (const std::string &f : filenames) {
duplicates.emplace_back(f);
duplicates.emplace_back(f);
}
@@ -3006,6 +2981,9 @@
// Tests that we explode if someone loses a part out of the middle of a log.
TEST_P(MultinodeLoggerTest, MissingPartsFromMiddle) {
+ if (file_strategy() == FileStrategy::kCombine) {
+ GTEST_SKIP() << "We don't need to test the combined file writer this deep.";
+ }
time_converter_.AddMonotonic(
{BootTimestamp::epoch(), BootTimestamp::epoch() + chrono::seconds(1000)});
{
@@ -3016,21 +2994,24 @@
StartLogger(&pi1_logger);
aos::monotonic_clock::time_point last_rotation_time =
pi1_logger.event_loop->monotonic_now();
- pi1_logger.logger->set_on_logged_period([&] {
- const auto now = pi1_logger.event_loop->monotonic_now();
- if (now > last_rotation_time + std::chrono::seconds(5)) {
- pi1_logger.logger->Rotate();
- last_rotation_time = now;
- }
- });
+ pi1_logger.logger->set_on_logged_period(
+ [&](aos::monotonic_clock::time_point) {
+ const auto now = pi1_logger.event_loop->monotonic_now();
+ if (now > last_rotation_time + std::chrono::seconds(5)) {
+ pi1_logger.logger->Rotate();
+ last_rotation_time = now;
+ }
+ });
event_loop_factory_.RunFor(chrono::milliseconds(10000));
}
std::vector<std::string> missing_parts;
- missing_parts.emplace_back(logfile_base1_ + "_pi1_data.part0" + Extension());
- missing_parts.emplace_back(logfile_base1_ + "_pi1_data.part2" + Extension());
+ missing_parts.emplace_back(logfile_base1_ + "_pi1_timestamps.part0" +
+ Extension());
+ missing_parts.emplace_back(logfile_base1_ + "_pi1_timestamps.part2" +
+ Extension());
missing_parts.emplace_back(absl::StrCat(
logfile_base1_, "_", std::get<0>(GetParam()).sha256, Extension()));
@@ -3038,8 +3019,8 @@
"Broken log, missing part files between");
}
-// Tests that we properly handle a dead node. Do this by just disconnecting it
-// and only using one nodes of logs.
+// Tests that we properly handle a dead node. Do this by just disconnecting
+// it and only using one nodes of logs.
TEST_P(MultinodeLoggerTest, DeadNode) {
pi1_->Disconnect(pi2_->node());
pi2_->Disconnect(pi1_->node());
@@ -3055,14 +3036,14 @@
event_loop_factory_.RunFor(chrono::milliseconds(10000));
}
- // Confirm that we can parse the result. LogReader has enough internal CHECKs
- // to confirm the right thing happened.
+ // Confirm that we can parse the result. LogReader has enough internal
+ // CHECKs to confirm the right thing happened.
ConfirmReadable(MakePi1DeadNodeLogfiles());
}
-// Tests that we can relog with a different config. This makes most sense when
-// you are trying to edit a log and want to use channel renaming + the original
-// config in the new log.
+// Tests that we can relog with a different config. This makes most sense
+// when you are trying to edit a log and want to use channel renaming + the
+// original config in the new log.
TEST_P(MultinodeLoggerTest, LogDifferentConfig) {
time_converter_.StartEqual();
{
@@ -3077,7 +3058,9 @@
event_loop_factory_.RunFor(chrono::milliseconds(20000));
}
- LogReader reader(SortParts(logfiles_));
+ auto sorted_parts = SortParts(logfiles_);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ LogReader reader(sorted_parts);
reader.RemapLoggedChannel<aos::examples::Ping>("/test", "/original");
SimulatedEventLoopFactory log_reader_factory(reader.configuration());
@@ -3116,16 +3099,16 @@
MakeLogger(log_reader_factory.GetNodeEventLoopFactory("pi2"),
&log_reader_factory, reader.logged_configuration());
- pi1_logger.StartLogger(tmp_dir_ + "/relogged1");
- pi2_logger.StartLogger(tmp_dir_ + "/relogged2");
+ pi1_logger.StartLogger(tmp_dir_ + "/logs/relogged1");
+ pi2_logger.StartLogger(tmp_dir_ + "/logs/relogged2");
log_reader_factory.Run();
for (auto &x : pi1_logger.log_namer->all_filenames()) {
- log_files.emplace_back(absl::StrCat(tmp_dir_, "/relogged1_", x));
+ log_files.emplace_back(absl::StrCat(tmp_dir_, "/logs/relogged1_", x));
}
for (auto &x : pi2_logger.log_namer->all_filenames()) {
- log_files.emplace_back(absl::StrCat(tmp_dir_, "/relogged2_", x));
+ log_files.emplace_back(absl::StrCat(tmp_dir_, "/logs/relogged2_", x));
}
}
@@ -3134,18 +3117,23 @@
// And verify that we can run the LogReader over the relogged files without
// hitting any fatal errors.
{
- LogReader relogged_reader(SortParts(log_files));
+ auto sorted_parts = SortParts(log_files);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ LogReader relogged_reader(sorted_parts);
relogged_reader.Register();
relogged_reader.event_loop_factory()->Run();
}
}
-// Tests that we properly replay a log where the start time for a node is before
-// any data on the node. This can happen if the logger starts before data is
-// published. While the scenario below is a bit convoluted, we have seen logs
-// like this generated out in the wild.
+// Tests that we properly replay a log where the start time for a node is
+// before any data on the node. This can happen if the logger starts before
+// data is published. While the scenario below is a bit convoluted, we have
+// seen logs like this generated out in the wild.
TEST(MultinodeRebootLoggerTest, StartTimeBeforeData) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
aos::FlatbufferDetachedBuffer<aos::Configuration> config =
aos::configuration::ReadConfig(ArtifactPath(
"aos/events/logging/multinode_pingpong_split3_config.json"));
@@ -3167,17 +3155,14 @@
event_loop_factory.configuration(), pi3->node());
const std::string kLogfile1_1 =
- aos::testing::TestTmpDir() + "/multi_logfile1/";
+ aos::testing::TestTmpDir() + "/logs/multi_logfile1/";
const std::string kLogfile2_1 =
- aos::testing::TestTmpDir() + "/multi_logfile2.1/";
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2.1/";
const std::string kLogfile2_2 =
- aos::testing::TestTmpDir() + "/multi_logfile2.2/";
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2.2/";
const std::string kLogfile3_1 =
- aos::testing::TestTmpDir() + "/multi_logfile3/";
- util::UnlinkRecursive(kLogfile1_1);
- util::UnlinkRecursive(kLogfile2_1);
- util::UnlinkRecursive(kLogfile2_2);
- util::UnlinkRecursive(kLogfile3_1);
+ aos::testing::TestTmpDir() + "/logs/multi_logfile3/";
+
const UUID pi1_boot0 = UUID::Random();
const UUID pi2_boot0 = UUID::Random();
const UUID pi2_boot1 = UUID::Random();
@@ -3213,13 +3198,16 @@
std::vector<std::string> filenames;
{
LoggerState pi1_logger = MakeLoggerState(
- pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
LoggerState pi3_logger = MakeLoggerState(
- pi3, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi3, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
{
// And now start the logger.
LoggerState pi2_logger = MakeLoggerState(
- pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
event_loop_factory.RunFor(chrono::milliseconds(1000));
@@ -3254,7 +3242,8 @@
// Start logging again on pi2 after it is up.
LoggerState pi2_logger = MakeLoggerState(
- pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
pi2_logger.StartLogger(kLogfile2_2);
event_loop_factory.RunFor(chrono::milliseconds(10000));
@@ -3277,9 +3266,10 @@
pi3_logger.AppendAllFilenames(&filenames);
}
- // Confirm that we can parse the result. LogReader has enough internal CHECKs
- // to confirm the right thing happened.
+ // Confirm that we can parse the result. LogReader has enough internal
+ // CHECKs to confirm the right thing happened.
const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
auto result = ConfirmReadable(filenames);
EXPECT_THAT(result[0].first, ::testing::ElementsAre(realtime_clock::epoch() +
chrono::seconds(1)));
@@ -3304,16 +3294,20 @@
}
// Tests that local data before remote data after reboot is properly replayed.
-// We only trigger a reboot in the timestamp interpolation function when solving
-// the timestamp problem when we actually have a point in the function. This
-// originally only happened when a point passes the noncausal filter. At the
-// start of time for the second boot, if we aren't careful, we will have
-// messages which need to be published at times before the boot. This happens
-// when a local message is in the log before a forwarded message, so there is no
-// point in the interpolation function. This delays the reboot. So, we need to
-// recreate that situation and make sure it doesn't come back.
+// We only trigger a reboot in the timestamp interpolation function when
+// solving the timestamp problem when we actually have a point in the
+// function. This originally only happened when a point passes the noncausal
+// filter. At the start of time for the second boot, if we aren't careful, we
+// will have messages which need to be published at times before the boot.
+// This happens when a local message is in the log before a forwarded message,
+// so there is no point in the interpolation function. This delays the
+// reboot. So, we need to recreate that situation and make sure it doesn't
+// come back.
TEST(MultinodeRebootLoggerTest,
LocalMessageBeforeRemoteBeforeStartAfterReboot) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
aos::FlatbufferDetachedBuffer<aos::Configuration> config =
aos::configuration::ReadConfig(ArtifactPath(
"aos/events/logging/multinode_pingpong_split3_config.json"));
@@ -3335,17 +3329,13 @@
event_loop_factory.configuration(), pi3->node());
const std::string kLogfile1_1 =
- aos::testing::TestTmpDir() + "/multi_logfile1/";
+ aos::testing::TestTmpDir() + "/logs/multi_logfile1/";
const std::string kLogfile2_1 =
- aos::testing::TestTmpDir() + "/multi_logfile2.1/";
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2.1/";
const std::string kLogfile2_2 =
- aos::testing::TestTmpDir() + "/multi_logfile2.2/";
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2.2/";
const std::string kLogfile3_1 =
- aos::testing::TestTmpDir() + "/multi_logfile3/";
- util::UnlinkRecursive(kLogfile1_1);
- util::UnlinkRecursive(kLogfile2_1);
- util::UnlinkRecursive(kLogfile2_2);
- util::UnlinkRecursive(kLogfile3_1);
+ aos::testing::TestTmpDir() + "/logs/multi_logfile3/";
const UUID pi1_boot0 = UUID::Random();
const UUID pi2_boot0 = UUID::Random();
const UUID pi2_boot1 = UUID::Random();
@@ -3377,13 +3367,16 @@
std::vector<std::string> filenames;
{
LoggerState pi1_logger = MakeLoggerState(
- pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
LoggerState pi3_logger = MakeLoggerState(
- pi3, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi3, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
{
// And now start the logger.
LoggerState pi2_logger = MakeLoggerState(
- pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
pi1_logger.StartLogger(kLogfile1_1);
pi3_logger.StartLogger(kLogfile3_1);
@@ -3411,7 +3404,8 @@
{
event_loop_factory.RunFor(chrono::milliseconds(1000));
- // Make local stuff happen before we start logging and connect the remote.
+ // Make local stuff happen before we start logging and connect the
+ // remote.
pi2->AlwaysStart<Pong>("pong");
std::unique_ptr<aos::EventLoop> ping_event_loop =
pi1->MakeEventLoop("ping");
@@ -3420,7 +3414,8 @@
// Start logging again on pi2 after it is up.
LoggerState pi2_logger = MakeLoggerState(
- pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
pi2_logger.StartLogger(kLogfile2_2);
// And allow remote messages now that we have some local ones.
@@ -3438,9 +3433,10 @@
pi3_logger.AppendAllFilenames(&filenames);
}
- // Confirm that we can parse the result. LogReader has enough internal CHECKs
- // to confirm the right thing happened.
+ // Confirm that we can parse the result. LogReader has enough internal
+ // CHECKs to confirm the right thing happened.
const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
auto result = ConfirmReadable(filenames);
EXPECT_THAT(result[0].first, ::testing::ElementsAre(realtime_clock::epoch()));
@@ -3489,6 +3485,9 @@
// Tests that setting the start and stop flags across a reboot works as
// expected.
TEST(MultinodeRebootLoggerTest, RebootStartStopTimes) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
aos::FlatbufferDetachedBuffer<aos::Configuration> config =
aos::configuration::ReadConfig(ArtifactPath(
"aos/events/logging/multinode_pingpong_split3_config.json"));
@@ -3510,17 +3509,13 @@
event_loop_factory.configuration(), pi3->node());
const std::string kLogfile1_1 =
- aos::testing::TestTmpDir() + "/multi_logfile1/";
+ aos::testing::TestTmpDir() + "/logs/multi_logfile1/";
const std::string kLogfile2_1 =
- aos::testing::TestTmpDir() + "/multi_logfile2.1/";
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2.1/";
const std::string kLogfile2_2 =
- aos::testing::TestTmpDir() + "/multi_logfile2.2/";
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2.2/";
const std::string kLogfile3_1 =
- aos::testing::TestTmpDir() + "/multi_logfile3/";
- util::UnlinkRecursive(kLogfile1_1);
- util::UnlinkRecursive(kLogfile2_1);
- util::UnlinkRecursive(kLogfile2_2);
- util::UnlinkRecursive(kLogfile3_1);
+ aos::testing::TestTmpDir() + "/logs/multi_logfile3/";
{
CHECK_EQ(pi1_index, 0u);
CHECK_EQ(pi2_index, 1u);
@@ -3542,13 +3537,16 @@
std::vector<std::string> filenames;
{
LoggerState pi1_logger = MakeLoggerState(
- pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
LoggerState pi3_logger = MakeLoggerState(
- pi3, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi3, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
{
// And now start the logger.
LoggerState pi2_logger = MakeLoggerState(
- pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
pi1_logger.StartLogger(kLogfile1_1);
pi3_logger.StartLogger(kLogfile3_1);
@@ -3572,7 +3570,8 @@
{
event_loop_factory.RunFor(chrono::milliseconds(1000));
- // Make local stuff happen before we start logging and connect the remote.
+ // Make local stuff happen before we start logging and connect the
+ // remote.
pi2->AlwaysStart<Pong>("pong");
std::unique_ptr<aos::EventLoop> ping_event_loop =
pi1->MakeEventLoop("ping");
@@ -3581,7 +3580,8 @@
// Start logging again on pi2 after it is up.
LoggerState pi2_logger = MakeLoggerState(
- pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
pi2_logger.StartLogger(kLogfile2_2);
event_loop_factory.RunFor(chrono::milliseconds(5000));
@@ -3594,6 +3594,7 @@
}
const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
auto result = ConfirmReadable(filenames);
EXPECT_THAT(result[0].first, ::testing::ElementsAre(realtime_clock::epoch()));
@@ -3645,6 +3646,9 @@
// Tests that we properly handle one direction being down.
TEST(MissingDirectionTest, OneDirection) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
aos::FlatbufferDetachedBuffer<aos::Configuration> config =
aos::configuration::ReadConfig(ArtifactPath(
"aos/events/logging/multinode_pingpong_split4_config.json"));
@@ -3679,11 +3683,9 @@
}
const std::string kLogfile2_1 =
- aos::testing::TestTmpDir() + "/multi_logfile2.1/";
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2.1/";
const std::string kLogfile1_1 =
- aos::testing::TestTmpDir() + "/multi_logfile1.1/";
- util::UnlinkRecursive(kLogfile2_1);
- util::UnlinkRecursive(kLogfile1_1);
+ aos::testing::TestTmpDir() + "/logs/multi_logfile1.1/";
pi2->Disconnect(pi1->node());
@@ -3692,7 +3694,8 @@
{
LoggerState pi2_logger = MakeLoggerState(
- pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
event_loop_factory.RunFor(chrono::milliseconds(95));
@@ -3703,7 +3706,8 @@
pi2->Connect(pi1->node());
LoggerState pi1_logger = MakeLoggerState(
- pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
pi1_logger.StartLogger(kLogfile1_1);
event_loop_factory.RunFor(chrono::milliseconds(5000));
@@ -3712,12 +3716,16 @@
}
const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
ConfirmReadable(filenames);
}
// Tests that we properly handle only one direction ever existing after a
// reboot.
TEST(MissingDirectionTest, OneDirectionAfterReboot) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
aos::FlatbufferDetachedBuffer<aos::Configuration> config =
aos::configuration::ReadConfig(ArtifactPath(
"aos/events/logging/multinode_pingpong_split4_config.json"));
@@ -3752,8 +3760,7 @@
}
const std::string kLogfile2_1 =
- aos::testing::TestTmpDir() + "/multi_logfile2.1/";
- util::UnlinkRecursive(kLogfile2_1);
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2.1/";
pi1->AlwaysStart<Ping>("ping");
@@ -3762,7 +3769,8 @@
// second boot.
{
LoggerState pi2_logger = MakeLoggerState(
- pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
event_loop_factory.RunFor(chrono::milliseconds(95));
@@ -3780,15 +3788,20 @@
}
const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
ConfirmReadable(filenames);
}
-// Tests that we properly handle only one direction ever existing after a reboot
-// with only reliable data.
+// Tests that we properly handle only one direction ever existing after a
+// reboot with only reliable data.
TEST(MissingDirectionTest, OneDirectionAfterRebootReliable) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
aos::FlatbufferDetachedBuffer<aos::Configuration> config =
- aos::configuration::ReadConfig(ArtifactPath(
- "aos/events/logging/multinode_pingpong_split4_reliable_config.json"));
+ aos::configuration::ReadConfig(
+ ArtifactPath("aos/events/logging/"
+ "multinode_pingpong_split4_reliable_config.json"));
message_bridge::TestingTimeConverter time_converter(
configuration::NodesCount(&config.message()));
SimulatedEventLoopFactory event_loop_factory(&config.message());
@@ -3820,8 +3833,7 @@
}
const std::string kLogfile2_1 =
- aos::testing::TestTmpDir() + "/multi_logfile2.1/";
- util::UnlinkRecursive(kLogfile2_1);
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2.1/";
pi1->AlwaysStart<Ping>("ping");
@@ -3830,7 +3842,8 @@
// second boot.
{
LoggerState pi2_logger = MakeLoggerState(
- pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
event_loop_factory.RunFor(chrono::milliseconds(95));
@@ -3848,13 +3861,17 @@
}
const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
ConfirmReadable(filenames);
}
-// Tests that we properly handle only one direction ever existing after a reboot
-// with mixed unreliable vs reliable, where reliable has an earlier timestamp
-// than unreliable.
+// Tests that we properly handle only one direction ever existing after a
+// reboot with mixed unreliable vs reliable, where reliable has an earlier
+// timestamp than unreliable.
TEST(MissingDirectionTest, OneDirectionAfterRebootMixedCase1) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
aos::FlatbufferDetachedBuffer<aos::Configuration> config =
aos::configuration::ReadConfig(ArtifactPath(
"aos/events/logging/multinode_pingpong_split4_mixed1_config.json"));
@@ -3889,8 +3906,7 @@
}
const std::string kLogfile2_1 =
- aos::testing::TestTmpDir() + "/multi_logfile2.1/";
- util::UnlinkRecursive(kLogfile2_1);
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2.1/";
// The following sequence using the above reference config creates
// a reliable message timestamp < unreliable message timestamp.
@@ -3909,7 +3925,8 @@
event_loop_factory.RunFor(chrono::milliseconds(1000));
LoggerState pi2_logger = MakeLoggerState(
- pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
pi2_logger.StartLogger(kLogfile2_1);
@@ -3918,13 +3935,17 @@
}
const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
ConfirmReadable(filenames);
}
-// Tests that we properly handle only one direction ever existing after a reboot
-// with mixed unreliable vs reliable, where unreliable has an earlier timestamp
-// than reliable.
+// Tests that we properly handle only one direction ever existing after a
+// reboot with mixed unreliable vs reliable, where unreliable has an earlier
+// timestamp than reliable.
TEST(MissingDirectionTest, OneDirectionAfterRebootMixedCase2) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
aos::FlatbufferDetachedBuffer<aos::Configuration> config =
aos::configuration::ReadConfig(ArtifactPath(
"aos/events/logging/multinode_pingpong_split4_mixed2_config.json"));
@@ -3959,8 +3980,7 @@
}
const std::string kLogfile2_1 =
- aos::testing::TestTmpDir() + "/multi_logfile2.1/";
- util::UnlinkRecursive(kLogfile2_1);
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2.1/";
// The following sequence using the above reference config creates
// an unreliable message timestamp < reliable message timestamp.
@@ -3979,7 +3999,8 @@
event_loop_factory.RunFor(chrono::milliseconds(1000));
LoggerState pi2_logger = MakeLoggerState(
- pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
pi2_logger.StartLogger(kLogfile2_1);
@@ -3988,15 +4009,17 @@
}
const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
ConfirmReadable(filenames);
}
// Tests that we properly handle what used to be a time violation in one
// direction. This can occur when one direction goes down after sending some
-// data, but the other keeps working. The down direction ends up resolving to a
-// straight line in the noncausal filter, where the direction which is still up
-// can cross that line. Really, time progressed along just fine but we assumed
-// that the offset was a line when it could have deviated by up to 1ms/second.
+// data, but the other keeps working. The down direction ends up resolving to
+// a straight line in the noncausal filter, where the direction which is still
+// up can cross that line. Really, time progressed along just fine but we
+// assumed that the offset was a line when it could have deviated by up to
+// 1ms/second.
TEST_P(MultinodeLoggerTest, OneDirectionTimeDrift) {
std::vector<std::string> filenames;
@@ -4022,8 +4045,7 @@
chrono::milliseconds(10000) + chrono::milliseconds(5)});
const std::string kLogfile =
- aos::testing::TestTmpDir() + "/multi_logfile2.1/";
- util::UnlinkRecursive(kLogfile);
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2.1/";
{
LoggerState pi2_logger = MakeLogger(pi2_);
@@ -4040,12 +4062,13 @@
}
const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
ConfirmReadable(filenames);
}
-// Tests that we can replay a logfile that has timestamps such that at least one
-// node's epoch is at a positive distributed_clock (and thus will have to be
-// booted after the other node(s)).
+// Tests that we can replay a logfile that has timestamps such that at least
+// one node's epoch is at a positive distributed_clock (and thus will have to
+// be booted after the other node(s)).
TEST_P(MultinodeLoggerTest, StartOneNodeBeforeOther) {
std::vector<std::string> filenames;
@@ -4064,8 +4087,7 @@
{chrono::milliseconds(10000), chrono::milliseconds(10000)});
const std::string kLogfile =
- aos::testing::TestTmpDir() + "/multi_logfile2.1/";
- util::UnlinkRecursive(kLogfile);
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2.1/";
pi2_->Disconnect(pi1_->node());
pi1_->Disconnect(pi2_->node());
@@ -4085,6 +4107,7 @@
}
const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
ConfirmReadable(filenames);
{
@@ -4110,12 +4133,16 @@
}
}
-// Tests that when we have a loop without all the logs at all points in time, we
-// can sort it properly.
+// Tests that when we have a loop without all the logs at all points in time,
+// we can sort it properly.
TEST(MultinodeLoggerLoopTest, Loop) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
aos::FlatbufferDetachedBuffer<aos::Configuration> config =
- aos::configuration::ReadConfig(ArtifactPath(
- "aos/events/logging/multinode_pingpong_triangle_split_config.json"));
+ aos::configuration::ReadConfig(
+ ArtifactPath("aos/events/logging/"
+ "multinode_pingpong_triangle_split_config.json"));
message_bridge::TestingTimeConverter time_converter(
configuration::NodesCount(&config.message()));
SimulatedEventLoopFactory event_loop_factory(&config.message());
@@ -4129,14 +4156,11 @@
event_loop_factory.GetNodeEventLoopFactory("pi3");
const std::string kLogfile1_1 =
- aos::testing::TestTmpDir() + "/multi_logfile1/";
+ aos::testing::TestTmpDir() + "/logs/multi_logfile1/";
const std::string kLogfile2_1 =
- aos::testing::TestTmpDir() + "/multi_logfile2/";
+ aos::testing::TestTmpDir() + "/logs/multi_logfile2/";
const std::string kLogfile3_1 =
- aos::testing::TestTmpDir() + "/multi_logfile3/";
- util::UnlinkRecursive(kLogfile1_1);
- util::UnlinkRecursive(kLogfile2_1);
- util::UnlinkRecursive(kLogfile3_1);
+ aos::testing::TestTmpDir() + "/logs/multi_logfile3/";
{
// Make pi1 boot before everything else.
@@ -4147,9 +4171,9 @@
BootTimestamp::epoch() - chrono::milliseconds(300)});
}
- // We want to setup a situation such that 2 of the 3 legs of the loop are very
- // confident about time being X, and the third leg is pulling the average off
- // to one side.
+ // We want to setup a situation such that 2 of the 3 legs of the loop are
+ // very confident about time being X, and the third leg is pulling the
+ // average off to one side.
//
// It's easiest to visualize this in timestamp_plotter.
@@ -4168,24 +4192,28 @@
CHECK_EQ(builder.Send(ping_builder.Finish()), RawSender::Error::kOk);
}
- // Wait a while so there's enough data to let the worst case be rather off.
+ // Wait a while so there's enough data to let the worst case be rather
+ // off.
event_loop_factory.RunFor(chrono::seconds(1000));
- // Now start a receiving node first. This sets up 2 tight bounds between 2
- // of the nodes.
+ // Now start a receiving node first. This sets up 2 tight bounds between
+ // 2 of the nodes.
LoggerState pi2_logger = MakeLoggerState(
- pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi2, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
pi2_logger.StartLogger(kLogfile2_1);
event_loop_factory.RunFor(chrono::seconds(100));
// And now start the third leg.
LoggerState pi3_logger = MakeLoggerState(
- pi3, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi3, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
pi3_logger.StartLogger(kLogfile3_1);
LoggerState pi1_logger = MakeLoggerState(
- pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0]);
+ pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
pi1_logger.StartLogger(kLogfile1_1);
event_loop_factory.RunFor(chrono::seconds(100));
@@ -4197,13 +4225,14 @@
// Make sure we can read this.
const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
auto result = ConfirmReadable(filenames);
}
// Tests that RestartLogging works in the simple case. Unfortunately, the
-// failure cases involve simulating time elapsing in callbacks, which is really
-// hard. The best we can reasonably do is make sure 2 back to back logs are
-// parseable together.
+// failure cases involve simulating time elapsing in callbacks, which is
+// really hard. The best we can reasonably do is make sure 2 back to back
+// logs are parseable together.
TEST_P(MultinodeLoggerTest, RestartLogging) {
time_converter_.AddMonotonic(
{BootTimestamp::epoch(), BootTimestamp::epoch() + chrono::seconds(1000)});
@@ -4216,18 +4245,19 @@
StartLogger(&pi1_logger, logfile_base1_);
aos::monotonic_clock::time_point last_rotation_time =
pi1_logger.event_loop->monotonic_now();
- pi1_logger.logger->set_on_logged_period([&] {
- const auto now = pi1_logger.event_loop->monotonic_now();
- if (now > last_rotation_time + std::chrono::seconds(5)) {
- pi1_logger.AppendAllFilenames(&filenames);
- std::unique_ptr<MultiNodeFilesLogNamer> namer =
- pi1_logger.MakeLogNamer(logfile_base2_);
- pi1_logger.log_namer = namer.get();
+ pi1_logger.logger->set_on_logged_period(
+ [&](aos::monotonic_clock::time_point) {
+ const auto now = pi1_logger.event_loop->monotonic_now();
+ if (now > last_rotation_time + std::chrono::seconds(5)) {
+ pi1_logger.AppendAllFilenames(&filenames);
+ std::unique_ptr<MultiNodeFilesLogNamer> namer =
+ pi1_logger.MakeLogNamer(logfile_base2_);
+ pi1_logger.log_namer = namer.get();
- pi1_logger.logger->RestartLogging(std::move(namer));
- last_rotation_time = now;
- }
- });
+ pi1_logger.logger->RestartLogging(std::move(namer));
+ last_rotation_time = now;
+ }
+ });
event_loop_factory_.RunFor(chrono::milliseconds(7000));
@@ -4242,8 +4272,414 @@
ConfirmReadable(filenames);
- // TODO(austin): It would be good to confirm that any one time messages end up
- // in both logs correctly.
+ // TODO(austin): It would be good to confirm that any one time messages end
+ // up in both logs correctly.
+}
+
+// Tests that we call OnEnd without --skip_missing_forwarding_entries.
+TEST_P(MultinodeLoggerTest, SkipMissingForwardingEntries) {
+ if (file_strategy() == FileStrategy::kCombine) {
+ GTEST_SKIP() << "We don't need to test the combined file writer this deep.";
+ }
+ time_converter_.AddMonotonic(
+ {BootTimestamp::epoch(), BootTimestamp::epoch() + chrono::seconds(1000)});
+
+ std::vector<std::string> filenames;
+ {
+ LoggerState pi1_logger = MakeLogger(pi1_);
+
+ event_loop_factory_.RunFor(chrono::milliseconds(95));
+
+ StartLogger(&pi1_logger);
+ aos::monotonic_clock::time_point last_rotation_time =
+ pi1_logger.event_loop->monotonic_now();
+ pi1_logger.logger->set_on_logged_period(
+ [&](aos::monotonic_clock::time_point) {
+ const auto now = pi1_logger.event_loop->monotonic_now();
+ if (now > last_rotation_time + std::chrono::seconds(5)) {
+ pi1_logger.logger->Rotate();
+ last_rotation_time = now;
+ }
+ });
+
+ event_loop_factory_.RunFor(chrono::milliseconds(15000));
+ pi1_logger.AppendAllFilenames(&filenames);
+ }
+
+ // If we remove the last remote data part, we'll trigger missing data for
+ // timestamps.
+ filenames.erase(std::remove_if(filenames.begin(), filenames.end(),
+ [](const std::string &s) {
+ return s.find("data/pi2_data.part3.bfbs") !=
+ std::string::npos;
+ }),
+ filenames.end());
+
+ auto result = ConfirmReadable(filenames);
+}
+
+// Tests that we call OnEnd without --skip_missing_forwarding_entries.
+TEST(MultinodeLoggerConfigTest, SingleNode) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
+ aos::FlatbufferDetachedBuffer<aos::Configuration> config =
+ aos::configuration::ReadConfig(
+ ArtifactPath("aos/events/logging/multinode_single_node_config.json"));
+ message_bridge::TestingTimeConverter time_converter(
+ configuration::NodesCount(&config.message()));
+ SimulatedEventLoopFactory event_loop_factory(&config.message());
+ event_loop_factory.SetTimeConverter(&time_converter);
+
+ time_converter.StartEqual();
+
+ const std::string kLogfile1_1 =
+ aos::testing::TestTmpDir() + "/logs/multi_logfile1/";
+
+ NodeEventLoopFactory *const pi1 =
+ event_loop_factory.GetNodeEventLoopFactory("pi1");
+
+ std::vector<std::string> filenames;
+
+ {
+ // Now start a receiving node first. This sets up 2 tight bounds between
+ // 2 of the nodes.
+ LoggerState pi1_logger = MakeLoggerState(
+ pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
+ pi1_logger.StartLogger(kLogfile1_1);
+
+ event_loop_factory.RunFor(chrono::seconds(10));
+
+ pi1_logger.AppendAllFilenames(&filenames);
+ }
+
+ // Make sure we can read this.
+ const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ auto result = ConfirmReadable(filenames);
+
+ // TODO(austin): Probably want to stop caring about ServerStatistics,
+ // ClientStatistics, and Timestamp since they don't really make sense.
+}
+
+// Tests that when we have evidence of 2 boots, and then start logging, the
+// max_out_of_order_duration ends up reasonable on the boot with the start time.
+TEST(MultinodeLoggerLoopTest, PreviousBootData) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
+ aos::FlatbufferDetachedBuffer<aos::Configuration> config =
+ aos::configuration::ReadConfig(ArtifactPath(
+ "aos/events/logging/multinode_pingpong_reboot_ooo_config.json"));
+ message_bridge::TestingTimeConverter time_converter(
+ configuration::NodesCount(&config.message()));
+ SimulatedEventLoopFactory event_loop_factory(&config.message());
+ event_loop_factory.SetTimeConverter(&time_converter);
+
+ const UUID pi1_boot0 = UUID::Random();
+ const UUID pi2_boot0 = UUID::Random();
+ const UUID pi2_boot1 = UUID::Random();
+
+ const std::string kLogfile1_1 =
+ aos::testing::TestTmpDir() + "/logs/multi_logfile1/";
+
+ {
+ constexpr size_t kPi1Index = 0;
+ constexpr size_t kPi2Index = 1;
+ time_converter.set_boot_uuid(kPi1Index, 0, pi1_boot0);
+ time_converter.set_boot_uuid(kPi2Index, 0, pi2_boot0);
+ time_converter.set_boot_uuid(kPi2Index, 1, pi2_boot1);
+
+ // Make pi1 boot before everything else.
+ time_converter.AddNextTimestamp(
+ distributed_clock::epoch(),
+ {BootTimestamp::epoch(),
+ BootTimestamp::epoch() - chrono::milliseconds(100)});
+
+ const chrono::nanoseconds reboot_time = chrono::seconds(1005);
+ time_converter.AddNextTimestamp(
+ distributed_clock::epoch() + reboot_time,
+ {BootTimestamp::epoch() + reboot_time,
+ BootTimestamp{.boot = 1, .time = monotonic_clock::epoch()}});
+ }
+
+ NodeEventLoopFactory *const pi1 =
+ event_loop_factory.GetNodeEventLoopFactory("pi1");
+ NodeEventLoopFactory *const pi2 =
+ event_loop_factory.GetNodeEventLoopFactory("pi2");
+
+ // What we want is for pi2 to send a message at t=1000 on the first channel
+ // (/atest1 pong), and t=1 on the second channel (/atest3 pong). That'll make
+ // the max out of order duration be large.
+ //
+ // Then, we reboot, and only send messages on a third channel (/atest2 pong).
+ // The order is key, they need to sort in this order in the config.
+
+ std::vector<std::string> filenames;
+ {
+ {
+ std::unique_ptr<EventLoop> pi2_event_loop = pi2->MakeEventLoop("pong");
+ aos::Sender<examples::Pong> pong_sender =
+ pi2_event_loop->MakeSender<examples::Pong>("/atest3");
+
+ pi2_event_loop->OnRun([&]() {
+ aos::Sender<examples::Pong>::Builder builder =
+ pong_sender.MakeBuilder();
+ examples::Pong::Builder pong_builder =
+ builder.MakeBuilder<examples::Pong>();
+ CHECK_EQ(builder.Send(pong_builder.Finish()), RawSender::Error::kOk);
+ });
+
+ event_loop_factory.RunFor(chrono::seconds(1000));
+ }
+
+ {
+ std::unique_ptr<EventLoop> pi2_event_loop = pi2->MakeEventLoop("pong");
+ aos::Sender<examples::Pong> pong_sender =
+ pi2_event_loop->MakeSender<examples::Pong>("/atest1");
+
+ aos::Sender<examples::Pong>::Builder builder = pong_sender.MakeBuilder();
+ examples::Pong::Builder pong_builder =
+ builder.MakeBuilder<examples::Pong>();
+ CHECK_EQ(builder.Send(pong_builder.Finish()), RawSender::Error::kOk);
+ }
+
+ event_loop_factory.RunFor(chrono::seconds(10));
+
+ // Now start a receiving node first. This sets up 2 tight bounds between
+ // 2 of the nodes.
+ LoggerState pi1_logger = MakeLoggerState(
+ pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
+ pi1_logger.StartLogger(kLogfile1_1);
+
+ std::unique_ptr<EventLoop> pi2_event_loop = pi2->MakeEventLoop("pong");
+ aos::Sender<examples::Pong> pong_sender =
+ pi2_event_loop->MakeSender<examples::Pong>("/atest2");
+
+ pi2_event_loop->AddPhasedLoop(
+ [&pong_sender](int) {
+ aos::Sender<examples::Pong>::Builder builder =
+ pong_sender.MakeBuilder();
+ examples::Pong::Builder pong_builder =
+ builder.MakeBuilder<examples::Pong>();
+ CHECK_EQ(builder.Send(pong_builder.Finish()), RawSender::Error::kOk);
+ },
+ chrono::milliseconds(10));
+
+ event_loop_factory.RunFor(chrono::seconds(100));
+
+ pi1_logger.AppendAllFilenames(&filenames);
+ }
+
+ // Make sure we can read this.
+ const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllRebootPartsMatchOutOfOrderDuration(sorted_parts, "pi2"));
+ auto result = ConfirmReadable(filenames);
+}
+
+// Tests that when we start without a connection, and then start logging, the
+// max_out_of_order_duration ends up reasonable.
+TEST(MultinodeLoggerLoopTest, StartDisconnected) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
+ aos::FlatbufferDetachedBuffer<aos::Configuration> config =
+ aos::configuration::ReadConfig(ArtifactPath(
+ "aos/events/logging/multinode_pingpong_reboot_ooo_config.json"));
+ message_bridge::TestingTimeConverter time_converter(
+ configuration::NodesCount(&config.message()));
+ SimulatedEventLoopFactory event_loop_factory(&config.message());
+ event_loop_factory.SetTimeConverter(&time_converter);
+
+ time_converter.StartEqual();
+
+ const std::string kLogfile1_1 =
+ aos::testing::TestTmpDir() + "/logs/multi_logfile1/";
+
+ NodeEventLoopFactory *const pi1 =
+ event_loop_factory.GetNodeEventLoopFactory("pi1");
+ NodeEventLoopFactory *const pi2 =
+ event_loop_factory.GetNodeEventLoopFactory("pi2");
+
+ // What we want is for pi2 to send a message at t=1000 on the first channel
+ // (/atest1 pong), and t=1 on the second channel (/atest3 pong). That'll make
+ // the max out of order duration be large.
+ //
+ // Then, we disconnect, and only send messages on a third channel
+ // (/atest2 pong). The order is key, they need to sort in this order in the
+ // config so we observe them in the order which grows the
+ // max_out_of_order_duration.
+
+ std::vector<std::string> filenames;
+ {
+ {
+ std::unique_ptr<EventLoop> pi2_event_loop = pi2->MakeEventLoop("pong");
+ aos::Sender<examples::Pong> pong_sender =
+ pi2_event_loop->MakeSender<examples::Pong>("/atest3");
+
+ pi2_event_loop->OnRun([&]() {
+ aos::Sender<examples::Pong>::Builder builder =
+ pong_sender.MakeBuilder();
+ examples::Pong::Builder pong_builder =
+ builder.MakeBuilder<examples::Pong>();
+ CHECK_EQ(builder.Send(pong_builder.Finish()), RawSender::Error::kOk);
+ });
+
+ event_loop_factory.RunFor(chrono::seconds(1000));
+ }
+
+ {
+ std::unique_ptr<EventLoop> pi2_event_loop = pi2->MakeEventLoop("pong");
+ aos::Sender<examples::Pong> pong_sender =
+ pi2_event_loop->MakeSender<examples::Pong>("/atest1");
+
+ aos::Sender<examples::Pong>::Builder builder = pong_sender.MakeBuilder();
+ examples::Pong::Builder pong_builder =
+ builder.MakeBuilder<examples::Pong>();
+ CHECK_EQ(builder.Send(pong_builder.Finish()), RawSender::Error::kOk);
+ }
+
+ event_loop_factory.RunFor(chrono::seconds(10));
+
+ pi1->Disconnect(pi2->node());
+ pi2->Disconnect(pi1->node());
+
+ // Make data flow.
+ std::unique_ptr<EventLoop> pi2_event_loop = pi2->MakeEventLoop("pong");
+ aos::Sender<examples::Pong> pong_sender =
+ pi2_event_loop->MakeSender<examples::Pong>("/atest2");
+
+ pi2_event_loop->AddPhasedLoop(
+ [&pong_sender](int) {
+ aos::Sender<examples::Pong>::Builder builder =
+ pong_sender.MakeBuilder();
+ examples::Pong::Builder pong_builder =
+ builder.MakeBuilder<examples::Pong>();
+ CHECK_EQ(builder.Send(pong_builder.Finish()), RawSender::Error::kOk);
+ },
+ chrono::milliseconds(10));
+
+ event_loop_factory.RunFor(chrono::seconds(10));
+
+ // Now start a receiving node first. This sets up 2 tight bounds between
+ // 2 of the nodes.
+ LoggerState pi1_logger = MakeLoggerState(
+ pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
+ pi1_logger.StartLogger(kLogfile1_1);
+
+ event_loop_factory.RunFor(chrono::seconds(10));
+
+ // Now, reconnect, and everything should recover.
+ pi1->Connect(pi2->node());
+ pi2->Connect(pi1->node());
+
+ event_loop_factory.RunFor(chrono::seconds(10));
+
+ pi1_logger.AppendAllFilenames(&filenames);
+ }
+
+ // Make sure we can read this.
+ const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ auto result = ConfirmReadable(filenames);
+}
+
+// Class to spam Pong messages blindly.
+class PongSender {
+ public:
+ PongSender(EventLoop *loop, std::string_view channel_name)
+ : sender_(loop->MakeSender<examples::Pong>(channel_name)) {
+ loop->AddPhasedLoop(
+ [this](int) {
+ aos::Sender<examples::Pong>::Builder builder = sender_.MakeBuilder();
+ examples::Pong::Builder pong_builder =
+ builder.MakeBuilder<examples::Pong>();
+ CHECK_EQ(builder.Send(pong_builder.Finish()), RawSender::Error::kOk);
+ },
+ chrono::milliseconds(10));
+ }
+
+ private:
+ aos::Sender<examples::Pong> sender_;
+};
+
+// Tests that we log correctly as nodes connect slowly.
+TEST(MultinodeLoggerLoopTest, StaggeredConnect) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
+ aos::FlatbufferDetachedBuffer<aos::Configuration> config =
+ aos::configuration::ReadConfig(ArtifactPath(
+ "aos/events/logging/multinode_pingpong_pi3_pingpong_config.json"));
+ message_bridge::TestingTimeConverter time_converter(
+ configuration::NodesCount(&config.message()));
+ SimulatedEventLoopFactory event_loop_factory(&config.message());
+ event_loop_factory.SetTimeConverter(&time_converter);
+
+ time_converter.StartEqual();
+
+ const std::string kLogfile1_1 =
+ aos::testing::TestTmpDir() + "/logs/multi_logfile1/";
+
+ NodeEventLoopFactory *const pi1 =
+ event_loop_factory.GetNodeEventLoopFactory("pi1");
+ NodeEventLoopFactory *const pi2 =
+ event_loop_factory.GetNodeEventLoopFactory("pi2");
+ NodeEventLoopFactory *const pi3 =
+ event_loop_factory.GetNodeEventLoopFactory("pi3");
+
+ // What we want is for pi2 to send a message at t=1000 on the first channel
+ // (/atest1 pong), and t=1 on the second channel (/atest3 pong). That'll make
+ // the max out of order duration be large.
+ //
+ // Then, we disconnect, and only send messages on a third channel
+ // (/atest2 pong). The order is key, they need to sort in this order in the
+ // config so we observe them in the order which grows the
+ // max_out_of_order_duration.
+
+ pi1->Disconnect(pi2->node());
+ pi2->Disconnect(pi1->node());
+
+ pi1->Disconnect(pi3->node());
+ pi3->Disconnect(pi1->node());
+
+ std::vector<std::string> filenames;
+ pi2->AlwaysStart<PongSender>("pongsender", "/test2");
+ pi3->AlwaysStart<PongSender>("pongsender", "/test3");
+
+ event_loop_factory.RunFor(chrono::seconds(10));
+
+ {
+ // Now start a receiving node first. This sets up 2 tight bounds between
+ // 2 of the nodes.
+ LoggerState pi1_logger = MakeLoggerState(
+ pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
+ pi1_logger.StartLogger(kLogfile1_1);
+
+ event_loop_factory.RunFor(chrono::seconds(10));
+
+ // Now, reconnect, and everything should recover.
+ pi1->Connect(pi2->node());
+ pi2->Connect(pi1->node());
+
+ event_loop_factory.RunFor(chrono::seconds(10));
+
+ pi1->Connect(pi3->node());
+ pi3->Connect(pi1->node());
+
+ event_loop_factory.RunFor(chrono::seconds(10));
+
+ pi1_logger.AppendAllFilenames(&filenames);
+ }
+
+ // Make sure we can read this.
+ const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ auto result = ConfirmReadable(filenames);
}
} // namespace testing
diff --git a/aos/events/logging/multinode_logger_test_lib.cc b/aos/events/logging/multinode_logger_test_lib.cc
index fdee4d8..64dd1cd 100644
--- a/aos/events/logging/multinode_logger_test_lib.cc
+++ b/aos/events/logging/multinode_logger_test_lib.cc
@@ -3,11 +3,14 @@
#include "aos/events/event_loop.h"
#include "aos/events/logging/log_reader.h"
#include "aos/events/logging/logfile_utils.h"
+#include "aos/events/logging/logfile_validator.h"
#include "aos/events/ping_lib.h"
#include "aos/events/pong_lib.h"
#include "aos/events/simulated_event_loop.h"
#include "aos/testing/tmpdir.h"
+DECLARE_bool(force_timestamp_loading);
+
namespace aos {
namespace logger {
namespace testing {
@@ -17,6 +20,7 @@
LoggerState MakeLoggerState(NodeEventLoopFactory *node,
SimulatedEventLoopFactory *factory,
CompressionParams params,
+ FileStrategy file_strategy,
const Configuration *configuration) {
if (configuration == nullptr) {
configuration = factory->configuration();
@@ -26,14 +30,18 @@
configuration,
configuration::GetNode(configuration, node->node()),
nullptr,
- params};
+ params,
+ file_strategy};
}
std::unique_ptr<MultiNodeFilesLogNamer> LoggerState::MakeLogNamer(
std::string logfile_base) {
std::unique_ptr<MultiNodeFilesLogNamer> namer =
- std::make_unique<MultiNodeFilesLogNamer>(logfile_base, configuration,
- event_loop.get(), node);
+ file_strategy == FileStrategy::kCombine
+ ? std::make_unique<MinimalFileMultiNodeLogNamer>(
+ logfile_base, configuration, event_loop.get(), node)
+ : std::make_unique<MultiNodeFilesLogNamer>(
+ logfile_base, configuration, event_loop.get(), node);
namer->set_extension(params.extension);
namer->set_encoder_factory(params.encoder_factory);
return namer;
@@ -94,32 +102,24 @@
pi2_index_(configuration::GetNodeIndex(
event_loop_factory_.configuration(), pi2_->node())),
tmp_dir_(aos::testing::TestTmpDir()),
- logfile_base1_(tmp_dir_ + "/multi_logfile1"),
- logfile_base2_(tmp_dir_ + "/multi_logfile2"),
+ logfile_base1_(tmp_dir_ + "/logs/multi_logfile1"),
+ logfile_base2_(tmp_dir_ + "/logs/multi_logfile2"),
pi1_reboot_logfiles_(MakePi1RebootLogfiles()),
logfiles_(MakeLogFiles(logfile_base1_, logfile_base2_)),
- pi1_single_direction_logfiles_(MakePi1SingleDirectionLogfiles()),
structured_logfiles_(StructureLogFiles()) {
+ FLAGS_force_timestamp_loading =
+ std::get<0>(GetParam()).timestamp_buffering ==
+ ForceTimestampBuffering::kForceBufferTimestamps;
+
+ util::UnlinkRecursive(tmp_dir_ + "/logs");
+ std::filesystem::create_directory(tmp_dir_ + "/logs");
+
LOG(INFO) << "Config " << std::get<0>(GetParam()).config;
event_loop_factory_.SetTimeConverter(&time_converter_);
- // Go through and remove the logfiles if they already exist.
- for (const auto &file : logfiles_) {
- unlink(file.c_str());
- unlink((file + ".xz").c_str());
- }
-
- for (const auto &file : MakeLogFiles(tmp_dir_ + "/relogged1",
- tmp_dir_ + "/relogged2", 3, 3, true)) {
- unlink(file.c_str());
- }
-
- for (const auto &file : pi1_reboot_logfiles_) {
- unlink(file.c_str());
- }
-
LOG(INFO) << "Logging data to " << logfiles_[0] << ", " << logfiles_[1]
- << " and " << logfiles_[2];
+ << " and " << logfiles_[2] << " shared? " << shared()
+ << " combine? " << (file_strategy() == FileStrategy::kCombine);
pi1_->OnStartup([this]() {
pi1_->AlwaysStart<Ping>("ping");
@@ -135,90 +135,72 @@
return std::get<0>(GetParam()).shared;
}
+FileStrategy MultinodeLoggerTest::file_strategy() const {
+ return std::get<0>(GetParam()).file_strategy;
+}
+
std::vector<std::string> MultinodeLoggerTest::MakeLogFiles(
std::string logfile_base1, std::string logfile_base2, size_t pi1_data_count,
- size_t pi2_data_count, bool relogged_config) {
+ size_t pi2_data_count, size_t pi1_timestamps_count,
+ size_t pi2_timestamps_count, bool relogged_config) {
std::string_view sha256 = relogged_config
? std::get<0>(GetParam()).relogged_sha256
: std::get<0>(GetParam()).sha256;
std::vector<std::string> result;
result.emplace_back(absl::StrCat(logfile_base1, "_", sha256, Extension()));
result.emplace_back(absl::StrCat(logfile_base2, "_", sha256, Extension()));
- for (size_t i = 0; i < pi1_data_count; ++i) {
- result.emplace_back(
- absl::StrCat(logfile_base1, "_pi1_data.part", i, Extension()));
- }
- result.emplace_back(logfile_base1 + "_pi2_data/test/aos.examples.Pong.part0" +
- Extension());
- result.emplace_back(logfile_base1 + "_pi2_data/test/aos.examples.Pong.part1" +
- Extension());
- for (size_t i = 0; i < pi2_data_count; ++i) {
- result.emplace_back(
- absl::StrCat(logfile_base2, "_pi2_data.part", i, Extension()));
- }
- result.emplace_back(logfile_base2 +
- "_pi1_data/pi1/aos/aos.message_bridge.Timestamp.part0" +
- Extension());
- result.emplace_back(logfile_base2 +
- "_pi1_data/pi1/aos/aos.message_bridge.Timestamp.part1" +
- Extension());
- result.emplace_back(logfile_base1 +
- "_pi2_data/pi2/aos/aos.message_bridge.Timestamp.part0" +
- Extension());
- result.emplace_back(logfile_base1 +
- "_pi2_data/pi2/aos/aos.message_bridge.Timestamp.part1" +
- Extension());
- if (shared()) {
- result.emplace_back(logfile_base1 +
- "_timestamps/pi1/aos/remote_timestamps/pi2/"
- "aos.message_bridge.RemoteMessage.part0" +
- Extension());
- result.emplace_back(logfile_base1 +
- "_timestamps/pi1/aos/remote_timestamps/pi2/"
- "aos.message_bridge.RemoteMessage.part1" +
- Extension());
- result.emplace_back(logfile_base1 +
- "_timestamps/pi1/aos/remote_timestamps/pi2/"
- "aos.message_bridge.RemoteMessage.part2" +
- Extension());
- result.emplace_back(logfile_base2 +
- "_timestamps/pi2/aos/remote_timestamps/pi1/"
- "aos.message_bridge.RemoteMessage.part0" +
- Extension());
- result.emplace_back(logfile_base2 +
- "_timestamps/pi2/aos/remote_timestamps/pi1/"
- "aos.message_bridge.RemoteMessage.part1" +
- Extension());
+
+ if (file_strategy() == FileStrategy::kCombine) {
+ for (size_t i = 0; i < pi1_data_count + pi1_timestamps_count; ++i) {
+ result.emplace_back(
+ absl::StrCat(logfile_base1, "_pi1_pi1_all.part", i, Extension()));
+ }
+ for (size_t i = 0; i < 3; ++i) {
+ result.emplace_back(
+ absl::StrCat(logfile_base1, "_pi1_pi2_all.part", i, Extension()));
+ }
+
+ for (size_t i = 0; i < pi2_data_count + pi2_timestamps_count; ++i) {
+ result.emplace_back(
+ absl::StrCat(logfile_base2, "_pi2_pi2_all.part", i, Extension()));
+ }
+
+ for (size_t i = 0; i < 3; ++i) {
+ result.emplace_back(
+ absl::StrCat(logfile_base2, "_pi2_pi1_all.part", i, Extension()));
+ }
} else {
- result.emplace_back(logfile_base1 +
- "_timestamps/pi1/aos/remote_timestamps/pi2/pi1/aos/"
- "aos-message_bridge-Timestamp/"
- "aos.message_bridge.RemoteMessage.part0" +
+ for (size_t i = 0; i < pi1_data_count; ++i) {
+ result.emplace_back(
+ absl::StrCat(logfile_base1, "_pi1_data.part", i, Extension()));
+ }
+ for (size_t i = 0; i < pi1_timestamps_count; ++i) {
+ result.emplace_back(
+ absl::StrCat(logfile_base1, "_pi1_timestamps.part", i, Extension()));
+ }
+ for (size_t i = 0; i < pi2_data_count; ++i) {
+ result.emplace_back(
+ absl::StrCat(logfile_base2, "_pi2_data.part", i, Extension()));
+ }
+ for (size_t i = 0; i < pi2_timestamps_count; ++i) {
+ result.emplace_back(
+ absl::StrCat(logfile_base2, "_pi2_timestamps.part", i, Extension()));
+ }
+ result.emplace_back(logfile_base2 + "_data/pi1_data.part0" + Extension());
+ result.emplace_back(logfile_base2 + "_data/pi1_data.part1" + Extension());
+ result.emplace_back(logfile_base1 + "_data/pi2_data.part0" + Extension());
+ result.emplace_back(logfile_base1 + "_data/pi2_data.part1" + Extension());
+ // shared and not shared config types will have the same output since the
+ // data writers are consolidated to per node instead of per channel.
+ result.emplace_back(logfile_base1 + "_timestamps/remote_pi2.part0" +
Extension());
- result.emplace_back(logfile_base1 +
- "_timestamps/pi1/aos/remote_timestamps/pi2/pi1/aos/"
- "aos-message_bridge-Timestamp/"
- "aos.message_bridge.RemoteMessage.part1" +
+ result.emplace_back(logfile_base1 + "_timestamps/remote_pi2.part1" +
Extension());
- result.emplace_back(logfile_base2 +
- "_timestamps/pi2/aos/remote_timestamps/pi1/pi2/aos/"
- "aos-message_bridge-Timestamp/"
- "aos.message_bridge.RemoteMessage.part0" +
+ result.emplace_back(logfile_base1 + "_timestamps/remote_pi2.part2" +
Extension());
- result.emplace_back(logfile_base2 +
- "_timestamps/pi2/aos/remote_timestamps/pi1/pi2/aos/"
- "aos-message_bridge-Timestamp/"
- "aos.message_bridge.RemoteMessage.part1" +
+ result.emplace_back(logfile_base2 + "_timestamps/remote_pi1.part0" +
Extension());
- result.emplace_back(logfile_base1 +
- "_timestamps/pi1/aos/remote_timestamps/pi2/test/"
- "aos-examples-Ping/"
- "aos.message_bridge.RemoteMessage.part0" +
- Extension());
- result.emplace_back(logfile_base1 +
- "_timestamps/pi1/aos/remote_timestamps/pi2/test/"
- "aos-examples-Ping/"
- "aos.message_bridge.RemoteMessage.part1" +
+ result.emplace_back(logfile_base2 + "_timestamps/remote_pi1.part1" +
Extension());
}
@@ -228,125 +210,56 @@
std::vector<std::string> MultinodeLoggerTest::MakePi1RebootLogfiles() {
std::vector<std::string> result;
result.emplace_back(logfile_base1_ + "_pi1_data.part0" + Extension());
- result.emplace_back(logfile_base1_ + "_pi1_data.part1" + Extension());
- result.emplace_back(logfile_base1_ + "_pi1_data.part2" + Extension());
- result.emplace_back(logfile_base1_ + "_pi1_data.part3" + Extension());
- result.emplace_back(logfile_base1_ + "_pi1_data.part4" + Extension());
- result.emplace_back(logfile_base1_ +
- "_pi2_data/test/aos.examples.Pong.part0" + Extension());
- result.emplace_back(logfile_base1_ +
- "_pi2_data/test/aos.examples.Pong.part1" + Extension());
- result.emplace_back(logfile_base1_ +
- "_pi2_data/test/aos.examples.Pong.part2" + Extension());
- result.emplace_back(logfile_base1_ +
- "_pi2_data/test/aos.examples.Pong.part3" + Extension());
- result.emplace_back(logfile_base1_ +
- "_pi2_data/pi2/aos/aos.message_bridge.Timestamp.part0" +
- Extension());
- result.emplace_back(logfile_base1_ +
- "_pi2_data/pi2/aos/aos.message_bridge.Timestamp.part1" +
- Extension());
- result.emplace_back(logfile_base1_ +
- "_pi2_data/pi2/aos/aos.message_bridge.Timestamp.part2" +
- Extension());
- result.emplace_back(logfile_base1_ +
- "_pi2_data/pi2/aos/aos.message_bridge.Timestamp.part3" +
- Extension());
+ result.emplace_back(logfile_base1_ + "_pi1_timestamps.part0" + Extension());
+ result.emplace_back(logfile_base1_ + "_pi1_timestamps.part1" + Extension());
+ result.emplace_back(logfile_base1_ + "_pi1_timestamps.part2" + Extension());
+ result.emplace_back(logfile_base1_ + "_pi1_timestamps.part3" + Extension());
+
+ result.emplace_back(logfile_base1_ + "_data/pi2_data.part0" + Extension());
+ result.emplace_back(logfile_base1_ + "_data/pi2_data.part1" + Extension());
+ result.emplace_back(logfile_base1_ + "_data/pi2_data.part2" + Extension());
+ result.emplace_back(logfile_base1_ + "_data/pi2_data.part3" + Extension());
result.emplace_back(absl::StrCat(
logfile_base1_, "_", std::get<0>(GetParam()).sha256, Extension()));
- if (shared()) {
- for (size_t i = 0; i < 6; ++i) {
- result.emplace_back(
- absl::StrCat(logfile_base1_,
- "_timestamps/pi1/aos/remote_timestamps/pi2/"
- "aos.message_bridge.RemoteMessage.part",
- i, Extension()));
- }
- } else {
- result.emplace_back(logfile_base1_ +
- "_timestamps/pi1/aos/remote_timestamps/pi2/pi1/aos/"
- "aos-message_bridge-Timestamp/"
- "aos.message_bridge.RemoteMessage.part0" +
- Extension());
- result.emplace_back(logfile_base1_ +
- "_timestamps/pi1/aos/remote_timestamps/pi2/pi1/aos/"
- "aos-message_bridge-Timestamp/"
- "aos.message_bridge.RemoteMessage.part1" +
- Extension());
- result.emplace_back(logfile_base1_ +
- "_timestamps/pi1/aos/remote_timestamps/pi2/pi1/aos/"
- "aos-message_bridge-Timestamp/"
- "aos.message_bridge.RemoteMessage.part2" +
- Extension());
- result.emplace_back(logfile_base1_ +
- "_timestamps/pi1/aos/remote_timestamps/pi2/pi1/aos/"
- "aos-message_bridge-Timestamp/"
- "aos.message_bridge.RemoteMessage.part3" +
- Extension());
-
- result.emplace_back(logfile_base1_ +
- "_timestamps/pi1/aos/remote_timestamps/pi2/test/"
- "aos-examples-Ping/"
- "aos.message_bridge.RemoteMessage.part0" +
- Extension());
- result.emplace_back(logfile_base1_ +
- "_timestamps/pi1/aos/remote_timestamps/pi2/test/"
- "aos-examples-Ping/"
- "aos.message_bridge.RemoteMessage.part1" +
- Extension());
- result.emplace_back(logfile_base1_ +
- "_timestamps/pi1/aos/remote_timestamps/pi2/test/"
- "aos-examples-Ping/"
- "aos.message_bridge.RemoteMessage.part2" +
- Extension());
- result.emplace_back(logfile_base1_ +
- "_timestamps/pi1/aos/remote_timestamps/pi2/test/"
- "aos-examples-Ping/"
- "aos.message_bridge.RemoteMessage.part3" +
- Extension());
+ for (size_t i = 0; i < 6; ++i) {
+ result.emplace_back(absl::StrCat(
+ logfile_base1_, "_timestamps/remote_pi2.part", i, Extension()));
}
return result;
}
-std::vector<std::string> MultinodeLoggerTest::MakePi1SingleDirectionLogfiles() {
- std::vector<std::string> result;
- result.emplace_back(logfile_base1_ + "_pi1_data.part0" + Extension());
- result.emplace_back(logfile_base1_ + "_pi1_data.part1" + Extension());
- result.emplace_back(logfile_base1_ +
- "_pi2_data/pi2/aos/aos.message_bridge.Timestamp.part0" +
- Extension());
- result.emplace_back(absl::StrCat(
- logfile_base1_, "_", std::get<0>(GetParam()).sha256, Extension()));
- return result;
-}
-
std::vector<std::string> MultinodeLoggerTest::MakePi1DeadNodeLogfiles() {
std::vector<std::string> result;
- result.emplace_back(logfile_base1_ + "_pi1_data.part0" + Extension());
- result.emplace_back(absl::StrCat(
- logfile_base1_, "_", std::get<0>(GetParam()).sha256, Extension()));
+ if (file_strategy() == FileStrategy::kCombine) {
+ result.emplace_back(logfile_base1_ + "_pi1_pi1_all.part0" + Extension());
+ result.emplace_back(absl::StrCat(
+ logfile_base1_, "_", std::get<0>(GetParam()).sha256, Extension()));
+ } else {
+ result.emplace_back(logfile_base1_ + "_pi1_data.part0" + Extension());
+ result.emplace_back(absl::StrCat(
+ logfile_base1_, "_", std::get<0>(GetParam()).sha256, Extension()));
+ }
return result;
}
std::vector<std::vector<std::string>> MultinodeLoggerTest::StructureLogFiles() {
- std::vector<std::vector<std::string>> result{
- std::vector<std::string>{logfiles_[2], logfiles_[3], logfiles_[4]},
- std::vector<std::string>{logfiles_[5], logfiles_[6]},
- std::vector<std::string>{logfiles_[7], logfiles_[8], logfiles_[9]},
- std::vector<std::string>{logfiles_[10], logfiles_[11]},
- std::vector<std::string>{logfiles_[12], logfiles_[13]}};
-
- if (shared()) {
- result.emplace_back(
- std::vector<std::string>{logfiles_[14], logfiles_[15], logfiles_[16]});
- result.emplace_back(std::vector<std::string>{logfiles_[17], logfiles_[18]});
+ if (file_strategy() == FileStrategy::kCombine) {
+ return std::vector<std::vector<std::string>>{
+ std::vector<std::string>{logfiles_[2], logfiles_[3], logfiles_[4]},
+ std::vector<std::string>{logfiles_[5], logfiles_[6], logfiles_[7]},
+ std::vector<std::string>{logfiles_[8], logfiles_[9], logfiles_[10]},
+ std::vector<std::string>{logfiles_[11], logfiles_[12], logfiles_[13]}};
} else {
- result.emplace_back(std::vector<std::string>{logfiles_[14], logfiles_[15]});
- result.emplace_back(std::vector<std::string>{logfiles_[16], logfiles_[17]});
- result.emplace_back(std::vector<std::string>{logfiles_[18], logfiles_[19]});
+ return std::vector<std::vector<std::string>>{
+ std::vector<std::string>{logfiles_[2]},
+ std::vector<std::string>{logfiles_[3], logfiles_[4]},
+ std::vector<std::string>{logfiles_[5]},
+ std::vector<std::string>{logfiles_[6], logfiles_[7]},
+ std::vector<std::string>{logfiles_[8], logfiles_[9]},
+ std::vector<std::string>{logfiles_[10], logfiles_[11]},
+ std::vector<std::string>{logfiles_[12], logfiles_[13], logfiles_[14]},
+ std::vector<std::string>{logfiles_[15], logfiles_[16]}};
}
-
- return result;
}
std::string MultinodeLoggerTest::Extension() {
@@ -359,7 +272,8 @@
if (factory == nullptr) {
factory = &event_loop_factory_;
}
- return MakeLoggerState(node, factory, std::get<1>(GetParam()), configuration);
+ return MakeLoggerState(node, factory, std::get<1>(GetParam()),
+ file_strategy(), configuration);
}
void MultinodeLoggerTest::StartLogger(LoggerState *logger,
@@ -422,7 +336,8 @@
// depends on if we have the remote timestamps split across 2 files, or just
// across 1, depending on if we are using a split or combined timestamp
// channel config.
- EXPECT_EQ(missing_rt_count, shared() ? 5u : 6u);
+ EXPECT_EQ(missing_rt_count,
+ file_strategy() == FileStrategy::kCombine ? 2u : 4u);
EXPECT_EQ(log_event_uuids.size(), 2u);
EXPECT_EQ(parts_uuids.size(), ToLogReaderVector(sorted_parts).size());
@@ -446,6 +361,8 @@
EXPECT_THAT(sorted_parts[0].corrupted, ::testing::Eq(corrupted_parts));
EXPECT_THAT(sorted_parts[1].corrupted, ::testing::Eq(corrupted_parts));
+
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
}
void MultinodeLoggerTest::AddExtension(std::string_view extension) {
@@ -526,6 +443,7 @@
reader.Deregister();
}
+ CHECK(LogIsReadableIfMultiNode(LogFilesContainer{SortParts(files)}));
{
std::vector<std::pair<std::vector<realtime_clock::time_point>,
std::vector<realtime_clock::time_point>>>
@@ -578,6 +496,9 @@
reader.Deregister();
for (auto x : result) {
+ EXPECT_EQ(x.first.size(), x.second.size())
+ << ": Got a different number of start and end times, that is very "
+ "bad.";
for (auto y : x.first) {
VLOG(1) << "Start " << y;
}
@@ -589,6 +510,20 @@
}
}
+std::vector<std::pair<std::vector<realtime_clock::time_point>,
+ std::vector<realtime_clock::time_point>>>
+MultinodeLoggerTest::ConfirmReadable(const std::vector<std::string> &files,
+ realtime_clock::time_point start_time,
+ realtime_clock::time_point end_time) {
+ LogFilesContainer c(SortParts(files));
+ if (file_strategy() == FileStrategy::kCombine) {
+ EXPECT_FALSE(c.TimestampsStoredSeparately());
+ } else {
+ EXPECT_TRUE(c.TimestampsStoredSeparately());
+ }
+ return testing::ConfirmReadable(files, start_time, end_time);
+}
+
// Counts the number of messages on a channel. Returns (channel name, channel
// type, count) for every message matching matcher()
std::vector<std::tuple<std::string, std::string, int>> CountChannelsMatching(
@@ -654,6 +589,45 @@
});
}
+bool AllPartsMatchOutOfOrderDuration(
+ const std::vector<LogFile> &files,
+ std::chrono::nanoseconds max_out_of_order_duration) {
+ bool result = true;
+ for (const LogFile &file : files) {
+ for (const LogParts &parts : file.parts) {
+ if (parts.max_out_of_order_duration != max_out_of_order_duration) {
+ LOG(ERROR) << "Found an out of order duration of "
+ << parts.max_out_of_order_duration.count()
+ << "ns instead of " << max_out_of_order_duration.count()
+ << "ns for " << parts;
+ result = false;
+ }
+ }
+ }
+ return result;
+}
+
+bool AllRebootPartsMatchOutOfOrderDuration(
+ const std::vector<LogFile> &files, const std::string node,
+ std::chrono::nanoseconds max_out_of_order_duration) {
+ bool result = true;
+ for (const LogFile &file : files) {
+ for (const LogParts &parts : file.parts) {
+ if (parts.node == node && parts.boot_count == 0) {
+ continue;
+ }
+ if (parts.max_out_of_order_duration != max_out_of_order_duration) {
+ LOG(ERROR) << "Found an out of order duration of "
+ << parts.max_out_of_order_duration.count()
+ << "ns instead of " << max_out_of_order_duration.count()
+ << "ns for " << parts;
+ result = false;
+ }
+ }
+ }
+ return result;
+}
+
} // namespace testing
} // namespace logger
} // namespace aos
diff --git a/aos/events/logging/multinode_logger_test_lib.h b/aos/events/logging/multinode_logger_test_lib.h
index e207179..d6d3f60 100644
--- a/aos/events/logging/multinode_logger_test_lib.h
+++ b/aos/events/logging/multinode_logger_test_lib.h
@@ -27,6 +27,18 @@
encoder_factory;
};
+enum class FileStrategy {
+ // Use MinimalFileMultiNodeLogNamer
+ kCombine,
+ // Use MultiNodeFilesLogNamer
+ kKeepSeparate,
+};
+
+enum class ForceTimestampBuffering {
+ kForceBufferTimestamps,
+ kAutoBuffer,
+};
+
// Parameters to run all the tests with.
struct ConfigParams {
// The config file to use.
@@ -39,6 +51,11 @@
std::string_view sha256;
// sha256 of the relogged config
std::string_view relogged_sha256;
+ // If kCombine, use MinimalFileMultiNodeLogNamer.
+ FileStrategy file_strategy;
+ // If kForceBufferTimestamps, set --force_timestamp_loading to force buffering
+ // timestamps at the start.
+ ForceTimestampBuffering timestamp_buffering;
};
struct LoggerState {
@@ -53,6 +70,7 @@
const Node *node;
MultiNodeFilesLogNamer *log_namer;
CompressionParams params;
+ FileStrategy file_strategy;
void AppendAllFilenames(std::vector<std::string> *filenames);
@@ -60,18 +78,19 @@
};
constexpr std::string_view kCombinedConfigSha1() {
- return "d018002a9b780d45a69172a1e5dd1d6df49a7c6c63b9bae9125cdc0458ddc6ca";
+ return "8f8a0fd505b5cd8ffc553b2af2796232450a2db88c6db974c5e3406b38eecb93";
}
constexpr std::string_view kSplitConfigSha1() {
- return "562f80087c0e95d9304127c4cb46962659b4bfc11def84253c67702b4213e6cf";
+ return "0a2443f8fe49815de16723c48881f3a78c4e6399273d15da2d77d35588c8cae8";
}
constexpr std::string_view kReloggedSplitConfigSha1() {
- return "cb560559ee3111d7c67314e3e1a5fd7fc88e8b4cfd9d15ea71c8d1cae1c0480b";
+ return "5e4f4784b890c1fdca7aeeadad8bd0354325f3ed8f11dbc9a81112df87407931";
}
LoggerState MakeLoggerState(NodeEventLoopFactory *node,
SimulatedEventLoopFactory *factory,
CompressionParams params,
+ FileStrategy file_strategy,
const Configuration *configuration = nullptr);
std::vector<std::vector<std::string>> ToLogReaderVector(
const std::vector<LogFile> &log_files);
@@ -99,23 +118,37 @@
std::shared_ptr<const aos::Configuration> config,
std::string_view filename);
+// Returns true if all of the max_out_of_order_duration's match the provided
+// max_out_of_order_duration.
+bool AllPartsMatchOutOfOrderDuration(
+ const std::vector<LogFile> &files,
+ std::chrono::nanoseconds max_out_of_order_duration =
+ std::chrono::milliseconds(300));
+
+// Skips checking the part file with boot_count 0 for 'node'.
+bool AllRebootPartsMatchOutOfOrderDuration(
+ const std::vector<LogFile> &files, const std::string node,
+ std::chrono::nanoseconds max_out_of_order_duration =
+ std::chrono::milliseconds(300));
+
class MultinodeLoggerTest : public ::testing::TestWithParam<
std::tuple<ConfigParams, CompressionParams>> {
public:
MultinodeLoggerTest();
bool shared() const;
+ FileStrategy file_strategy() const;
std::vector<std::string> MakeLogFiles(std::string logfile_base1,
std::string logfile_base2,
- size_t pi1_data_count = 3,
- size_t pi2_data_count = 3,
+ size_t pi1_data_count = 1,
+ size_t pi2_data_count = 1,
+ size_t pi1_timestamp_count = 2,
+ size_t pi2_timestamp_count = 2,
bool relogged_config = false);
std::vector<std::string> MakePi1RebootLogfiles();
- std::vector<std::string> MakePi1SingleDirectionLogfiles();
-
std::vector<std::string> MakePi1DeadNodeLogfiles();
std::vector<std::vector<std::string>> StructureLogFiles();
@@ -131,8 +164,17 @@
void VerifyParts(const std::vector<LogFile> &sorted_parts,
const std::vector<std::string> &corrupted_parts = {});
+ std::vector<std::pair<std::vector<realtime_clock::time_point>,
+ std::vector<realtime_clock::time_point>>>
+ ConfirmReadable(
+ const std::vector<std::string> &files,
+ realtime_clock::time_point start_time = realtime_clock::min_time,
+ realtime_clock::time_point end_time = realtime_clock::max_time);
+
void AddExtension(std::string_view extension);
+ gflags::FlagSaver flag_saver_;
+
// Config and factory.
aos::FlatbufferDetachedBuffer<aos::Configuration> config_;
message_bridge::TestingTimeConverter time_converter_;
@@ -148,7 +190,6 @@
std::string logfile_base2_;
std::vector<std::string> pi1_reboot_logfiles_;
std::vector<std::string> logfiles_;
- std::vector<std::string> pi1_single_direction_logfiles_;
std::vector<std::vector<std::string>> structured_logfiles_;
};
diff --git a/aos/events/logging/multinode_pingpong_pi3_pingpong.json b/aos/events/logging/multinode_pingpong_pi3_pingpong.json
new file mode 100644
index 0000000..336b328
--- /dev/null
+++ b/aos/events/logging/multinode_pingpong_pi3_pingpong.json
@@ -0,0 +1,312 @@
+{
+ "channels": [
+ {
+ "name": "/pi1/aos",
+ "type": "aos.logging.LogMessageFbs",
+ "source_node": "pi1",
+ "frequency": 200,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.logging.LogMessageFbs",
+ "source_node": "pi2",
+ "frequency": 200,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ {
+ "name": "/pi3/aos",
+ "type": "aos.logging.LogMessageFbs",
+ "source_node": "pi3",
+ "frequency": 200,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ /* Logged on pi1 locally */
+ {
+ "name": "/pi1/aos",
+ "type": "aos.timing.Report",
+ "source_node": "pi1",
+ "frequency": 50,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.timing.Report",
+ "source_node": "pi2",
+ "frequency": 50,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ {
+ "name": "/pi3/aos",
+ "type": "aos.timing.Report",
+ "source_node": "pi3",
+ "frequency": 50,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ {
+ "name": "/pi1/aos",
+ "type": "aos.message_bridge.ServerStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi1"
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.message_bridge.ServerStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi2"
+ },
+ {
+ "name": "/pi3/aos",
+ "type": "aos.message_bridge.ServerStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi3"
+ },
+ {
+ "name": "/pi1/aos",
+ "type": "aos.logging.DynamicLogCommand",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi1"
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.logging.DynamicLogCommand",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi2"
+ },
+ {
+ "name": "/pi3/aos",
+ "type": "aos.logging.DynamicLogCommand",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi3"
+ },
+ {
+ "name": "/pi1/aos",
+ "type": "aos.message_bridge.ClientStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi1"
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.message_bridge.ClientStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi2"
+ },
+ {
+ "name": "/pi3/aos",
+ "type": "aos.message_bridge.ClientStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi3"
+ },
+ {
+ "name": "/pi1/aos",
+ "type": "aos.message_bridge.Timestamp",
+ "logger": "LOCAL_AND_REMOTE_LOGGER",
+ "logger_nodes": ["pi2"],
+ "source_node": "pi1",
+ "destination_nodes": [
+ {
+ "name": "pi2",
+ "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+ "timestamp_logger_nodes": ["pi1"]
+ },
+ {
+ "name": "pi3",
+ "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+ "timestamp_logger_nodes": ["pi1"]
+ }
+ ]
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.message_bridge.Timestamp",
+ "logger": "LOCAL_AND_REMOTE_LOGGER",
+ "logger_nodes": ["pi1"],
+ "source_node": "pi2",
+ "destination_nodes": [
+ {
+ "name": "pi1",
+ "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+ "timestamp_logger_nodes": ["pi2"]
+ }
+ ]
+ },
+ {
+ "name": "/pi3/aos",
+ "type": "aos.message_bridge.Timestamp",
+ "logger": "LOCAL_AND_REMOTE_LOGGER",
+ "logger_nodes": ["pi1"],
+ "source_node": "pi3",
+ "destination_nodes": [
+ {
+ "name": "pi1",
+ "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+ "timestamp_logger_nodes": ["pi3"]
+ }
+ ]
+ },
+ {
+ "name": "/pi1/aos/remote_timestamps/pi2/pi1/aos/aos-message_bridge-Timestamp",
+ "type": "aos.message_bridge.RemoteMessage",
+ "logger": "NOT_LOGGED",
+ "num_senders": 2,
+ "source_node": "pi1"
+ },
+ {
+ "name": "/pi1/aos/remote_timestamps/pi3/pi1/aos/aos-message_bridge-Timestamp",
+ "type": "aos.message_bridge.RemoteMessage",
+ "logger": "NOT_LOGGED",
+ "num_senders": 2,
+ "source_node": "pi1"
+ },
+ {
+ "name": "/pi2/aos/remote_timestamps/pi1/pi2/aos/aos-message_bridge-Timestamp",
+ "type": "aos.message_bridge.RemoteMessage",
+ "logger": "NOT_LOGGED",
+ "num_senders": 2,
+ "source_node": "pi2"
+ },
+ {
+ "name": "/pi3/aos/remote_timestamps/pi1/pi3/aos/aos-message_bridge-Timestamp",
+ "type": "aos.message_bridge.RemoteMessage",
+ "logger": "NOT_LOGGED",
+ "num_senders": 2,
+ "source_node": "pi3"
+ },
+ {
+ "name": "/pi1/aos/remote_timestamps/pi2/test2/aos-examples-Ping",
+ "type": "aos.message_bridge.RemoteMessage",
+ "logger": "NOT_LOGGED",
+ "num_senders": 2,
+ "source_node": "pi1",
+ "frequency": 150
+ },
+ {
+ "name": "/pi1/aos/remote_timestamps/pi3/test3/aos-examples-Ping",
+ "type": "aos.message_bridge.RemoteMessage",
+ "logger": "NOT_LOGGED",
+ "num_senders": 2,
+ "source_node": "pi1",
+ "frequency": 150
+ },
+ /* Forwarded to pi2 */
+ {
+ "name": "/test2",
+ "type": "aos.examples.Ping",
+ "source_node": "pi1",
+ "destination_nodes": [
+ {
+ "name": "pi2",
+ "priority": 1,
+ "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+ "timestamp_logger_nodes": [
+ "pi1"
+ ],
+ "time_to_live": 5000000
+ }
+ ],
+ "frequency": 150
+ },
+ {
+ "name": "/test2",
+ "type": "aos.examples.Pong",
+ "source_node": "pi2",
+ "logger": "LOCAL_AND_REMOTE_LOGGER",
+ "logger_nodes": ["pi1"],
+ "destination_nodes": [
+ {
+ "name": "pi1",
+ "priority": 1,
+ "timestamp_logger": "LOCAL_LOGGER",
+ "time_to_live": 5000000
+ }
+ ],
+ "frequency": 150
+ },
+ /* Forwarded to pi3 */
+ {
+ "name": "/test3",
+ "type": "aos.examples.Ping",
+ "source_node": "pi1",
+ "destination_nodes": [
+ {
+ "name": "pi3",
+ "priority": 1,
+ "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+ "timestamp_logger_nodes": [
+ "pi1"
+ ],
+ "time_to_live": 5000000
+ }
+ ],
+ "frequency": 150
+ },
+ {
+ "name": "/test3",
+ "type": "aos.examples.Pong",
+ "source_node": "pi3",
+ "logger": "LOCAL_AND_REMOTE_LOGGER",
+ "logger_nodes": ["pi1"],
+ "destination_nodes": [
+ {
+ "name": "pi1",
+ "priority": 1,
+ "timestamp_logger": "LOCAL_LOGGER",
+ "time_to_live": 5000000
+ }
+ ],
+ "frequency": 150
+ }
+ ],
+ "maps": [
+ {
+ "match": {
+ "name": "/aos*",
+ "source_node": "pi1"
+ },
+ "rename": {
+ "name": "/pi1/aos"
+ }
+ },
+ {
+ "match": {
+ "name": "/aos*",
+ "source_node": "pi2"
+ },
+ "rename": {
+ "name": "/pi2/aos"
+ }
+ },
+ {
+ "match": {
+ "name": "/aos*",
+ "source_node": "pi3"
+ },
+ "rename": {
+ "name": "/pi3/aos"
+ }
+ }
+ ],
+ "nodes": [
+ {
+ "name": "pi1",
+ "hostname": "raspberrypi",
+ "port": 9971
+ },
+ {
+ "name": "pi2",
+ "hostname": "raspberrypi2",
+ "port": 9971
+ },
+ {
+ "name": "pi3",
+ "hostname": "raspberrypi3",
+ "port": 9971
+ }
+ ]
+}
diff --git a/aos/events/logging/multinode_pingpong_reboot_ooo.json b/aos/events/logging/multinode_pingpong_reboot_ooo.json
new file mode 100644
index 0000000..80d2972
--- /dev/null
+++ b/aos/events/logging/multinode_pingpong_reboot_ooo.json
@@ -0,0 +1,240 @@
+{
+ "channels": [
+ {
+ "name": "/pi1/aos",
+ "type": "aos.logging.LogMessageFbs",
+ "source_node": "pi1",
+ "frequency": 200,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.logging.LogMessageFbs",
+ "source_node": "pi2",
+ "frequency": 200,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ /* Logged on pi1 locally */
+ {
+ "name": "/pi1/aos",
+ "type": "aos.timing.Report",
+ "source_node": "pi1",
+ "frequency": 50,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.timing.Report",
+ "source_node": "pi2",
+ "frequency": 50,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ {
+ "name": "/pi1/aos",
+ "type": "aos.message_bridge.ServerStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi1"
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.message_bridge.ServerStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi2"
+ },
+ {
+ "name": "/pi1/aos",
+ "type": "aos.logging.DynamicLogCommand",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi1"
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.logging.DynamicLogCommand",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi2"
+ },
+ {
+ "name": "/pi1/aos",
+ "type": "aos.message_bridge.ClientStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi1"
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.message_bridge.ClientStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi2"
+ },
+ {
+ "name": "/pi1/aos",
+ "type": "aos.message_bridge.Timestamp",
+ "logger": "LOCAL_AND_REMOTE_LOGGER",
+ "logger_nodes": ["pi2"],
+ "source_node": "pi1",
+ "destination_nodes": [
+ {
+ "name": "pi2",
+ "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+ "timestamp_logger_nodes": ["pi1"]
+ }
+ ]
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.message_bridge.Timestamp",
+ "logger": "LOCAL_AND_REMOTE_LOGGER",
+ "logger_nodes": ["pi1"],
+ "source_node": "pi2",
+ "destination_nodes": [
+ {
+ "name": "pi1",
+ "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+ "timestamp_logger_nodes": ["pi2"]
+ }
+ ]
+ },
+ {
+ "name": "/pi1/aos/remote_timestamps/pi2/pi1/aos/aos-message_bridge-Timestamp",
+ "type": "aos.message_bridge.RemoteMessage",
+ /* Log one remote timestamp channel to ensure everythings still works. */
+ "logger": "LOCAL_LOGGER",
+ "num_senders": 2,
+ "source_node": "pi1"
+ },
+ {
+ "name": "/pi2/aos/remote_timestamps/pi1/pi2/aos/aos-message_bridge-Timestamp",
+ "type": "aos.message_bridge.RemoteMessage",
+ "logger": "NOT_LOGGED",
+ "num_senders": 2,
+ "source_node": "pi2"
+ },
+
+
+ {
+ "name": "/atest1",
+ "type": "aos.examples.Ping",
+ "source_node": "pi1",
+ "destination_nodes": [
+ {
+ "name": "pi2",
+ "priority": 1,
+ "timestamp_logger": "LOCAL_LOGGER",
+ "time_to_live": 5000000
+ }
+ ],
+ "frequency": 150
+ },
+ {
+ "name": "/atest1",
+ "type": "aos.examples.Pong",
+ "source_node": "pi2",
+ "logger": "LOCAL_AND_REMOTE_LOGGER",
+ "logger_nodes": ["pi1"],
+ "destination_nodes": [
+ {
+ "name": "pi1",
+ "priority": 1,
+ "timestamp_logger": "LOCAL_LOGGER",
+ "time_to_live": 5000000
+ }
+ ],
+ "frequency": 150
+ },
+ {
+ "name": "/atest2",
+ "type": "aos.examples.Ping",
+ "source_node": "pi1",
+ "destination_nodes": [
+ {
+ "name": "pi2",
+ "priority": 1,
+ "timestamp_logger": "LOCAL_LOGGER",
+ "time_to_live": 5000000
+ }
+ ],
+ "frequency": 150
+ },
+ {
+ "name": "/atest2",
+ "type": "aos.examples.Pong",
+ "source_node": "pi2",
+ "logger": "LOCAL_AND_REMOTE_LOGGER",
+ "logger_nodes": ["pi1"],
+ "destination_nodes": [
+ {
+ "name": "pi1",
+ "priority": 1,
+ "timestamp_logger": "LOCAL_LOGGER",
+ "time_to_live": 5000000
+ }
+ ],
+ "frequency": 150
+ },
+ {
+ "name": "/atest3",
+ "type": "aos.examples.Ping",
+ "source_node": "pi1",
+ "destination_nodes": [
+ {
+ "name": "pi2",
+ "priority": 1,
+ "timestamp_logger": "LOCAL_LOGGER",
+ "time_to_live": 5000000
+ }
+ ],
+ "frequency": 150
+ },
+ {
+ "name": "/atest3",
+ "type": "aos.examples.Pong",
+ "source_node": "pi2",
+ "logger": "LOCAL_AND_REMOTE_LOGGER",
+ "logger_nodes": ["pi1"],
+ "destination_nodes": [
+ {
+ "name": "pi1",
+ "priority": 1,
+ "timestamp_logger": "LOCAL_LOGGER",
+ "time_to_live": 5000000
+ }
+ ],
+ "frequency": 150
+ }
+ ],
+ "maps": [
+ {
+ "match": {
+ "name": "/aos*",
+ "source_node": "pi1"
+ },
+ "rename": {
+ "name": "/pi1/aos"
+ }
+ },
+ {
+ "match": {
+ "name": "/aos*",
+ "source_node": "pi2"
+ },
+ "rename": {
+ "name": "/pi2/aos"
+ }
+ }
+ ],
+ "nodes": [
+ {
+ "name": "pi1",
+ "hostname": "raspberrypi",
+ "port": 9971
+ },
+ {
+ "name": "pi2",
+ "hostname": "raspberrypi2",
+ "port": 9971
+ }
+ ]
+}
diff --git a/aos/events/logging/multinode_single_node.json b/aos/events/logging/multinode_single_node.json
new file mode 100644
index 0000000..d3a1090
--- /dev/null
+++ b/aos/events/logging/multinode_single_node.json
@@ -0,0 +1,33 @@
+{
+ "channels": [
+ {
+ "name": "/aos",
+ "type": "aos.logging.LogMessageFbs",
+ "source_node": "pi1",
+ "frequency": 200,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ {
+ "name": "/aos",
+ "type": "aos.timing.Report",
+ "source_node": "pi1",
+ "frequency": 50,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ {
+ "name": "/aos",
+ "type": "aos.logging.DynamicLogCommand",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi1"
+ }
+ ],
+ "nodes": [
+ {
+ "name": "pi1",
+ "hostname": "raspberrypi",
+ "port": 9971
+ }
+ ]
+}
diff --git a/aos/events/logging/realtime_replay_test.cc b/aos/events/logging/realtime_replay_test.cc
index 8c7751e..4118550 100644
--- a/aos/events/logging/realtime_replay_test.cc
+++ b/aos/events/logging/realtime_replay_test.cc
@@ -363,7 +363,7 @@
&config_.message(), &replay_channels);
EXPECT_DEATH(
reader.RemapLoggedChannel<aos::examples::Ping>("/fake", "/original"),
- "which is not included in the replay channels passed to LogReader");
+ "which is not included in the replay channels passed to");
}
} // namespace aos::logger::testing
diff --git a/aos/events/logging/replay_channels.h b/aos/events/logging/replay_channels.h
new file mode 100644
index 0000000..f9ec144
--- /dev/null
+++ b/aos/events/logging/replay_channels.h
@@ -0,0 +1,15 @@
+#ifndef AOS_EVENTS_LOGGING_REPLAY_CHANNELS_H_
+#define AOS_EVENTS_LOGGING_REPLAY_CHANNELS_H_
+
+#include <string>
+#include <vector>
+
+namespace aos {
+namespace logger {
+// Vector of pair of name and type of the channel
+using ReplayChannels = std::vector<std::pair<std::string, std::string>>;
+// Vector of channel indices
+using ReplayChannelIndices = std::vector<size_t>;
+} // namespace logger
+} // namespace aos
+#endif // AOS_EVENTS_LOGGING_REPLAY_CHANNELS_H_
diff --git a/aos/events/logging/single_node_merge.cc b/aos/events/logging/single_node_merge.cc
index aee2c5e..0fa3567 100644
--- a/aos/events/logging/single_node_merge.cc
+++ b/aos/events/logging/single_node_merge.cc
@@ -21,7 +21,7 @@
int Main(int argc, char **argv) {
const LogFilesContainer log_files(SortParts(FindLogs(argc, argv)));
- const Configuration *config = log_files.config();
+ const Configuration *config = log_files.config().get();
// Haven't tested this on a single node log, and don't really see a need to
// right now. The higher layers just work.
@@ -37,8 +37,8 @@
// parts to not be from the same boot.
if (log_files.ContainsPartsForNode(node_name)) {
// Filter the parts relevant to each node when building the mapper.
- mappers.emplace_back(
- std::make_unique<TimestampMapper>(node_name, log_files));
+ mappers.emplace_back(std::make_unique<TimestampMapper>(
+ node_name, log_files, TimestampQueueStrategy::kQueueTogether));
if (node_name == FLAGS_node) {
node_mapper = mappers.back().get();
}
diff --git a/aos/events/logging/timestamp_extractor.cc b/aos/events/logging/timestamp_extractor.cc
index 0b22f93..3cf96f6 100644
--- a/aos/events/logging/timestamp_extractor.cc
+++ b/aos/events/logging/timestamp_extractor.cc
@@ -6,8 +6,8 @@
#include "aos/events/logging/logfile_sorting.h"
#include "aos/events/logging/logfile_utils.h"
+#include "aos/events/logging/logfile_validator.h"
#include "aos/init.h"
-#include "aos/network/multinode_timestamp_filter.h"
DECLARE_bool(timestamps_to_csv);
DEFINE_bool(skip_order_validation, false,
@@ -15,135 +15,9 @@
namespace aos::logger {
-namespace chrono = std::chrono;
-
int Main(int argc, char **argv) {
const LogFilesContainer log_files(SortParts(FindLogs(argc, argv)));
- const Configuration *config = log_files.config();
-
- CHECK(configuration::MultiNode(config))
- << ": Timestamps only make sense in a multi-node world.";
-
- // Now, build up all the TimestampMapper classes to read and sort the data.
- std::vector<std::unique_ptr<TimestampMapper>> mappers;
-
- for (const Node *node : configuration::GetNodes(config)) {
- auto node_name = MaybeNodeName(node);
- // Confirm that all the parts are from the same boot if there are enough
- // parts to not be from the same boot.
- if (!log_files.ContainsPartsForNode(node_name)) {
- // Filter the parts relevant to each node when building the mapper.
- mappers.emplace_back(
- std::make_unique<TimestampMapper>(node_name, log_files));
- } else {
- mappers.emplace_back(nullptr);
- }
- }
-
- // Now, build up the estimator used to solve for time.
- message_bridge::MultiNodeNoncausalOffsetEstimator multinode_estimator(
- config, config, log_files.boots(), FLAGS_skip_order_validation,
- chrono::seconds(0));
- multinode_estimator.set_reboot_found(
- [config](distributed_clock::time_point reboot_time,
- const std::vector<logger::BootTimestamp> &node_times) {
- LOG(INFO) << "Rebooted at distributed " << reboot_time;
- size_t node_index = 0;
- for (const logger::BootTimestamp &time : node_times) {
- LOG(INFO) << " "
- << config->nodes()->Get(node_index)->name()->string_view()
- << " " << time;
- ++node_index;
- }
- });
-
- {
- std::vector<TimestampMapper *> timestamp_mappers;
- for (std::unique_ptr<TimestampMapper> &mapper : mappers) {
- timestamp_mappers.emplace_back(mapper.get());
- }
- multinode_estimator.SetTimestampMappers(std::move(timestamp_mappers));
- }
-
- // To make things more like the logger and faster, cache the node + channel ->
- // filter mapping in a set of vectors.
- std::vector<std::vector<message_bridge::NoncausalOffsetEstimator *>> filters;
- filters.resize(configuration::NodesCount(config));
-
- for (const Node *node : configuration::GetNodes(config)) {
- const size_t node_index = configuration::GetNodeIndex(config, node);
- filters[node_index].resize(config->channels()->size(), nullptr);
- for (size_t channel_index = 0; channel_index < config->channels()->size();
- ++channel_index) {
- const Channel *channel = config->channels()->Get(channel_index);
-
- if (!configuration::ChannelIsSendableOnNode(channel, node) &&
- configuration::ChannelIsReadableOnNode(channel, node)) {
- // We've got a message which is being forwarded to this node.
- const Node *source_node = configuration::GetNode(
- config, channel->source_node()->string_view());
- filters[node_index][channel_index] =
- multinode_estimator.GetFilter(node, source_node);
- }
- }
- }
-
- multinode_estimator.CheckGraph();
-
- // Now, read all the timestamps for each node. This is simpler than the
- // logger on purpose. It loads in *all* the timestamps in 1 go per node,
- // ignoring memory usage.
- for (const Node *node : configuration::GetNodes(config)) {
- LOG(INFO) << "Reading all data for " << node->name()->string_view();
- const size_t node_index = configuration::GetNodeIndex(config, node);
- TimestampMapper *timestamp_mapper = mappers[node_index].get();
- if (timestamp_mapper == nullptr) {
- continue;
- }
- while (true) {
- TimestampedMessage *m = timestamp_mapper->Front();
- if (m == nullptr) {
- break;
- }
- timestamp_mapper->PopFront();
- }
- }
-
- // Don't get clever. Use the first time as the start time. Note: this is
- // different than how log_cat and others work.
- std::optional<const std::tuple<distributed_clock::time_point,
- std::vector<BootTimestamp>> *>
- next_timestamp = multinode_estimator.QueueNextTimestamp();
- CHECK(next_timestamp);
- LOG(INFO) << "Starting at:";
- for (const Node *node : configuration::GetNodes(config)) {
- const size_t node_index = configuration::GetNodeIndex(config, node);
- LOG(INFO) << " " << node->name()->string_view() << " -> "
- << std::get<1>(*next_timestamp.value())[node_index].time;
- }
-
- std::vector<monotonic_clock::time_point> just_monotonic(
- std::get<1>(*next_timestamp.value()).size());
- for (size_t i = 0; i < just_monotonic.size(); ++i) {
- CHECK_EQ(std::get<1>(*next_timestamp.value())[i].boot, 0u);
- just_monotonic[i] = std::get<1>(*next_timestamp.value())[i].time;
- }
- multinode_estimator.Start(just_monotonic);
-
- // As we pull off all the timestamps, the time problem is continually solved,
- // filling in the CSV files.
- while (true) {
- std::optional<const std::tuple<distributed_clock::time_point,
- std::vector<BootTimestamp>> *>
- next_timestamp = multinode_estimator.QueueNextTimestamp();
- if (!next_timestamp) {
- break;
- }
- multinode_estimator.ObserveTimePassed(std::get<0>(*next_timestamp.value()));
- }
-
- LOG(INFO) << "Done";
-
+ CHECK(MultiNodeLogIsReadable(log_files, FLAGS_skip_order_validation));
return 0;
}
diff --git a/aos/events/ping.cc b/aos/events/ping.cc
index 83a066b..975f2e4 100644
--- a/aos/events/ping.cc
+++ b/aos/events/ping.cc
@@ -11,6 +11,7 @@
int main(int argc, char **argv) {
aos::InitGoogle(&argc, &argv);
+ aos::EventLoop::SetDefaultVersionString("ping_version");
aos::FlatbufferDetachedBuffer<aos::Configuration> config =
aos::configuration::ReadConfig(FLAGS_config);
diff --git a/aos/events/ping_lib.cc b/aos/events/ping_lib.cc
index a7a8dd4..56dde2f 100644
--- a/aos/events/ping_lib.cc
+++ b/aos/events/ping_lib.cc
@@ -7,7 +7,7 @@
#include "aos/events/pong_generated.h"
#include "aos/json_to_flatbuffer.h"
-DEFINE_int32(sleep_ms, 10, "Time to sleep between pings");
+DEFINE_int32(sleep_us, 10000, "Time to sleep between pings");
namespace aos {
@@ -26,7 +26,7 @@
event_loop_->OnRun([this]() {
timer_handle_->Schedule(event_loop_->monotonic_now(),
- chrono::milliseconds(FLAGS_sleep_ms));
+ chrono::microseconds(FLAGS_sleep_us));
});
event_loop_->SetRuntimeRealtimePriority(5);
@@ -35,7 +35,7 @@
void Ping::SendPing() {
if (last_pong_value_ != count_ && (!quiet_ || VLOG_IS_ON(1))) {
LOG(WARNING) << "Did not receive response to " << count_ << " within "
- << FLAGS_sleep_ms << "ms.";
+ << FLAGS_sleep_us << "us.";
}
++count_;
aos::Sender<examples::Ping>::Builder builder = sender_.MakeBuilder();
diff --git a/aos/events/pong.cc b/aos/events/pong.cc
index 6a22b6b..9162567 100644
--- a/aos/events/pong.cc
+++ b/aos/events/pong.cc
@@ -11,6 +11,7 @@
int main(int argc, char **argv) {
aos::InitGoogle(&argc, &argv);
+ aos::EventLoop::SetDefaultVersionString("pong_version");
aos::FlatbufferDetachedBuffer<aos::Configuration> config =
aos::configuration::ReadConfig(FLAGS_config);
diff --git a/aos/events/pong_lib.cc b/aos/events/pong_lib.cc
index 93a551e..e8bf171 100644
--- a/aos/events/pong_lib.cc
+++ b/aos/events/pong_lib.cc
@@ -6,27 +6,45 @@
#include "aos/events/ping_generated.h"
#include "aos/events/pong_generated.h"
+DEFINE_bool(fetch, false, "Poll & fetch messages instead of using a watcher.");
+DEFINE_uint32(fetch_period_ms, 10, "Frequency at which to fetch.");
+
namespace aos {
Pong::Pong(EventLoop *event_loop)
: event_loop_(event_loop),
+ fetcher_(event_loop_->MakeFetcher<examples::Ping>("/test")),
sender_(event_loop_->MakeSender<examples::Pong>("/test")) {
- event_loop_->MakeWatcher("/test", [this](const examples::Ping &ping) {
- if (last_value_ == ping.value() && (!quiet_ || VLOG_IS_ON(1))) {
- LOG(WARNING) << "Duplicate ping value at " << last_value_
- << " time difference " << ping.send_time() - last_send_time_;
- }
- last_value_ = ping.value();
- last_send_time_ = ping.send_time();
- aos::Sender<examples::Pong>::Builder builder = sender_.MakeBuilder();
- examples::Pong::Builder pong_builder =
- builder.MakeBuilder<examples::Pong>();
- pong_builder.add_value(ping.value());
- pong_builder.add_initial_send_time(ping.send_time());
- builder.CheckOk(builder.Send(pong_builder.Finish()));
- });
+ if (FLAGS_fetch) {
+ event_loop_
+ ->AddPhasedLoop(
+ [this](int) {
+ while (fetcher_.FetchNext()) {
+ HandlePing(*fetcher_.get());
+ }
+ },
+ std::chrono::milliseconds(FLAGS_fetch_period_ms))
+ ->set_name("pong");
+ } else {
+ event_loop_->MakeWatcher(
+ "/test", [this](const examples::Ping &ping) { HandlePing(ping); });
+ }
event_loop_->SetRuntimeRealtimePriority(5);
}
+void Pong::HandlePing(const examples::Ping &ping) {
+ if (last_value_ == ping.value() && (!quiet_ || VLOG_IS_ON(1))) {
+ LOG(WARNING) << "Duplicate ping value at " << last_value_
+ << " time difference " << ping.send_time() - last_send_time_;
+ }
+ last_value_ = ping.value();
+ last_send_time_ = ping.send_time();
+ aos::Sender<examples::Pong>::Builder builder = sender_.MakeBuilder();
+ examples::Pong::Builder pong_builder = builder.MakeBuilder<examples::Pong>();
+ pong_builder.add_value(ping.value());
+ pong_builder.add_initial_send_time(ping.send_time());
+ builder.CheckOk(builder.Send(pong_builder.Finish()));
+}
+
} // namespace aos
diff --git a/aos/events/pong_lib.h b/aos/events/pong_lib.h
index 8b6641a..a12dad0 100644
--- a/aos/events/pong_lib.h
+++ b/aos/events/pong_lib.h
@@ -15,7 +15,9 @@
void set_quiet(bool quiet) { quiet_ = quiet; }
private:
+ void HandlePing(const examples::Ping &ping);
EventLoop *event_loop_;
+ aos::Fetcher<examples::Ping> fetcher_;
aos::Sender<examples::Pong> sender_;
int32_t last_value_ = 0;
int32_t last_send_time_ = 0;
diff --git a/aos/events/shm_event_loop.cc b/aos/events/shm_event_loop.cc
index 8523cdf..3126265 100644
--- a/aos/events/shm_event_loop.cc
+++ b/aos/events/shm_event_loop.cc
@@ -38,8 +38,6 @@
} // namespace
-DEFINE_string(shm_base, "/dev/shm/aos",
- "Directory to place queue backing mmaped files in.");
// This value is affected by the umask of the process which is calling it
// and is set to the user's value by default (check yours running `umask` on
// the command line).
@@ -59,10 +57,6 @@
using namespace shm_event_loop_internal;
-void SetShmBase(const std::string_view base) {
- FLAGS_shm_base = std::string(base) + "/aos";
-}
-
namespace {
const Node *MaybeMyNode(const Configuration *configuration) {
@@ -165,14 +159,16 @@
}
}
- bool FetchNext() {
+ bool FetchNext() { return FetchNextIf(std::ref(should_fetch_)); }
+
+ bool FetchNextIf(std::function<bool(const Context &)> fn) {
const ipc_lib::LocklessQueueReader::Result read_result =
- DoFetch(actual_queue_index_);
+ DoFetch(actual_queue_index_, std::move(fn));
return read_result == ipc_lib::LocklessQueueReader::Result::GOOD;
}
- bool Fetch() {
+ bool FetchIf(std::function<bool(const Context &)> fn) {
const ipc_lib::QueueIndex queue_index = reader_.LatestIndex();
// actual_queue_index_ is only meaningful if it was set by Fetch or
// FetchNext. This happens when valid_data_ has been set. So, only
@@ -187,7 +183,7 @@
}
const ipc_lib::LocklessQueueReader::Result read_result =
- DoFetch(queue_index);
+ DoFetch(queue_index, std::move(fn));
CHECK(read_result != ipc_lib::LocklessQueueReader::Result::NOTHING_NEW)
<< ": Queue index went backwards. This should never happen. "
@@ -196,6 +192,8 @@
return read_result == ipc_lib::LocklessQueueReader::Result::GOOD;
}
+ bool Fetch() { return FetchIf(std::ref(should_fetch_)); }
+
Context context() const { return context_; }
bool RegisterWakeup(int priority) {
@@ -229,7 +227,8 @@
private:
ipc_lib::LocklessQueueReader::Result DoFetch(
- ipc_lib::QueueIndex queue_index) {
+ ipc_lib::QueueIndex queue_index,
+ std::function<bool(const Context &context)> fn) {
// TODO(austin): Get behind and make sure it dies.
char *copy_buffer = nullptr;
if (copy_data()) {
@@ -239,8 +238,7 @@
queue_index.index(), &context_.monotonic_event_time,
&context_.realtime_event_time, &context_.monotonic_remote_time,
&context_.realtime_remote_time, &context_.remote_queue_index,
- &context_.source_boot_uuid, &context_.size, copy_buffer,
- std::ref(should_fetch_));
+ &context_.source_boot_uuid, &context_.size, copy_buffer, std::move(fn));
if (read_result == ipc_lib::LocklessQueueReader::Result::GOOD) {
if (pin_data()) {
@@ -336,9 +334,7 @@
Context context_;
// Pre-allocated should_fetch function so we don't allocate.
- std::function<bool(const Context &)> should_fetch_ = [](const Context &) {
- return true;
- };
+ const std::function<bool(const Context &)> should_fetch_;
};
class ShmFetcher : public RawFetcher {
@@ -364,6 +360,16 @@
return std::make_pair(false, monotonic_clock::min_time);
}
+ std::pair<bool, monotonic_clock::time_point> DoFetchNextIf(
+ std::function<bool(const Context &context)> fn) override {
+ shm_event_loop()->CheckCurrentThread();
+ if (simple_shm_fetcher_.FetchNextIf(std::move(fn))) {
+ context_ = simple_shm_fetcher_.context();
+ return std::make_pair(true, monotonic_clock::now());
+ }
+ return std::make_pair(false, monotonic_clock::min_time);
+ }
+
std::pair<bool, monotonic_clock::time_point> DoFetch() override {
shm_event_loop()->CheckCurrentThread();
if (simple_shm_fetcher_.Fetch()) {
@@ -373,6 +379,16 @@
return std::make_pair(false, monotonic_clock::min_time);
}
+ std::pair<bool, monotonic_clock::time_point> DoFetchIf(
+ std::function<bool(const Context &context)> fn) override {
+ shm_event_loop()->CheckCurrentThread();
+ if (simple_shm_fetcher_.FetchIf(std::move(fn))) {
+ context_ = simple_shm_fetcher_.context();
+ return std::make_pair(true, monotonic_clock::now());
+ }
+ return std::make_pair(false, monotonic_clock::min_time);
+ }
+
absl::Span<const char> GetPrivateMemory() const {
return simple_shm_fetcher_.GetPrivateMemory();
}
diff --git a/aos/events/shm_event_loop.h b/aos/events/shm_event_loop.h
index 7b7e68f..0e71f96 100644
--- a/aos/events/shm_event_loop.h
+++ b/aos/events/shm_event_loop.h
@@ -8,11 +8,11 @@
#include "aos/events/epoll.h"
#include "aos/events/event_loop.h"
#include "aos/events/event_loop_generated.h"
+#include "aos/ipc_lib/shm_base.h"
#include "aos/ipc_lib/signalfd.h"
#include "aos/stl_mutex/stl_mutex.h"
DECLARE_string(application_name);
-DECLARE_string(shm_base);
namespace aos {
namespace shm_event_loop_internal {
diff --git a/aos/events/shm_event_loop.rs b/aos/events/shm_event_loop.rs
index 3e387ef..880f72b 100644
--- a/aos/events/shm_event_loop.rs
+++ b/aos/events/shm_event_loop.rs
@@ -233,7 +233,7 @@
use aos_configuration::read_config_from;
use aos_events_event_loop_runtime::{Sender, Watcher};
- use aos_init::test_init;
+ use aos_test_init::test_init;
use ping_rust_fbs::aos::examples as ping;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Barrier;
diff --git a/aos/events/simulated_event_loop.cc b/aos/events/simulated_event_loop.cc
index 7e469f5..7b84380 100644
--- a/aos/events/simulated_event_loop.cc
+++ b/aos/events/simulated_event_loop.cc
@@ -393,6 +393,11 @@
~SimulatedFetcher() { simulated_channel_->UnregisterFetcher(this); }
std::pair<bool, monotonic_clock::time_point> DoFetchNext() override {
+ return DoFetchNextIf(std::function<bool(const Context &context)>());
+ }
+
+ std::pair<bool, monotonic_clock::time_point> DoFetchNextIf(
+ std::function<bool(const Context &context)> fn) override {
// The allocations in here are due to infrastructure and don't count in the
// no mallocs in RT code.
ScopedNotRealtime nrt;
@@ -404,12 +409,27 @@
<< configuration::StrippedChannelToString(
simulated_channel_->channel());
- SetMsg(msgs_.front());
+ if (fn) {
+ Context context = msgs_.front()->context;
+ context.data = nullptr;
+ context.buffer_index = -1;
+
+ if (!fn(context)) {
+ return std::make_pair(false, monotonic_clock::min_time);
+ }
+ }
+
+ SetMsg(std::move(msgs_.front()));
msgs_.pop_front();
return std::make_pair(true, event_loop()->monotonic_now());
}
std::pair<bool, monotonic_clock::time_point> DoFetch() override {
+ return DoFetchIf(std::function<bool(const Context &context)>());
+ }
+
+ std::pair<bool, monotonic_clock::time_point> DoFetchIf(
+ std::function<bool(const Context &context)> fn) override {
// The allocations in here are due to infrastructure and don't count in the
// no mallocs in RT code.
ScopedNotRealtime nrt;
@@ -417,13 +437,35 @@
// TODO(austin): Can we just do this logic unconditionally? It is a lot
// simpler. And call clear, obviously.
if (!msg_ && simulated_channel_->latest_message()) {
- SetMsg(simulated_channel_->latest_message());
+ std::shared_ptr<SimulatedMessage> latest_message =
+ simulated_channel_->latest_message();
+
+ if (fn) {
+ Context context = latest_message->context;
+ context.data = nullptr;
+ context.buffer_index = -1;
+
+ if (!fn(context)) {
+ return std::make_pair(false, monotonic_clock::min_time);
+ }
+ }
+ SetMsg(std::move(latest_message));
return std::make_pair(true, event_loop()->monotonic_now());
} else {
return std::make_pair(false, monotonic_clock::min_time);
}
}
+ if (fn) {
+ Context context = msgs_.back()->context;
+ context.data = nullptr;
+ context.buffer_index = -1;
+
+ if (!fn(context)) {
+ return std::make_pair(false, monotonic_clock::min_time);
+ }
+ }
+
// We've had a message enqueued, so we don't need to go looking for the
// latest message from before we started.
SetMsg(msgs_.back());
@@ -1300,7 +1342,7 @@
new NodeEventLoopFactory(&scheduler_scheduler_, this, node));
}
- if (configuration::MultiNode(configuration)) {
+ if (configuration::NodesCount(configuration) > 1u) {
bridge_ = std::make_unique<message_bridge::SimulatedMessageBridge>(this);
}
}
diff --git a/aos/events/simulated_event_loop.rs b/aos/events/simulated_event_loop.rs
index ea3a154..90c4c86 100644
--- a/aos/events/simulated_event_loop.rs
+++ b/aos/events/simulated_event_loop.rs
@@ -143,7 +143,7 @@
use runfiles::Runfiles;
use aos_configuration::read_config_from;
- use aos_init::test_init;
+ use aos_test_init::test_init;
use ping_rust_fbs::aos::examples::PingBuilder;
// A really basic test of the functionality here.
diff --git a/aos/events/simulated_event_loop_test.cc b/aos/events/simulated_event_loop_test.cc
index 0afa22e..23badae 100644
--- a/aos/events/simulated_event_loop_test.cc
+++ b/aos/events/simulated_event_loop_test.cc
@@ -2101,6 +2101,18 @@
"All ExitHandles must be destroyed before the factory");
}
+// Test that AllowApplicationCreationDuring can't happen in OnRun callbacks.
+TEST(SimulatedEventLoopDeathTest, AllowApplicationCreationDuringInOnRun) {
+ aos::FlatbufferDetachedBuffer<aos::Configuration> config =
+ aos::configuration::ReadConfig(
+ ArtifactPath("aos/events/multinode_pingpong_test_split_config.json"));
+ auto factory = std::make_unique<SimulatedEventLoopFactory>(&config.message());
+ NodeEventLoopFactory *pi1 = factory->GetNodeEventLoopFactory("pi1");
+ std::unique_ptr<EventLoop> loop = pi1->MakeEventLoop("foo");
+ loop->OnRun([&]() { factory->AllowApplicationCreationDuring([]() {}); });
+ EXPECT_DEATH(factory->RunFor(chrono::seconds(1)), "OnRun");
+}
+
// Tests that messages don't survive a reboot of a node.
TEST(SimulatedEventLoopTest, ChannelClearedOnReboot) {
aos::FlatbufferDetachedBuffer<aos::Configuration> config =
diff --git a/aos/events/timing_report_dump_lib.cc b/aos/events/timing_report_dump_lib.cc
index 86936f1..15a8eb9 100644
--- a/aos/events/timing_report_dump_lib.cc
+++ b/aos/events/timing_report_dump_lib.cc
@@ -181,10 +181,16 @@
LOG(INFO) << "Failed to send " << report.send_failures()
<< " timing report(s) in " << report.name()->string_view();
}
+ std::string version_string;
+ if (report.has_version()) {
+ version_string =
+ absl::StrCat("version: \"", report.version()->string_view(), "\" ");
+ }
std::cout << report.name()->string_view() << "[" << report.pid() << "] ("
- << MaybeNodeName("", event_loop_->node()) << ") ("
- << event_loop_->context().monotonic_event_time << ","
+ << MaybeNodeName("", event_loop_->node()) << ") " << version_string
+ << "(" << event_loop_->context().monotonic_event_time << ","
<< event_loop_->context().realtime_event_time << "):" << std::endl;
+
if (report.has_watchers() && report.watchers()->size() > 0) {
PrintWatchers(&std::cout, *report.watchers());
}
@@ -314,6 +320,9 @@
if (accumulated_statistics_.count(map_key) == 0) {
accumulated_statistics_[map_key].name = raw_report.name()->str();
accumulated_statistics_[map_key].pid = raw_report.pid();
+ if (raw_report.has_version()) {
+ accumulated_statistics_[map_key].version = raw_report.version()->str();
+ }
}
timing::ReportT report;
diff --git a/aos/init.rs b/aos/init.rs
index 9ac62f1..13066f1 100644
--- a/aos/init.rs
+++ b/aos/init.rs
@@ -8,19 +8,7 @@
generate!("aos::InitFromRust")
);
-/// Initializes things for a test.
-///
-/// TODO(Brian): Should we provide a proc macro attribute that handles calling this?
-///
-/// # Panics
-///
-/// Panics if non-test initialization has already been performed.
-pub fn test_init() {
- init();
- // TODO(Brian): Do we want any of the other stuff that `:gtest_main` has?
- // TODO(Brian): Call `aos::SetShmBase` like `:gtest_main` does.
-}
-
+/// Initializes AOS.
pub fn init() {
static ONCE: Once = Once::new();
ONCE.call_once(|| {
diff --git a/aos/ipc_lib/BUILD b/aos/ipc_lib/BUILD
index 53d30cd..d67616f 100644
--- a/aos/ipc_lib/BUILD
+++ b/aos/ipc_lib/BUILD
@@ -233,6 +233,22 @@
],
)
+cc_library(
+ name = "lockless_queue_stepping",
+ testonly = True,
+ srcs = [
+ "lockless_queue_stepping.cc",
+ ],
+ hdrs = ["lockless_queue_stepping.h"],
+ deps = [
+ ":lockless_queue",
+ ":shm_observers",
+ "//aos/libc:aos_strsignal",
+ "//aos/testing:prevent_exit",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
cc_test(
name = "lockless_queue_test",
timeout = "eternal",
@@ -242,11 +258,11 @@
deps = [
":event",
":lockless_queue",
+ ":lockless_queue_stepping",
":queue_racer",
":signalfd",
"//aos/events:epoll",
"//aos/testing:googletest",
- "//aos/testing:prevent_exit",
"//aos/util:phased_loop",
],
)
@@ -258,6 +274,7 @@
deps = [
":event",
":lockless_queue",
+ ":lockless_queue_stepping",
":queue_racer",
":shm_observers",
":signalfd",
@@ -404,3 +421,14 @@
"shm_observers.h",
],
)
+
+cc_library(
+ name = "shm_base",
+ srcs = ["shm_base.cc"],
+ hdrs = ["shm_base.h"],
+ target_compatible_with = ["@platforms//os:linux"],
+ visibility = ["//visibility:public"],
+ deps = [
+ "@com_github_gflags_gflags//:gflags",
+ ],
+)
diff --git a/aos/ipc_lib/lockless_queue.cc b/aos/ipc_lib/lockless_queue.cc
index 01706ba..fd3a305 100644
--- a/aos/ipc_lib/lockless_queue.cc
+++ b/aos/ipc_lib/lockless_queue.cc
@@ -1,6 +1,7 @@
#include "aos/ipc_lib/lockless_queue.h"
#include <linux/futex.h>
+#include <pwd.h>
#include <sys/types.h>
#include <syscall.h>
#include <unistd.h>
@@ -647,7 +648,20 @@
// redo initialization.
memory->initialized = true;
} else {
- CHECK_EQ(uid, memory->uid) << ": UIDs must match for all processes";
+ if (memory->uid != uid) {
+ // Subsequent calls to getpwuid() overwrite this
+ // pointer, pull the thing we care about into a
+ // string.
+ struct passwd const *user_pw = getpwuid(uid);
+ std::string user_username = user_pw->pw_name;
+ struct passwd const *memory_pw = getpwuid(memory->uid);
+ std::string memory_username = memory_pw->pw_name;
+ LOG(FATAL) << "Current user " << user_username << " (uid:" << uid << ") "
+ << "doesn't match shared memory user " << memory_username
+ << " (uid:" << memory->uid << "). "
+ << "Log in as " << memory_username
+ << " user to access this channel.";
+ }
}
return memory;
@@ -1269,19 +1283,20 @@
monotonic_clock::time_point *monotonic_remote_time,
realtime_clock::time_point *realtime_remote_time,
uint32_t *remote_queue_index, UUID *source_boot_uuid, size_t *length,
- char *data, std::function<bool(const Context &)> should_read) const {
- const size_t queue_size = memory_->queue_size();
+ char *data,
+ std::function<bool(const Context &)> should_read_callback) const {
+ const size_t queue_size = const_memory_->queue_size();
// Build up the QueueIndex.
const QueueIndex queue_index =
QueueIndex::Zero(queue_size).IncrementBy(uint32_queue_index);
// Read the message stored at the requested location.
- Index mi = memory_->LoadIndex(queue_index);
- const Message *m = memory_->GetMessage(mi);
+ Index mi = const_memory_->LoadIndex(queue_index);
+ const Message *m = const_memory_->GetMessage(mi);
while (true) {
- DCHECK(!CheckBothRedzones(memory_, m))
+ DCHECK(!CheckBothRedzones(const_memory_, m))
<< ": Invalid message found in shared memory";
// We need to confirm that the data doesn't change while we are reading it.
// Do that by first confirming that the message points to the queue index we
@@ -1299,7 +1314,7 @@
// Someone has re-used this message between when we pulled it out of the
// queue and when we grabbed its index. It is pretty hard to deduce
// what happened. Just try again.
- const Message *const new_m = memory_->GetMessage(queue_index);
+ const Message *const new_m = const_memory_->GetMessage(queue_index);
if (m != new_m) {
m = new_m;
VLOG(3) << "Retrying, m doesn't match";
@@ -1328,7 +1343,7 @@
// then they need to wait. They got ahead. Otherwise, they are
// asking for something crazy, like something before the beginning of
// the queue. Tell them that they are behind.
- if (uint32_queue_index < memory_->queue_size()) {
+ if (uint32_queue_index < const_memory_->queue_size()) {
VLOG(3) << "Near zero, " << std::hex << uint32_queue_index;
return Result::NOTHING_NEW;
} else {
@@ -1343,37 +1358,24 @@
// Then read the data out. Copy it all out to be deterministic and so we can
// make length be from either end.
- if (!should_read) {
- *monotonic_sent_time = m->header.monotonic_sent_time;
- *realtime_sent_time = m->header.realtime_sent_time;
- if (m->header.remote_queue_index == 0xffffffffu) {
- *remote_queue_index = queue_index.index();
- } else {
- *remote_queue_index = m->header.remote_queue_index;
- }
- *monotonic_remote_time = m->header.monotonic_remote_time;
- *realtime_remote_time = m->header.realtime_remote_time;
- *source_boot_uuid = m->header.source_boot_uuid;
- *length = m->header.length;
+ Context context;
+ context.monotonic_event_time = m->header.monotonic_sent_time;
+ context.realtime_event_time = m->header.realtime_sent_time;
+ context.monotonic_remote_time = m->header.monotonic_remote_time;
+ context.realtime_remote_time = m->header.realtime_remote_time;
+ context.queue_index = queue_index.index();
+ if (m->header.remote_queue_index == 0xffffffffu) {
+ context.remote_queue_index = context.queue_index;
} else {
- // Cache the header results so we don't modify the outputs unless the filter
- // function says "go".
- Context context;
- context.monotonic_event_time = m->header.monotonic_sent_time;
- context.realtime_event_time = m->header.realtime_sent_time;
- context.monotonic_remote_time = m->header.monotonic_remote_time;
- context.realtime_remote_time = m->header.realtime_remote_time;
- context.queue_index = queue_index.index();
- if (m->header.remote_queue_index == 0xffffffffu) {
- context.remote_queue_index = context.queue_index;
- } else {
- context.remote_queue_index = m->header.remote_queue_index;
- }
- context.source_boot_uuid = m->header.source_boot_uuid;
- context.size = m->header.length;
- context.data = nullptr;
- context.buffer_index = -1;
+ context.remote_queue_index = m->header.remote_queue_index;
+ }
+ context.source_boot_uuid = m->header.source_boot_uuid;
+ context.size = m->header.length;
+ context.data = nullptr;
+ context.buffer_index = -1;
+ // If the callback is provided, use it.
+ if (should_read_callback) {
// And finally, confirm that the message *still* points to the queue index
// we want. This means it didn't change out from under us. If something
// changed out from under us, we were reading it much too late in its
@@ -1390,24 +1392,22 @@
// We now know that the context is safe to use. See if we are supposed to
// take the message or not.
- if (!should_read(context)) {
+ if (!should_read_callback(context)) {
return Result::FILTERED;
}
-
- // And now take it.
- *monotonic_sent_time = context.monotonic_event_time;
- *realtime_sent_time = context.realtime_event_time;
- *remote_queue_index = context.remote_queue_index;
- *monotonic_remote_time = context.monotonic_remote_time;
- *realtime_remote_time = context.realtime_remote_time;
- *source_boot_uuid = context.source_boot_uuid;
- *length = context.size;
}
- if (data) {
- memcpy(data, m->data(memory_->message_data_size()),
- memory_->message_data_size());
- // Check again since we touched the message again.
+ // Read the data if requested.
+ if (data) {
+ memcpy(data, m->data(const_memory_->message_data_size()),
+ const_memory_->message_data_size());
+ }
+
+ // Now, we need to confirm that nothing has changed by re-reading the queue
+ // index from the header since we've read all the body. We only need to do it
+ // if we have read anything new after the previous check up above, which
+ // happens if we read the data, or if we didn't check for the filtered case.
+ if (data || !should_read_callback) {
aos_compiler_memory_barrier();
const QueueIndex final_queue_index = m->header.queue_index.Load(queue_size);
if (final_queue_index != queue_index) {
@@ -1419,18 +1419,75 @@
}
}
+ // And now take it and make it visible to the user. By doing it here, we will
+ // never present partial or corrupted state to the user in the output
+ // pointers.
+ *monotonic_sent_time = context.monotonic_event_time;
+ *realtime_sent_time = context.realtime_event_time;
+ *remote_queue_index = context.remote_queue_index;
+ *monotonic_remote_time = context.monotonic_remote_time;
+ *realtime_remote_time = context.realtime_remote_time;
+ *source_boot_uuid = context.source_boot_uuid;
+ *length = context.size;
+
return Result::GOOD;
}
QueueIndex LocklessQueueReader::LatestIndex() const {
- const size_t queue_size = memory_->queue_size();
+ const size_t queue_size = const_memory_->queue_size();
- // There is only one interesting case. We need to know if the queue is empty.
- // That is done with a sentinel value. At worst, this will be off by one.
- const QueueIndex next_queue_index =
- memory_->next_queue_index.Load(queue_size);
- if (next_queue_index.valid()) {
- const QueueIndex current_queue_index = next_queue_index.DecrementBy(1u);
+ // There are 2 main cases. Either the next queue index is right, or it is
+ // behind by 1 and wrong. If nothing has been published, the next queue index
+ // will be the reserved "Invalid" value, otherwise it will point to the next
+ // place to write. We need to figure out if it is right or wrong, and it if
+ // is wrong, fix it. If we don't, Read() can find the next message before
+ // LatestIndex() sees it if someone is hammering on Read() until it returns
+ // nothing new is left, which mean watchers and fetchers may disagree on when
+ // a message is published.
+ QueueIndex actual_next_queue_index =
+ const_memory_->next_queue_index.Load(queue_size);
+
+ // Handle the "nothing has been published" case by making next_queue_index
+ // point to the 0th index.
+ const QueueIndex next_queue_index = ZeroOrValid(actual_next_queue_index);
+
+ // This needs to synchronize with whoever the previous writer at this
+ // location was. Read what is there to see if the message has been published
+ // and next_queue_index is just behind.
+ Index to_replace = const_memory_->LoadIndex(next_queue_index);
+
+ // See if next_queue_index is consistent with the state of the queue. If it
+ // is not, try to atomically update next_queue_index in case the previous
+ // writer failed and retry.
+ if (to_replace.IsPlausible(next_queue_index)) {
+ // If next_queue_index ends up pointing to a message with a matching index,
+ // this is what next_queue_index needs to be updated to
+ const QueueIndex incremented_queue_index = next_queue_index.Increment();
+
+ // We don't care about the result. It will either succeed, or we got
+ // beat in fixing it. The way the Send logic works, the pointer can never
+ // get more than 1 behind or the next send will repair it. So, if we fail,
+ // that means that someone else got there first and fixed it up (and
+ // potentially someone further continued to send).
+ //
+ // Both require no further action from us. Worst case, our Next pointer
+ // will not be the latest message, but there will always be a point after
+ // which the index can change. We just need a consistent snapshot where
+ // there is nothing in the queue that isn't accounted for by
+ // next_queue_index.
+ memory_->next_queue_index.CompareAndExchangeStrong(actual_next_queue_index,
+ incremented_queue_index);
+
+ VLOG(3) << "next_queue_index is lagging, fixed it. Found " << std::hex
+ << to_replace.get() << ", expected "
+ << next_queue_index.DecrementBy(queue_size).index();
+
+ actual_next_queue_index = incremented_queue_index;
+ }
+
+ if (actual_next_queue_index.valid()) {
+ const QueueIndex current_queue_index =
+ actual_next_queue_index.DecrementBy(1u);
return current_queue_index;
}
return QueueIndex::Invalid();
diff --git a/aos/ipc_lib/lockless_queue.h b/aos/ipc_lib/lockless_queue.h
index 2cafb48..01f2b7c 100644
--- a/aos/ipc_lib/lockless_queue.h
+++ b/aos/ipc_lib/lockless_queue.h
@@ -414,14 +414,15 @@
GOOD,
// The message is in the future and we haven't written it yet.
NOTHING_NEW,
- // There is a message, but should_read() returned false so we didn't fetch
- // it.
+ // There is a message, but should_read_callback() returned false so we
+ // didn't fetch it.
FILTERED,
// The message got overwritten while we were reading it.
OVERWROTE,
};
- LocklessQueueReader(LocklessQueue queue) : memory_(queue.const_memory()) {
+ LocklessQueueReader(LocklessQueue queue)
+ : memory_(queue.memory()), const_memory_(queue.const_memory()) {
queue.Initialize();
}
@@ -433,14 +434,14 @@
// message, but the filter function returned false, return FILTERED.
//
// data may be nullptr to indicate the data should not be copied.
- Result Read(uint32_t queue_index,
- monotonic_clock::time_point *monotonic_sent_time,
- realtime_clock::time_point *realtime_sent_time,
- monotonic_clock::time_point *monotonic_remote_time,
- realtime_clock::time_point *realtime_remote_time,
- uint32_t *remote_queue_index, UUID *source_boot_uuid,
- size_t *length, char *data,
- std::function<bool(const Context &context)> should_read) const;
+ Result Read(
+ uint32_t queue_index, monotonic_clock::time_point *monotonic_sent_time,
+ realtime_clock::time_point *realtime_sent_time,
+ monotonic_clock::time_point *monotonic_remote_time,
+ realtime_clock::time_point *realtime_remote_time,
+ uint32_t *remote_queue_index, UUID *source_boot_uuid, size_t *length,
+ char *data,
+ std::function<bool(const Context &context)> should_read_callback) const;
// Returns the index to the latest queue message. Returns empty_queue_index()
// if there are no messages in the queue. Do note that this index wraps if
@@ -448,7 +449,8 @@
QueueIndex LatestIndex() const;
private:
- const LocklessQueueMemory *const memory_;
+ LocklessQueueMemory *const memory_;
+ const LocklessQueueMemory *const_memory_;
};
// Returns the number of messages which are logically in the queue at a time.
diff --git a/aos/ipc_lib/lockless_queue_death_test.cc b/aos/ipc_lib/lockless_queue_death_test.cc
index c37dc63..2a95b31 100644
--- a/aos/ipc_lib/lockless_queue_death_test.cc
+++ b/aos/ipc_lib/lockless_queue_death_test.cc
@@ -22,10 +22,9 @@
#include "aos/ipc_lib/aos_sync.h"
#include "aos/ipc_lib/lockless_queue.h"
#include "aos/ipc_lib/lockless_queue_memory.h"
+#include "aos/ipc_lib/lockless_queue_stepping.h"
#include "aos/ipc_lib/shm_observers.h"
-#include "aos/libc/aos_strsignal.h"
#include "aos/realtime.h"
-#include "aos/testing/prevent_exit.h"
#include "aos/testing/test_logging.h"
namespace aos {
@@ -34,529 +33,9 @@
namespace chrono = ::std::chrono;
-#if ((defined(AOS_SANITIZER_address) || defined(AOS_SANITIZER_thread)) && \
- !defined(__clang__) && __GNUC__ <= 4 && __GNUC_MINOR__ <= 8) || \
- defined(__ARM_EABI__)
-// There are various reasons why we might not actually be able to do this
-// testing, but we still want to run the functions to test anything they test
-// other than shm robustness.
-//
-// GCC 4.8 has too old of a version of asan to allow SIGSEGV handler
-// chaining.
-//
-// GCC 4.8's tsan doesn't work with the code for calling the real sigaction to
-// work arounds its weirdness of the SIGTRAP handler.
-//
-// ARM doesn't have a nice way to do single-stepping, and I don't feel like
-// dealing with editing the next instruction to be a breakpoint and then
-// changing it back.
-#else
-
-// This code currently supports amd64 only, but it
-// shouldn't be hard to port to i386 (it should just be using the name for
-// only the smaller part of the flags register), but that's not tested, and
-// porting it to any other architecture is more work.
-// Currently, we skip doing anything exciting on arm (just run the code without
-// any robustness testing) and refuse to compile anywhere else.
-
-#define SIMPLE_ASSERT(condition, message) \
- do { \
- if (!(condition)) { \
- static const char kMessage[] = message "\n"; \
- if (write(STDERR_FILENO, kMessage, sizeof(kMessage) - 1) != \
- (sizeof(kMessage) - 1)) { \
- static const char kFailureMessage[] = "writing failed\n"; \
- __attribute__((unused)) int ignore = write( \
- STDERR_FILENO, kFailureMessage, sizeof(kFailureMessage) - 1); \
- } \
- abort(); \
- } \
- } while (false)
-
-// Array to track writes to memory, and make sure they happen in the right
-// order.
-class WritesArray {
- public:
- uintptr_t At(size_t location) const {
- SIMPLE_ASSERT(location < size_, "too far into writes array");
- return writes_[location];
- }
- void Add(uintptr_t pointer) {
- SIMPLE_ASSERT(size_ < kSize, "too many writes");
- writes_[size_++] = pointer;
- }
-
- size_t size() const { return size_; }
-
- private:
- static const size_t kSize = 20000;
-
- uintptr_t writes_[kSize];
- size_t size_ = 0;
-};
-
-enum class DieAtState {
- // No SEGVs should be happening.
- kDisabled,
- // SEGVs are fine. Normal operation.
- kRunning,
- // We are manipulating a mutex. No SEGVs should be happening.
- kWriting,
-};
-
-// What we exit with when we're exiting in the middle.
-const int kExitEarlyValue = 123;
-
-// We have to keep track of everything in a global variable because there's no
-// other way for the signal handlers to find it.
-struct GlobalState {
- // Pointer to the queue memory, and its size.
- void *lockless_queue_memory;
- size_t lockless_queue_memory_size;
-
- // Pointer to a second block of memory the same size. This (on purpose) has
- // the same size as lockless_queue_memory so we can point the robust mutexes
- // here.
- void *lockless_queue_memory_lock_backup;
-
- // Expected writes.
- const WritesArray *writes_in;
- // Actual writes.
- WritesArray *writes_out;
- // Location to die at, and how far we have gotten.
- size_t die_at, current_location;
- // State.
- DieAtState state;
-};
-::std::atomic<GlobalState *> global_state;
-
-// Returns true if the address is in the queue memory chunk.
-bool IsInLocklessQueueMemory(void *address) {
- GlobalState *my_global_state = global_state.load(::std::memory_order_relaxed);
- void *lockless_queue_memory = my_global_state->lockless_queue_memory;
- if (address < lockless_queue_memory) {
- return false;
- if (reinterpret_cast<uintptr_t>(address) >
- reinterpret_cast<uintptr_t>(lockless_queue_memory) +
- my_global_state->lockless_queue_memory_size)
- return false;
- }
- return true;
-}
-
-// Calls mprotect(2) for the entire shared memory region with the given prot.
-void ShmProtectOrDie(int prot) {
- GlobalState *my_global_state = global_state.load(::std::memory_order_relaxed);
- PCHECK(mprotect(my_global_state->lockless_queue_memory,
- my_global_state->lockless_queue_memory_size, prot) != -1)
- << ": mprotect(" << my_global_state->lockless_queue_memory << ", "
- << my_global_state->lockless_queue_memory_size << ", 0x" << std::hex
- << prot << ") failed";
-}
-
-// Checks a write into the queue and conditionally dies. Tracks the write.
-void HandleWrite(void *address) {
- GlobalState *my_global_state = global_state.load(::std::memory_order_relaxed);
- uintptr_t address_offset =
- reinterpret_cast<uintptr_t>(address) -
- reinterpret_cast<uintptr_t>(my_global_state->lockless_queue_memory);
- if (my_global_state->writes_in != nullptr) {
- SIMPLE_ASSERT(my_global_state->writes_in->At(
- my_global_state->current_location) == address_offset,
- "wrong write order");
- }
- if (my_global_state->writes_out != nullptr) {
- my_global_state->writes_out->Add(address_offset);
- }
- if (my_global_state->die_at != 0) {
- if (my_global_state->die_at == my_global_state->current_location) {
- _exit(kExitEarlyValue);
- }
- }
- ++my_global_state->current_location;
-}
-
-struct sigaction old_segv_handler, old_trap_handler;
-
-// Calls the original signal handler.
-bool CallChainedAction(const struct sigaction &action, int signal,
- siginfo_t *siginfo, void *context) {
- if (action.sa_handler == SIG_IGN || action.sa_handler == SIG_DFL) {
- return false;
- }
- if (action.sa_flags & SA_SIGINFO) {
- action.sa_sigaction(signal, siginfo, context);
- } else {
- action.sa_handler(signal);
- }
- return true;
-}
-
-void segv_handler(int signal, siginfo_t *siginfo, void *context_void) {
- GlobalState *my_global_state = global_state.load(::std::memory_order_relaxed);
- const int saved_errno = errno;
- SIMPLE_ASSERT(signal == SIGSEGV, "wrong signal for SIGSEGV handler");
-
- // Only process memory addresses in our shared memory block.
- if (!IsInLocklessQueueMemory(siginfo->si_addr)) {
- if (CallChainedAction(old_segv_handler, signal, siginfo, context_void)) {
- errno = saved_errno;
- return;
- } else {
- SIMPLE_ASSERT(false, "actual SIGSEGV");
- }
- }
- SIMPLE_ASSERT(my_global_state->state == DieAtState::kRunning,
- "bad state for SIGSEGV");
-
- HandleWrite(siginfo->si_addr);
-
- ShmProtectOrDie(PROT_READ | PROT_WRITE);
- my_global_state->state = DieAtState::kWriting;
- errno = saved_errno;
-
-#if defined(__x86_64__)
- __asm__ __volatile__("int $3" ::: "memory", "cc");
-#elif defined(__aarch64__)
- __asm__ __volatile__("brk #0" ::: "memory", "cc");
-#else
-#error Unhandled architecture
-#endif
-}
-
-// A mutex lock is about to happen. Mark the memory rw, and check to see if we
-// should die.
-void futex_before(void *address, bool) {
- GlobalState *my_global_state = global_state.load(::std::memory_order_relaxed);
- if (IsInLocklessQueueMemory(address)) {
- assert(my_global_state->state == DieAtState::kRunning);
- HandleWrite(address);
- ShmProtectOrDie(PROT_READ | PROT_WRITE);
- my_global_state->state = DieAtState::kWriting;
- }
-}
-
-// The SEGV handler has set a breakpoint 1 instruction in the future. This
-// clears it, marks memory readonly, and continues.
-void trap_handler(int signal, siginfo_t *, void * /*context*/) {
- GlobalState *my_global_state = global_state.load(::std::memory_order_relaxed);
- const int saved_errno = errno;
- SIMPLE_ASSERT(signal == SIGTRAP, "wrong signal for SIGTRAP handler");
-
- my_global_state->state = DieAtState::kWriting;
- SIMPLE_ASSERT(my_global_state->state == DieAtState::kWriting,
- "bad state for SIGTRAP");
- ShmProtectOrDie(PROT_READ);
- my_global_state->state = DieAtState::kRunning;
- errno = saved_errno;
-}
-
-// We have a manual trap for mutexes. Check to see if we were supposed to die
-// on this write (the compare/exchange for the mutex), and mark the memory ro
-// again.
-void futex_after(void *address, bool) {
- GlobalState *my_global_state = global_state.load(::std::memory_order_relaxed);
- if (IsInLocklessQueueMemory(address)) {
- assert(my_global_state->state == DieAtState::kWriting);
- ShmProtectOrDie(PROT_READ);
- my_global_state->state = DieAtState::kRunning;
- }
-}
-
-// Installs the signal handler.
-void InstallHandler(int signal, void (*handler)(int, siginfo_t *, void *),
- struct sigaction *old_action) {
- struct sigaction action;
- memset(&action, 0, sizeof(action));
- action.sa_sigaction = handler;
- // We don't do a full normal signal handler exit with ptrace, so SA_NODEFER is
- // necessary to keep our signal handler active.
- action.sa_flags = SA_RESTART | SA_SIGINFO | SA_NODEFER;
-#ifdef AOS_SANITIZER_thread
- // Tsan messes with signal handlers to check for race conditions, and it
- // causes problems, so we have to work around it for SIGTRAP.
- if (signal == SIGTRAP) {
- typedef int (*SigactionType)(int, const struct sigaction *,
- struct sigaction *);
- SigactionType real_sigaction =
- reinterpret_cast<SigactionType>(dlsym(RTLD_NEXT, "sigaction"));
- if (sigaction == real_sigaction) {
- LOG(WARNING) << "failed to work around tsan signal handling weirdness";
- }
- PCHECK(real_sigaction(signal, &action, old_action) == 0);
- return;
- }
-#endif
- PCHECK(sigaction(signal, &action, old_action) == 0);
-}
-
-// gtest only allows creating fatal failures in functions returning void...
-// status is from wait(2).
-void DetectFatalFailures(int status) {
- if (WIFEXITED(status)) {
- FAIL() << " child returned status "
- << ::std::to_string(WEXITSTATUS(status));
- } else if (WIFSIGNALED(status)) {
- FAIL() << " child exited because of signal "
- << aos_strsignal(WTERMSIG(status));
- } else {
- FAIL() << " child exited with status " << ::std::hex << status;
- }
-}
-
-// Returns true if it runs all the way through.
-bool RunFunctionDieAt(::std::function<void(void *)> prepare,
- ::std::function<void(void *)> function,
- bool *test_failure, size_t die_at, bool prepare_in_child,
- uintptr_t writable_offset, const WritesArray *writes_in,
- WritesArray *writes_out) {
- GlobalState *my_global_state = global_state.load(::std::memory_order_relaxed);
- my_global_state->writes_in = writes_in;
- my_global_state->writes_out = writes_out;
- my_global_state->die_at = die_at;
- my_global_state->current_location = 0;
- my_global_state->state = DieAtState::kDisabled;
-
- if (!prepare_in_child) prepare(my_global_state->lockless_queue_memory);
-
- const pid_t pid = fork();
- PCHECK(pid != -1) << ": fork() failed";
- if (pid == 0) {
- // Run the test.
- ::aos::testing::PreventExit();
-
- if (prepare_in_child) prepare(my_global_state->lockless_queue_memory);
-
- // Update the robust list offset.
- linux_code::ipc_lib::SetRobustListOffset(writable_offset);
- // Install a segv handler (to detect writes to the memory block), and a trap
- // handler so we can single step.
- InstallHandler(SIGSEGV, segv_handler, &old_segv_handler);
- InstallHandler(SIGTRAP, trap_handler, &old_trap_handler);
- CHECK_EQ(old_trap_handler.sa_handler, SIG_DFL);
- linux_code::ipc_lib::SetShmAccessorObservers(futex_before, futex_after);
-
- PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
- ShmProtectOrDie(PROT_READ);
- my_global_state->state = DieAtState::kRunning;
-
- function(my_global_state->lockless_queue_memory);
- my_global_state->state = DieAtState::kDisabled;
- ShmProtectOrDie(PROT_READ | PROT_WRITE);
- _exit(0);
- } else {
- // Annoying wrapper type because elf_gregset_t is an array, which C++
- // handles poorly.
- struct RestoreState {
- RestoreState(elf_gregset_t regs_in) {
- memcpy(regs, regs_in, sizeof(regs));
- }
- elf_gregset_t regs;
- };
- std::optional<RestoreState> restore_regs;
- bool pass_trap = false;
- // Wait until the child process dies.
- while (true) {
- int status;
- pid_t waited_on = waitpid(pid, &status, 0);
- if (waited_on == -1) {
- if (errno == EINTR) continue;
- PCHECK(false) << ": waitpid(" << pid << ", " << &status
- << ", 0) failed";
- }
- CHECK_EQ(waited_on, pid)
- << ": waitpid got child " << waited_on << " instead of " << pid;
- if (WIFSTOPPED(status)) {
- // The child was stopped via ptrace.
- const int stop_signal = WSTOPSIG(status);
- elf_gregset_t regs;
- {
- struct iovec iov;
- iov.iov_base = ®s;
- iov.iov_len = sizeof(regs);
- PCHECK(ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov) == 0);
- CHECK_EQ(iov.iov_len, sizeof(regs))
- << ": ptrace regset is the wrong size";
- }
- if (stop_signal == SIGSEGV) {
- // It's a SEGV, hopefully due to writing to the shared memory which is
- // marked read-only. We record the instruction that faulted so we can
- // look for it while single-stepping, then deliver the signal so the
- // child can mark it read-write and then poke us to single-step that
- // instruction.
-
- CHECK(!restore_regs)
- << ": Traced child got a SEGV while single-stepping";
- // Save all the registers to resume execution at the current location
- // in the child.
- restore_regs = RestoreState(regs);
- PCHECK(ptrace(PTRACE_CONT, pid, nullptr, SIGSEGV) == 0);
- continue;
- }
- if (stop_signal == SIGTRAP) {
- if (pass_trap) {
- // This is the new SIGTRAP we generated, which we just want to pass
- // through so the child's signal handler can restore the memory to
- // read-only
- PCHECK(ptrace(PTRACE_CONT, pid, nullptr, SIGTRAP) == 0);
- pass_trap = false;
- continue;
- }
- if (restore_regs) {
- // Restore the state we saved before delivering the SEGV, and then
- // single-step that one instruction.
- struct iovec iov;
- iov.iov_base = &restore_regs->regs;
- iov.iov_len = sizeof(restore_regs->regs);
- PCHECK(ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov) == 0);
- restore_regs = std::nullopt;
- PCHECK(ptrace(PTRACE_SINGLESTEP, pid, nullptr, nullptr) == 0);
- continue;
- }
- // We executed the single instruction that originally faulted, so
- // now deliver a SIGTRAP to the child so it can mark the memory
- // read-only again.
- pass_trap = true;
- PCHECK(kill(pid, SIGTRAP) == 0);
- PCHECK(ptrace(PTRACE_CONT, pid, nullptr, nullptr) == 0);
- continue;
- }
- LOG(FATAL) << "Traced child was stopped with unexpected signal: "
- << static_cast<int>(WSTOPSIG(status));
- }
- if (WIFEXITED(status)) {
- if (WEXITSTATUS(status) == 0) return true;
- if (WEXITSTATUS(status) == kExitEarlyValue) return false;
- }
- DetectFatalFailures(status);
- if (test_failure) *test_failure = true;
- return false;
- }
- }
-}
-
-bool RunFunctionDieAtAndCheck(const LocklessQueueConfiguration &config,
- ::std::function<void(void *)> prepare,
- ::std::function<void(void *)> function,
- ::std::function<void(void *)> check,
- bool *test_failure, size_t die_at,
- bool prepare_in_child,
- const WritesArray *writes_in,
- WritesArray *writes_out) {
- // Allocate shared memory.
- GlobalState *my_global_state = global_state.load(::std::memory_order_relaxed);
- my_global_state->lockless_queue_memory_size = LocklessQueueMemorySize(config);
- my_global_state->lockless_queue_memory = static_cast<void *>(
- mmap(nullptr, my_global_state->lockless_queue_memory_size,
- PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0));
- CHECK_NE(MAP_FAILED, my_global_state->lockless_queue_memory);
-
- // And the backup used to point the robust list at.
- my_global_state->lockless_queue_memory_lock_backup = static_cast<void *>(
- mmap(nullptr, my_global_state->lockless_queue_memory_size,
- PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0));
- CHECK_NE(MAP_FAILED, my_global_state->lockless_queue_memory_lock_backup);
-
- // The writable offset tells us how to convert from a pointer in the queue to
- // a pointer that is safe to write. This is so robust futexes don't spin the
- // kernel when the user maps a page PROT_READ, and the kernel tries to clear
- // the futex there.
- const uintptr_t writable_offset =
- reinterpret_cast<uintptr_t>(
- my_global_state->lockless_queue_memory_lock_backup) -
- reinterpret_cast<uintptr_t>(my_global_state->lockless_queue_memory);
-
- bool r;
- // Do the actual test in a new thread so any locked mutexes will be cleaned up
- // nicely with owner-died at the end.
- ::std::thread test_thread([&prepare, &function, &check, test_failure, die_at,
- prepare_in_child, writes_in, writes_out,
- writable_offset, &r]() {
- r = RunFunctionDieAt(prepare, function, test_failure, die_at,
- prepare_in_child, writable_offset, writes_in,
- writes_out);
- if (::testing::Test::HasFailure()) {
- r = false;
- if (test_failure) *test_failure = true;
- return;
- }
-
- check(
- global_state.load(::std::memory_order_relaxed)->lockless_queue_memory);
- });
- test_thread.join();
- return r;
-}
-
-// Tests function to make sure it handles dying after each store it makes to
-// shared memory. check should make sure function behaved correctly.
-// This will repeatedly create a new TestSharedMemory, run prepare, run
-// function, and then
-// run check, killing the process function is running in at various points. It
-// will stop if anything reports a fatal gtest failure.
-//
-// prepare_in_child being true means the prepare function will be run in the
-// child instead of the parent which doesn't die. This means that reference
-// counts on any objects it allocates won't be duplicated, but it also means
-// that any variables it sets will not be visible in check etc.
-void TestShmRobustness(const LocklessQueueConfiguration &config,
- ::std::function<void(void *)> prepare,
- ::std::function<void(void *)> function,
- ::std::function<void(void *)> check,
- bool prepare_in_child) {
- // Map the global state and memory for the Writes array so it exists across
- // the process boundary.
- void *shared_allocations = static_cast<GlobalState *>(
- mmap(nullptr, sizeof(GlobalState) + sizeof(WritesArray),
- PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0));
- CHECK_NE(MAP_FAILED, shared_allocations);
-
- global_state.store(static_cast<GlobalState *>(shared_allocations));
- shared_allocations = static_cast<void *>(
- static_cast<uint8_t *>(shared_allocations) + sizeof(GlobalState));
- WritesArray *expected_writes = static_cast<WritesArray *>(shared_allocations);
- new (expected_writes) WritesArray();
-
- bool test_failed = false;
- ASSERT_TRUE(RunFunctionDieAtAndCheck(config, prepare, function, check,
- &test_failed, 0, prepare_in_child,
- nullptr, expected_writes));
- if (test_failed) {
- ADD_FAILURE();
- return;
- }
-
- size_t die_at = 1;
- while (true) {
- SCOPED_TRACE("dying at " + ::std::to_string(die_at) + "/" +
- ::std::to_string(expected_writes->size()));
- if (RunFunctionDieAtAndCheck(config, prepare, function, check, &test_failed,
- die_at, prepare_in_child, expected_writes,
- nullptr)) {
- LOG(INFO) << "Tested " << die_at << " death points";
- return;
- }
- if (test_failed) {
- ADD_FAILURE();
- }
- if (::testing::Test::HasFailure()) return;
- ++die_at;
- }
-}
+#ifdef SUPPORTS_SHM_ROBUSTNESS_TEST
namespace {
-pid_t gettid() { return syscall(SYS_gettid); }
-
-// Sets FUTEX_OWNER_DIED if the owner was tid. This fakes what the kernel does
-// with a robust mutex.
-bool PretendOwnerDied(aos_mutex *mutex, pid_t tid) {
- if ((mutex->futex & FUTEX_TID_MASK) == tid) {
- mutex->futex = FUTEX_OWNER_DIED;
- return true;
- }
- return false;
-}
-
static int kPinnedMessageIndex = 0;
constexpr monotonic_clock::duration kChannelStorageDuration =
@@ -567,12 +46,8 @@
// Tests that death during sends is recovered from correctly.
TEST(LocklessQueueTest, Death) {
::aos::testing::EnableTestLogging();
- // Capture the tid in the child so we can tell if it died. Use mmap so it
- // works across the process boundary.
- pid_t *tid =
- static_cast<pid_t *>(mmap(nullptr, sizeof(pid_t), PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_ANONYMOUS, -1, 0));
- CHECK_NE(MAP_FAILED, tid);
+
+ SharedTid tid;
// Make a small queue so it is easier to debug.
LocklessQueueConfiguration config;
@@ -584,14 +59,14 @@
TestShmRobustness(
config,
- [config, tid](void *memory) {
+ [config, &tid](void *memory) {
// Initialize the queue and grab the tid.
LocklessQueue(
reinterpret_cast<aos::ipc_lib::LocklessQueueMemory *>(memory),
reinterpret_cast<aos::ipc_lib::LocklessQueueMemory *>(memory),
config)
.Initialize();
- *tid = gettid();
+ tid.Set();
},
[config](void *memory) {
LocklessQueue queue(
@@ -617,7 +92,7 @@
}
}
},
- [config, tid](void *raw_memory) {
+ [config, &tid](void *raw_memory) {
::aos::ipc_lib::LocklessQueueMemory *const memory =
reinterpret_cast<::aos::ipc_lib::LocklessQueueMemory *>(raw_memory);
// Confirm that we can create 2 senders (the number in the queue), and
@@ -629,10 +104,10 @@
// TestShmRobustness doesn't handle robust futexes. It is happy to just
// not crash with them. We know what they are, and what the tid of the
// holder is. So go pretend to be the kernel and fix it for it.
- PretendOwnerDied(&memory->queue_setup_lock, *tid);
+ PretendOwnerDied(&memory->queue_setup_lock, tid.Get());
for (size_t i = 0; i < config.num_senders; ++i) {
- if (PretendOwnerDied(&memory->GetSender(i)->tid, *tid)) {
+ if (PretendOwnerDied(&memory->GetSender(i)->tid, tid.Get())) {
// Print out before and after results if a sender died. That is the
// more fun case.
print = true;
@@ -735,8 +210,7 @@
// Confirm our message got through.
EXPECT_EQ(last_data, '9') << ": Got through " << i;
- },
- /* prepare_in_child = true */ true);
+ });
}
#endif
diff --git a/aos/ipc_lib/lockless_queue_stepping.cc b/aos/ipc_lib/lockless_queue_stepping.cc
new file mode 100644
index 0000000..0ec5214
--- /dev/null
+++ b/aos/ipc_lib/lockless_queue_stepping.cc
@@ -0,0 +1,473 @@
+#include "aos/ipc_lib/lockless_queue_stepping.h"
+
+#include <dlfcn.h>
+#include <elf.h>
+#include <linux/futex.h>
+#include <sys/mman.h>
+#include <sys/procfs.h>
+#include <sys/ptrace.h>
+#include <sys/syscall.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include <wait.h>
+
+#include <memory>
+#include <thread>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "aos/ipc_lib/aos_sync.h"
+#include "aos/ipc_lib/lockless_queue_memory.h"
+#include "aos/ipc_lib/shm_observers.h"
+#include "aos/libc/aos_strsignal.h"
+#include "aos/testing/prevent_exit.h"
+
+#ifdef SUPPORTS_SHM_ROBUSTNESS_TEST
+
+namespace aos {
+namespace ipc_lib {
+namespace testing {
+
+namespace {
+pid_t gettid() { return syscall(SYS_gettid); }
+
+::std::atomic<GlobalState *> global_state;
+
+struct sigaction old_segv_handler, old_trap_handler;
+
+// Calls the original signal handler.
+bool CallChainedAction(const struct sigaction &action, int signal,
+ siginfo_t *siginfo, void *context) {
+ if (action.sa_handler == SIG_IGN || action.sa_handler == SIG_DFL) {
+ return false;
+ }
+ if (action.sa_flags & SA_SIGINFO) {
+ action.sa_sigaction(signal, siginfo, context);
+ } else {
+ action.sa_handler(signal);
+ }
+ return true;
+}
+
+void segv_handler(int signal, siginfo_t *siginfo, void *context_void) {
+ GlobalState *my_global_state = GlobalState::Get();
+ const int saved_errno = errno;
+ SIMPLE_ASSERT(signal == SIGSEGV, "wrong signal for SIGSEGV handler");
+
+ // Only process memory addresses in our shared memory block.
+ if (!my_global_state->IsInLocklessQueueMemory(siginfo->si_addr)) {
+ if (CallChainedAction(old_segv_handler, signal, siginfo, context_void)) {
+ errno = saved_errno;
+ return;
+ } else {
+ SIMPLE_ASSERT(false, "actual SIGSEGV");
+ }
+ }
+ SIMPLE_ASSERT(my_global_state->state == DieAtState::kRunning,
+ "bad state for SIGSEGV");
+
+ my_global_state->HandleWrite(siginfo->si_addr);
+
+ my_global_state->ShmProtectOrDie(PROT_READ | PROT_WRITE);
+ my_global_state->state = DieAtState::kWriting;
+ errno = saved_errno;
+
+#if defined(__x86_64__)
+ __asm__ __volatile__("int $3" ::: "memory", "cc");
+#elif defined(__aarch64__)
+ __asm__ __volatile__("brk #0" ::: "memory", "cc");
+#else
+#error Unhandled architecture
+#endif
+}
+
+// The SEGV handler has set a breakpoint 1 instruction in the future. This
+// clears it, marks memory readonly, and continues.
+void trap_handler(int signal, siginfo_t *, void * /*context*/) {
+ GlobalState *my_global_state = GlobalState::Get();
+ const int saved_errno = errno;
+ SIMPLE_ASSERT(signal == SIGTRAP, "wrong signal for SIGTRAP handler");
+
+ my_global_state->state = DieAtState::kWriting;
+ SIMPLE_ASSERT(my_global_state->state == DieAtState::kWriting,
+ "bad state for SIGTRAP");
+ my_global_state->ShmProtectOrDie(PROT_READ);
+ my_global_state->state = DieAtState::kRunning;
+ errno = saved_errno;
+}
+
+// Installs the signal handler.
+void InstallHandler(int signal, void (*handler)(int, siginfo_t *, void *),
+ struct sigaction *old_action) {
+ struct sigaction action;
+ memset(&action, 0, sizeof(action));
+ action.sa_sigaction = handler;
+ // We don't do a full normal signal handler exit with ptrace, so SA_NODEFER is
+ // necessary to keep our signal handler active.
+ action.sa_flags = SA_RESTART | SA_SIGINFO | SA_NODEFER;
+#ifdef AOS_SANITIZER_thread
+ // Tsan messes with signal handlers to check for race conditions, and it
+ // causes problems, so we have to work around it for SIGTRAP.
+ if (signal == SIGTRAP) {
+ typedef int (*SigactionType)(int, const struct sigaction *,
+ struct sigaction *);
+ SigactionType real_sigaction =
+ reinterpret_cast<SigactionType>(dlsym(RTLD_NEXT, "sigaction"));
+ if (sigaction == real_sigaction) {
+ LOG(WARNING) << "failed to work around tsan signal handling weirdness";
+ }
+ PCHECK(real_sigaction(signal, &action, old_action) == 0);
+ return;
+ }
+#endif
+ PCHECK(sigaction(signal, &action, old_action) == 0);
+}
+
+// A mutex lock is about to happen. Mark the memory rw, and check to see if we
+// should die.
+void futex_before(void *address, bool) {
+ GlobalState *my_global_state = GlobalState::Get();
+ if (my_global_state->IsInLocklessQueueMemory(address)) {
+ assert(my_global_state->state == DieAtState::kRunning);
+ my_global_state->HandleWrite(address);
+ my_global_state->ShmProtectOrDie(PROT_READ | PROT_WRITE);
+ my_global_state->state = DieAtState::kWriting;
+ }
+}
+
+// We have a manual trap for mutexes. Check to see if we were supposed to die
+// on this write (the compare/exchange for the mutex), and mark the memory ro
+// again.
+void futex_after(void *address, bool) {
+ GlobalState *my_global_state = GlobalState::Get();
+ if (my_global_state->IsInLocklessQueueMemory(address)) {
+ assert(my_global_state->state == DieAtState::kWriting);
+ my_global_state->ShmProtectOrDie(PROT_READ);
+ my_global_state->state = DieAtState::kRunning;
+ }
+}
+
+} // namespace
+
+void GlobalState::HandleWrite(void *address) {
+ uintptr_t address_offset = reinterpret_cast<uintptr_t>(address) -
+ reinterpret_cast<uintptr_t>(lockless_queue_memory);
+ if (writes_in != nullptr) {
+ SIMPLE_ASSERT(writes_in->At(current_location) == address_offset,
+ "wrong write order");
+ }
+ if (writes_out != nullptr) {
+ writes_out->Add(address_offset);
+ }
+ if (die_at != 0) {
+ if (die_at == current_location) {
+ _exit(kExitEarlyValue);
+ }
+ }
+ ++current_location;
+}
+
+GlobalState *GlobalState::Get() {
+ return global_state.load(::std::memory_order_relaxed);
+}
+
+std::tuple<GlobalState *, WritesArray *> GlobalState::MakeGlobalState() {
+ // Map the global state and memory for the Writes array so it exists across
+ // the process boundary.
+ void *shared_allocations = static_cast<GlobalState *>(
+ mmap(nullptr, sizeof(GlobalState) + sizeof(WritesArray),
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0));
+ CHECK_NE(MAP_FAILED, shared_allocations);
+
+ global_state.store(static_cast<GlobalState *>(shared_allocations));
+ void *expected_writes_shared_allocations = static_cast<void *>(
+ static_cast<uint8_t *>(shared_allocations) + sizeof(GlobalState));
+ WritesArray *expected_writes =
+ static_cast<WritesArray *>(expected_writes_shared_allocations);
+ new (expected_writes) WritesArray();
+ return std::make_pair(static_cast<GlobalState *>(shared_allocations),
+ expected_writes);
+}
+
+bool GlobalState::IsInLocklessQueueMemory(void *address) {
+ void *read_lockless_queue_memory = lockless_queue_memory;
+ if (address < read_lockless_queue_memory) {
+ return false;
+ if (reinterpret_cast<uintptr_t>(address) >
+ reinterpret_cast<uintptr_t>(read_lockless_queue_memory) +
+ lockless_queue_memory_size)
+ return false;
+ }
+ return true;
+}
+
+void GlobalState::ShmProtectOrDie(int prot) {
+ PCHECK(mprotect(lockless_queue_memory, lockless_queue_memory_size, prot) !=
+ -1)
+ << ": mprotect(" << lockless_queue_memory << ", "
+ << lockless_queue_memory_size << ", 0x" << std::hex << prot << ") failed";
+}
+
+void GlobalState::RegisterSegvAndTrapHandlers() {
+ InstallHandler(SIGSEGV, segv_handler, &old_segv_handler);
+ InstallHandler(SIGTRAP, trap_handler, &old_trap_handler);
+ CHECK_EQ(old_trap_handler.sa_handler, SIG_DFL);
+ linux_code::ipc_lib::SetShmAccessorObservers(futex_before, futex_after);
+}
+
+// gtest only allows creating fatal failures in functions returning void...
+// status is from wait(2).
+void DetectFatalFailures(int status) {
+ if (WIFEXITED(status)) {
+ FAIL() << " child returned status "
+ << ::std::to_string(WEXITSTATUS(status));
+ } else if (WIFSIGNALED(status)) {
+ FAIL() << " child exited because of signal "
+ << aos_strsignal(WTERMSIG(status));
+ } else {
+ FAIL() << " child exited with status " << ::std::hex << status;
+ }
+}
+
+// Returns true if it runs all the way through.
+bool RunFunctionDieAt(::std::function<void(void *)> prepare,
+ ::std::function<void(void *)> function,
+ bool *test_failure, size_t die_at,
+ uintptr_t writable_offset, const WritesArray *writes_in,
+ WritesArray *writes_out) {
+ GlobalState *my_global_state = GlobalState::Get();
+ my_global_state->writes_in = writes_in;
+ my_global_state->writes_out = writes_out;
+ my_global_state->die_at = die_at;
+ my_global_state->current_location = 0;
+ my_global_state->state = DieAtState::kDisabled;
+
+ const pid_t pid = fork();
+ PCHECK(pid != -1) << ": fork() failed";
+ if (pid == 0) {
+ // Run the test.
+ ::aos::testing::PreventExit();
+
+ prepare(my_global_state->lockless_queue_memory);
+
+ // Update the robust list offset.
+ linux_code::ipc_lib::SetRobustListOffset(writable_offset);
+ // Install a segv handler (to detect writes to the memory block), and a trap
+ // handler so we can single step.
+ my_global_state->RegisterSegvAndTrapHandlers();
+
+ PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+ my_global_state->ShmProtectOrDie(PROT_READ);
+ my_global_state->state = DieAtState::kRunning;
+
+ function(my_global_state->lockless_queue_memory);
+ my_global_state->state = DieAtState::kDisabled;
+ my_global_state->ShmProtectOrDie(PROT_READ | PROT_WRITE);
+ _exit(0);
+ } else {
+ // Annoying wrapper type because elf_gregset_t is an array, which C++
+ // handles poorly.
+ struct RestoreState {
+ RestoreState(elf_gregset_t regs_in) {
+ memcpy(regs, regs_in, sizeof(regs));
+ }
+ elf_gregset_t regs;
+ };
+ std::optional<RestoreState> restore_regs;
+ bool pass_trap = false;
+ // Wait until the child process dies.
+ while (true) {
+ int status;
+ pid_t waited_on = waitpid(pid, &status, 0);
+ if (waited_on == -1) {
+ if (errno == EINTR) continue;
+ PCHECK(false) << ": waitpid(" << pid << ", " << &status
+ << ", 0) failed";
+ }
+ CHECK_EQ(waited_on, pid)
+ << ": waitpid got child " << waited_on << " instead of " << pid;
+ if (WIFSTOPPED(status)) {
+ // The child was stopped via ptrace.
+ const int stop_signal = WSTOPSIG(status);
+ elf_gregset_t regs;
+ {
+ struct iovec iov;
+ iov.iov_base = ®s;
+ iov.iov_len = sizeof(regs);
+ PCHECK(ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov) == 0);
+ CHECK_EQ(iov.iov_len, sizeof(regs))
+ << ": ptrace regset is the wrong size";
+ }
+ if (stop_signal == SIGSEGV) {
+ // It's a SEGV, hopefully due to writing to the shared memory which is
+ // marked read-only. We record the instruction that faulted so we can
+ // look for it while single-stepping, then deliver the signal so the
+ // child can mark it read-write and then poke us to single-step that
+ // instruction.
+
+ CHECK(!restore_regs)
+ << ": Traced child got a SEGV while single-stepping";
+ // Save all the registers to resume execution at the current location
+ // in the child.
+ restore_regs = RestoreState(regs);
+ PCHECK(ptrace(PTRACE_CONT, pid, nullptr, SIGSEGV) == 0);
+ continue;
+ }
+ if (stop_signal == SIGTRAP) {
+ if (pass_trap) {
+ // This is the new SIGTRAP we generated, which we just want to pass
+ // through so the child's signal handler can restore the memory to
+ // read-only
+ PCHECK(ptrace(PTRACE_CONT, pid, nullptr, SIGTRAP) == 0);
+ pass_trap = false;
+ continue;
+ }
+ if (restore_regs) {
+ // Restore the state we saved before delivering the SEGV, and then
+ // single-step that one instruction.
+ struct iovec iov;
+ iov.iov_base = &restore_regs->regs;
+ iov.iov_len = sizeof(restore_regs->regs);
+ PCHECK(ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov) == 0);
+ restore_regs = std::nullopt;
+ PCHECK(ptrace(PTRACE_SINGLESTEP, pid, nullptr, nullptr) == 0);
+ continue;
+ }
+ // We executed the single instruction that originally faulted, so
+ // now deliver a SIGTRAP to the child so it can mark the memory
+ // read-only again.
+ pass_trap = true;
+ PCHECK(kill(pid, SIGTRAP) == 0);
+ PCHECK(ptrace(PTRACE_CONT, pid, nullptr, nullptr) == 0);
+ continue;
+ }
+ LOG(FATAL) << "Traced child was stopped with unexpected signal: "
+ << static_cast<int>(WSTOPSIG(status));
+ }
+ if (WIFEXITED(status)) {
+ if (WEXITSTATUS(status) == 0) return true;
+ if (WEXITSTATUS(status) == kExitEarlyValue) return false;
+ }
+ DetectFatalFailures(status);
+ if (test_failure) *test_failure = true;
+ return false;
+ }
+ }
+}
+
+bool RunFunctionDieAtAndCheck(const LocklessQueueConfiguration &config,
+ ::std::function<void(void *)> prepare,
+ ::std::function<void(void *)> function,
+ ::std::function<void(void *)> check,
+ bool *test_failure, size_t die_at,
+ const WritesArray *writes_in,
+ WritesArray *writes_out) {
+ // Allocate shared memory.
+ GlobalState *my_global_state = GlobalState::Get();
+ my_global_state->lockless_queue_memory_size = LocklessQueueMemorySize(config);
+ my_global_state->lockless_queue_memory = static_cast<void *>(
+ mmap(nullptr, my_global_state->lockless_queue_memory_size,
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0));
+ CHECK_NE(MAP_FAILED, my_global_state->lockless_queue_memory);
+
+ // And the backup used to point the robust list at.
+ my_global_state->lockless_queue_memory_lock_backup = static_cast<void *>(
+ mmap(nullptr, my_global_state->lockless_queue_memory_size,
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0));
+ CHECK_NE(MAP_FAILED, my_global_state->lockless_queue_memory_lock_backup);
+
+ // The writable offset tells us how to convert from a pointer in the queue to
+ // a pointer that is safe to write. This is so robust futexes don't spin the
+ // kernel when the user maps a page PROT_READ, and the kernel tries to clear
+ // the futex there.
+ const uintptr_t writable_offset =
+ reinterpret_cast<uintptr_t>(
+ my_global_state->lockless_queue_memory_lock_backup) -
+ reinterpret_cast<uintptr_t>(my_global_state->lockless_queue_memory);
+
+ bool r;
+ // Do the actual test in a new thread so any locked mutexes will be cleaned up
+ // nicely with owner-died at the end.
+ ::std::thread test_thread([&prepare, &function, &check, test_failure, die_at,
+ writes_in, writes_out, writable_offset, &r]() {
+ r = RunFunctionDieAt(prepare, function, test_failure, die_at,
+ writable_offset, writes_in, writes_out);
+ if (::testing::Test::HasFailure()) {
+ r = false;
+ if (test_failure) *test_failure = true;
+ return;
+ }
+
+ check(GlobalState::Get()->lockless_queue_memory);
+ });
+ test_thread.join();
+ return r;
+}
+
+// Tests function to make sure it handles dying after each store it makes to
+// shared memory. check should make sure function behaved correctly.
+// This will repeatedly create a new TestSharedMemory, run prepare, run
+// function, and then
+// run check, killing the process function is running in at various points. It
+// will stop if anything reports a fatal gtest failure.
+void TestShmRobustness(const LocklessQueueConfiguration &config,
+ ::std::function<void(void *)> prepare,
+ ::std::function<void(void *)> function,
+ ::std::function<void(void *)> check) {
+ auto [my_global_state, expected_writes] = GlobalState::MakeGlobalState();
+
+ bool test_failed = false;
+ ASSERT_TRUE(RunFunctionDieAtAndCheck(config, prepare, function, check,
+ &test_failed, 0, nullptr,
+ expected_writes));
+ if (test_failed) {
+ ADD_FAILURE();
+ return;
+ }
+
+ size_t die_at = 1;
+ while (true) {
+ SCOPED_TRACE("dying at " + ::std::to_string(die_at) + "/" +
+ ::std::to_string(expected_writes->size()));
+ if (RunFunctionDieAtAndCheck(config, prepare, function, check, &test_failed,
+ die_at, expected_writes, nullptr)) {
+ LOG(INFO) << "Tested " << die_at << " death points";
+ return;
+ }
+ if (test_failed) {
+ ADD_FAILURE();
+ }
+ if (::testing::Test::HasFailure()) return;
+ ++die_at;
+ }
+}
+
+SharedTid::SharedTid() {
+ // Capture the tid in the child so we can tell if it died. Use mmap so it
+ // works across the process boundary.
+ tid_ =
+ static_cast<pid_t *>(mmap(nullptr, sizeof(pid_t), PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0));
+ CHECK_NE(MAP_FAILED, tid_);
+}
+
+SharedTid::~SharedTid() { CHECK_EQ(munmap(tid_, sizeof(pid_t)), 0); }
+
+void SharedTid::Set() { *tid_ = gettid(); }
+
+pid_t SharedTid::Get() { return *tid_; }
+
+bool PretendOwnerDied(aos_mutex *mutex, pid_t tid) {
+ if ((mutex->futex & FUTEX_TID_MASK) == tid) {
+ mutex->futex = FUTEX_OWNER_DIED;
+ return true;
+ }
+ return false;
+}
+
+} // namespace testing
+} // namespace ipc_lib
+} // namespace aos
+
+#endif // SUPPORTS_SHM_ROBSTNESS_TEST
diff --git a/aos/ipc_lib/lockless_queue_stepping.h b/aos/ipc_lib/lockless_queue_stepping.h
new file mode 100644
index 0000000..b3eb6e3
--- /dev/null
+++ b/aos/ipc_lib/lockless_queue_stepping.h
@@ -0,0 +1,153 @@
+#ifndef AOS_IPC_LIB_LOCKLESS_QUEUE_STEPPING_H_
+#define AOS_IPC_LIB_LOCKLESS_QUEUE_STEPPING_H_
+
+#include <cinttypes>
+#include <functional>
+
+#include "aos/ipc_lib/lockless_queue.h"
+#include "aos/ipc_lib/lockless_queue_memory.h"
+
+namespace aos {
+namespace ipc_lib {
+namespace testing {
+
+#if defined(__ARM_EABI__)
+// There are various reasons why we might not actually be able to do this
+// testing, but we still want to run the functions to test anything they test
+// other than shm robustness.
+//
+// ARM doesn't have a nice way to do single-stepping, and I don't feel like
+// dealing with editing the next instruction to be a breakpoint and then
+// changing it back.
+#else
+
+#define SUPPORTS_SHM_ROBUSTNESS_TEST
+
+// This code currently supports amd64 only, but it
+// shouldn't be hard to port to i386 (it should just be using the name for
+// only the smaller part of the flags register), but that's not tested, and
+// porting it to any other architecture is more work.
+// Currently, we skip doing anything exciting on arm (just run the code without
+// any robustness testing) and refuse to compile anywhere else.
+
+#define SIMPLE_ASSERT(condition, message) \
+ do { \
+ if (!(condition)) { \
+ static const char kMessage[] = message "\n"; \
+ if (write(STDERR_FILENO, kMessage, sizeof(kMessage) - 1) != \
+ (sizeof(kMessage) - 1)) { \
+ static const char kFailureMessage[] = "writing failed\n"; \
+ __attribute__((unused)) int ignore = write( \
+ STDERR_FILENO, kFailureMessage, sizeof(kFailureMessage) - 1); \
+ } \
+ abort(); \
+ } \
+ } while (false)
+
+// Array to track writes to memory, and make sure they happen in the right
+// order.
+class WritesArray {
+ public:
+ uintptr_t At(size_t location) const {
+ SIMPLE_ASSERT(location < size_, "too far into writes array");
+ return writes_[location];
+ }
+ void Add(uintptr_t pointer) {
+ SIMPLE_ASSERT(size_ < kSize, "too many writes");
+ writes_[size_++] = pointer;
+ }
+
+ size_t size() const { return size_; }
+
+ private:
+ static const size_t kSize = 20000;
+
+ uintptr_t writes_[kSize];
+ size_t size_ = 0;
+};
+
+enum class DieAtState {
+ // No SEGVs should be happening.
+ kDisabled,
+ // SEGVs are fine. Normal operation.
+ kRunning,
+ // We are manipulating a mutex. No SEGVs should be happening.
+ kWriting,
+};
+
+// What we exit with when we're exiting in the middle.
+const int kExitEarlyValue = 123;
+
+// We have to keep track of everything in a global variable because there's no
+// other way for the signal handlers to find it.
+struct GlobalState {
+ // Constructs the singleton GlobalState.
+ static std::tuple<GlobalState *, WritesArray *> MakeGlobalState();
+
+ // Returns the global state. Atomic and safe from signal handlers.
+ static GlobalState *Get();
+
+ // Pointer to the queue memory, and its size.
+ void *lockless_queue_memory;
+ size_t lockless_queue_memory_size;
+
+ // Pointer to a second block of memory the same size. This (on purpose) has
+ // the same size as lockless_queue_memory so we can point the robust mutexes
+ // here.
+ void *lockless_queue_memory_lock_backup;
+
+ // Expected writes.
+ const WritesArray *writes_in;
+ // Actual writes.
+ WritesArray *writes_out;
+ // Location to die at, and how far we have gotten.
+ size_t die_at, current_location;
+ // State.
+ DieAtState state;
+
+ // Returns true if the address is in the queue memory chunk.
+ bool IsInLocklessQueueMemory(void *address);
+
+ // Calls mprotect(2) for the entire shared memory region with the given prot.
+ void ShmProtectOrDie(int prot);
+
+ // Checks a write into the queue and conditionally dies. Tracks the write.
+ void HandleWrite(void *address);
+
+ // Registers the handlers required to trap writes.
+ void RegisterSegvAndTrapHandlers();
+};
+
+void TestShmRobustness(const LocklessQueueConfiguration &config,
+ ::std::function<void(void *)> prepare,
+ ::std::function<void(void *)> function,
+ ::std::function<void(void *)> check);
+
+// Capture the tid in the child so we can tell if it died. Uses mmap so it
+// works across the process boundary.
+class SharedTid {
+ public:
+ SharedTid();
+ ~SharedTid();
+
+ // Captures the tid.
+ void Set();
+
+ // Returns the captured tid.
+ pid_t Get();
+
+ private:
+ pid_t *tid_;
+};
+
+// Sets FUTEX_OWNER_DIED if the owner was tid. This fakes what the kernel does
+// with a robust mutex.
+bool PretendOwnerDied(aos_mutex *mutex, pid_t tid);
+
+#endif
+
+} // namespace testing
+} // namespace ipc_lib
+} // namespace aos
+
+#endif // AOS_IPC_LIB_LOCKLESS_QUEUE_STEPPING_H_
diff --git a/aos/ipc_lib/lockless_queue_test.cc b/aos/ipc_lib/lockless_queue_test.cc
index 93ae2a3..58e093f 100644
--- a/aos/ipc_lib/lockless_queue_test.cc
+++ b/aos/ipc_lib/lockless_queue_test.cc
@@ -1,5 +1,6 @@
#include "aos/ipc_lib/lockless_queue.h"
+#include <sys/mman.h>
#include <unistd.h>
#include <wait.h>
@@ -16,6 +17,8 @@
#include "aos/events/epoll.h"
#include "aos/ipc_lib/aos_sync.h"
#include "aos/ipc_lib/event.h"
+#include "aos/ipc_lib/lockless_queue_memory.h"
+#include "aos/ipc_lib/lockless_queue_stepping.h"
#include "aos/ipc_lib/queue_racer.h"
#include "aos/ipc_lib/signalfd.h"
#include "aos/realtime.h"
@@ -421,6 +424,148 @@
static_cast<double>(kNumMessages) / elapsed_seconds);
}
+#if defined(SUPPORTS_SHM_ROBUSTNESS_TEST)
+
+// Verifies that LatestIndex points to the same message as the logic from
+// "FetchNext", which increments the index until it gets "NOTHING_NEW" back.
+// This is so we can confirm fetchers and watchers all see the same message at
+// the same point in time.
+int VerifyMessages(LocklessQueue *queue, LocklessQueueMemory *memory) {
+ LocklessQueueReader reader(*queue);
+
+ const ipc_lib::QueueIndex queue_index = reader.LatestIndex();
+ if (!queue_index.valid()) {
+ return 0;
+ }
+
+ // Now loop through the queue and make sure the number in the snprintf
+ // increments.
+ char last_data = '0';
+ int i = 0;
+
+ // Callback which isn't set so we don't exercise the conditional reading code.
+ std::function<bool(const Context &)> should_read_callback;
+
+ // Now, read as far as we can until we get NOTHING_NEW. This simulates
+ // FetchNext.
+ while (true) {
+ monotonic_clock::time_point monotonic_sent_time;
+ realtime_clock::time_point realtime_sent_time;
+ monotonic_clock::time_point monotonic_remote_time;
+ realtime_clock::time_point realtime_remote_time;
+ uint32_t remote_queue_index;
+ UUID source_boot_uuid;
+ char read_data[1024];
+ size_t length;
+
+ LocklessQueueReader::Result read_result = reader.Read(
+ i, &monotonic_sent_time, &realtime_sent_time, &monotonic_remote_time,
+ &realtime_remote_time, &remote_queue_index, &source_boot_uuid, &length,
+ &(read_data[0]), std::ref(should_read_callback));
+
+ if (read_result != LocklessQueueReader::Result::GOOD) {
+ if (read_result == LocklessQueueReader::Result::TOO_OLD) {
+ ++i;
+ continue;
+ }
+ CHECK(read_result == LocklessQueueReader::Result::NOTHING_NEW)
+ << ": " << static_cast<int>(read_result);
+ break;
+ }
+
+ EXPECT_GT(read_data[LocklessQueueMessageDataSize(memory) - length + 6],
+ last_data)
+ << ": Got " << read_data << " for " << i;
+ last_data = read_data[LocklessQueueMessageDataSize(memory) - length + 6];
+
+ ++i;
+ }
+
+ // The latest queue index should match the fetched queue index.
+ if (i == 0) {
+ EXPECT_FALSE(queue_index.valid());
+ } else {
+ EXPECT_EQ(queue_index.index(), i - 1);
+ }
+ return i;
+}
+
+// Tests that at all points in the publish step, fetch == fetch next. This
+// means that there is an atomic point at which the message is viewed as visible
+// to consumers. Do this by killing the writer after each change to shared
+// memory, and confirming fetch == fetch next each time.
+TEST_F(LocklessQueueTest, FetchEqFetchNext) {
+ SharedTid tid;
+
+ // Make a small queue so it is easier to debug.
+ LocklessQueueConfiguration config;
+ config.num_watchers = 1;
+ config.num_senders = 2;
+ config.num_pinners = 0;
+ config.queue_size = 3;
+ config.message_data_size = 32;
+
+ TestShmRobustness(
+ config,
+ [config, &tid](void *memory) {
+ // Initialize the queue.
+ LocklessQueue(
+ reinterpret_cast<aos::ipc_lib::LocklessQueueMemory *>(memory),
+ reinterpret_cast<aos::ipc_lib::LocklessQueueMemory *>(memory),
+ config)
+ .Initialize();
+ tid.Set();
+ },
+ [config](void *memory) {
+ LocklessQueue queue(
+ reinterpret_cast<aos::ipc_lib::LocklessQueueMemory *>(memory),
+ reinterpret_cast<aos::ipc_lib::LocklessQueueMemory *>(memory),
+ config);
+ // Now try to write some messages. We will get killed a bunch as this
+ // tries to happen.
+ LocklessQueueSender sender =
+ LocklessQueueSender::Make(queue, chrono::nanoseconds(1)).value();
+ for (int i = 0; i < 5; ++i) {
+ char data[100];
+ size_t s = snprintf(data, sizeof(data), "foobar%d", i + 1);
+ ASSERT_EQ(sender.Send(data, s + 1, monotonic_clock::min_time,
+ realtime_clock::min_time, 0xffffffffl,
+ UUID::Zero(), nullptr, nullptr, nullptr),
+ LocklessQueueSender::Result::GOOD);
+ }
+ },
+ [config, &tid](void *raw_memory) {
+ ::aos::ipc_lib::LocklessQueueMemory *const memory =
+ reinterpret_cast<::aos::ipc_lib::LocklessQueueMemory *>(raw_memory);
+ LocklessQueue queue(memory, memory, config);
+ PretendOwnerDied(&memory->queue_setup_lock, tid.Get());
+
+ if (VLOG_IS_ON(1)) {
+ PrintLocklessQueueMemory(memory);
+ }
+
+ const int i = VerifyMessages(&queue, memory);
+
+ LocklessQueueSender sender =
+ LocklessQueueSender::Make(queue, chrono::nanoseconds(1)).value();
+ {
+ char data[100];
+ size_t s = snprintf(data, sizeof(data), "foobar%d", i + 1);
+ ASSERT_EQ(sender.Send(data, s + 1, monotonic_clock::min_time,
+ realtime_clock::min_time, 0xffffffffl,
+ UUID::Zero(), nullptr, nullptr, nullptr),
+ LocklessQueueSender::Result::GOOD);
+ }
+
+ // Now, make sure we can send 1 message and receive it to confirm we
+ // haven't corrupted next_queue_index irrevocably.
+ const int newi = VerifyMessages(&queue, memory);
+ EXPECT_EQ(newi, i + 1);
+ });
+}
+
+#endif
+
} // namespace testing
} // namespace ipc_lib
} // namespace aos
diff --git a/aos/ipc_lib/queue_racer.h b/aos/ipc_lib/queue_racer.h
index 87f2cce..f0a2684 100644
--- a/aos/ipc_lib/queue_racer.h
+++ b/aos/ipc_lib/queue_racer.h
@@ -62,7 +62,8 @@
private:
// Wipes the queue memory out so we get a clean start.
void Reset() {
- memset(queue_.memory(), 0, LocklessQueueMemorySize(queue_.config()));
+ memset(reinterpret_cast<void *>(queue_.memory()), 0,
+ LocklessQueueMemorySize(queue_.config()));
}
// This is a separate method so that when all the ASSERT_* methods, we still
diff --git a/aos/ipc_lib/shm_base.cc b/aos/ipc_lib/shm_base.cc
new file mode 100644
index 0000000..22bf915
--- /dev/null
+++ b/aos/ipc_lib/shm_base.cc
@@ -0,0 +1,9 @@
+#include "aos/ipc_lib/shm_base.h"
+
+DEFINE_string(shm_base, "/dev/shm/aos",
+ "Directory to place queue backing mmaped files in.");
+namespace aos::testing {
+void SetShmBase(const std::string_view base) {
+ FLAGS_shm_base = std::string(base) + "/aos";
+}
+} // namespace aos::testing
diff --git a/aos/ipc_lib/shm_base.h b/aos/ipc_lib/shm_base.h
new file mode 100644
index 0000000..b70df06
--- /dev/null
+++ b/aos/ipc_lib/shm_base.h
@@ -0,0 +1,11 @@
+#ifndef AOS_IPC_LIB_SHM_BASE_H_
+#define AOS_IPC_LIB_SHM_BASE_H_
+
+#include "gflags/gflags.h"
+
+DECLARE_string(shm_base);
+
+namespace aos::testing {
+void SetShmBase(const std::string_view base);
+}
+#endif // AOS_IPC_LIB_SHM_BASE_H_
diff --git a/aos/json_to_flatbuffer.h b/aos/json_to_flatbuffer.h
index 01a3c3f..b664036 100644
--- a/aos/json_to_flatbuffer.h
+++ b/aos/json_to_flatbuffer.h
@@ -86,14 +86,19 @@
// Writes a Flatbuffer to a file, or dies.
template <typename T>
-inline void WriteFlatbufferToJson(std::string_view filename,
- const Flatbuffer<T> &msg) {
+inline void WriteFlatbufferToJson(std::string_view filename, const T *msg) {
std::ofstream json_file(std::string(filename), std::ios::out);
CHECK(json_file) << ": Couldn't open " << filename;
json_file << FlatbufferToJson(msg);
json_file.close();
}
+template <typename T>
+inline void WriteFlatbufferToJson(std::string_view filename,
+ const Flatbuffer<T> &msg) {
+ WriteFlatbufferToJson(filename, &msg.message());
+}
+
// Writes a NonSizePrefixedFlatbuffer to a binary file, or dies.
template <typename T>
inline void WriteFlatbufferToFile(std::string_view filename,
diff --git a/aos/network/BUILD b/aos/network/BUILD
index 75173e7..d1a79d0 100644
--- a/aos/network/BUILD
+++ b/aos/network/BUILD
@@ -385,6 +385,7 @@
":sctp_config_fbs",
":sctp_config_request_fbs",
":timestamp_fbs",
+ "//aos/containers:ring_buffer",
"//aos/events:shm_event_loop",
"//aos/events/logging:log_reader",
],
diff --git a/aos/network/message_bridge_client.fbs b/aos/network/message_bridge_client.fbs
index 40c0698..2f6461d 100644
--- a/aos/network/message_bridge_client.fbs
+++ b/aos/network/message_bridge_client.fbs
@@ -35,6 +35,9 @@
// Number of times we've established a connection to the server.
connection_count:uint (id: 8);
+
+ // Number of timestamps we have failed to send.
+ timestamp_send_failures: uint (id: 9);
}
// Statistics for all clients.
diff --git a/aos/network/message_bridge_client_lib.cc b/aos/network/message_bridge_client_lib.cc
index e42eaa0..19f3420 100644
--- a/aos/network/message_bridge_client_lib.cc
+++ b/aos/network/message_bridge_client_lib.cc
@@ -134,6 +134,10 @@
connect_timer_ = event_loop_->AddTimer([this]() { SendConnect(); });
connect_timer_->set_name(std::string("connect_") +
remote_node_->name()->str());
+ timestamp_retry_buffer_ =
+ event_loop_->AddTimer([this]() { SendTimestamps(); });
+ timestamp_retry_buffer_->set_name(std::string("timestamp_retry_") +
+ remote_node_->name()->str());
event_loop_->OnRun(
[this]() { connect_timer_->Schedule(event_loop_->monotonic_now()); });
@@ -265,6 +269,11 @@
connect_timer_->Schedule(
event_loop_->monotonic_now() + chrono::milliseconds(100),
chrono::milliseconds(100));
+
+ // Don't try resending timestamps since there is nobody to send them to. Let
+ // Connect handle that instead. HandleData will retry when they reply.
+ timestamp_retry_buffer_->Disable();
+
client_status_->Disconnect(client_index_);
client_status_->SampleReset(client_index_);
}
@@ -304,6 +313,7 @@
chrono::nanoseconds(remote_data->monotonic_sent_time()));
// Publish the message.
+ UUID remote_boot_uuid = UUID::FromVector(remote_data->boot_uuid());
RawSender *sender = channel_state->sender.get();
sender->CheckOk(sender->Send(
remote_data->data()->data(), remote_data->data()->size(),
@@ -311,48 +321,60 @@
chrono::nanoseconds(remote_data->monotonic_sent_time())),
realtime_clock::time_point(
chrono::nanoseconds(remote_data->realtime_sent_time())),
- remote_data->queue_index(),
- UUID::FromVector(remote_data->boot_uuid())));
+ remote_data->queue_index(), remote_boot_uuid));
client_status_->SampleFilter(
client_index_,
monotonic_clock::time_point(
chrono::nanoseconds(remote_data->monotonic_sent_time())),
- sender->monotonic_sent_time(),
- UUID::FromVector(remote_data->boot_uuid()));
+ sender->monotonic_sent_time(), remote_boot_uuid);
if (stream_reply_with_timestamp_[stream]) {
- // TODO(austin): Send back less if we are only acking. Maybe only a
- // stream id? Nothing if we are only forwarding?
+ SavedTimestamp timestamp{
+ .channel_index = remote_data->channel_index(),
+ .monotonic_sent_time = remote_data->monotonic_sent_time(),
+ .realtime_sent_time = remote_data->realtime_sent_time(),
+ .queue_index = remote_data->queue_index(),
+ .monotonic_remote_time =
+ sender->monotonic_sent_time().time_since_epoch().count(),
+ .realtime_remote_time =
+ sender->realtime_sent_time().time_since_epoch().count(),
+ .remote_queue_index = sender->sent_queue_index(),
+ };
- // Now fill out the message received reply. This uses a MessageHeader
- // container so it can be directly logged.
- message_reception_reply_.mutable_message()->mutate_channel_index(
- remote_data->channel_index());
- message_reception_reply_.mutable_message()->mutate_monotonic_sent_time(
- remote_data->monotonic_sent_time());
- message_reception_reply_.mutable_message()->mutate_realtime_sent_time(
- remote_data->realtime_sent_time());
- message_reception_reply_.mutable_message()->mutate_queue_index(
- remote_data->queue_index());
+ // Sort out if we should try to queue the timestamp or just send it
+ // directly. We don't want to get too far ahead.
+ //
+ // In this case, the other side rebooted. There is no sense telling it
+ // about previous timestamps. Wipe them and reset which boot we are.
+ if (remote_boot_uuid != timestamps_uuid_) {
+ timestamp_buffer_.Reset();
+ timestamps_uuid_ = remote_boot_uuid;
+ }
- // And capture the relevant data needed to generate the forwarding
- // MessageHeader.
- message_reception_reply_.mutable_message()->mutate_monotonic_remote_time(
- sender->monotonic_sent_time().time_since_epoch().count());
- message_reception_reply_.mutable_message()->mutate_realtime_remote_time(
- sender->realtime_sent_time().time_since_epoch().count());
- message_reception_reply_.mutable_message()->mutate_remote_queue_index(
- sender->sent_queue_index());
+ // Then, we only can send if there are no pending timestamps.
+ bool push_timestamp = false;
+ if (timestamp_buffer_.empty()) {
+ if (!SendTimestamp(timestamp)) {
+ // Whops, we failed to send, queue and try again.
+ push_timestamp = true;
+ }
+ } else {
+ push_timestamp = true;
+ }
- // Unique ID is channel_index and monotonic clock.
- // TODO(austin): Depending on if we are the logger node or not, we need to
- // guarentee that this ack gets received too... Same path as the logger.
- client_.Send(kTimestampStream(),
- std::string_view(reinterpret_cast<const char *>(
- message_reception_reply_.span().data()),
- message_reception_reply_.span().size()),
- 0);
+ if (push_timestamp) {
+ // Trigger the timer if we are the first timestamp added, or if the
+ // timer was disabled because the far side disconnected.
+ if (timestamp_buffer_.empty() ||
+ timestamp_retry_buffer_->IsDisabled()) {
+ timestamp_retry_buffer_->Schedule(event_loop_->monotonic_now() +
+ chrono::milliseconds(100));
+ }
+ VLOG(1) << this << " Queued timestamp " << timestamp.channel_index
+ << " " << timestamp.queue_index;
+ timestamp_buffer_.Push(timestamp);
+ }
}
}
@@ -371,6 +393,62 @@
<< " cumtsn=" << message->header.rcvinfo.rcv_cumtsn << ")";
}
+bool SctpClientConnection::SendTimestamp(SavedTimestamp timestamp) {
+ // Now fill out the message received reply. This uses a MessageHeader
+ // container so it can be directly logged.
+ message_reception_reply_.mutable_message()->mutate_channel_index(
+ timestamp.channel_index);
+ message_reception_reply_.mutable_message()->mutate_monotonic_sent_time(
+ timestamp.monotonic_sent_time);
+ message_reception_reply_.mutable_message()->mutate_realtime_sent_time(
+ timestamp.realtime_sent_time);
+ message_reception_reply_.mutable_message()->mutate_queue_index(
+ timestamp.queue_index);
+
+ // And capture the relevant data needed to generate the forwarding
+ // MessageHeader.
+ message_reception_reply_.mutable_message()->mutate_monotonic_remote_time(
+ timestamp.monotonic_remote_time);
+ message_reception_reply_.mutable_message()->mutate_realtime_remote_time(
+ timestamp.realtime_remote_time);
+ message_reception_reply_.mutable_message()->mutate_remote_queue_index(
+ timestamp.remote_queue_index);
+
+ // Unique ID is channel_index and monotonic clock.
+ VLOG(1) << this << " Sent timestamp " << timestamp.channel_index << " "
+ << timestamp.queue_index;
+ if (!client_.Send(
+ kTimestampStream(),
+ std::string_view(reinterpret_cast<const char *>(
+ message_reception_reply_.span().data()),
+ message_reception_reply_.span().size()),
+ 0)) {
+ // Count the failure.
+ connection_->mutate_timestamp_send_failures(
+ connection_->timestamp_send_failures() + 1);
+ return false;
+ }
+ return true;
+}
+
+void SctpClientConnection::SendTimestamps() {
+ // This is only called from the timer, and the timer is only enabled when
+ // there is something in the buffer. Explode if that assumption is false.
+ CHECK(!timestamp_buffer_.empty());
+ do {
+ if (!SendTimestamp(timestamp_buffer_[0])) {
+ timestamp_retry_buffer_->Schedule(event_loop_->monotonic_now() +
+ chrono::milliseconds(100));
+ return;
+ } else {
+ VLOG(1) << this << " Resent timestamp "
+ << timestamp_buffer_[0].channel_index << " "
+ << timestamp_buffer_[0].queue_index;
+ timestamp_buffer_.Shift();
+ }
+ } while (!timestamp_buffer_.empty());
+}
+
MessageBridgeClient::MessageBridgeClient(
aos::ShmEventLoop *event_loop, std::string config_sha256,
SctpAuthMethod requested_authentication)
diff --git a/aos/network/message_bridge_client_lib.h b/aos/network/message_bridge_client_lib.h
index e4b5e84..2df3599 100644
--- a/aos/network/message_bridge_client_lib.h
+++ b/aos/network/message_bridge_client_lib.h
@@ -3,6 +3,7 @@
#include <string_view>
+#include "aos/containers/ring_buffer.h"
#include "aos/events/event_loop.h"
#include "aos/events/logging/logger_generated.h"
#include "aos/events/shm_event_loop.h"
@@ -49,6 +50,18 @@
}
private:
+ // Datastructure to hold the timestamps we failed to deliver in. All the
+ // fields match the reply exactly.
+ struct SavedTimestamp {
+ size_t channel_index;
+ int64_t monotonic_sent_time;
+ int64_t realtime_sent_time;
+ uint32_t queue_index;
+ int64_t monotonic_remote_time;
+ int64_t realtime_remote_time;
+ uint32_t remote_queue_index;
+ };
+
// Reads a message from the socket. Could be a notification.
void MessageReceived();
@@ -61,6 +74,9 @@
void NodeDisconnected();
void HandleData(const Message *message);
+ // Sends a timestamp, and returns true on success, false on failure.
+ bool SendTimestamp(SavedTimestamp timestamp);
+
// Schedules connect_timer_ for a ways in the future. If one of our messages
// gets dropped, the server might be waiting for this, so if we don't hear
// from the server for a while we'll try sending it again.
@@ -69,6 +85,9 @@
kReconnectTimeout);
}
+ // Sends out as many queued timestamps as we have and can send.
+ void SendTimestamps();
+
// Event loop to register the server on.
aos::ShmEventLoop *const event_loop_;
@@ -97,11 +116,20 @@
// Timer which fires to handle reconnections.
aos::TimerHandler *connect_timer_;
+ // Timer which fires to handle resending timestamps.
+ aos::TimerHandler *timestamp_retry_buffer_;
+
// ClientConnection statistics message to modify. This will be published
// periodicially.
MessageBridgeClientStatus *client_status_;
int client_index_;
ClientConnection *connection_;
+
+ // Boot UUID for the remote for any timestamps in the buffer.
+ UUID timestamps_uuid_ = UUID::Zero();
+
+ // Any timestamps that we were unable to send back.
+ aos::RingBuffer<SavedTimestamp, 1024> timestamp_buffer_;
};
// This encapsulates the state required to talk to *all* the servers from this
diff --git a/aos/network/message_bridge_client_status.cc b/aos/network/message_bridge_client_status.cc
index 2ba2ae0..272e49e 100644
--- a/aos/network/message_bridge_client_status.cc
+++ b/aos/network/message_bridge_client_status.cc
@@ -37,6 +37,7 @@
connection_builder.add_connected_since_time(
monotonic_clock::min_time.time_since_epoch().count());
connection_builder.add_connection_count(0);
+ connection_builder.add_timestamp_send_failures(0);
connection_offsets.emplace_back(connection_builder.Finish());
}
flatbuffers::Offset<
@@ -144,6 +145,11 @@
client_connection_builder.add_partial_deliveries(
connection->partial_deliveries());
+ if (connection->timestamp_send_failures() != 0) {
+ client_connection_builder.add_timestamp_send_failures(
+ connection->timestamp_send_failures());
+ }
+
if (!uuid_offset.IsNull()) {
client_connection_builder.add_boot_uuid(uuid_offset);
}
diff --git a/aos/network/message_bridge_test.cc b/aos/network/message_bridge_test.cc
index 4eb5e8b..cc46926 100644
--- a/aos/network/message_bridge_test.cc
+++ b/aos/network/message_bridge_test.cc
@@ -733,7 +733,9 @@
{
MakePi2Server();
- RunPi2Server(chrono::milliseconds(3050));
+ // Wait long enough for the client to connect again. It currently takes 3
+ // seconds of connection to estimate the time offset.
+ RunPi2Server(chrono::milliseconds(4050));
// And confirm we are synchronized again.
EXPECT_TRUE(pi1_server_statistics_fetcher.Fetch());
diff --git a/aos/network/message_bridge_test_lib.cc b/aos/network/message_bridge_test_lib.cc
index d5a5f8f..16e4a4b 100644
--- a/aos/network/message_bridge_test_lib.cc
+++ b/aos/network/message_bridge_test_lib.cc
@@ -3,8 +3,6 @@
DECLARE_string(boot_uuid);
namespace aos {
-void SetShmBase(const std::string_view base);
-
namespace message_bridge::testing {
namespace chrono = std::chrono;
@@ -20,7 +18,7 @@
}
void DoSetShmBase(const std::string_view node) {
- aos::SetShmBase(ShmBase(node));
+ aos::testing::SetShmBase(ShmBase(node));
}
ThreadedEventLoopRunner::ThreadedEventLoopRunner(aos::ShmEventLoop *event_loop)
@@ -75,6 +73,7 @@
void MessageBridgeParameterizedTest::MakePi1Server(
std::string server_config_sha256) {
OnPi1();
+ LOG(INFO) << "Making pi1 server";
FLAGS_application_name = "pi1_message_bridge_server";
pi1_server_event_loop =
std::make_unique<aos::ShmEventLoop>(&config.message());
@@ -87,6 +86,7 @@
void MessageBridgeParameterizedTest::RunPi1Server(
chrono::nanoseconds duration) {
+ LOG(INFO) << "Running pi1 server";
// Set up a shutdown callback.
aos::TimerHandler *const quit = pi1_server_event_loop->AddTimer(
[this]() { pi1_server_event_loop->Exit(); });
@@ -99,11 +99,13 @@
}
void MessageBridgeParameterizedTest::StartPi1Server() {
+ LOG(INFO) << "Starting pi1 server";
pi1_server_thread =
std::make_unique<ThreadedEventLoopRunner>(pi1_server_event_loop.get());
}
void MessageBridgeParameterizedTest::StopPi1Server() {
+ LOG(INFO) << "Stopping pi1 server";
pi1_server_thread.reset();
pi1_message_bridge_server.reset();
pi1_server_event_loop.reset();
@@ -111,6 +113,7 @@
void MessageBridgeParameterizedTest::MakePi1Client() {
OnPi1();
+ LOG(INFO) << "Making pi1 client";
FLAGS_application_name = "pi1_message_bridge_client";
pi1_client_event_loop =
std::make_unique<aos::ShmEventLoop>(&config.message());
@@ -120,11 +123,13 @@
}
void MessageBridgeParameterizedTest::StartPi1Client() {
+ LOG(INFO) << "Starting pi1 client";
pi1_client_thread =
std::make_unique<ThreadedEventLoopRunner>(pi1_client_event_loop.get());
}
void MessageBridgeParameterizedTest::StopPi1Client() {
+ LOG(INFO) << "Stopping pi1 client";
pi1_client_thread.reset();
pi1_message_bridge_client.reset();
pi1_client_event_loop.reset();
@@ -132,6 +137,7 @@
void MessageBridgeParameterizedTest::MakePi1Test() {
OnPi1();
+ LOG(INFO) << "Making pi1 test";
FLAGS_application_name = "test1";
pi1_test_event_loop = std::make_unique<aos::ShmEventLoop>(&config.message());
@@ -156,14 +162,19 @@
}
void MessageBridgeParameterizedTest::StartPi1Test() {
+ LOG(INFO) << "Starting pi1 test";
pi1_test_thread =
std::make_unique<ThreadedEventLoopRunner>(pi1_test_event_loop.get());
}
-void MessageBridgeParameterizedTest::StopPi1Test() { pi1_test_thread.reset(); }
+void MessageBridgeParameterizedTest::StopPi1Test() {
+ LOG(INFO) << "Stopping pi1 test";
+ pi1_test_thread.reset();
+}
void MessageBridgeParameterizedTest::MakePi2Server() {
OnPi2();
+ LOG(INFO) << "Making pi2 server";
FLAGS_application_name = "pi2_message_bridge_server";
pi2_server_event_loop =
std::make_unique<aos::ShmEventLoop>(&config.message());
@@ -174,6 +185,7 @@
void MessageBridgeParameterizedTest::RunPi2Server(
chrono::nanoseconds duration) {
+ LOG(INFO) << "Running pi2 server";
// Schedule a shutdown callback.
aos::TimerHandler *const quit = pi2_server_event_loop->AddTimer(
[this]() { pi2_server_event_loop->Exit(); });
@@ -186,11 +198,13 @@
}
void MessageBridgeParameterizedTest::StartPi2Server() {
+ LOG(INFO) << "Starting pi2 server";
pi2_server_thread =
std::make_unique<ThreadedEventLoopRunner>(pi2_server_event_loop.get());
}
void MessageBridgeParameterizedTest::StopPi2Server() {
+ LOG(INFO) << "Stopping pi2 server";
pi2_server_thread.reset();
pi2_message_bridge_server.reset();
pi2_server_event_loop.reset();
@@ -198,6 +212,7 @@
void MessageBridgeParameterizedTest::MakePi2Client() {
OnPi2();
+ LOG(INFO) << "Making pi2 client";
FLAGS_application_name = "pi2_message_bridge_client";
pi2_client_event_loop =
std::make_unique<aos::ShmEventLoop>(&config.message());
@@ -208,6 +223,7 @@
void MessageBridgeParameterizedTest::RunPi2Client(
chrono::nanoseconds duration) {
+ LOG(INFO) << "Running pi2 client";
// Run for 5 seconds to make sure we have time to estimate the offset.
aos::TimerHandler *const quit = pi2_client_event_loop->AddTimer(
[this]() { pi2_client_event_loop->Exit(); });
@@ -221,11 +237,13 @@
}
void MessageBridgeParameterizedTest::StartPi2Client() {
+ LOG(INFO) << "Starting pi2 client";
pi2_client_thread =
std::make_unique<ThreadedEventLoopRunner>(pi2_client_event_loop.get());
}
void MessageBridgeParameterizedTest::StopPi2Client() {
+ LOG(INFO) << "Stopping pi2 client";
pi2_client_thread.reset();
pi2_message_bridge_client.reset();
pi2_client_event_loop.reset();
@@ -233,6 +251,7 @@
void MessageBridgeParameterizedTest::MakePi2Test() {
OnPi2();
+ LOG(INFO) << "Making pi2 test";
FLAGS_application_name = "test2";
pi2_test_event_loop = std::make_unique<aos::ShmEventLoop>(&config.message());
@@ -257,10 +276,15 @@
}
void MessageBridgeParameterizedTest::StartPi2Test() {
+ LOG(INFO) << "Starting pi2 test";
pi2_test_thread =
std::make_unique<ThreadedEventLoopRunner>(pi2_test_event_loop.get());
}
-void MessageBridgeParameterizedTest::StopPi2Test() { pi2_test_thread.reset(); }
+void MessageBridgeParameterizedTest::StopPi2Test() {
+ LOG(INFO) << "Stopping pi2 test";
+ pi2_test_thread.reset();
+}
+
} // namespace message_bridge::testing
} // namespace aos
diff --git a/aos/network/multinode_timestamp_filter.cc b/aos/network/multinode_timestamp_filter.cc
index 417845b..85ee12c 100644
--- a/aos/network/multinode_timestamp_filter.cc
+++ b/aos/network/multinode_timestamp_filter.cc
@@ -877,20 +877,27 @@
return updated;
}
-std::tuple<std::vector<BootTimestamp>, Eigen::VectorXd, size_t, size_t>
+std::optional<
+ std::tuple<std::vector<BootTimestamp>, Eigen::VectorXd, size_t, size_t>>
SolveConstrainedNewton(NewtonSolver *solver, TimestampProblem *problem,
const size_t max_iterations) {
std::vector<size_t> active_constraints;
while (true) {
- Eigen::VectorXd y;
size_t iteration;
size_t solution_node;
- std::vector<size_t> new_active_constraints;
- std::tie(y, solution_node, iteration, new_active_constraints) =
- solver->SolveConstrainedNewton(problem, max_iterations,
- active_constraints);
+ auto solver_result = solver->SolveConstrainedNewton(problem, max_iterations,
+ active_constraints);
+ if (!solver_result.has_value()) {
+ return std::nullopt;
+ }
+
+ std::tie(std::ignore, solution_node, iteration, std::ignore) =
+ *solver_result;
+ Eigen::VectorXd y = std::move(std::get<0>(*solver_result));
+ std::vector<size_t> new_active_constraints =
+ std::move(std::get<3>(*solver_result));
std::vector<BootTimestamp> result =
problem->PackResults(iteration > max_iterations ||
@@ -913,7 +920,7 @@
}
}
-std::tuple<Eigen::VectorXd, size_t, size_t, std::vector<size_t>>
+std::optional<std::tuple<Eigen::VectorXd, size_t, size_t, std::vector<size_t>>>
NewtonSolver::SolveConstrainedNewton(
Problem *problem, const size_t max_iterations,
const absl::Span<const size_t> original_active_constraints) {
@@ -1197,8 +1204,9 @@
}
}
- if (iteration > max_iterations) {
- LOG(FATAL) << "Failed to converge on solve " << my_solve_number_;
+ if (iteration > max_iterations && FLAGS_crash_on_solve_failure) {
+ LOG(ERROR) << "Failed to converge on solve " << my_solve_number_;
+ return std::nullopt;
}
return std::make_tuple(std::move(y), solution_node, iteration,
@@ -1345,45 +1353,63 @@
}
}
-std::optional<const std::tuple<distributed_clock::time_point,
- std::vector<BootTimestamp>> *>
+std::optional<std::optional<const std::tuple<distributed_clock::time_point,
+ std::vector<BootTimestamp>> *>>
InterpolatedTimeConverter::QueueNextTimestamp() {
- std::optional<
- std::tuple<distributed_clock::time_point, std::vector<BootTimestamp>>>
+ std::optional<std::optional<
+ std::tuple<distributed_clock::time_point, std::vector<BootTimestamp>>>>
next_time = NextTimestamp();
- if (!next_time) {
- VLOG(1) << "Last timestamp, calling it quits";
- at_end_ = true;
+ if (!next_time.has_value()) {
+ VLOG(1) << "Error in processing timestamps.";
return std::nullopt;
}
+ if (!next_time.value().has_value()) {
+ VLOG(1) << "Last timestamp, calling it quits";
+ std::optional<std::optional<const std::tuple<distributed_clock::time_point,
+ std::vector<BootTimestamp>> *>>
+ result;
+ result.emplace(std::nullopt);
+ // Check that C++ actually works how we think it does...
+ CHECK(result.has_value());
+ at_end_ = true;
+ return result;
+ }
- VLOG(1) << "Fetched next timestamp while solving: " << std::get<0>(*next_time)
- << " ->";
- for (BootTimestamp t : std::get<1>(*next_time)) {
+ VLOG(1) << "Fetched next timestamp while solving: "
+ << std::get<0>(**next_time) << " ->";
+ for (BootTimestamp t : std::get<1>(**next_time)) {
VLOG(1) << " " << t;
}
- CHECK_EQ(node_count_, std::get<1>(*next_time).size());
+ CHECK_EQ(node_count_, std::get<1>(**next_time).size());
if (times_.empty()) {
- for (BootTimestamp t : std::get<1>(*next_time)) {
+ for (BootTimestamp t : std::get<1>(**next_time)) {
CHECK_EQ(t.boot, 0u);
}
} else {
bool rebooted = false;
for (size_t i = 0; i < node_count_; ++i) {
if (std::get<1>(times_.back())[i].boot !=
- std::get<1>(*next_time)[i].boot) {
+ std::get<1>(**next_time)[i].boot) {
rebooted = true;
break;
}
}
if (rebooted) {
CHECK(reboot_found_);
- reboot_found_(std::get<0>(*next_time), std::get<1>(*next_time));
+ if (VLOG_IS_ON(2)) {
+ VLOG(2) << "Notified reboot of";
+ size_t node_index = 0;
+ for (logger::BootTimestamp t : std::get<1>(**next_time)) {
+ VLOG(2) << " Node " << node_index << " " << t;
+ ++node_index;
+ }
+ }
+ reboot_found_(std::get<0>(**next_time), std::get<1>(**next_time));
}
}
- times_.emplace_back(std::move(*next_time));
+ times_.emplace_back(std::move(**next_time));
return ×_.back();
}
@@ -1685,7 +1711,7 @@
}
}
-void MultiNodeNoncausalOffsetEstimator::FlushAndClose(bool destructor) {
+bool MultiNodeNoncausalOffsetEstimator::FlushAndClose(bool destructor) {
// Write out all the data in our filters.
FlushAllSamples(true);
if (fp_) {
@@ -1718,6 +1744,7 @@
}
}
}
+ filter_fps_.clear();
}
if (sample_fps_.size() != 0) {
for (std::vector<FILE *> &filter_fp : sample_fps_) {
@@ -1727,6 +1754,7 @@
}
}
}
+ sample_fps_.clear();
}
if (all_done_) {
@@ -1736,13 +1764,14 @@
std::optional<std::tuple<BootTimestamp, BootDuration>> next =
filter.filter->Consume();
if (next) {
- skip_order_validation_
- ? LOG(WARNING)
- : LOG(FATAL) << "MultiNodeNoncausalOffsetEstimator reported all "
- "done, but "
- << node_a_index << " -> " << filter.b_index
- << " found more data at time " << std::get<0>(*next)
- << ". Time estimation was silently wrong.";
+ LOG(ERROR) << "MultiNodeNoncausalOffsetEstimator reported all "
+ "done, but "
+ << node_a_index << " -> " << filter.b_index
+ << " found more data at time " << std::get<0>(*next)
+ << ". Time estimation was silently wrong.";
+ if (!skip_order_validation_) {
+ return false;
+ }
}
}
++node_a_index;
@@ -1753,14 +1782,21 @@
if (!node_samples_.empty()) {
for (NodeSamples &node : node_samples_) {
for (SingleNodeSamples ×tamps : node.nodes) {
- CHECK(timestamps.messages.empty());
+ if (!timestamps.messages.empty()) {
+ LOG(ERROR) << "Timestamps still remaining.";
+ return false;
+ }
}
}
}
+ return true;
}
MultiNodeNoncausalOffsetEstimator::~MultiNodeNoncausalOffsetEstimator() {
- FlushAndClose(true);
+ const bool success = FlushAndClose(true);
+ if (!non_fatal_destructor_checks_) {
+ CHECK(success);
+ }
}
UUID MultiNodeNoncausalOffsetEstimator::boot_uuid(size_t node_index,
@@ -2163,12 +2199,15 @@
BitSet64 all_live_nodes(problem.size());
const BitSet64 all_nodes = ~BitSet64(problem.size());
- for (size_t node_index = 0; node_index < timestamp_mappers_.size();
- ++node_index) {
- if (timestamp_mappers_[node_index] != nullptr) {
- // Make sure we have enough data queued such that if we are going to
- // have a timestamp on this filter, we do have a timestamp queued.
- timestamp_mappers_[node_index]->QueueFor(time_estimation_buffer_seconds_);
+ if (time_estimation_buffer_seconds_ != chrono::seconds(0)) {
+ for (size_t node_index = 0; node_index < timestamp_mappers_.size();
+ ++node_index) {
+ if (timestamp_mappers_[node_index] != nullptr) {
+ // Make sure we have enough data queued such that if we are going to
+ // have a timestamp on this filter, we do have a timestamp queued.
+ timestamp_mappers_[node_index]->QueueFor(
+ time_estimation_buffer_seconds_);
+ }
}
}
@@ -2188,6 +2227,10 @@
problem.add_clock_offset_filter(node_a_index, filter.filter,
filter.b_index, filter.b_filter);
+ if (time_estimation_buffer_seconds_ == chrono::seconds(0)) {
+ continue;
+ }
+
if (timestamp_mappers_[node_a_index] != nullptr) {
// Now, we have cases at startup where we have a couple of points
// followed by a long gap, followed by the body of the data. We are
@@ -2423,7 +2466,8 @@
return std::make_tuple(candidate_times, boots_all_match);
}
-std::tuple<NoncausalTimestampFilter *, std::vector<BootTimestamp>, int>
+std::optional<
+ std::tuple<NoncausalTimestampFilter *, std::vector<BootTimestamp>, int>>
MultiNodeNoncausalOffsetEstimator::SimultaneousSolution(
TimestampProblem *problem,
const std::vector<CandidateTimes> candidate_times,
@@ -2482,8 +2526,11 @@
if (iterations > kMaxIterations && FLAGS_crash_on_solve_failure) {
UpdateSolution(std::move(solution));
- FlushAndClose(false);
- LOG(FATAL) << "Failed to converge.";
+ if (!FlushAndClose(false)) {
+ return std::nullopt;
+ }
+ LOG(ERROR) << "Failed to converge.";
+ return std::nullopt;
}
if (!problem->ValidateSolution(solution, true)) {
@@ -2497,13 +2544,16 @@
problem->Debug();
if (!skip_order_validation_) {
UpdateSolution(solution);
- FlushAndClose(false);
- LOG(FATAL) << "Bailing, use --skip_order_validation to continue. "
+ if (!FlushAndClose(false)) {
+ return std::nullopt;
+ }
+ LOG(ERROR) << "Bailing, use --skip_order_validation to continue. "
"Use at your own risk.";
+ return std::nullopt;
}
} else {
// Aw, our initial attempt to solve failed. Now, let's try again with
- // constriants.
+ // constraints.
for (size_t node_index = 0; node_index < base_times.size();
++node_index) {
problem->set_base_clock(node_index, solution[node_index]);
@@ -2515,14 +2565,24 @@
}
problem->set_points(points);
- std::tie(solution, solution_y, solution_index, iterations) =
+ auto solver_result =
SolveConstrainedNewton(&solver, problem, kMaxIterations);
+ if (!solver_result.has_value()) {
+ return std::nullopt;
+ }
+ solution = std::move(std::get<0>(solver_result.value()));
+ solution_y = std::move(std::get<1>(solver_result.value()));
+ std::tie(std::ignore, std::ignore, solution_index, iterations) =
+ *solver_result;
if (iterations > kMaxIterations && FLAGS_crash_on_solve_failure) {
UpdateSolution(std::move(solution));
- FlushAndClose(false);
- LOG(FATAL) << "Failed to converge for problem "
+ if (!FlushAndClose(false)) {
+ return std::nullopt;
+ }
+ LOG(ERROR) << "Failed to converge for problem "
<< solver.my_solve_number();
+ return std::nullopt;
}
if (!problem->ValidateSolution(solution, false)) {
LOG(WARNING) << "Invalid solution, constraints not met for problem "
@@ -2533,9 +2593,12 @@
problem->Debug();
if (!skip_order_validation_) {
UpdateSolution(solution);
- FlushAndClose(false);
- LOG(FATAL) << "Bailing, use --skip_order_validation to continue. "
+ if (!FlushAndClose(false)) {
+ return std::nullopt;
+ }
+ LOG(ERROR) << "Bailing, use --skip_order_validation to continue. "
"Use at your own risk.";
+ return std::nullopt;
}
}
}
@@ -2548,7 +2611,7 @@
return std::make_tuple(next_filter, std::move(result_times), solution_index);
}
-void MultiNodeNoncausalOffsetEstimator::CheckInvalidDistance(
+bool MultiNodeNoncausalOffsetEstimator::CheckInvalidDistance(
const std::vector<BootTimestamp> &result_times,
const std::vector<BootTimestamp> &solution) {
// If times are close enough, drop the invalid time.
@@ -2562,7 +2625,7 @@
<< (result_times[i].time - solution[i].time).count() << "ns";
}
VLOG(1) << "Ignoring because it is close enough.";
- return;
+ return true;
}
// Somehow the new solution is better *and* worse than the old
// solution... This is an internal failure because that means time
@@ -2579,13 +2642,18 @@
LOG(ERROR) << "Skipping because --skip_order_validation";
} else {
UpdateSolution(solution);
- FlushAndClose(false);
- LOG(FATAL) << "Please investigate. Use --max_invalid_distance_ns="
+ if (!FlushAndClose(false)) {
+ return false;
+ }
+ LOG(ERROR) << "Please investigate. Use --max_invalid_distance_ns="
<< invalid_distance.count() << " to ignore this.";
+ return false;
}
+ return true;
}
-std::tuple<NoncausalTimestampFilter *, std::vector<BootTimestamp>, int>
+std::optional<
+ std::tuple<NoncausalTimestampFilter *, std::vector<BootTimestamp>, int>>
MultiNodeNoncausalOffsetEstimator::SequentialSolution(
TimestampProblem *problem,
const std::vector<CandidateTimes> candidate_times,
@@ -2689,9 +2757,12 @@
if (iterations > kMaxIterations && FLAGS_crash_on_solve_failure) {
UpdateSolution(std::move(solution));
- FlushAndClose(false);
- LOG(FATAL) << "Failed to converge for problem "
+ if (!FlushAndClose(false)) {
+ return std::nullopt;
+ }
+ LOG(ERROR) << "Failed to converge for problem "
<< solver.my_solve_number();
+ return std::nullopt;
}
// Bypass checking if order validation is turned off. This lets us dump a
@@ -2710,9 +2781,12 @@
problem->Debug();
if (!skip_order_validation_) {
UpdateSolution(solution);
- FlushAndClose(false);
- LOG(FATAL) << "Bailing, use --skip_order_validation to continue. "
+ if (!FlushAndClose(false)) {
+ return std::nullopt;
+ }
+ LOG(ERROR) << "Bailing, use --skip_order_validation to continue. "
"Use at your own risk.";
+ return std::nullopt;
}
} else {
// Seed with our last solution.
@@ -2722,14 +2796,24 @@
}
// And now resolve constrained.
problem->set_points(points);
- std::tie(solution, solution_y, solution_index, iterations) =
+ auto solver_result =
SolveConstrainedNewton(&solver, problem, kMaxIterations);
+ if (!solver_result.has_value()) {
+ return std::nullopt;
+ }
+ solution = std::move(std::get<0>(solver_result.value()));
+ solution_y = std::move(std::get<1>(solver_result.value()));
+ std::tie(std::ignore, std::ignore, solution_index, iterations) =
+ *solver_result;
if (iterations > kMaxIterations && FLAGS_crash_on_solve_failure) {
UpdateSolution(std::move(solution));
- FlushAndClose(false);
- LOG(FATAL) << "Failed to converge for problem "
+ if (!FlushAndClose(false)) {
+ return std::nullopt;
+ }
+ LOG(ERROR) << "Failed to converge for problem "
<< solver.my_solve_number();
+ return std::nullopt;
}
if (!problem->ValidateSolution(solution, false)) {
LOG(WARNING) << "Invalid solution, constraints not met for problem "
@@ -2740,9 +2824,12 @@
problem->Debug();
if (!skip_order_validation_) {
UpdateSolution(solution);
- FlushAndClose(false);
- LOG(FATAL) << "Bailing, use --skip_order_validation to continue. "
+ if (!FlushAndClose(false)) {
+ return std::nullopt;
+ }
+ LOG(ERROR) << "Bailing, use --skip_order_validation to continue. "
"Use at your own risk.";
+ return std::nullopt;
}
}
}
@@ -2776,7 +2863,9 @@
solution_index = node_a_index;
break;
case TimeComparison::kInvalid: {
- CheckInvalidDistance(result_times, solution);
+ if (!CheckInvalidDistance(result_times, solution)) {
+ return std::nullopt;
+ }
if (next_node_filter) {
std::optional<std::tuple<logger::BootTimestamp, logger::BootDuration>>
result = next_node_filter->Consume();
@@ -2799,18 +2888,18 @@
return std::make_tuple(next_filter, std::move(result_times), solution_index);
}
-std::tuple<NoncausalTimestampFilter *, std::vector<BootTimestamp>, int>
+std::optional<
+ std::tuple<NoncausalTimestampFilter *, std::vector<BootTimestamp>, int>>
MultiNodeNoncausalOffsetEstimator::NextSolution(
TimestampProblem *problem, const std::vector<BootTimestamp> &base_times) {
// Ok, now solve for the minimum time on each channel.
- std::vector<BootTimestamp> result_times;
-
bool boots_all_match = true;
std::vector<CandidateTimes> candidate_times;
std::tie(candidate_times, boots_all_match) = MakeCandidateTimes();
- NoncausalTimestampFilter *next_filter = nullptr;
- size_t solution_index = 0;
+ std::optional<
+ std::tuple<NoncausalTimestampFilter *, std::vector<BootTimestamp>, int>>
+ result;
// We can significantly speed things up if we know that all the boots match by
// solving for everything at once. If the boots don't match, the combined min
@@ -2818,21 +2907,21 @@
// actually using the boot from the candidate times to figure out which
// interpolation function to use under the hood.
if (boots_all_match) {
- std::tie(next_filter, result_times, solution_index) =
+ result =
SimultaneousSolution(problem, std::move(candidate_times), base_times);
} else {
// If all the boots don't match, fall back to the old method of comparing
// all the solutions individually.
- std::tie(next_filter, result_times, solution_index) =
+ result =
SequentialSolution(problem, std::move(candidate_times), base_times);
}
- if (VLOG_IS_ON(1)) {
- VLOG(1) << "Best solution is for node " << solution_index;
- for (size_t i = 0; i < result_times.size(); ++i) {
- VLOG(1) << " " << result_times[i];
+ if (VLOG_IS_ON(1) && result.has_value()) {
+ VLOG(1) << "Best solution is for node " << std::get<2>(*result);
+ for (size_t i = 0; i < std::get<1>(*result).size(); ++i) {
+ VLOG(1) << " " << std::get<1>(*result)[i];
}
}
- return std::make_tuple(next_filter, std::move(result_times), solution_index);
+ return result;
}
void MultiNodeNoncausalOffsetEstimator::UpdateSolution(
@@ -2902,19 +2991,24 @@
}
}
-std::optional<
- std::tuple<distributed_clock::time_point, std::vector<BootTimestamp>>>
+std::optional<std::optional<
+ std::tuple<distributed_clock::time_point, std::vector<BootTimestamp>>>>
MultiNodeNoncausalOffsetEstimator::NextTimestamp() {
// TODO(austin): Detect and handle there being fewer nodes in the log file
// than in replay, or them being in a different order.
TimestampProblem problem = MakeProblem();
// Ok, now solve for the minimum time on each channel.
- std::vector<BootTimestamp> result_times;
+ auto next_solution = NextSolution(&problem, last_monotonics_);
+ if (!next_solution.has_value()) {
+ return std::nullopt;
+ }
+ std::vector<BootTimestamp> result_times =
+ std::move(std::get<1>(next_solution.value()));
NoncausalTimestampFilter *next_filter = nullptr;
int solution_node_index = 0;
- std::tie(next_filter, result_times, solution_node_index) =
- NextSolution(&problem, last_monotonics_);
+ std::tie(next_filter, std::ignore, solution_node_index) =
+ next_solution.value();
CHECK(!all_done_);
@@ -2954,7 +3048,13 @@
if (fp_) {
fflush(fp_);
}
- return std::nullopt;
+ std::optional<std::optional<
+ std::tuple<distributed_clock::time_point, std::vector<BootTimestamp>>>>
+ result;
+ result.emplace(std::nullopt);
+ CHECK(result.has_value());
+ CHECK(!result.value().has_value());
+ return result;
}
{
@@ -3011,11 +3111,13 @@
}
UpdateSolution(std::move(result_times));
WriteFilter(next_filter, sample);
- FlushAndClose(false);
- LOG(FATAL)
+ if (!FlushAndClose(false)) {
+ return std::nullopt;
+ }
+ LOG(ERROR)
<< "Found a solution before the last returned solution on node "
<< solution_node_index;
- break;
+ return std::nullopt;
case TimeComparison::kEq:
WriteFilter(next_filter, sample);
return NextTimestamp();
@@ -3038,9 +3140,12 @@
}
UpdateSolution(std::move(result_times));
WriteFilter(next_filter, sample);
- FlushAndClose(false);
- LOG(FATAL) << "Please investigate. Use --max_invalid_distance_ns="
+ if (!FlushAndClose(false)) {
+ return std::nullopt;
+ }
+ LOG(ERROR) << "Please investigate. Use --max_invalid_distance_ns="
<< invalid_distance.count() << " to ignore this.";
+ return std::nullopt;
} break;
}
}
diff --git a/aos/network/multinode_timestamp_filter.h b/aos/network/multinode_timestamp_filter.h
index e3bfde2..959fe85 100644
--- a/aos/network/multinode_timestamp_filter.h
+++ b/aos/network/multinode_timestamp_filter.h
@@ -121,7 +121,8 @@
// Returns the solution, the solution_node field from the final derivatives,
// the number of iterations it took, and a list of the constraints actually
// used in the solution.
- std::tuple<Eigen::VectorXd, size_t, size_t, std::vector<size_t>>
+ std::optional<
+ std::tuple<Eigen::VectorXd, size_t, size_t, std::vector<size_t>>>
SolveConstrainedNewton(Problem *problem, size_t max_iterations,
const absl::Span<const size_t> constraint_indices);
@@ -354,8 +355,9 @@
// Queues 1 more timestamp in the interpolation list. This is public for
// timestamp_extractor so it can hammer on the log until everything is queued.
- std::optional<const std::tuple<distributed_clock::time_point,
- std::vector<logger::BootTimestamp>> *>
+ // Return type matches that of NextTimestamp().
+ [[nodiscard]] std::optional<std::optional<const std::tuple<
+ distributed_clock::time_point, std::vector<logger::BootTimestamp>> *>>
QueueNextTimestamp();
private:
@@ -364,8 +366,12 @@
// A timestamp is a sample of the distributed clock and a corresponding point
// on every monotonic clock for all the nodes in the factory that this will be
// hooked up to.
- virtual std::optional<std::tuple<distributed_clock::time_point,
- std::vector<logger::BootTimestamp>>>
+ // If !NextTimestamp().has_value(), then an error occurred. If
+ // !NextTimestamp().value().has_value(), then there is merely no next
+ // timestamp available.
+ // TODO(james): Swap this to std::expected when available.
+ virtual std::optional<std::optional<std::tuple<
+ distributed_clock::time_point, std::vector<logger::BootTimestamp>>>>
NextTimestamp() = 0;
// Queues timestamps util the last time in the queue matches the provided
@@ -373,7 +379,9 @@
template <typename F>
void QueueUntil(F not_done) {
while (!at_end_ && (times_.empty() || not_done(times_.back()))) {
- QueueNextTimestamp();
+ // Check the outer std::optional to watch for errors.
+ CHECK(QueueNextTimestamp().has_value())
+ << ": An error occurred when queueing timestamps.";
}
CHECK(!times_.empty())
@@ -472,10 +480,6 @@
void SetTimestampMappers(
std::vector<logger::TimestampMapper *> timestamp_mappers);
- std::optional<std::tuple<distributed_clock::time_point,
- std::vector<logger::BootTimestamp>>>
- NextTimestamp() override;
-
UUID boot_uuid(size_t node_index, size_t boot_count) override;
// Checks that all the nodes in the graph are connected. Needs all filters to
@@ -504,6 +508,14 @@
// Returns the configuration that we are replaying into.
const aos::Configuration *configuration() const { return configuration_; }
+ // Runs some checks that normally run with fatal CHECK's in the destructor.
+ // Returns false if any checks failed. This is used to allow the
+ // logfile_validator to non-fatally identify certain log sorting issues.
+ [[nodiscard]] bool RunDestructorChecks() {
+ non_fatal_destructor_checks_ = true;
+ return FlushAndClose(false);
+ }
+
private:
static constexpr int kMaxIterations = 400;
@@ -516,35 +528,39 @@
TimestampProblem MakeProblem();
+ std::optional<std::optional<std::tuple<distributed_clock::time_point,
+ std::vector<logger::BootTimestamp>>>>
+ NextTimestamp() override;
+
// Returns the list of candidate times to solve for.
std::tuple<std::vector<CandidateTimes>, bool> MakeCandidateTimes() const;
// Returns the next solution, the filter which has the control point for it
// (or nullptr if a start time triggered this to be returned), and the node
// which triggered it.
- std::tuple<NoncausalTimestampFilter *, std::vector<logger::BootTimestamp>,
- int>
+ std::optional<std::tuple<NoncausalTimestampFilter *,
+ std::vector<logger::BootTimestamp>, int>>
NextSolution(TimestampProblem *problem,
const std::vector<logger::BootTimestamp> &base_times);
// Returns the solution (if there is one) for the list of candidate times by
// solving all the problems simultaneously. They must be from the same boot.
- std::tuple<NoncausalTimestampFilter *, std::vector<logger::BootTimestamp>,
- int>
+ std::optional<std::tuple<NoncausalTimestampFilter *,
+ std::vector<logger::BootTimestamp>, int>>
SimultaneousSolution(TimestampProblem *problem,
const std::vector<CandidateTimes> candidate_times,
const std::vector<logger::BootTimestamp> &base_times);
// Returns the solution (if there is one) for the list of candidate times by
// solving the problems one after another. They can be from any boot.
- std::tuple<NoncausalTimestampFilter *, std::vector<logger::BootTimestamp>,
- int>
+ std::optional<std::tuple<NoncausalTimestampFilter *,
+ std::vector<logger::BootTimestamp>, int>>
SequentialSolution(TimestampProblem *problem,
const std::vector<CandidateTimes> candidate_times,
const std::vector<logger::BootTimestamp> &base_times);
- // Explodes if the invalid distance is too far.
- void CheckInvalidDistance(
+ // Returns false if the invalid distance is too far.
+ [[nodiscard]] bool CheckInvalidDistance(
const std::vector<logger::BootTimestamp> &result_times,
const std::vector<logger::BootTimestamp> &solution);
@@ -560,15 +576,18 @@
// Writes everything to disk anc closes it all out in preparation for either
// destruction or crashing.
- void FlushAndClose(bool destructor);
+ // Returns true if everything is healthy; returns false if we discovered
+ // sorting issues when closing things out.
+ [[nodiscard]] bool FlushAndClose(bool destructor);
const Configuration *configuration_;
const Configuration *logged_configuration_;
std::shared_ptr<const logger::Boots> boots_;
- // If true, skip any validation which would trigger if we see evidence that
- // time estimation between nodes was incorrect.
+ // If true, don't fatally die on any order validation failures which would
+ // trigger if we see evidence that time estimation between nodes was
+ // incorrect.
const bool skip_order_validation_;
// List of filters for a connection. The pointer to the first node will be
@@ -598,6 +617,7 @@
bool first_solution_ = true;
bool all_done_ = false;
+ bool non_fatal_destructor_checks_ = false;
// Optional file pointers to save the results of the noncausal filter in. This
// lives here so we can give each sample a distributed clock.
diff --git a/aos/network/multinode_timestamp_filter_test.cc b/aos/network/multinode_timestamp_filter_test.cc
index 9b37806..9aa76c0 100644
--- a/aos/network/multinode_timestamp_filter_test.cc
+++ b/aos/network/multinode_timestamp_filter_test.cc
@@ -407,7 +407,7 @@
std::vector<size_t> used_constraints;
std::tie(y, solution_node, iterations, used_constraints) =
- solver.SolveConstrainedNewton(&problem, 20, constraints);
+ solver.SolveConstrainedNewton(&problem, 20, constraints).value();
LOG(INFO) << y.transpose();
diff --git a/aos/network/testing_time_converter.cc b/aos/network/testing_time_converter.cc
index c175fe6..9dc5fff 100644
--- a/aos/network/testing_time_converter.cc
+++ b/aos/network/testing_time_converter.cc
@@ -22,7 +22,10 @@
TestingTimeConverter::~TestingTimeConverter() {
if (at_end_) {
- CHECK(!NextTimestamp()) << ": At the end but there is more data.";
+ auto next_timestamp = NextTimestamp();
+ CHECK(next_timestamp.has_value()) << ": Unexpected error";
+ CHECK(!next_timestamp.value().has_value())
+ << ": At the end but there is more data.";
}
}
@@ -105,13 +108,17 @@
ts_.emplace_back(std::make_tuple(time, std::move(times)));
}
-std::optional<std::tuple<distributed_clock::time_point,
- std::vector<logger::BootTimestamp>>>
+std::optional<std::optional<std::tuple<distributed_clock::time_point,
+ std::vector<logger::BootTimestamp>>>>
TestingTimeConverter::NextTimestamp() {
CHECK(!first_) << ": Tried to pull a timestamp before one was added. This "
"is unlikely to be what you want.";
if (ts_.empty()) {
- return std::nullopt;
+ std::optional<std::optional<std::tuple<distributed_clock::time_point,
+ std::vector<logger::BootTimestamp>>>>
+ result;
+ result.emplace(std::nullopt);
+ return result;
}
auto result = ts_.front();
ts_.pop_front();
diff --git a/aos/network/testing_time_converter.h b/aos/network/testing_time_converter.h
index 20b8e16..ae85cee 100644
--- a/aos/network/testing_time_converter.h
+++ b/aos/network/testing_time_converter.h
@@ -41,8 +41,8 @@
void AddNextTimestamp(distributed_clock::time_point time,
std::vector<logger::BootTimestamp> times);
- std::optional<std::tuple<distributed_clock::time_point,
- std::vector<logger::BootTimestamp>>>
+ std::optional<std::optional<std::tuple<distributed_clock::time_point,
+ std::vector<logger::BootTimestamp>>>>
NextTimestamp() override;
void set_boot_uuid(size_t node_index, size_t boot_count, UUID uuid) {
diff --git a/aos/network/timestamp_filter.cc b/aos/network/timestamp_filter.cc
index 77d3726..4a29dfc 100644
--- a/aos/network/timestamp_filter.cc
+++ b/aos/network/timestamp_filter.cc
@@ -1279,7 +1279,8 @@
<< " seconds after the last sample at " << std::get<0>(timestamps_[0])
<< ". Increase --time_estimation_buffer_seconds to greater than "
<< chrono::duration<double>(monotonic_now - std::get<0>(timestamps_[0]))
- .count();
+ .count()
+ << ", or set --force_timestamp_loading";
return;
}
CHECK_GT(monotonic_now, frozen_time_)
@@ -1287,7 +1288,8 @@
<< " before the frozen time of " << frozen_time_
<< ". Increase "
"--time_estimation_buffer_seconds to greater than "
- << chrono::duration<double>(frozen_time_ - monotonic_now).count();
+ << chrono::duration<double>(frozen_time_ - monotonic_now).count()
+ << ", or set --force_timestamp_loading";
// Future samples get quite a bit harder. We want the line to track the
// highest point without volating the slope constraint.
@@ -1333,7 +1335,8 @@
<< " seconds after the last sample at " << std::get<0>(timestamps_[0])
<< ". Increase --time_estimation_buffer_seconds to greater than "
<< chrono::duration<double>(monotonic_now - std::get<0>(timestamps_[0]))
- .count();
+ .count()
+ << ", or set --force_timestamp_loading";
// Back propagate the max velocity and remove any elements violating the
// velocity constraint. This is to handle the case where the offsets were
@@ -1357,8 +1360,8 @@
<< chrono::duration<double>(monotonic_now - std::get<0>(back)).count()
<< " seconds in the past. Increase --time_estimation_buffer_seconds "
"to greater than "
- << chrono::duration<double>(monotonic_now - std::get<0>(back))
- .count();
+ << chrono::duration<double>(monotonic_now - std::get<0>(back)).count()
+ << ", or set --force_timestamp_loading";
VLOG(1) << node_names_
<< " Removing now invalid sample during back propegation of "
<< TimeString(back);
@@ -1554,7 +1557,8 @@
<< ": " << node_names_
<< " Can't pop an already frozen sample. Increase "
"--time_estimation_buffer_seconds to greater than "
- << chrono::duration<double>(prior_dt).count();
+ << chrono::duration<double>(prior_dt).count()
+ << ", or set --force_timestamp_loading";
VLOG(1) << "Prior slope is too positive, removing prior point "
<< TimeString(*prior_it);
diff --git a/aos/sha256.cc b/aos/sha256.cc
index ae83792..cfeccde 100644
--- a/aos/sha256.cc
+++ b/aos/sha256.cc
@@ -7,6 +7,8 @@
#include "absl/types/span.h"
#include "openssl/sha.h"
+#include "aos/util/file.h"
+
namespace aos {
std::string Sha256(const absl::Span<const uint8_t> str) {
@@ -23,4 +25,13 @@
return ss.str();
}
+std::string Sha256(std::string_view str) {
+ return Sha256({reinterpret_cast<const uint8_t *>(str.data()), str.size()});
+}
+
+std::string Sha256OfFile(std::filesystem::path file) {
+ const std::string contents = aos::util::ReadFileToStringOrDie(file.string());
+ return Sha256(contents);
+}
+
} // namespace aos
diff --git a/aos/sha256.h b/aos/sha256.h
index 7fb9b2f..117bc14 100644
--- a/aos/sha256.h
+++ b/aos/sha256.h
@@ -1,6 +1,7 @@
#ifndef AOS_SHA256_H_
#define AOS_SHA256_H_
+#include <filesystem>
#include <string>
#include "absl/types/span.h"
@@ -9,6 +10,10 @@
// Returns the sha256 of a span.
std::string Sha256(const absl::Span<const uint8_t> str);
+std::string Sha256(std::string_view str);
+
+// Returns the Sha256 of the specified file. Dies on failure to read the file.
+std::string Sha256OfFile(std::filesystem::path file);
} // namespace aos
diff --git a/aos/sha256_test.cc b/aos/sha256_test.cc
new file mode 100644
index 0000000..33ab5ec
--- /dev/null
+++ b/aos/sha256_test.cc
@@ -0,0 +1,27 @@
+#include "aos/sha256.h"
+
+#include "gtest/gtest.h"
+
+#include "aos/testing/tmpdir.h"
+#include "aos/util/file.h"
+
+namespace aos::testing {
+
+constexpr const char *kTestString = "Test String";
+constexpr const char *kTestStringSha =
+ "30c6ff7a44f7035af933babaea771bf177fc38f06482ad06434cbcc04de7ac14";
+
+TEST(Sha256Test, ChecksumString) {
+ EXPECT_EQ(kTestStringSha, Sha256(kTestString));
+ EXPECT_EQ("2b4da12a4bfe66a061c24440521a9e5b994e4f0bcc47a17436275f5283bb6852",
+ Sha256("Test String 2"));
+}
+
+TEST(Sha256Test, ChecksumFile) {
+ const std::filesystem::path test_file =
+ aos::testing::TestTmpDir() + "/test.txt";
+ util::WriteStringToFileOrDie(test_file.string(), kTestString);
+ EXPECT_EQ(kTestStringSha, Sha256OfFile(test_file));
+}
+
+} // namespace aos::testing
diff --git a/aos/starter/BUILD b/aos/starter/BUILD
index 7ef3777..732cfe3 100644
--- a/aos/starter/BUILD
+++ b/aos/starter/BUILD
@@ -201,11 +201,13 @@
"$(rootpath :aos_starter)",
"$(rootpath //aos:aos_dump)",
"$(rootpath //aos/events/logging:logger_main)",
+ "$(rootpath //aos/events:aos_timing_report_streamer)",
],
data = [
":aos_starter",
":starterd",
"//aos:aos_dump",
+ "//aos/events:aos_timing_report_streamer",
"//aos/events:ping",
"//aos/events:pingpong_config",
"//aos/events:pong",
diff --git a/aos/starter/starter.fbs b/aos/starter/starter.fbs
index 7285281..d2cd573 100644
--- a/aos/starter/starter.fbs
+++ b/aos/starter/starter.fbs
@@ -82,6 +82,12 @@
// actually stopping and the parent process receiving the signal indicating that the application
// finished stopping.
process_info: util.ProcessInfo (id: 7);
+
+ // Indicates whether we have observed a recent AOS timing report from
+ // the application. Staleness is calculated based on the timing report period
+ // specified for the starterd application (defaults to 1 Hz, can be overridden
+ // by --timing_report_ms).
+ has_active_timing_report: bool (id: 8);
}
root_type Status;
diff --git a/aos/starter/starter_demo.py b/aos/starter/starter_demo.py
index 5a50890..89d06a0 100755
--- a/aos/starter/starter_demo.py
+++ b/aos/starter/starter_demo.py
@@ -7,9 +7,9 @@
DESCRIPTION = """
This script provides a convenient way to experiment with the starter
-and starter_cmd in particular. To run this, run:
+and aos_starter in particular. To run this, run:
$ bazel run //aos/starter:starter_demo
-This will then print out instructions for running starter_cmd.
+This will then print out instructions for running aos_starter.
If running via bazel, you should not need to specify the positional
arguments.
@@ -21,11 +21,7 @@
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("starterd", help="Location of starterd")
parser.add_argument("configs", help="Location of the config files")
- parser.add_argument("ping", help="Location of ping")
- parser.add_argument("pong", help="Location of pong")
- parser.add_argument("starter_cmd", help="Location of starter_cmd")
- parser.add_argument("aos_dump", help="Location of aos_dump")
- parser.add_argument("logger_main", help="Location of logger_main")
+ parser.add_argument("binaries", nargs='+', help="Binaries to provide")
args = parser.parse_args()
# Copy all the interesting files into a temporary directory and run
@@ -37,14 +33,11 @@
# can take a new --shm_base to allow cleaner running on shared systems.
with tempfile.TemporaryDirectory() as tmpdir:
shutil.copy(args.starterd, tmpdir + "/starterd")
- shutil.copy(args.ping, tmpdir + "/ping")
- shutil.copy(args.pong, tmpdir + "/pong")
- shutil.copy(args.starter_cmd, tmpdir + "/starter_cmd")
- shutil.copy(args.aos_dump, tmpdir + "/aos_dump")
- shutil.copy(args.logger_main, tmpdir + "/logger_main")
+ for binary in args.binaries:
+ shutil.copy(binary, tmpdir + "/" + os.path.basename(binary))
print(f"Running starter from {tmpdir}")
- print(f"\n\nTo run starter_cmd, do:\ncd {tmpdir}\n./starter_cmd\n\n")
+ print(f"\n\nTo run aos_starter, do:\ncd {tmpdir}\n./aos_starter\n\n")
for config in args.configs.split(' '):
basename = os.path.basename(config)
diff --git a/aos/starter/starter_test.cc b/aos/starter/starter_test.cc
index 3070f8e..cbb83f6 100644
--- a/aos/starter/starter_test.cc
+++ b/aos/starter/starter_test.cc
@@ -358,6 +358,8 @@
ASSERT_EQ(pong_app_status->pid(), pong_app_status->process_info()->pid());
ASSERT_TRUE(pong_app_status->process_info()->has_cpu_usage());
ASSERT_LE(0.0, pong_app_status->process_info()->cpu_usage());
+ ASSERT_TRUE(pong_app_status->has_has_active_timing_report());
+ ASSERT_TRUE(pong_app_status->has_active_timing_report());
watcher_loop.Exit();
SUCCEED();
}
diff --git a/aos/starter/starterd_lib.cc b/aos/starter/starterd_lib.cc
index f08ecc2..bf0fc31 100644
--- a/aos/starter/starterd_lib.cc
+++ b/aos/starter/starterd_lib.cc
@@ -36,10 +36,13 @@
: config_msg_(event_loop_config),
event_loop_(event_loop_config),
status_sender_(event_loop_.MakeSender<aos::starter::Status>("/aos")),
- status_timer_(event_loop_.AddTimer([this] {
- SendStatus();
- status_count_ = 0;
- })),
+ status_timer_(event_loop_.AddPhasedLoop(
+ [this](int elapsed_cycles) {
+ ServiceTimingReportFetcher(elapsed_cycles);
+ SendStatus();
+ status_count_ = 0;
+ },
+ std::chrono::milliseconds(1000))),
cleanup_timer_(event_loop_.AddTimer([this] {
event_loop_.Exit();
LOG(INFO) << "Starter event loop exit finished.";
@@ -47,17 +50,14 @@
max_status_count_(
event_loop_.GetChannel<aos::starter::Status>("/aos")->frequency() -
1),
+ timing_report_fetcher_(
+ event_loop_.MakeFetcher<aos::timing::Report>("/aos")),
shm_base_(FLAGS_shm_base),
listener_(&event_loop_,
[this](signalfd_siginfo signal) { OnSignal(signal); }),
top_(&event_loop_) {
event_loop_.SkipAosLog();
- event_loop_.OnRun([this] {
- status_timer_->Schedule(event_loop_.monotonic_now(),
- std::chrono::milliseconds(1000));
- });
-
if (!aos::configuration::MultiNode(config_msg_)) {
event_loop_.MakeWatcher(
"/aos",
@@ -266,6 +266,23 @@
event_loop_.Run();
}
+void Starter::ServiceTimingReportFetcher(int elapsed_cycles) {
+ // If there is any chance that it has been longer than one cycle since we last
+ // serviced the fetcher, call Fetch(). This reduces the chances that the
+ // fetcher falls behind when the system is under heavy load. Dropping a few
+ // timing report messages when the system is under stress is fine.
+ if (timing_report_fetcher_.get() == nullptr || elapsed_cycles > 1) {
+ timing_report_fetcher_.Fetch();
+ }
+ while (timing_report_fetcher_.FetchNext()) {
+ for (auto &application : applications_) {
+ application.second.ObserveTimingReport(
+ timing_report_fetcher_.context().monotonic_event_time,
+ timing_report_fetcher_.get());
+ }
+ }
+}
+
void Starter::SendStatus() {
aos::Sender<aos::starter::Status>::Builder builder =
status_sender_.MakeBuilder();
diff --git a/aos/starter/starterd_lib.h b/aos/starter/starterd_lib.h
index 775cdbc..92b20fa 100644
--- a/aos/starter/starterd_lib.h
+++ b/aos/starter/starterd_lib.h
@@ -59,6 +59,10 @@
// limit.
void HandleStateChange();
+ // Called periodically to run through the timing report fetcher and alert all
+ // the Application's to the new messages.
+ void ServiceTimingReportFetcher(int elapsed_cycles);
+
void SendStatus();
// Creates a MemoryMappedQueue for the given channel, to pre-allocate shared
@@ -71,12 +75,14 @@
aos::ShmEventLoop event_loop_;
aos::Sender<aos::starter::Status> status_sender_;
- aos::TimerHandler *status_timer_;
+ aos::PhasedLoopHandler *status_timer_;
aos::TimerHandler *cleanup_timer_;
int status_count_ = 0;
const int max_status_count_;
+ aos::Fetcher<aos::timing::Report> timing_report_fetcher_;
+
std::unordered_map<std::string, Application> applications_;
// Lock and list of all the queues. This makes it so we can initialize the
diff --git a/aos/starter/subprocess.cc b/aos/starter/subprocess.cc
index b1320f4..7b5fce6 100644
--- a/aos/starter/subprocess.cc
+++ b/aos/starter/subprocess.cc
@@ -8,6 +8,8 @@
#include "glog/logging.h"
+#include "aos/flatbuffer_merge.h"
+
namespace aos::starter {
// RAII class to become root and restore back to the original user and group
@@ -36,22 +38,25 @@
gid_t rgid_, egid_, sgid_;
};
-MemoryCGroup::MemoryCGroup(std::string_view name)
- : cgroup_(absl::StrCat("/sys/fs/cgroup/memory/aos_", name)) {
- Sudo sudo;
- int ret = mkdir(cgroup_.c_str(), 0755);
+MemoryCGroup::MemoryCGroup(std::string_view name, Create should_create)
+ : cgroup_(absl::StrCat("/sys/fs/cgroup/memory/aos_", name)),
+ should_create_(should_create) {
+ if (should_create_ == Create::kDoCreate) {
+ Sudo sudo;
+ int ret = mkdir(cgroup_.c_str(), 0755);
- if (ret != 0) {
- if (errno == EEXIST) {
- PCHECK(rmdir(cgroup_.c_str()) == 0)
- << ": Failed to remove previous cgroup " << cgroup_;
- ret = mkdir(cgroup_.c_str(), 0755);
+ if (ret != 0) {
+ if (errno == EEXIST) {
+ PCHECK(rmdir(cgroup_.c_str()) == 0)
+ << ": Failed to remove previous cgroup " << cgroup_;
+ ret = mkdir(cgroup_.c_str(), 0755);
+ }
}
- }
- if (ret != 0) {
- PLOG(FATAL) << ": Failed to create cgroup aos_" << cgroup_
- << ", do you have permission?";
+ if (ret != 0) {
+ PLOG(FATAL) << ": Failed to create cgroup aos_" << cgroup_
+ << ", do you have permission?";
+ }
}
}
@@ -59,20 +64,32 @@
if (pid == 0) {
pid = getpid();
}
- Sudo sudo;
- util::WriteStringToFileOrDie(absl::StrCat(cgroup_, "/tasks").c_str(),
- std::to_string(pid));
+ if (should_create_ == Create::kDoCreate) {
+ Sudo sudo;
+ util::WriteStringToFileOrDie(absl::StrCat(cgroup_, "/tasks").c_str(),
+ std::to_string(pid));
+ } else {
+ util::WriteStringToFileOrDie(absl::StrCat(cgroup_, "/tasks").c_str(),
+ std::to_string(pid));
+ }
}
void MemoryCGroup::SetLimit(std::string_view limit_name, uint64_t limit_value) {
- Sudo sudo;
- util::WriteStringToFileOrDie(absl::StrCat(cgroup_, "/", limit_name).c_str(),
- std::to_string(limit_value));
+ if (should_create_ == Create::kDoCreate) {
+ Sudo sudo;
+ util::WriteStringToFileOrDie(absl::StrCat(cgroup_, "/", limit_name).c_str(),
+ std::to_string(limit_value));
+ } else {
+ util::WriteStringToFileOrDie(absl::StrCat(cgroup_, "/", limit_name).c_str(),
+ std::to_string(limit_value));
+ }
}
MemoryCGroup::~MemoryCGroup() {
- Sudo sudo;
- PCHECK(rmdir(absl::StrCat(cgroup_).c_str()) == 0);
+ if (should_create_ == Create::kDoCreate) {
+ Sudo sudo;
+ PCHECK(rmdir(absl::StrCat(cgroup_).c_str()) == 0);
+ }
}
SignalListener::SignalListener(aos::ShmEventLoop *loop,
@@ -127,6 +144,11 @@
LOG_IF(WARNING, quiet_flag_ == QuietLogging::kNo)
<< "Failed to stop, sending SIGKILL to '" << name_
<< "' pid: " << pid_;
+ } else {
+ PLOG_IF(WARNING, quiet_flag_ == QuietLogging::kNo)
+ << "Failed to send SIGKILL to '" << name_ << "' pid: " << pid_;
+ stop_timer_->Schedule(event_loop_->monotonic_now() +
+ std::chrono::seconds(1));
}
})),
pipe_timer_(event_loop_->AddTimer([this]() { FetchOutputs(); })),
@@ -134,13 +156,11 @@
event_loop_->AddTimer([this]() { MaybeHandleSignal(); })),
on_change_({on_change}),
quiet_flag_(quiet_flag) {
- event_loop_->OnRun([this]() {
- // Every second poll to check if the child is dead. This is used as a
- // default for the case where the user is not directly catching SIGCHLD and
- // calling MaybeHandleSignal for us.
- child_status_handler_->Schedule(event_loop_->monotonic_now(),
- std::chrono::seconds(1));
- });
+ // Every second poll to check if the child is dead. This is used as a
+ // default for the case where the user is not directly catching SIGCHLD and
+ // calling MaybeHandleSignal for us.
+ child_status_handler_->Schedule(event_loop_->monotonic_now(),
+ std::chrono::seconds(1));
}
Application::Application(const aos::Application *application,
@@ -202,6 +222,7 @@
id_ = next_id_++;
start_time_ = event_loop_->monotonic_now();
status_ = aos::starter::State::STARTING;
+ latest_timing_report_version_.reset();
LOG_IF(INFO, quiet_flag_ == QuietLogging::kNo)
<< "Starting '" << name_ << "' pid " << pid_;
@@ -308,6 +329,16 @@
_exit(EXIT_FAILURE);
}
+void Application::ObserveTimingReport(
+ const aos::monotonic_clock::time_point send_time,
+ const aos::timing::Report *msg) {
+ if (msg->name()->string_view() == name_ && msg->pid() == pid_ &&
+ msg->has_version()) {
+ latest_timing_report_version_ = msg->version()->str();
+ last_timing_report_ = send_time;
+ }
+}
+
void Application::FetchOutputs() {
if (capture_stdout_) {
stdout_pipes_.read->Read(&stdout_);
@@ -462,6 +493,12 @@
if (exit_code_.has_value()) {
status_builder.add_last_exit_code(exit_code_.value());
}
+ status_builder.add_has_active_timing_report(
+ last_timing_report_ +
+ // Leave a bit of margin on the timing report receipt time, to allow
+ // for timing errors.
+ 3 * std::chrono::milliseconds(FLAGS_timing_report_ms) >
+ event_loop_->monotonic_now());
status_builder.add_last_stop_reason(stop_reason_);
if (pid_ != -1) {
status_builder.add_pid(pid_);
@@ -572,9 +609,17 @@
<< "Application '" << name_ << "' pid " << pid_
<< " exited with status " << exit_code_.value();
} else {
- LOG_IF(WARNING, quiet_flag_ == QuietLogging::kNo)
- << "Application '" << name_ << "' pid " << pid_
- << " exited unexpectedly with status " << exit_code_.value();
+ if (quiet_flag_ == QuietLogging::kNo) {
+ std::string version_string =
+ latest_timing_report_version_.has_value()
+ ? absl::StrCat("'", latest_timing_report_version_.value(),
+ "'")
+ : "unknown";
+ LOG_IF(WARNING, quiet_flag_ == QuietLogging::kNo)
+ << "Application '" << name_ << "' pid " << pid_ << " version "
+ << version_string << " exited unexpectedly with status "
+ << exit_code_.value();
+ }
}
if (autorestart()) {
QueueStart();
@@ -621,4 +666,12 @@
}
}
+Application::~Application() {
+ start_timer_->Disable();
+ restart_timer_->Disable();
+ stop_timer_->Disable();
+ pipe_timer_->Disable();
+ child_status_handler_->Disable();
+}
+
} // namespace aos::starter
diff --git a/aos/starter/subprocess.h b/aos/starter/subprocess.h
index ff62117..784c544 100644
--- a/aos/starter/subprocess.h
+++ b/aos/starter/subprocess.h
@@ -43,7 +43,14 @@
// Class to use the V1 cgroup API to limit memory usage.
class MemoryCGroup {
public:
- MemoryCGroup(std::string_view name);
+ // Enum to control if MemoryCGroup should create the cgroup and remove it on
+ // its own, or if it should assume it already exists and just use it.
+ enum class Create {
+ kDoCreate,
+ kDoNotCreate,
+ };
+
+ MemoryCGroup(std::string_view name, Create should_create = Create::kDoCreate);
~MemoryCGroup();
// Adds a thread ID to be managed by the cgroup.
@@ -54,6 +61,7 @@
private:
std::string cgroup_;
+ Create should_create_;
};
// Manages a running process, allowing starting and stopping, and restarting
@@ -73,6 +81,8 @@
aos::EventLoop *event_loop, std::function<void()> on_change,
QuietLogging quiet_flag = QuietLogging::kNo);
+ ~Application();
+
flatbuffers::Offset<aos::starter::ApplicationStatus> PopulateStatus(
flatbuffers::FlatBufferBuilder *builder, util::Top *top);
aos::starter::State status() const { return status_; };
@@ -92,8 +102,12 @@
void Start() { HandleCommand(aos::starter::Command::START); }
+ // Stops the command by sending a SIGINT first, followed by a SIGKILL if it's
+ // still alive in 1s.
void Stop() { HandleCommand(aos::starter::Command::STOP); }
+ // Stops the command the same way as Stop() does, but updates internal state
+ // to reflect that the application was terminated.
void Terminate();
// Adds a callback which gets notified when the application changes state.
@@ -111,6 +125,7 @@
bool autostart() const { return autostart_; }
bool autorestart() const { return autorestart_; }
+ void set_autorestart(bool autorestart) { autorestart_ = autorestart; }
const std::string &GetStdout();
const std::string &GetStderr();
@@ -124,6 +139,23 @@
memory_cgroup_->SetLimit("memory.limit_in_bytes", limit);
}
+ // Sets the cgroup and memory limit to a pre-existing cgroup which is
+ // externally managed. This lets us configure the cgroup of an application
+ // without root access.
+ void SetExistingCgroupMemoryLimit(std::string_view name, size_t limit) {
+ if (!memory_cgroup_) {
+ memory_cgroup_ = std::make_unique<MemoryCGroup>(
+ name, MemoryCGroup::Create::kDoNotCreate);
+ }
+ memory_cgroup_->SetLimit("memory.limit_in_bytes", limit);
+ }
+
+ // Observe a timing report message, and save it if it is relevant to us.
+ // It is the responsibility of the caller to manage this, because the lifetime
+ // of the Application itself is such that it cannot own Fetchers readily.
+ void ObserveTimingReport(const aos::monotonic_clock::time_point send_time,
+ const aos::timing::Report *msg);
+
private:
typedef aos::util::ScopedPipe::PipePair PipePair;
@@ -192,6 +224,12 @@
aos::TimerHandler *start_timer_, *restart_timer_, *stop_timer_, *pipe_timer_,
*child_status_handler_;
+ // Version string from the most recent valid timing report for this
+ // application. Cleared when the application restarts.
+ std::optional<std::string> latest_timing_report_version_;
+ aos::monotonic_clock::time_point last_timing_report_ =
+ aos::monotonic_clock::min_time;
+
std::vector<std::function<void()>> on_change_;
std::unique_ptr<MemoryCGroup> memory_cgroup_;
diff --git a/aos/starter/subprocess_test.cc b/aos/starter/subprocess_test.cc
index 4aa371b..cab22b4 100644
--- a/aos/starter/subprocess_test.cc
+++ b/aos/starter/subprocess_test.cc
@@ -1,5 +1,8 @@
#include "aos/starter/subprocess.h"
+#include <signal.h>
+#include <sys/types.h>
+
#include "gtest/gtest.h"
#include "aos/events/shm_event_loop.h"
@@ -179,4 +182,47 @@
EXPECT_EQ(aos::starter::State::STOPPED, error_out.status());
}
+// Tests that Nothing Bad™ happens if the event loop outlives the Application.
+//
+// Note that this is a bit of a hope test, as there is no guarantee that we
+// will trigger a crash even if the resources tied to the event loop in the
+// aos::Application aren't properly released.
+TEST_F(SubprocessTest, ShortLivedApp) {
+ const std::string config_file =
+ ::aos::testing::ArtifactPath("aos/events/pingpong_config.json");
+
+ aos::FlatbufferDetachedBuffer<aos::Configuration> config =
+ aos::configuration::ReadConfig(config_file);
+ aos::ShmEventLoop event_loop(&config.message());
+
+ auto application =
+ std::make_unique<Application>("sleep", "sleep", &event_loop, []() {});
+ application->set_args({"10"});
+ application->Start();
+ pid_t pid = application->get_pid();
+
+ int ticks = 0;
+ aos::TimerHandler *exit_timer = event_loop.AddTimer([&event_loop, &ticks,
+ &application, pid]() {
+ ticks++;
+ if (application && application->status() == aos::starter::State::RUNNING) {
+ // Kill the application, it will autorestart.
+ kill(pid, SIGTERM);
+ application.reset();
+ }
+
+ // event loop lives for longer.
+ if (ticks >= 5) {
+ // Now we exit.
+ event_loop.Exit();
+ }
+ });
+
+ event_loop.OnRun([&event_loop, exit_timer]() {
+ exit_timer->Schedule(event_loop.monotonic_now(),
+ std::chrono::milliseconds(1000));
+ });
+
+ event_loop.Run();
+}
} // namespace aos::starter::testing
diff --git a/aos/test_init.rs b/aos/test_init.rs
new file mode 100644
index 0000000..a8380e2
--- /dev/null
+++ b/aos/test_init.rs
@@ -0,0 +1,21 @@
+use aos_init::init;
+
+autocxx::include_cpp! (
+#include "aos/testing/tmpdir.h"
+
+safety!(unsafe)
+
+generate!("aos::testing::SetTestShmBase")
+);
+
+// TODO(Brian): Should we provide a proc macro attribute that handles calling this?
+/// Initializes things for a test.
+///
+/// # Panics
+///
+/// Panics if non-test initialization has already been performed.
+pub fn test_init() {
+ init();
+ ffi::aos::testing::SetTestShmBase();
+ // TODO(Brian): Do we want any of the other stuff that `:gtest_main` has?
+}
diff --git a/aos/testdata/BUILD b/aos/testdata/BUILD
index 8b87682..076e6c9 100644
--- a/aos/testdata/BUILD
+++ b/aos/testdata/BUILD
@@ -27,7 +27,7 @@
"invalid_source_node.json",
"self_forward.json",
],
- visibility = ["//aos:__pkg__"],
+ visibility = ["//visibility:public"],
)
genrule(
diff --git a/aos/testing/BUILD b/aos/testing/BUILD
index 787fdda..02459e4 100644
--- a/aos/testing/BUILD
+++ b/aos/testing/BUILD
@@ -8,6 +8,7 @@
visibility = ["//visibility:public"],
deps = [
"//aos:init",
+ "//aos/testing:tmpdir",
"@com_github_gflags_gflags//:gflags",
"@com_github_google_glog//:glog",
"@com_google_googletest//:gtest",
@@ -100,6 +101,9 @@
hdrs = ["tmpdir.h"],
target_compatible_with = ["@platforms//os:linux"],
visibility = ["//visibility:public"],
+ deps = [
+ "//aos/ipc_lib:shm_base",
+ ],
)
cc_library(
diff --git a/aos/testing/gtest_main.cc b/aos/testing/gtest_main.cc
index 21f141f..2fb77fe 100644
--- a/aos/testing/gtest_main.cc
+++ b/aos/testing/gtest_main.cc
@@ -7,6 +7,7 @@
#include "gtest/gtest.h"
#include "aos/init.h"
+#include "aos/testing/tmpdir.h"
DEFINE_bool(print_logs, false,
"Print the log messages as they are being generated.");
@@ -14,7 +15,6 @@
"Print all log messages to FILE instead of standard output.");
namespace aos {
-void SetShmBase(const std::string_view base) __attribute__((weak));
namespace testing {
@@ -48,12 +48,7 @@
// Point shared memory away from /dev/shm if we are testing. We don't care
// about RT in this case, so if it is backed by disk, we are fine.
- if (::aos::SetShmBase) {
- const char *tmpdir_c_str = getenv("TEST_TMPDIR");
- if (tmpdir_c_str != nullptr) {
- aos::SetShmBase(tmpdir_c_str);
- }
- }
+ aos::testing::SetTestShmBase();
return RUN_ALL_TESTS();
}
diff --git a/aos/testing/test_shm.cc b/aos/testing/test_shm.cc
index 0787e5a..8dc189a 100644
--- a/aos/testing/test_shm.cc
+++ b/aos/testing/test_shm.cc
@@ -2,6 +2,8 @@
#include <sys/mman.h>
+#include "gflags/gflags.h"
+
#include "aos/logging/logging.h"
#include "aos/testing/test_logging.h"
diff --git a/aos/testing/tmpdir.cc b/aos/testing/tmpdir.cc
index 15e3c13..7287603 100644
--- a/aos/testing/tmpdir.cc
+++ b/aos/testing/tmpdir.cc
@@ -3,16 +3,24 @@
#include <cstdlib>
#include <string>
+#include "aos/ipc_lib/shm_base.h"
+
namespace aos {
namespace testing {
-std::string TestTmpDir() {
+namespace {
+std::string TestTmpDirOr(std::string fallback) {
const char *tmp_dir = std::getenv("TEST_TMPDIR");
if (tmp_dir != nullptr) {
return tmp_dir;
}
- return "/tmp";
+ return fallback;
}
+} // namespace
+
+std::string TestTmpDir() { return TestTmpDirOr("/tmp"); }
+
+void SetTestShmBase() { SetShmBase(TestTmpDirOr(FLAGS_shm_base)); }
} // namespace testing
} // namespace aos
diff --git a/aos/testing/tmpdir.h b/aos/testing/tmpdir.h
index 7e64342..8eabf86 100644
--- a/aos/testing/tmpdir.h
+++ b/aos/testing/tmpdir.h
@@ -9,6 +9,10 @@
// Returns a usable temporary directory.
std::string TestTmpDir();
+// Sets shm_base to a folder inside of TEST_TMPDIR if set, or --shm_base
+// otherwise.
+void SetTestShmBase();
+
} // namespace testing
} // namespace aos
diff --git a/aos/util/file.cc b/aos/util/file.cc
index 52657a9..973ab3c 100644
--- a/aos/util/file.cc
+++ b/aos/util/file.cc
@@ -200,11 +200,19 @@
PCHECK(file_.get() != -1) << ": opening " << filename;
}
-absl::Span<char> FileReader::ReadContents(absl::Span<char> buffer) {
+std::optional<absl::Span<char>> FileReader::ReadContents(
+ absl::Span<char> buffer) {
PCHECK(0 == lseek(file_.get(), 0, SEEK_SET));
const ssize_t result = read(file_.get(), buffer.data(), buffer.size());
+ if (result < 0) {
+ // Read timeout for an i2c request returns this.
+ if (errno == EREMOTEIO) {
+ return std::nullopt;
+ }
+ }
+
PCHECK(result >= 0);
- return {buffer.data(), static_cast<size_t>(result)};
+ return absl::Span<char>{buffer.data(), static_cast<size_t>(result)};
}
FileWriter::FileWriter(std::string_view filename, mode_t permissions)
@@ -216,21 +224,25 @@
// absl::SimpleAtoi doesn't interpret a leading 0x as hex, which we need here.
// Instead, we use the flatbufers API, which unfortunately relies on NUL
// termination.
-int32_t FileReader::ReadInt32() {
+std::optional<int32_t> FileReader::ReadInt32() {
// Maximum characters for a 32-bit integer, +1 for the NUL.
// Hex is the same size with the leading 0x.
std::array<char, 11> buffer;
int32_t result;
- const auto string_span =
+ const std::optional<absl::Span<char>> string_span =
ReadContents(absl::Span<char>(buffer.data(), buffer.size())
.subspan(0, buffer.size() - 1));
+ if (!string_span.has_value()) {
+ return std::nullopt;
+ }
+
// Verify we found the newline.
- CHECK_EQ(buffer[string_span.size() - 1], '\n');
+ CHECK_EQ(buffer[string_span->size() - 1], '\n');
// Truncate the newline.
- buffer[string_span.size() - 1] = '\0';
+ buffer[string_span->size() - 1] = '\0';
CHECK(flatbuffers::StringToNumber(buffer.data(), &result))
<< ": Error parsing string to integer: "
- << std::string_view(string_span.data(), string_span.size());
+ << std::string_view(string_span->data(), string_span->size());
return result;
}
diff --git a/aos/util/file.h b/aos/util/file.h
index da5b29f..e825e4c 100644
--- a/aos/util/file.h
+++ b/aos/util/file.h
@@ -64,18 +64,22 @@
public:
FileReader(std::string_view filename);
// Reads the entire contents of the file into the internal buffer and returns
- // a string_view of it.
- // Note: The result may not be null-terminated.
- absl::Span<char> ReadContents(absl::Span<char> buffer);
+ // a string_view of it. Returns nullopt if we failed to read the contents
+ // (currently only on EREMOTEIO). Note: The result may not be null-terminated.
+ std::optional<absl::Span<char>> ReadContents(absl::Span<char> buffer);
// Returns the value of the file as a string, for a fixed-length file.
// Returns nullopt if the result is smaller than kSize. Ignores any
// bytes beyond kSize.
template <int kSize>
std::optional<std::array<char, kSize>> ReadString() {
std::array<char, kSize> result;
- const absl::Span<char> used_span =
+ const std::optional<absl::Span<char>> used_span =
ReadContents(absl::Span<char>(result.data(), result.size()));
- if (used_span.size() == kSize) {
+ if (!used_span.has_value()) {
+ return std::nullopt;
+ }
+
+ if (used_span->size() == kSize) {
return result;
} else {
return std::nullopt;
@@ -83,8 +87,8 @@
}
// Returns the value of the file as an integer. Crashes if it doesn't fit in a
// 32-bit integer. The value may start with 0x for a hex value, otherwise it
- // must be base 10.
- int32_t ReadInt32();
+ // must be base 10. Returns nullopt if we failed to read the file.
+ std::optional<int32_t> ReadInt32();
private:
aos::ScopedFD file_;
diff --git a/aos/util/file_test.cc b/aos/util/file_test.cc
index ec4bfe4..905a2cd 100644
--- a/aos/util/file_test.cc
+++ b/aos/util/file_test.cc
@@ -91,10 +91,10 @@
aos::ScopedRealtime realtime;
{
std::array<char, 20> contents;
- absl::Span<char> read_result =
+ std::optional<absl::Span<char>> read_result =
reader.ReadContents({contents.data(), contents.size()});
EXPECT_EQ("123456789\n",
- std::string_view(read_result.data(), read_result.size()));
+ std::string_view(read_result->data(), read_result->size()));
}
{
std::optional<std::array<char, 10>> read_result = reader.ReadString<10>();
diff --git a/aos/util/scoped_pipe.cc b/aos/util/scoped_pipe.cc
index e2e0ca5..d64fad1 100644
--- a/aos/util/scoped_pipe.cc
+++ b/aos/util/scoped_pipe.cc
@@ -55,8 +55,8 @@
read(fd(), buffer->data() + buffer->size() - kBufferSize, kBufferSize);
if (result == -1) {
if (errno == EAGAIN || errno == EWOULDBLOCK) {
- buffer->resize(original_size);
- return 0;
+ buffer->resize(original_size + read_bytes);
+ return read_bytes;
}
PLOG(FATAL) << "Error on reading pipe.";
} else if (result < kBufferSize) {
diff --git a/aos/util/scoped_pipe_test.cc b/aos/util/scoped_pipe_test.cc
index c71e272..0e024c2 100644
--- a/aos/util/scoped_pipe_test.cc
+++ b/aos/util/scoped_pipe_test.cc
@@ -44,6 +44,21 @@
}
}
+// Tests using string read/write methods on the ScopedPipe objects.
+TEST(ScopedPipeTest, StringPipe2048) {
+ ScopedPipe::PipePair pipe = ScopedPipe::MakePipe();
+ std::string buffer;
+ ASSERT_EQ(0u, pipe.read->Read(&buffer))
+ << "Shouldn't get anything on empty read.";
+ ASSERT_TRUE(buffer.empty());
+
+ std::string a(2048, 'a');
+ pipe.write->Write(absl::Span<const uint8_t>(
+ reinterpret_cast<const uint8_t *>(a.data()), a.size()));
+ ASSERT_EQ(2048u, pipe.read->Read(&buffer));
+ ASSERT_EQ(a, buffer);
+}
+
// Tests that calling SetCloexec succeeds and does indeed set FD_CLOEXEC.
TEST(ScopedPipeTest, SetCloexec) {
ScopedPipe::PipePair pipe = ScopedPipe::MakePipe();
diff --git a/debian/BUILD b/debian/BUILD
index e2a8541..d8f1753 100644
--- a/debian/BUILD
+++ b/debian/BUILD
@@ -112,6 +112,27 @@
)
download_packages(
+ name = "download_rsync_packages",
+ excludes = [
+ ],
+ packages = [
+ "rsync",
+ ],
+)
+
+download_packages(
+ name = "download_ssh_packages",
+ excludes = [
+ "libcbor0.8",
+ "libsemanage2",
+ "libsepol2",
+ ],
+ packages = [
+ "ssh",
+ ],
+)
+
+download_packages(
name = "download_gtk_runtime",
excludes = [
"libstdc++6",
@@ -425,6 +446,7 @@
exports_files([
"ssh_wrapper.sh",
+ "rsync_wrapper.sh",
"curl.BUILD",
"BUILD.zlib.bazel",
])
diff --git a/debian/packages.bzl b/debian/packages.bzl
index e6e34a6..babc0c0 100644
--- a/debian/packages.bzl
+++ b/debian/packages.bzl
@@ -24,7 +24,7 @@
# 6. Add a new "new_http_archive" entry to the WORKSPACE file for the tarball
# you just uploaded.
-def download_packages(name, packages, excludes = [], force_includes = [], force_excludes = [], target_compatible_with = None):
+def download_packages(name, packages, excludes = [], force_includes = [], force_excludes = [], target_compatible_with = None, release = "bullseye"):
"""Downloads a set of packages as well as their dependencies.
You can also specify excludes in case some of the dependencies are meta
@@ -65,8 +65,8 @@
# --- end runfiles.bash initialization v2 ---
-exec "$$(rlocation org_frc971/debian/download_packages)" %s %s %s %s "$$@"
-END""" % (force_includes, force_excludes, excludes_list, package_list),
+exec "$$(rlocation org_frc971/debian/download_packages)" %s %s %s %s --release=%s "$$@"
+END""" % (force_includes, force_excludes, excludes_list, package_list, release),
target_compatible_with = target_compatible_with,
)
native.sh_binary(
diff --git a/debian/rsync.BUILD b/debian/rsync.BUILD
index 4fe45b1..d761ace 100644
--- a/debian/rsync.BUILD
+++ b/debian/rsync.BUILD
@@ -1,5 +1,26 @@
-filegroup(
+genrule(
+ name = "copy_rsync_wrapper",
+ srcs = ["@//debian:rsync_wrapper.sh"],
+ outs = ["rsync_wrapper.sh"],
+ cmd = "cp $< $@",
+)
+
+sh_binary(
name = "rsync",
- srcs = ["usr/bin/rsync"],
+ srcs = [
+ "rsync_wrapper.sh",
+ ],
+ data = [
+ "usr/bin/rsync",
+ ":libs",
+ "@bazel_tools//tools/bash/runfiles",
+ ],
visibility = ["//visibility:public"],
)
+
+filegroup(
+ name = "libs",
+ srcs = glob([
+ "usr/lib/x86_64-linux-gnu/**",
+ ]),
+)
diff --git a/debian/rsync.bzl b/debian/rsync.bzl
index 6be610d..768a98c 100644
--- a/debian/rsync.bzl
+++ b/debian/rsync.bzl
@@ -1,3 +1,9 @@
files = {
- "rsync_3.1.2-1+deb9u2_amd64.deb": "f2987623a6e5b5aedf56e679bedd2e6f7d54cdb4815ac4149c4d135da16ff9c2",
+ "init-system-helpers_1.60_all.deb": "43420922c5e3aa747f8854236bf381a35179bba3885b242edb104751dad20644",
+ "liblz4-1_1.9.3-2_amd64.deb": "79ac6e9ca19c483f2e8effcc3401d723dd9dbb3a4ae324714de802adb21a8117",
+ "libpopt0_1.18-2_amd64.deb": "2f9fca8afbc5a18211ad46fa5f5df323662ba4d5212fa36bfc30dd551ae86b28",
+ "libssl1.1_1.1.1n-0+deb11u5_amd64.deb": "08be73a6a5454a8978c5a71ea5ca4b3a6909ce6cc927890729ebd6f9af12d9d8",
+ "libxxhash0_0.8.0-2_amd64.deb": "3fb82550a71d27d05672472508548576dfb34486847bc860d3066cda5aaf186f",
+ "lsb-base_11.1.0_all.deb": "89ed6332074d827a65305f9a51e591dff20641d61ff5e11f4e1822a9987e96fe",
+ "rsync_3.2.3-4+deb11u1_amd64.deb": "2ba613ac761266dcfc8518b06c257c790c4d02ea311163829871dc38fbf08ba0",
}
diff --git a/debian/rsync_wrapper.sh b/debian/rsync_wrapper.sh
new file mode 100755
index 0000000..9a8d4ba
--- /dev/null
+++ b/debian/rsync_wrapper.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# --- begin runfiles.bash initialization v2 ---
+# Copy-pasted from the Bazel Bash runfiles library v2.
+set -uo pipefail; f=bazel_tools/tools/bash/runfiles/runfiles.bash
+source "${RUNFILES_DIR:-/dev/null}/$f" 2>/dev/null || \
+ source "$(grep -sm1 "^$f " "${RUNFILES_MANIFEST_FILE:-/dev/null}" | cut -f2- -d' ')" 2>/dev/null || \
+ source "$0.runfiles/$f" 2>/dev/null || \
+ source "$(grep -sm1 "^$f " "$0.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \
+ source "$(grep -sm1 "^$f " "$0.exe.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \
+ { echo>&2 "ERROR: cannot find $f"; exit 1; }; f=; set -e
+# --- end runfiles.bash initialization v2 ---
+
+LIB_PATH="$(rlocation rsync/usr/lib/x86_64-linux-gnu/libcrypto.so.1.1)"
+LIB_PATH="${LIB_PATH%/libcrypto.so.1.1}"
+export LD_LIBRARY_PATH="${LIB_PATH}"
+
+
+TOOL_PATH="$(rlocation rsync/usr/bin/rsync)"
+
+exec "${TOOL_PATH}" "$@"
diff --git a/debian/ssh.bzl b/debian/ssh.bzl
index 77b913c..9150540 100644
--- a/debian/ssh.bzl
+++ b/debian/ssh.bzl
@@ -1,4 +1,42 @@
files = {
- "openssh-client_7.9p1-10+deb10u1_amd64.deb": "1c30bcaf37dafe198783cf691096fe557e8eacbc9435631f51af62b3f705ee12",
- "libssl1.1_1.1.1d-0+deb10u2_amd64.deb": "31c15130e0e4b2c907ef7cd92e50be23320a22c0c3b54e130b5258fe6bd8df2d",
+ "adduser_3.118_all.deb": "bd71dd1ab8dcd6005390708f23741d07f1913877affb7604dfd55f85d009aa2b",
+ "cdebconf_0.260_amd64.deb": "f54b3308b019b3d4246f5ae114d7d8a489e26dcdc80db85fc8eec089fc31acb6",
+ "coreutils_8.32-4+b1_amd64.deb": "3558a412ab51eee4b60641327cb145bb91415f127769823b68f9335585b308d4",
+ "init-system-helpers_1.60_all.deb": "43420922c5e3aa747f8854236bf381a35179bba3885b242edb104751dad20644",
+ "libattr1_2.4.48-6_amd64.deb": "af3c3562eb2802481a2b9558df1b389f3c6d9b1bf3b4219e000e05131372ebaf",
+ "libbsd0_0.11.3-1_amd64.deb": "284a7b8dcfcad74770f57360721365317448b38ab773db542bf630e94e60c13e",
+ "libdb5.3_5.3.28+dfsg1-0.8_amd64.deb": "00b9e63e287f45300d4a4f59b6b88e25918443c932ae3e5845d5761ae193c530",
+ "libdebian-installer4_0.121_amd64.deb": "b97d270ea3588bcff281b2595ff89752e0f829e54c8c7a4cc9a46492692cd0fd",
+ "libedit2_3.1-20191231-2+b1_amd64.deb": "ac545f6ad10ba791aca24b09255ad1d6d943e6bc7c5511d5998e104aee51c943",
+ "libfido2-1_1.6.0-2_amd64.deb": "e9efa10b4e45e8c941883b4bea29bc68dc5a298b8619bd786a7ca1675ca0c197",
+ "libgcrypt20_1.8.7-6_amd64.deb": "7a2e0eef8e0c37f03f3a5fcf7102a2e3dc70ba987f696ab71949f9abf36f35ef",
+ "libgmp10_6.2.1+dfsg-1+deb11u1_amd64.deb": "fc117ccb084a98d25021f7e01e4dfedd414fa2118fdd1e27d2d801d7248aebbc",
+ "libgpg-error0_1.38-2_amd64.deb": "16a507fb20cc58b5a524a0dc254a9cb1df02e1ce758a2d8abde0bc4a3c9b7c26",
+ "liblz4-1_1.9.3-2_amd64.deb": "79ac6e9ca19c483f2e8effcc3401d723dd9dbb3a4ae324714de802adb21a8117",
+ "libmd0_1.0.3-3_amd64.deb": "9e425b3c128b69126d95e61998e1b5ef74e862dd1fc953d91eebcc315aea62ea",
+ "libncurses6_6.2+20201114-2+deb11u1_amd64.deb": "030173bda939906c849a022823b8b90a1984ccc249a8ee9b54ea1f08acc24e9e",
+ "libncursesw6_6.2+20201114-2+deb11u1_amd64.deb": "4a53efece402caaa798649f74bfb63fe21ac2fbcacf302554e46fbd17e5b30fc",
+ "libnewt0.52_0.52.21-4+b3_amd64.deb": "5a59623010dd29b006335c1bb989303031f1148080b90e76e857cd6298aa546e",
+ "libpam-modules-bin_1.4.0-9+deb11u1_amd64.deb": "abbbd181329c236676222d3e912df13f8d1d90a117559edd997d90006369e5c8",
+ "libpam-modules_1.4.0-9+deb11u1_amd64.deb": "ca1e121700bf4b3eb33e30e0774d3e63e1adae9d4b6a940ea3501225db3cc287",
+ "libpam-runtime_1.4.0-9+deb11u1_all.deb": "d98a68a56386d0992446417a4ee9fa685ebe841e81303a7d4f45cdd4c133c3f6",
+ "libpam0g_1.4.0-9+deb11u1_amd64.deb": "496771218fb585bb716fdae6ef8824dbfb5d544b4fa2f3cd4d0e4d7158ae2220",
+ "libprocps8_3.3.17-5_amd64.deb": "0a60017f0229cd4eec95b9f354c68312cc4ca4770ba8c01f545fd9c02b34e8a0",
+ "libslang2_2.3.2-5_amd64.deb": "107ad70aba3dc4dab2bc0fe11c3dd7c2afe9549dd45f4da3f4cf9d360e171eba",
+ "libssl1.1_1.1.1n-0+deb11u5_amd64.deb": "08be73a6a5454a8978c5a71ea5ca4b3a6909ce6cc927890729ebd6f9af12d9d8",
+ "libsystemd0_247.3-7+deb11u4_amd64.deb": "e6f3e65e388196a399c1a36564c38ad987337350358732056227db1b6e708878",
+ "libtextwrap1_0.1-14.2_amd64.deb": "6626eee49a3ad10c596955f1180bee6c937f5e9ea1404085516a29010ab8bd23",
+ "libtinfo6_6.2+20201114-2+deb11u1_amd64.deb": "92ac8a8c12f02a6fe08c47f33ea6fb313e1b9480484e26eab34e2058ea59fdb4",
+ "libudev1_247.3-7+deb11u4_amd64.deb": "9274ca1aa37fcdf5895dad1de0895162351099ef8dff8a62f2f4c9eb181a8fce",
+ "libwrap0_7.6.q-31_amd64.deb": "c6aa9c653857d807cff31682b5158722e8b16eeb3cec443d34d6eba52312e701",
+ "lsb-base_11.1.0_all.deb": "89ed6332074d827a65305f9a51e591dff20641d61ff5e11f4e1822a9987e96fe",
+ "openssh-client_8.4p1-5+deb11u1_amd64.deb": "5b908fa946a425c6f2b4e4b234f4e42ae61c35c6655dc06b9746145e09b6cb2f",
+ "openssh-server_8.4p1-5+deb11u1_amd64.deb": "7118fe69fee753a08480c97d9d873dfca7c7a6972366515ae26d8428301a0dea",
+ "openssh-sftp-server_8.4p1-5+deb11u1_amd64.deb": "b77715c95ef748aad458d1291df307bbe4ed2e6099d02f2c47dae961bd175cb7",
+ "passwd_4.8.1-1_amd64.deb": "542593f26502e87b4276fa778e6e3ae52e66b973979986fff77803d9fcb2c2e8",
+ "procps_3.3.17-5_amd64.deb": "ac8edf0517abe09637c36651cb6a59e10948b2879f3af9003b9145b2128a7a08",
+ "runit-helper_2.10.3_all.deb": "fb8add1955628b2ad896318553ac1e3bc7cfa2a7058e9c9dbaa23baa21fc53a7",
+ "sensible-utils_0.0.14_all.deb": "b9a447dc4ec8714196b037e20a2209e62cd669f5450222952f259bda4416b71f",
+ "ssh_8.4p1-5+deb11u1_all.deb": "3ec5c5a0ea62d534bed2b339a669920d90cf4777899fb90d3a9a45f747c107eb",
+ "ucf_3.0043_all.deb": "ebef6bcd777b5c0cc2699926f2159db08433aed07c50cb321fd828b28c5e8d53",
}
diff --git a/debian/ssh_wrapper.sh b/debian/ssh_wrapper.sh
index 687ed87..3343887 100755
--- a/debian/ssh_wrapper.sh
+++ b/debian/ssh_wrapper.sh
@@ -11,8 +11,8 @@
{ echo>&2 "ERROR: cannot find $f"; exit 1; }; f=; set -e
# --- end runfiles.bash initialization v2 ---
-LIB_PATH="$(rlocation ssh/usr/lib/x86_64-linux-gnu/libcrypto.so.1.0.0)"
-LIB_PATH="${LIB_PATH%/libcrypto.so.1.0.0}"
+LIB_PATH="$(rlocation ssh/usr/lib/x86_64-linux-gnu/libcrypto.so.1.1)"
+LIB_PATH="${LIB_PATH%/libcrypto.so.1.1}"
export LD_LIBRARY_PATH="${LIB_PATH}"
TOOL_PATH="$(rlocation ssh/%(TOOL))"
diff --git a/documentation/aos/docs/reference.md b/documentation/aos/docs/reference.md
index 05099c7..917b410 100644
--- a/documentation/aos/docs/reference.md
+++ b/documentation/aos/docs/reference.md
@@ -158,7 +158,23 @@
// Pointer to the data.
const void *data;
- // buffer_index and source_boot_uuid members omitted.
+ // Index of the message buffer. This will be in [0, NumberBuffers) on
+ // read_method=PIN channels, and -1 for other channels.
+ //
+ // This only tells you about the underlying storage for this message, not
+ // anything about its position in the queue. This is only useful for advanced
+ // zero-copy use cases, on read_method=PIN channels.
+ //
+ // This will uniquely identify a message on this channel at a point in time.
+ // For senders, this point in time is while the sender has the message. With
+ // read_method==PIN, this point in time includes while the caller has access
+ // to this context. For other read_methods, this point in time may be before
+ // the caller has access to this context, which makes this pretty useless.
+ int buffer_index;
+
+ // UUID of the remote node which sent this message, or this node in the case
+ // of events which are local to this node.
+ UUID source_boot_uuid = UUID::Zero();
// Efficiently copies the flatbuffer into a FlatbufferVector, allocating
// memory in the process. It is vital that T matches the type of the
@@ -781,14 +797,18 @@
3. You need to access the most-recently-sent message that may have been sent
before your process started.
-There are two main options for fetching data with a Fetcher:
+There are four main options for fetching data with a Fetcher:
1. `Fetch()`: Allows you to fetch the latest message on a channel.
2. `FetchNext()`: Allows you to fetch the message immediately after the
previously fetched message. This lets you use a fetcher to observe
every single message on the channel.
+3. `FetchIf()` and `FetchNextIf()`: Identical to the above, but take a
+ predicate that causes the fetcher to only fetch if the next message
+ causes the predicate to return true (e.g., if you only want to fetch
+ messages up until a certain point in time).
-Both `Fetch*()` calls return true if they got a new message and false otherwise,
+All `Fetch*()` calls return true if they got a new message and false otherwise,
making
```cpp
if (fetcher_.Fetch()) {
@@ -1050,6 +1070,10 @@
schedulers, as well as for managing the malloc hook that intercepts attempted
memory allocations.
+Additionally, the ROS2 design docs website has a [reasonable
+introduction](https://design.ros2.org/articles/realtime_background.html)
+to working with realtime code.
+
### Timing Reports
Timing reports are an FlatBuffer message sent out periodically by an
@@ -1252,14 +1276,523 @@
## ShmEventLoop
-## Realtime Code
+The `ShmEventLoop` is an implementation of the `EventLoop` API that is intended
+to run in realtime on a Linux system and uses shared memory to communicate
+between processes. This is currently the only `EventLoop` implementation for
+actually running realtime processes (the `SimulatedEventLoop` being the only
+other implementation of the `EventLoop` API, and it is only meant for
+simulation).
-The ROS2 design docs website has a [reasonable
-introduction](https://design.ros2.org/articles/realtime_background.html)
-to working with realtime code.
+Additionally, there are a set of common utilities & applications that go with
+the `ShmEventLoop` to form a complete realtime AOS system:
+
+* The `message_bridge_server` and `message_bridge_client` processes use the
+ `ShmEventLoop` to read data from shared memory channels, forward it to other
+ node(s) over SCTP, and then republish the data onto the shared memory channels
+ on the other node(s).
+* The `starterd` manages starting & stopping applications, and allows
+ dynamically starting, stopping, and viewing the status of applications by
+ way of AOS channels.
+* The logger (which is sometimes configured in specialized ways, but generally
+ uses the `LogWriter` class).
+* `aos_dump`, `aos_send`, `aos_starter`, `aos_timing_report_streamer`, and other
+ utilities allow engineers to debug the system at the command line by exposing
+ AOS channels in various ways (see [Command-line Utilities](#command-line-utilities)).
+
+The `ShmEventLoop` does a few key things beyond just implementing what is
+required by the `EventLoop` API:
+
+* Has `Run()` and `Exit()` calls to directly start/stop execution of the event
+ loop.
+* Does signal handling to capture `SIGINT` and automatically exit.
+* Allows controlling the scheduling of the process (realtime priority, CPU
+ affinity)---technically these are actually part of the `EventLoop` API,
+ but are meaningless for any other `EventLoop` implementations.
+* Exposes the `EPoll` implementation used to back the `EventLoop` (this
+ becomes useful when wanting to interact with sockets, files, and sometimes
+ even other event loops/schedulers).
+
+### Design of the `ShmEventLoop`
+
+The goal of the `ShmEventLoop` is to provide an `EventLoop` implementation that
+can run on most Linux systems, and which has good enough performance to support
+soft/firm real-time system where large amounts of data (e.g., raw images) may
+need to be moved over the pub-sub channels. In order to implement the core
+`EventLoop` API, we use two things:
+
+1. An IPC (InterProcess Communication) library using a lockless shared-memory
+ queue for managing the pub-sub channels.
+2. `epoll` for managing event scheduling.
+
+For each `EventLoop` feature, this means:
+
+* `Watcher`s are implemented by having the sending event loop general signals,
+ where each `Watcher` then has an associated `signalfd` that we use `epoll` to
+ watch and wakeup on when the new messages arrived.
+* `Timer`s and `PhasedLoop`s are implemented using `timerfd` and `epoll` to
+ wakeup when the timers expire.
+* `Sender`s and `Fetcher`s do not have to directly interact with the event
+ scheduling (beyond generating signals on sends, in the case of `Sender`s),
+ but do use the IPC library.
+
+Outside of the "core" `EventLoop` API, the `ShmEventLoop` is also responsible
+for setting the current process priority, pinning it to the requested CPU
+core(s), and doing things like preventing calls to `malloc` while the
+`ShmEventLoop` is [Running](#running).
+
+For additional detail on the underlying IPC design, reference the [Design
+Doc](https://docs.google.com/document/d/10xulameLtEqjBFkm54UcN-5N-w5Q_XFNILvNf1Jl1Y4/edit#heading=h.y9blqcmsacou)
+and the code at `//aos/ipc_lib:lockless_queue`.
+
+### IPC Performance Considerations
+
+This section provides a discussion of how to use the `EventLoop` API in
+situations where you may have strong performance constraints (e.g.,
+you are processing large numbers of camera images that require transferring
+large amounts of data).
+
+Some discussion in this section can theoretically apply to any `EventLoop`
+implementation---everything discussed here just uses the generic `EventLoop`
+API and does not actually require doing anything that is specific to the
+`ShmEventLoop`. However, it is useful to discuss how the actual implementation
+works to understand the performance implications of different options.
+
+At a high level, the most common issues which we have observed regarding
+performance are:
+
+1. Copying messages can be very expensive, particularly for raw camera images;
+ for this reason, the `EventLoop` API provides zero-copy constructs that
+ allow the sender to construct messages in-place in shared memory and for
+ readers to read the message directly from shared memory without copying
+ it first. There are a variety of places where the "normal" APIs will do
+ copies by default, as well as some convenience APIs that are more rarely
+ used but which do do extraneous copies.
+2. For processes which must consume large numbers of individual messages (e.g.,
+ a logger), using `Watcher`s may get expensive due to the cost of waking
+ up for every signal. Using `Fetcher`s + a polling timer can mitigate this.
+3. Be aware that querying the clocks on the `ShmEventLoop` with
+ `monotonic_now()` and `realtime_now()` calls will actually query the system
+ clocks---if you want the time of the current event, use the [Event Loop
+ Context](#event-loop-context) (note that "doing too many clock queries"
+ is not typically a major issue).
+
+#### Avoiding Copies
+
+Copying around large chunks of memory can be expensive. As such, the `EventLoop`
+API is designed to allow you to avoid extraneous copies if you do not want to do
+so. This does mean paying attention to what you are doing at each step of the
+process.
+
+*Sending*: When sending a message, you should make sure to use the
+`aos::Sender::MakeBuilder()` call, which provides a `FlatBufferBuilder` that
+constructs your message in-place in shared-memory (the `Sender` will acquire
+ownership of a single slot in the shared memory queue, and leave the message
+unpublished until you call `Send`). If you need to fill your message with
+a large blob (e.g., an image), you can use the
+[CreateUninitializedVector](https://flatbuffers.dev/classflatbuffers_1_1_flat_buffer_builder.html#a2305b63d367845972b51669dd995cc50)
+method to get a pointer to a fixed-length buffer where you can fill in your
+data. Be aware that the `aos::Sender::Send()` method which takes a
+`NonSizePrefixedFlatbuffer` will do a copy, because it takes a FlatBuffer
+which has been constructed outside if its shared memory buffer. This is distinct
+from the `aos::Sender::Builder::Send()` calls, which assume that you have built
+your flatbuffer up using the provided `Builder` and so don't need to do extra
+copies---the `aos::Sender::Builder::Send()` calls are what almost all existing
+code uses. As an aside,
+because `Sender`s must construct messages in-place, there is a configurable
+limit on the maximum number of senders per-channel. If we allowed arbitrarily
+many senders per channel, then they could consume all of the slots in the
+shared memory queue and prevent any messages from actually flowing.
+
+*Receving*: By default, `Fetcher`s and `Watcher`s will copy a message on
+receipt. This allows us to allow arbitrarily many processes to be fetching on a
+given channel by default. However, if the `read_method` for a channel is set to
+`PIN` in the [configuration](#Configurations) then each reader will acquire a
+slot in the shared memory queue (this causes there to be a limit on the maximum
+number of allowed readers). If `PIN`d, then when you read the message you are
+reading directly from shared memory. Note that there is an independent
+`num_watchers` setting in the configuration for a channel; this maximum exists
+because the shared memory queue must have a fixed-length list of processes to
+notify when a message is sent.
+
+*`NoArgWatcher`s*: Regardless of whether you want zero-copy performance or not,
+being aware of `NoArgWatcher`s is useful as they allow you to receive a
+callback when a message is received with needing to actually copy the message
+out. This is typically paired with a `Fetcher` for that message, although there
+may also be situations where you just need to trigger some work or event
+whenever a channel is sent on, without caring about the message contents. See
+[Watchers](#watchers) for more discussion.
+
+#### High-rate Messages
+
+When dealing with high-rate (usually starting at ~1 kHz) messages, the context
+switches associated with using a watcher to wake up your process on every single
+message can become non-trivial. In applications where you really do want to wake
+up for every single messages, this may still be appropriate, but if instead you
+just need to ensure that you are handling every message in a reasonably timely
+manner it can be much cheaper to set up a periodic timer at a lower frequency
+(e.g., the logger generally polls at ~10 Hz; but even just getting below a
+kilohertz is typically good). In order to ensure that you read every message,
+you can then use a [Fetcher](#fetchers) with `FetchNext()` to iterate over
+all the messages received in the last polling period.
+
+### Typical `ShmEventLoop` usage
+
+There is a good sample `main()` in the [Getting Started](/getting_started/#writing-the-ping-main);
+the key pattern that is typically followed is:
+
+```cpp
+// We pull in a config that is stored somewhere on disk (as pointed to by
+// a --config flag).
+aos::FlatbufferDetachedBuffer<aos::Configuration> config =
+ aos::configuration::ReadConfig(FLAGS_config);
+
+aos::ShmEventLoop event_loop(&config.message());
+
+// The application being run is provided with an EventLoop*, which enables it to
+// access the EventLoop API without knowing whether it is on a realtime system
+// or just running in simulation.
+aos::Ping ping(&event_loop);
+
+// Actually run the EventLoop. This will block until event_loop.Exit() is
+// called or we receive a signal to exit (e.g., a Ctrl-C on the command line).
+event_loop.Run();
+```
+
+### Running (and Exiting) the `ShmEventLoop`
+
+In the vast majority of cases, your code will simply call `Run()` on the
+`ShmEventLoop` and not have to think about it any more. However, there are
+situations where you need to worry about more complexity.
+
+A few things to be aware of:
+
+* When the `ShmEventLoop` is [Running](#running), the process priority will be
+ elevated and if the priority is real-time it will die on attempted `malloc`s.
+ I.e., "Running" == Real-time.
+* If you wish to programmatically exit your process (e.g., it is a one-shot
+ process that just does some work and exits), use the `ExitHandle` API.
+ This way you can use `ShmEventLoop::MakeExitHandle` to provide your
+ application with a way to exit the process at runtime, and then use a
+ different `ExitHandle` implementation for testing.
+
+### `EPoll` interface
+
+As previously mentioned, the `ShmEventLoop` uses `epoll` under the hood to
+trigger wakeups on new events. Typically, you do not need to care about this;
+however, the `EPoll` class is exposed from the `ShmEventLoop` in order to enable
+you to interact with the system outside of the standard `EventLoop` API while
+still having your callbacks and events occur within the main thread of the
+`EventLoop`. In order to enable testing & simulation, most applications that do
+this will take an abstract `EventLoop*` and an `EPoll*` in their constructor;
+however, for cases where the application truly only makes sense to run against
+a `ShmEventLoop`, it may take a `ShmEventLoop` directly. By breaking out of
+the normal `EventLoop` API your application will become harder to simulate,
+replay, etc. However, this is useful in some situations, e.g.:
+
+* An application that must provide a web server of some sort to external
+ applications, such as for a live debugging webpage.
+* Interacting with a CAN bus, UDP/TCP socket, etc. in order to interface with
+ and control some external system.
+* Building a camera driver that must read/send out camera images when they
+ become available.
+
+## Multi-Node
+
+AOS is built to support distributed systems where applications are running on
+separate devices which communicate with one another. The API is designed on the
+theory that typical applications should not really need to care about what node
+a channel that they are sending/listening on is forwarded to/from. However,
+it is expected that some applications will care and that you will need to care
+about the data flow for the system as a whole.
+
+There are two key things that change when you begin working with a multi-node
+AOS system as compared to a single-node system (or when working within a single
+node on a multi-node system):
+
+1. Clocks & time may not be fully synchronized across nodes. By default, AOS
+ also allows individual nodes on a system to reboot, complicating clock
+ management further.
+2. Message delivery between nodes can be unreliable, and will generally have
+ significantly worse latency than message delivery within a node. Note that
+ there are options to, e.g., allow for reliable message delivery but any
+ network-based messaging will be constrained by the networking setup of your
+ system.
+
+
+See [Configuring Multi-Node Systems](#configuring-multi-node-systems) for
+information on how to set up an AOS configuration to be multi-node.
+
+Note: Currently, while "single-node" is technically a distinct mode from
+"multi-node" in AOS configurations, it is possible that we will deprecate the
+single-node mode in the future in the favor of simply requiring that any
+single-node systems be multi-node systems with one node.
+
+### Message Forwarding & Reliability
+
+Any given channel can be configured to be forwarded between nodes. That channel
+may only be sent on from one node (the `source_node`), and can be forwarded to
+arbitrarily many other nodes. Each individual forwarding connection can be
+configured separately, such that forwarding channel `/foo` from node A to node B
+may be at a higher priority or reliability than from node A to node C. For each
+forwarding connection, you may configure:
+
+* Where & how to log timestamp information associated with the forwarding.
+* The SCTP priority with which to forward the message.
+* Whether to guarantee delivery of the message, akin to TCP ("reliable forwarding").
+* The time to live to set for messages with unreliable forwarding.
+
+As alluded to, message forwarding is currently assumed to be implemented by
+[SCTP](https://en.wikipedia.org/wiki/Stream_Control_Transmission_Protocol) at
+runtime. In simulation, forwarding is entirely deterministic and reliable
+unless the user configures the `SimulatedNetworkBridge` otherwise. Should
+any additional `EventLoop` implementations be added in the future that do
+_not_ use SCTP, they would be expected to provide similar options for
+configuration, although one could reasonable create an implementation that did
+not implement all the possible configurability. For that matter, some SCTP
+implementations may not implement the priority option.
+
+Ordering is guaranteed within a channel---an observer on a channel will never
+observe a newer message before an older message. Ordering between channels is
+not guaranteed. Consider, for instance, two channels:
+
+* Channel `/big_and_slow` sends infrequent but large messages. At t=0sec it
+ sends a 100MB message which takes ~1 sec to transfer across the network.
+* Channel `/small_and_fast` sends frequent but small and high priority messages.
+ At t=0.01sec it sends a 100 byte message which crosses the network in <1ms.
+
+Depending on the SCTP implementation, it is entirely possible that the
+`/small_and_fast` message will arrive at the destination node before the
+`/big_and_slow` message. This is desired behavior (we wouldn't want a
+low-priority channel able to block the delivery of high-priority messages).
+
+Once the messages have arrived on the destination node, ordering will again be
+~guaranteed[^ordering-guarantee] across channels, but with respect to the time
+at which the message was sent in shared memory by the message bridge on
+the current node.
+
+[^ordering-guarantee]: There is actually an outstanding bug where in some
+ corner-cases, ordering is not actually guaranteed. See
+ https://github.com/frc971/971-Robot-Code/issues/29
+
+#### Forwarded Message Metadata
+
+Forwarded messages have additional metadata populated in the [Event Loop
+Context](#event-loop-context). The following fields become relevant for
+forwarded messages:
+
+```cpp
+// For a single-node configuration, these two are identical to *_event_time.
+// In a multinode configuration, these are the times that the message was
+// sent on the original node.
+monotonic_clock::time_point monotonic_remote_time;
+realtime_clock::time_point realtime_remote_time;
+
+// Index into the remote queue. Useful to determine if data was lost. In a
+// single-node configuration, this will match queue_index.
+uint32_t remote_queue_index;
+
+// UUID of the remote node which sent this message, or this node in the case
+// of events which are local to this node.
+UUID source_boot_uuid = UUID::Zero();
+```
+
+The remote times tell you the time at which the message was sent on the original
+node, and represent the clocks _on that node_. As such, in order to meaningfully
+compare the remote times to local times (e.g., if you want to figure out how
+long ago the message was actually sent) you must either (a) trust the realtime
+clocks; or (b) [compensate for the monotonic clock
+offsets](#correlating-monotonic-clocks-across-nodes). If you want the "event"
+time for when your current watcher or the such got triggered, use the regular
+`monotonic_event_time`. The only real use of the `remote_queue_index` would be
+to detect when messages were dropped over the network.
+
+The `source_boot_uuid` can be used to determine if the source node rebooted
+between observing messages. Arbitrarily many messages may have been dropped by
+the forwarding during a reboot, as we cannot guarantee that every single message
+sent from a rebooting node while it is rebooting gets forwarded.
+
+#### Reliable vs. Unreliable Forwarding
+
+"Unreliable" forwarding is essentially what it sounds like---messages will be
+forwarded; if they do not make it within the `time_to_live`, then they will not
+get delivered to the receiving node. Unreliable messages that were sent from the
+sending node prior to the recipient node having become connected to the sending
+node are simply dropped. Unreliable forwarding is generally the default state
+for most channels.
+
+"Reliable" forwarding, on the other hand, carries two primary guarantees beyond
+the normal unreliable messages:
+
+* As long as two nodes are connected, all messages on reliable channels will be
+ forwarded, regardless of how much time must be spent retrying (this generally
+ makes reliable forwarding a poor choice for high-rate, latency-critical
+ messages).
+* When two nodes connect, the most recently sent message (if any) on each
+ reliable channel will get forwarded to the receiving node. This makes it so
+ that, e.g., if you have a central node that sends out a configuration message
+ once on startup then you can make it so that whenever the edge nodes connect
+ they will get the configuration message forwarded to them, even if they were
+ not online when the configuration message was originally published.
+
+Note that if the message bridges on two nodes disconnect and then reconnect in
+the future (without a node reboot occurring), then the semantics of reliable
+messages are similar to what happens on boot. Namely:
+
+* If no reliable messages were sent during the disconnect, then nothing happens.
+* If 1 or more reliable messages were sent during the disconnect, then the
+ latest message will get forwarded to the destination node.
+
+#### Message Bridge
+
+The applications that manage the forwarding of messages between nodes are
+collectively known as the "message bridge." In simulation, this is managed by
+the `SimulatedMessageBridge`, which at it's core is a bunch of
+`std::deque`'s that are used to introduce simulated network latency (and/or
+unreliability) to the simulation. At runtime, the `message_bridge_client` and
+`message_bridge_server` handle forwarding messages from shared memory channels
+to SCTP. The server is responsible for listening to messages sent on the
+`source_node` and forwarding them to the network, while the client is
+responsible for subscribing to the server, receiving messages, and republishing
+them onto the shared memory channels locally.
+
+The message bridge processes themselves are "just" regular AOS processes. They
+use a `ShmEventLoop` to interface with AOS, and do need to break out of the
+normal abstractions to interact with SCTP. The main things which they do which
+do break out of the abstractions which most users should worry about are:
+
+* The `message_bridge_client` uses `RawSender`s to allow itself to send on
+ channels which other applications cannot (since you shouldn't be able to send
+ on channels where you are not the `source_node`). Additionally, when sending
+ these messages, the client manually populates the various remote message
+ metadata.
+* The `message_bridge_server` sends special `RemoteMessage` messages on the
+ remote timestamp channels (the sending of these does not actually require
+ anything special, but the logger treats these messages specially).
+
+Typically, the operation of message bridge should be reasonably transparent to
+most users; however, there are times when it is useful to watch the message
+bridge status messages. The status messages contain a variety of information,
+but the main pieces of information that you are likely to care about are:
+
+* The connection state. Unless the state for a given node is `CONNECTED`, then
+ messages will not be flowing. Note that states for the message bridge client
+ and server are tracked separately, so it is possible that you may be connected
+ to a node such that you are receiving messages from it but not successfully
+ sending to it, or vice-versa.
+* The current estimates of the monotonic clock offsets between the nodes. See
+ [Correlating Monotonic Clocks Across
+ Nodes](#correlating-monotonic-clocks-across-nodes)
+
+### Cross-node Clocks
+
+Dealing with time & clocks across nodes can be tricky. While we do require that
+the monotonic clock on each device (generally corresponding to `CLOCK_MONOTONIC`
+on the system, and virtually always a time since boot) be monotonic, the
+realtime clock (i.e., the clock providing the time since the Unix Epoch) may not
+be monotonic, or even if it is monotonic, it may jump or speed up/slow down
+significantly. If you have NTP (or some similar protocol) set up on your system,
+then you may be able to control the behavior of the clocks more tightly than we
+guarantee, but AOS itself does not provide any guarantees around the realtime
+clock.
+
+Additionally, the log reading code currently makes assumptions about how quickly
+the monotonic clocks on different nodes can drift apart from one another. This
+is currently a [hard-coded value if 1ms /
+sec](https://github.com/frc971/971-Robot-Code/blob/790cb54590e4f28f61e2f1bcd2e6e12ca47d7713/aos/network/timestamp_filter.h#L21-L26)
+(i.e., we assume that over one second of time on one node's clock, the other
+node's clock will have advanced by somewhere between 999 and 1001 milliseconds).
+This number could plausibly be changed, but we have not yet encountered clocks
+actually drifting by enough to require that.
+
+#### Correlating Monotonic Clocks Across Nodes
+
+When dealing with time across nodes, we rely on the ongoing flow of messages to
+try and estimate the offsets between different clocks. The results of this
+estimation are published in the `aos.message_bridge.ServerStatistics` and
+`aos.message_bridge.ClientStatistics` messages (note that the offsets provided
+in these messages may be slightly different, as they are estimated independently).
+These estimates should be reasonable, but are currently not well validated in
+all possible corner cases. If you discover a situation where they are too
+unreliable for your use-case, that would be something we would want to fix.
+
+As an example, consider a situation where you are receiving sensor data from a
+node named `sensor`. This sensor data is on the channel `/input` with a type of
+`SensorData`. You wish to determine how old the sensor data is, but do not have
+accurate realtime clocks set up on your machine. As such, you would have
+something like:
+
+```cpp
+class SensorAgeReader {
+ public:
+ SensorAgeReader(aos::EventLoop *event_loop)
+ : event_loop_(event_loop),
+ clock_offset_fetcher_(
+ event_loop->MakeFetcher<aos::message_bridge::ServerStatistics>(
+ "/aos")) {
+ event_loop_->MakeWatcher(
+ "/input", [this](const SensorData &msg) { HandleSensorData(msg); });
+ }
+
+ void HandleSensorData(const SensorData &msg) {
+ std::chrono::nanoseconds monotonic_offset{0};
+ clock_offset_fetcher_.Fetch();
+ if (clock_offset_fetcher_.get() != nullptr) {
+ for (const auto connection : *clock_offset_fetcher_->connections()) {
+ if (connection->has_node() && connection->node()->has_name() &&
+ connection->node()->name()->string_view() == "sensor") {
+ if (connection->has_monotonic_offset()) {
+ monotonic_offset =
+ std::chrono::nanoseconds(connection->monotonic_offset());
+ } else {
+ // If we don't have a monotonic offset, that means we aren't
+ // connected, in which case we should just exit early.
+ // The ServerStatistics message will always populate statuses for
+ // every node, so we don't have to worry about missing the "sensor"
+ // node (although it can be good practice to check that the node you
+ // are looking for actually exists, to protect against programming
+ // errors).
+ LOG(WARNING) << "Message bridge disconnected.";
+ return;
+ }
+ break;
+ }
+ }
+ } else {
+ LOG(WARNING) << "No message bridge status available.";
+ return;
+ }
+ const aos::monotonic_clock::time_point now = event_loop_->monotonic_now();
+ // The monotonic_remote_time will be the time that the message was sent on
+ // the source node; by offsetting it by the monotonic_offset, we should get
+ // a reasonable estimate of when it was sent. This does not account for any
+ // delays between the sensor reading and when it actually got sent.
+ const aos::monotonic_clock::time_point send_time(
+ event_loop_->context().monotonic_remote_time - monotonic_offset);
+ // Many sensors may include some sort of hardware timestamp indicating when
+ // the measurement was taken, which is likely before the sent time. This can
+ // be populated as a data field inside of the message, and if it is using
+ // the same monotonic clock as AOS is then we can do the same offset
+ // computation, but get a timestamp for when the data was actually captured.
+ const aos::monotonic_clock::time_point capture_time(
+ std::chrono::nanoseconds(msg.hardware_capture_time_ns()) -
+ monotonic_offset);
+ LOG(INFO) << "The sensor data was sent "
+ << aos::time::DurationInSeconds(now - send_time)
+ << " seconds ago.";
+ LOG(INFO) << "The sensor data was read off of the hardware "
+ << aos::time::DurationInSeconds(now - capture_time)
+ << " seconds ago.";
+ }
+
+ aos::EventLoop *event_loop_;
+ aos::Fetcher<aos::message_bridge::ServerStatistics> clock_offset_fetcher_;
+};
+```
## Configurations
+### Configuring Multi-Node Systems
+
## FlatBuffers
See [FlatBuffers](/flatbuffers).
@@ -1272,12 +1805,6 @@
### Sent Too Fast
-## Multi-Node
-
-### Message Bridge
-
-### Correlating Monotonic Clocks Across Nodes
-
## Simulation
## Logging
diff --git a/documentation/aos/examples/BUILD b/documentation/aos/examples/BUILD
new file mode 100644
index 0000000..f224409
--- /dev/null
+++ b/documentation/aos/examples/BUILD
@@ -0,0 +1,18 @@
+load("@com_github_google_flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+
+flatbuffer_cc_library(
+ name = "sensor_data_fbs",
+ srcs = ["sensor_data.fbs"],
+ gen_reflections = True,
+)
+
+cc_library(
+ name = "clock_offset_reader",
+ srcs = ["clock_offset_reader.cc"],
+ hdrs = ["clock_offset_reader.h"],
+ deps = [
+ ":sensor_data_fbs",
+ "//aos/events:event_loop",
+ "//aos/network:message_bridge_server_fbs",
+ ],
+)
diff --git a/documentation/aos/examples/clock_offset_reader.cc b/documentation/aos/examples/clock_offset_reader.cc
new file mode 100644
index 0000000..7859a65
--- /dev/null
+++ b/documentation/aos/examples/clock_offset_reader.cc
@@ -0,0 +1 @@
+#include "documentation/aos/examples/clock_offset_reader.h"
diff --git a/documentation/aos/examples/clock_offset_reader.h b/documentation/aos/examples/clock_offset_reader.h
new file mode 100644
index 0000000..330da4f
--- /dev/null
+++ b/documentation/aos/examples/clock_offset_reader.h
@@ -0,0 +1,80 @@
+#ifndef DOCUMENTATION_AOS_EXAMPLES_CLOCK_OFFSET_READER_H_
+#define DOCUMENTATION_AOS_EXAMPLES_CLOCK_OFFSET_READER_H_
+#include "aos/events/event_loop.h"
+#include "aos/network/message_bridge_server_generated.h"
+#include "documentation/aos/examples/sensor_data_generated.h"
+
+namespace examples {
+
+// This class is a sample that is shown in the markdown documentation.
+// If it needs to get updated, the sample should get updated as well.
+// TODO(james): Get a convenient way to directly include portions of files in
+// markdown so that we don't just manually copy-and-paste the code between
+// spots.
+class SensorAgeReader {
+ public:
+ SensorAgeReader(aos::EventLoop *event_loop)
+ : event_loop_(event_loop),
+ clock_offset_fetcher_(
+ event_loop->MakeFetcher<aos::message_bridge::ServerStatistics>(
+ "/aos")) {
+ event_loop_->MakeWatcher(
+ "/input", [this](const SensorData &msg) { HandleSensorData(msg); });
+ }
+
+ void HandleSensorData(const SensorData &msg) {
+ std::chrono::nanoseconds monotonic_offset{0};
+ clock_offset_fetcher_.Fetch();
+ if (clock_offset_fetcher_.get() != nullptr) {
+ for (const auto connection : *clock_offset_fetcher_->connections()) {
+ if (connection->has_node() && connection->node()->has_name() &&
+ connection->node()->name()->string_view() == "sensor") {
+ if (connection->has_monotonic_offset()) {
+ monotonic_offset =
+ std::chrono::nanoseconds(connection->monotonic_offset());
+ } else {
+ // If we don't have a monotonic offset, that means we aren't
+ // connected, in which case we should just exit early.
+ // The ServerStatistics message will always populate statuses for
+ // every node, so we don't have to worry about missing the "sensor"
+ // node (although it can be good practice to check that the node you
+ // are looking for actually exists, to protect against programming
+ // errors).
+ LOG(WARNING) << "Message bridge disconnected.";
+ return;
+ }
+ break;
+ }
+ }
+ } else {
+ LOG(WARNING) << "No message bridge status available.";
+ return;
+ }
+ const aos::monotonic_clock::time_point now = event_loop_->monotonic_now();
+ // The monotonic_remote_time will be the time that the message was sent on
+ // the source node; by offsetting it by the monotonic_offset, we should get
+ // a reasonable estimate of when it was sent. This does not account for any
+ // delays between the sensor reading and when it actually got sent.
+ const aos::monotonic_clock::time_point send_time(
+ event_loop_->context().monotonic_remote_time - monotonic_offset);
+ // Many sensors may include some sort of hardware timestamp indicating when
+ // the measurement was taken, which is likely before the sent time. This can
+ // be populated as a data field inside of the message, and if it is using
+ // the same monotonic clock as AOS is then we can do the same offset
+ // computation, but get a timestamp for when the data was actually captured.
+ const aos::monotonic_clock::time_point capture_time(
+ std::chrono::nanoseconds(msg.hardware_capture_time_ns()) -
+ monotonic_offset);
+ LOG(INFO) << "The sensor data was sent "
+ << aos::time::DurationInSeconds(now - send_time)
+ << " seconds ago.";
+ LOG(INFO) << "The sensor data was read off of the hardware "
+ << aos::time::DurationInSeconds(now - capture_time)
+ << " seconds ago.";
+ }
+
+ aos::EventLoop *event_loop_;
+ aos::Fetcher<aos::message_bridge::ServerStatistics> clock_offset_fetcher_;
+};
+} // namespace examples
+#endif // DOCUMENTATION_AOS_EXAMPLES_CLOCK_OFFSET_READER_H_
diff --git a/documentation/aos/examples/sensor_data.fbs b/documentation/aos/examples/sensor_data.fbs
new file mode 100644
index 0000000..c90472c
--- /dev/null
+++ b/documentation/aos/examples/sensor_data.fbs
@@ -0,0 +1,7 @@
+namespace examples;
+
+table SensorData {
+ hardware_capture_time_ns:uint64 (id: 0);
+}
+
+root_type SensorData;
diff --git a/frc971/downloader/downloader.py b/frc971/downloader/downloader.py
index d21d8bd..a962d5b 100644
--- a/frc971/downloader/downloader.py
+++ b/frc971/downloader/downloader.py
@@ -126,7 +126,7 @@
os.chmod(os.path.join(temp_dir, "starterd"), 0o775 | stat.S_ISUID)
rsync_cmd = ([
- "external/rsync/usr/bin/rsync",
+ "external/rsync/rsync",
"-e",
ssh_path,
"-c",
diff --git a/y2023/constants.cc b/y2023/constants.cc
index 3d49cf8..c17ffd0 100644
--- a/y2023/constants.cc
+++ b/y2023/constants.cc
@@ -102,7 +102,7 @@
arm_distal->zeroing.one_revolution_distance =
M_PI * 2.0 * constants::Values::kDistalEncoderRatio();
- roll_joint->zeroing.measured_absolute_position = 0.424187348328397;
+ roll_joint->zeroing.measured_absolute_position = 0.919380272775513;
roll_joint->potentiometer_offset =
-(3.87038557084874 - 0.0241774522172967 + 0.0711345168020632 -
0.866186131631967 - 0.0256788357596952 + 0.18101759154572017 -
@@ -111,7 +111,7 @@
0.11972765117321 - 0.318724743041507) +
0.0201047336425017 - 1.0173426655158 - 0.186085272847293 -
0.0317706563397807 - 2.6357823523782 + 0.871932806570122 +
- 1.09682107821155;
+ 1.09682107821155 - 0.193945964842277;
wrist->subsystem_params.zeroing_constants.measured_absolute_position =
0.886183343417664;
diff --git a/y2023/vision/image_logger.cc b/y2023/vision/image_logger.cc
index 1b457f9..45e25f6 100644
--- a/y2023/vision/image_logger.cc
+++ b/y2023/vision/image_logger.cc
@@ -57,7 +57,7 @@
aos::logger::Logger logger(&event_loop);
if (FLAGS_rotate_every != 0.0) {
- logger.set_on_logged_period([&] {
+ logger.set_on_logged_period([&](aos::monotonic_clock::time_point) {
const auto now = event_loop.monotonic_now();
if (logging && now > last_rotation_time + std::chrono::duration<double>(
FLAGS_rotate_every)) {