Add a test reproducing a log reading failure
We saw a case where we only had a reliable message for the second boot.
on pi2 from pi1 { "name": "/atest1", "type": "aos.examples.Ping" }
.monotonic_event_time={.boot=0, .time=99.000150000sec},
.monotonic_remote_time={.boot=0, .time=99.000000000sec},
.monotonic_timestamp_time={.boot=0, .time=99.000250000sec}
on pi2 from pi1 { "name": "/atest2", "type": "aos.examples.Ping" }
.monotonic_event_time={.boot=1, .time=0.000100000sec},
.monotonic_remote_time={.boot=0, .time=1.000000000sec},
.monotonic_timestamp_time={.boot=0, .time=100.000200000sec},
This ends up resulting in it looking like the second boot happened 50
seconds before the end of the first boot, and time going backwards.
Add a test to recreate this situation so we can fix it.
Change-Id: I4615e9c85ba57dcd7e656af1f94d3ca90f2c2126
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/events/logging/BUILD b/aos/events/logging/BUILD
index a8979ee..eadefa8 100644
--- a/aos/events/logging/BUILD
+++ b/aos/events/logging/BUILD
@@ -807,6 +807,20 @@
)
aos_config(
+ name = "multinode_pingpong_reboot_reliable_only_config",
+ src = "multinode_pingpong_reboot_reliable_only.json",
+ flatbuffers = [
+ "//aos/events:ping_fbs",
+ "//aos/network:message_bridge_client_fbs",
+ "//aos/network:message_bridge_server_fbs",
+ "//aos/network:remote_message_fbs",
+ "//aos/network:timestamp_fbs",
+ ],
+ target_compatible_with = ["@platforms//os:linux"],
+ deps = ["//aos/events:aos_config"],
+)
+
+aos_config(
name = "multinode_pingpong_reboot_ooo_config",
src = "multinode_pingpong_reboot_ooo.json",
flatbuffers = [
@@ -895,6 +909,7 @@
":multinode_pingpong_combined_config",
":multinode_pingpong_pi3_pingpong_config",
":multinode_pingpong_reboot_ooo_config",
+ ":multinode_pingpong_reboot_reliable_only_config",
":multinode_pingpong_split3_config",
":multinode_pingpong_split4_config",
":multinode_pingpong_split4_mixed1_config",
diff --git a/aos/events/logging/multinode_logger_test.cc b/aos/events/logging/multinode_logger_test.cc
index 427347d..a3cda43 100644
--- a/aos/events/logging/multinode_logger_test.cc
+++ b/aos/events/logging/multinode_logger_test.cc
@@ -79,6 +79,44 @@
ForceTimestampBuffering::kAutoBuffer}),
::testing::ValuesIn(SupportedCompressionAlgorithms())));
+// Class to spam Pong messages blindly.
+class PongSender {
+ public:
+ PongSender(EventLoop *loop, std::string_view channel_name)
+ : sender_(loop->MakeSender<examples::Pong>(channel_name)) {
+ loop->AddPhasedLoop(
+ [this](int) {
+ aos::Sender<examples::Pong>::Builder builder = sender_.MakeBuilder();
+ examples::Pong::Builder pong_builder =
+ builder.MakeBuilder<examples::Pong>();
+ CHECK_EQ(builder.Send(pong_builder.Finish()), RawSender::Error::kOk);
+ },
+ chrono::milliseconds(10));
+ }
+
+ private:
+ aos::Sender<examples::Pong> sender_;
+};
+
+// Class to spam Ping messages blindly.
+class PingSender {
+ public:
+ PingSender(EventLoop *loop, std::string_view channel_name)
+ : sender_(loop->MakeSender<examples::Ping>(channel_name)) {
+ loop->AddPhasedLoop(
+ [this](int) {
+ aos::Sender<examples::Ping>::Builder builder = sender_.MakeBuilder();
+ examples::Ping::Builder ping_builder =
+ builder.MakeBuilder<examples::Ping>();
+ CHECK_EQ(builder.Send(ping_builder.Finish()), RawSender::Error::kOk);
+ },
+ chrono::milliseconds(10));
+ }
+
+ private:
+ aos::Sender<examples::Ping> sender_;
+};
+
// Tests that we can write and read simple multi-node log files.
TEST_P(MultinodeLoggerTest, SimpleMultiNode) {
if (file_strategy() == FileStrategy::kCombine) {
@@ -4686,24 +4724,71 @@
auto result = ConfirmReadable(filenames);
}
-// Class to spam Pong messages blindly.
-class PongSender {
- public:
- PongSender(EventLoop *loop, std::string_view channel_name)
- : sender_(loop->MakeSender<examples::Pong>(channel_name)) {
- loop->AddPhasedLoop(
- [this](int) {
- aos::Sender<examples::Pong>::Builder builder = sender_.MakeBuilder();
- examples::Pong::Builder pong_builder =
- builder.MakeBuilder<examples::Pong>();
- CHECK_EQ(builder.Send(pong_builder.Finish()), RawSender::Error::kOk);
- },
- chrono::milliseconds(10));
+// Tests that only having a delayed, reliable message from a boot results in a
+// readable log.
+//
+// Note: this is disabled since it doesn't work yet. Un-disable this when the
+// code is fixed!
+TEST(MultinodeLoggerLoopTest, DISABLED_ReliableOnlyTimestamps) {
+ util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+ std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
+ aos::FlatbufferDetachedBuffer<aos::Configuration> config =
+ aos::configuration::ReadConfig(
+ ArtifactPath("aos/events/logging/"
+ "multinode_pingpong_reboot_reliable_only_config.json"));
+ message_bridge::TestingTimeConverter time_converter(
+ configuration::NodesCount(&config.message()));
+ SimulatedEventLoopFactory event_loop_factory(&config.message());
+ event_loop_factory.SetTimeConverter(&time_converter);
+
+ constexpr chrono::nanoseconds kRebootTime = chrono::seconds(100);
+ {
+ time_converter.AddNextTimestamp(
+ distributed_clock::epoch(),
+ {BootTimestamp::epoch(), BootTimestamp::epoch()});
+ time_converter.AddNextTimestamp(
+ distributed_clock::epoch() + kRebootTime,
+ {BootTimestamp::epoch() + kRebootTime,
+ BootTimestamp{.boot = 1, .time = monotonic_clock::epoch()}});
}
- private:
- aos::Sender<examples::Pong> sender_;
-};
+ const std::string kLogfile1_1 =
+ aos::testing::TestTmpDir() + "/logs/multi_logfile1/";
+
+ NodeEventLoopFactory *const pi1 =
+ event_loop_factory.GetNodeEventLoopFactory("pi1");
+
+ // We want unreliable timestamps from one boot, a reliable timestamp from the
+ // same boot, and then a long delayed reliable timestamp from the second boot.
+ // This produces conflicting information about when the second boot happened.
+ std::vector<std::string> filenames;
+ PingSender *app1 = pi1->AlwaysStart<PingSender>("pingsender", "/atest1");
+ PingSender *app2 = pi1->AlwaysStart<PingSender>("pingsender", "/atest2");
+ event_loop_factory.RunFor(chrono::seconds(1));
+ pi1->Stop(app2);
+ event_loop_factory.RunFor(kRebootTime - chrono::seconds(2));
+ pi1->Stop(app1);
+
+ event_loop_factory.RunFor(chrono::seconds(1) + kRebootTime * 2);
+
+ {
+ // Collect a small log after reboot.
+ LoggerState pi1_logger = MakeLoggerState(
+ pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+ FileStrategy::kKeepSeparate);
+ pi1_logger.StartLogger(kLogfile1_1);
+
+ event_loop_factory.RunFor(chrono::seconds(1));
+
+ pi1_logger.AppendAllFilenames(&filenames);
+ }
+
+ // Make sure we can read this.
+ const std::vector<LogFile> sorted_parts = SortParts(filenames);
+ EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+ auto result = ConfirmReadable(filenames);
+}
// Tests that we log correctly as nodes connect slowly.
TEST(MultinodeLoggerLoopTest, StaggeredConnect) {
diff --git a/aos/events/logging/multinode_pingpong_reboot_reliable_only.json b/aos/events/logging/multinode_pingpong_reboot_reliable_only.json
new file mode 100644
index 0000000..cfe421d
--- /dev/null
+++ b/aos/events/logging/multinode_pingpong_reboot_reliable_only.json
@@ -0,0 +1,214 @@
+{
+ "channels": [
+ {
+ "name": "/pi1/aos",
+ "type": "aos.logging.LogMessageFbs",
+ "source_node": "pi1",
+ "frequency": 200,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.logging.LogMessageFbs",
+ "source_node": "pi2",
+ "frequency": 200,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ /* Logged on pi1 locally */
+ {
+ "name": "/pi1/aos",
+ "type": "aos.timing.Report",
+ "source_node": "pi1",
+ "frequency": 50,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.timing.Report",
+ "source_node": "pi2",
+ "frequency": 50,
+ "num_senders": 20,
+ "max_size": 2048
+ },
+ {
+ "name": "/pi1/aos",
+ "type": "aos.message_bridge.ServerStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi1"
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.message_bridge.ServerStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi2"
+ },
+ {
+ "name": "/pi1/aos",
+ "type": "aos.logging.DynamicLogCommand",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi1"
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.logging.DynamicLogCommand",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi2"
+ },
+ {
+ "name": "/pi1/aos",
+ "type": "aos.message_bridge.ClientStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi1"
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.message_bridge.ClientStatistics",
+ "logger": "LOCAL_LOGGER",
+ "source_node": "pi2"
+ },
+ {
+ "name": "/pi1/aos",
+ "type": "aos.message_bridge.Timestamp",
+ "logger": "NOT_LOGGED",
+ "source_node": "pi1",
+ "destination_nodes": [
+ {
+ "name": "pi2",
+ "timestamp_logger": "NOT_LOGGED"
+ }
+ ]
+ },
+ {
+ "name": "/pi2/aos",
+ "type": "aos.message_bridge.Timestamp",
+ "logger": "NOT_LOGGED",
+ "source_node": "pi2",
+ "destination_nodes": [
+ {
+ "name": "pi1",
+ "timestamp_logger": "NOT_LOGGED"
+ }
+ ]
+ },
+ {
+ "name": "/pi1/aos/remote_timestamps/pi2/pi1/aos/aos-message_bridge-Timestamp",
+ "type": "aos.message_bridge.RemoteMessage",
+ "logger": "NOT_LOGGED",
+ "num_senders": 2,
+ "source_node": "pi1"
+ },
+ {
+ "name": "/pi2/aos/remote_timestamps/pi1/pi2/aos/aos-message_bridge-Timestamp",
+ "type": "aos.message_bridge.RemoteMessage",
+ "logger": "NOT_LOGGED",
+ "num_senders": 2,
+ "source_node": "pi2"
+ },
+ {
+ "name": "/pi1/aos/remote_timestamps/pi2/atest1/aos-examples-Ping",
+ "type": "aos.message_bridge.RemoteMessage",
+ "logger": "NOT_LOGGED",
+ "num_senders": 2,
+ "source_node": "pi1",
+ "frequency": 150
+ },
+ {
+ "name": "/pi1/aos/remote_timestamps/pi2/atest2/aos-examples-Ping",
+ "type": "aos.message_bridge.RemoteMessage",
+ "logger": "NOT_LOGGED",
+ "num_senders": 2,
+ "source_node": "pi1",
+ "frequency": 150
+ },
+ {
+ "name": "/pi1/aos/remote_timestamps/pi2/atest3/aos-examples-Ping",
+ "type": "aos.message_bridge.RemoteMessage",
+ "logger": "NOT_LOGGED",
+ "num_senders": 2,
+ "source_node": "pi1",
+ "frequency": 150
+ },
+ {
+ "name": "/atest1",
+ "type": "aos.examples.Ping",
+ "source_node": "pi1",
+ "logger": "LOCAL_AND_REMOTE_LOGGER",
+ "logger_nodes": ["pi2"],
+ "destination_nodes": [
+ {
+ "name": "pi2",
+ "priority": 1,
+ "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+ "timestamp_logger_nodes": ["pi1"],
+ "time_to_live": 5000000
+ }
+ ],
+ "frequency": 150
+ },
+ {
+ "name": "/atest2",
+ "type": "aos.examples.Ping",
+ "source_node": "pi1",
+ "logger": "LOCAL_AND_REMOTE_LOGGER",
+ "logger_nodes": ["pi2"],
+ "destination_nodes": [
+ {
+ "name": "pi2",
+ "priority": 1,
+ "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+ "timestamp_logger_nodes": ["pi1"]
+ }
+ ],
+ "frequency": 150
+ },
+ {
+ "name": "/atest3",
+ "type": "aos.examples.Ping",
+ "source_node": "pi1",
+ "destination_nodes": [
+ {
+ "name": "pi2",
+ "priority": 1,
+ "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+ "timestamp_logger_nodes": ["pi1"]
+ }
+ ],
+ "frequency": 150
+ }
+ ],
+ "maps": [
+ {
+ "match": {
+ "name": "/aos*",
+ "source_node": "pi1"
+ },
+ "rename": {
+ "name": "/pi1/aos"
+ }
+ },
+ {
+ "match": {
+ "name": "/aos*",
+ "source_node": "pi2"
+ },
+ "rename": {
+ "name": "/pi2/aos"
+ }
+ }
+ ],
+ "nodes": [
+ {
+ "name": "pi1",
+ "hostname": "raspberrypi",
+ "port": 9971
+ },
+ {
+ "name": "pi2",
+ "hostname": "raspberrypi2",
+ "port": 9971
+ }
+ ]
+}
diff --git a/aos/events/simulated_event_loop.h b/aos/events/simulated_event_loop.h
index 6d88aa6..e976544 100644
--- a/aos/events/simulated_event_loop.h
+++ b/aos/events/simulated_event_loop.h
@@ -245,6 +245,10 @@
template <class Main, class... Args>
Main *AlwaysStart(std::string_view name, Args &&...args);
+ // Stops an application given the pointer to the application started.
+ template <class Main>
+ void Stop(Main *application);
+
// Returns the simulated network delay for messages forwarded between nodes.
std::chrono::nanoseconds network_delay() const {
return factory_->network_delay();
@@ -341,6 +345,8 @@
: event_loop(node_factory->MakeEventLoop(name)) {}
virtual ~Application() {}
+ virtual void *application() = 0;
+
std::unique_ptr<EventLoop> event_loop;
};
@@ -365,6 +371,8 @@
}
~TypedApplication() override {}
+ void *application() override { return &main; }
+
Main main;
};
@@ -392,6 +400,16 @@
return main_ptr;
}
+template <class Main>
+void NodeEventLoopFactory::Stop(Main *application) {
+ auto it = std::remove_if(
+ applications_.begin(), applications_.end(),
+ [application](const std::unique_ptr<Application> &app) {
+ return app->application() == static_cast<void *>(application);
+ });
+ applications_.erase(it, applications_.end());
+}
+
inline monotonic_clock::time_point NodeEventLoopFactory::monotonic_now() const {
// TODO(austin): Confirm that time never goes backwards?
return scheduler_.monotonic_now();