Add a test reproducing a log reading failure

We saw a case where we only had a reliable message for the second boot.

on pi2 from pi1 { "name": "/atest1", "type": "aos.examples.Ping" }
    .monotonic_event_time={.boot=0, .time=99.000150000sec},
    .monotonic_remote_time={.boot=0, .time=99.000000000sec},
    .monotonic_timestamp_time={.boot=0, .time=99.000250000sec}

on pi2 from pi1 { "name": "/atest2", "type": "aos.examples.Ping" }
    .monotonic_event_time={.boot=1, .time=0.000100000sec},
    .monotonic_remote_time={.boot=0, .time=1.000000000sec},
    .monotonic_timestamp_time={.boot=0, .time=100.000200000sec},

This ends up resulting in it looking like the second boot happened 50
seconds before the end of the first boot, and time going backwards.

Add a test to recreate this situation so we can fix it.

Change-Id: I4615e9c85ba57dcd7e656af1f94d3ca90f2c2126
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/events/logging/BUILD b/aos/events/logging/BUILD
index a8979ee..eadefa8 100644
--- a/aos/events/logging/BUILD
+++ b/aos/events/logging/BUILD
@@ -807,6 +807,20 @@
 )
 
 aos_config(
+    name = "multinode_pingpong_reboot_reliable_only_config",
+    src = "multinode_pingpong_reboot_reliable_only.json",
+    flatbuffers = [
+        "//aos/events:ping_fbs",
+        "//aos/network:message_bridge_client_fbs",
+        "//aos/network:message_bridge_server_fbs",
+        "//aos/network:remote_message_fbs",
+        "//aos/network:timestamp_fbs",
+    ],
+    target_compatible_with = ["@platforms//os:linux"],
+    deps = ["//aos/events:aos_config"],
+)
+
+aos_config(
     name = "multinode_pingpong_reboot_ooo_config",
     src = "multinode_pingpong_reboot_ooo.json",
     flatbuffers = [
@@ -895,6 +909,7 @@
         ":multinode_pingpong_combined_config",
         ":multinode_pingpong_pi3_pingpong_config",
         ":multinode_pingpong_reboot_ooo_config",
+        ":multinode_pingpong_reboot_reliable_only_config",
         ":multinode_pingpong_split3_config",
         ":multinode_pingpong_split4_config",
         ":multinode_pingpong_split4_mixed1_config",
diff --git a/aos/events/logging/multinode_logger_test.cc b/aos/events/logging/multinode_logger_test.cc
index 427347d..a3cda43 100644
--- a/aos/events/logging/multinode_logger_test.cc
+++ b/aos/events/logging/multinode_logger_test.cc
@@ -79,6 +79,44 @@
                          ForceTimestampBuffering::kAutoBuffer}),
         ::testing::ValuesIn(SupportedCompressionAlgorithms())));
 
+// Class to spam Pong messages blindly.
+class PongSender {
+ public:
+  PongSender(EventLoop *loop, std::string_view channel_name)
+      : sender_(loop->MakeSender<examples::Pong>(channel_name)) {
+    loop->AddPhasedLoop(
+        [this](int) {
+          aos::Sender<examples::Pong>::Builder builder = sender_.MakeBuilder();
+          examples::Pong::Builder pong_builder =
+              builder.MakeBuilder<examples::Pong>();
+          CHECK_EQ(builder.Send(pong_builder.Finish()), RawSender::Error::kOk);
+        },
+        chrono::milliseconds(10));
+  }
+
+ private:
+  aos::Sender<examples::Pong> sender_;
+};
+
+// Class to spam Ping messages blindly.
+class PingSender {
+ public:
+  PingSender(EventLoop *loop, std::string_view channel_name)
+      : sender_(loop->MakeSender<examples::Ping>(channel_name)) {
+    loop->AddPhasedLoop(
+        [this](int) {
+          aos::Sender<examples::Ping>::Builder builder = sender_.MakeBuilder();
+          examples::Ping::Builder ping_builder =
+              builder.MakeBuilder<examples::Ping>();
+          CHECK_EQ(builder.Send(ping_builder.Finish()), RawSender::Error::kOk);
+        },
+        chrono::milliseconds(10));
+  }
+
+ private:
+  aos::Sender<examples::Ping> sender_;
+};
+
 // Tests that we can write and read simple multi-node log files.
 TEST_P(MultinodeLoggerTest, SimpleMultiNode) {
   if (file_strategy() == FileStrategy::kCombine) {
@@ -4686,24 +4724,71 @@
   auto result = ConfirmReadable(filenames);
 }
 
-// Class to spam Pong messages blindly.
-class PongSender {
- public:
-  PongSender(EventLoop *loop, std::string_view channel_name)
-      : sender_(loop->MakeSender<examples::Pong>(channel_name)) {
-    loop->AddPhasedLoop(
-        [this](int) {
-          aos::Sender<examples::Pong>::Builder builder = sender_.MakeBuilder();
-          examples::Pong::Builder pong_builder =
-              builder.MakeBuilder<examples::Pong>();
-          CHECK_EQ(builder.Send(pong_builder.Finish()), RawSender::Error::kOk);
-        },
-        chrono::milliseconds(10));
+// Tests that only having a delayed, reliable message from a boot results in a
+// readable log.
+//
+// Note: this is disabled since it doesn't work yet.  Un-disable this when the
+// code is fixed!
+TEST(MultinodeLoggerLoopTest, DISABLED_ReliableOnlyTimestamps) {
+  util::UnlinkRecursive(aos::testing::TestTmpDir() + "/logs");
+  std::filesystem::create_directory(aos::testing::TestTmpDir() + "/logs");
+
+  aos::FlatbufferDetachedBuffer<aos::Configuration> config =
+      aos::configuration::ReadConfig(
+          ArtifactPath("aos/events/logging/"
+                       "multinode_pingpong_reboot_reliable_only_config.json"));
+  message_bridge::TestingTimeConverter time_converter(
+      configuration::NodesCount(&config.message()));
+  SimulatedEventLoopFactory event_loop_factory(&config.message());
+  event_loop_factory.SetTimeConverter(&time_converter);
+
+  constexpr chrono::nanoseconds kRebootTime = chrono::seconds(100);
+  {
+    time_converter.AddNextTimestamp(
+        distributed_clock::epoch(),
+        {BootTimestamp::epoch(), BootTimestamp::epoch()});
+    time_converter.AddNextTimestamp(
+        distributed_clock::epoch() + kRebootTime,
+        {BootTimestamp::epoch() + kRebootTime,
+         BootTimestamp{.boot = 1, .time = monotonic_clock::epoch()}});
   }
 
- private:
-  aos::Sender<examples::Pong> sender_;
-};
+  const std::string kLogfile1_1 =
+      aos::testing::TestTmpDir() + "/logs/multi_logfile1/";
+
+  NodeEventLoopFactory *const pi1 =
+      event_loop_factory.GetNodeEventLoopFactory("pi1");
+
+  // We want unreliable timestamps from one boot, a reliable timestamp from the
+  // same boot, and then a long delayed reliable timestamp from the second boot.
+  // This produces conflicting information about when the second boot happened.
+  std::vector<std::string> filenames;
+  PingSender *app1 = pi1->AlwaysStart<PingSender>("pingsender", "/atest1");
+  PingSender *app2 = pi1->AlwaysStart<PingSender>("pingsender", "/atest2");
+  event_loop_factory.RunFor(chrono::seconds(1));
+  pi1->Stop(app2);
+  event_loop_factory.RunFor(kRebootTime - chrono::seconds(2));
+  pi1->Stop(app1);
+
+  event_loop_factory.RunFor(chrono::seconds(1) + kRebootTime * 2);
+
+  {
+    // Collect a small log after reboot.
+    LoggerState pi1_logger = MakeLoggerState(
+        pi1, &event_loop_factory, SupportedCompressionAlgorithms()[0],
+        FileStrategy::kKeepSeparate);
+    pi1_logger.StartLogger(kLogfile1_1);
+
+    event_loop_factory.RunFor(chrono::seconds(1));
+
+    pi1_logger.AppendAllFilenames(&filenames);
+  }
+
+  // Make sure we can read this.
+  const std::vector<LogFile> sorted_parts = SortParts(filenames);
+  EXPECT_TRUE(AllPartsMatchOutOfOrderDuration(sorted_parts));
+  auto result = ConfirmReadable(filenames);
+}
 
 // Tests that we log correctly as nodes connect slowly.
 TEST(MultinodeLoggerLoopTest, StaggeredConnect) {
diff --git a/aos/events/logging/multinode_pingpong_reboot_reliable_only.json b/aos/events/logging/multinode_pingpong_reboot_reliable_only.json
new file mode 100644
index 0000000..cfe421d
--- /dev/null
+++ b/aos/events/logging/multinode_pingpong_reboot_reliable_only.json
@@ -0,0 +1,214 @@
+{
+  "channels": [
+    {
+      "name": "/pi1/aos",
+      "type": "aos.logging.LogMessageFbs",
+      "source_node": "pi1",
+      "frequency": 200,
+      "num_senders": 20,
+      "max_size": 2048
+    },
+    {
+      "name": "/pi2/aos",
+      "type": "aos.logging.LogMessageFbs",
+      "source_node": "pi2",
+      "frequency": 200,
+      "num_senders": 20,
+      "max_size": 2048
+    },
+    /* Logged on pi1 locally */
+    {
+      "name": "/pi1/aos",
+      "type": "aos.timing.Report",
+      "source_node": "pi1",
+      "frequency": 50,
+      "num_senders": 20,
+      "max_size": 2048
+    },
+    {
+      "name": "/pi2/aos",
+      "type": "aos.timing.Report",
+      "source_node": "pi2",
+      "frequency": 50,
+      "num_senders": 20,
+      "max_size": 2048
+    },
+    {
+      "name": "/pi1/aos",
+      "type": "aos.message_bridge.ServerStatistics",
+      "logger": "LOCAL_LOGGER",
+      "source_node": "pi1"
+    },
+    {
+      "name": "/pi2/aos",
+      "type": "aos.message_bridge.ServerStatistics",
+      "logger": "LOCAL_LOGGER",
+      "source_node": "pi2"
+    },
+    {
+      "name": "/pi1/aos",
+      "type": "aos.logging.DynamicLogCommand",
+      "logger": "LOCAL_LOGGER",
+      "source_node": "pi1"
+    },
+    {
+      "name": "/pi2/aos",
+      "type": "aos.logging.DynamicLogCommand",
+      "logger": "LOCAL_LOGGER",
+      "source_node": "pi2"
+    },
+    {
+      "name": "/pi1/aos",
+      "type": "aos.message_bridge.ClientStatistics",
+      "logger": "LOCAL_LOGGER",
+      "source_node": "pi1"
+    },
+    {
+      "name": "/pi2/aos",
+      "type": "aos.message_bridge.ClientStatistics",
+      "logger": "LOCAL_LOGGER",
+      "source_node": "pi2"
+    },
+    {
+      "name": "/pi1/aos",
+      "type": "aos.message_bridge.Timestamp",
+      "logger": "NOT_LOGGED",
+      "source_node": "pi1",
+      "destination_nodes": [
+        {
+          "name": "pi2",
+          "timestamp_logger": "NOT_LOGGED"
+        }
+      ]
+    },
+    {
+      "name": "/pi2/aos",
+      "type": "aos.message_bridge.Timestamp",
+      "logger": "NOT_LOGGED",
+      "source_node": "pi2",
+      "destination_nodes": [
+        {
+          "name": "pi1",
+          "timestamp_logger": "NOT_LOGGED"
+        }
+      ]
+    },
+    {
+      "name": "/pi1/aos/remote_timestamps/pi2/pi1/aos/aos-message_bridge-Timestamp",
+      "type": "aos.message_bridge.RemoteMessage",
+      "logger": "NOT_LOGGED",
+      "num_senders": 2,
+      "source_node": "pi1"
+    },
+    {
+      "name": "/pi2/aos/remote_timestamps/pi1/pi2/aos/aos-message_bridge-Timestamp",
+      "type": "aos.message_bridge.RemoteMessage",
+      "logger": "NOT_LOGGED",
+      "num_senders": 2,
+      "source_node": "pi2"
+    },
+    {
+      "name": "/pi1/aos/remote_timestamps/pi2/atest1/aos-examples-Ping",
+      "type": "aos.message_bridge.RemoteMessage",
+      "logger": "NOT_LOGGED",
+      "num_senders": 2,
+      "source_node": "pi1",
+      "frequency": 150
+    },
+    {
+      "name": "/pi1/aos/remote_timestamps/pi2/atest2/aos-examples-Ping",
+      "type": "aos.message_bridge.RemoteMessage",
+      "logger": "NOT_LOGGED",
+      "num_senders": 2,
+      "source_node": "pi1",
+      "frequency": 150
+    },
+    {
+      "name": "/pi1/aos/remote_timestamps/pi2/atest3/aos-examples-Ping",
+      "type": "aos.message_bridge.RemoteMessage",
+      "logger": "NOT_LOGGED",
+      "num_senders": 2,
+      "source_node": "pi1",
+      "frequency": 150
+    },
+    {
+      "name": "/atest1",
+      "type": "aos.examples.Ping",
+      "source_node": "pi1",
+      "logger": "LOCAL_AND_REMOTE_LOGGER",
+      "logger_nodes": ["pi2"],
+      "destination_nodes": [
+        {
+          "name": "pi2",
+          "priority": 1,
+          "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+          "timestamp_logger_nodes": ["pi1"],
+          "time_to_live": 5000000
+        }
+      ],
+      "frequency": 150
+    },
+    {
+      "name": "/atest2",
+      "type": "aos.examples.Ping",
+      "source_node": "pi1",
+      "logger": "LOCAL_AND_REMOTE_LOGGER",
+      "logger_nodes": ["pi2"],
+      "destination_nodes": [
+        {
+          "name": "pi2",
+          "priority": 1,
+          "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+          "timestamp_logger_nodes": ["pi1"]
+        }
+      ],
+      "frequency": 150
+    },
+    {
+      "name": "/atest3",
+      "type": "aos.examples.Ping",
+      "source_node": "pi1",
+      "destination_nodes": [
+        {
+          "name": "pi2",
+          "priority": 1,
+          "timestamp_logger": "LOCAL_AND_REMOTE_LOGGER",
+          "timestamp_logger_nodes": ["pi1"]
+        }
+      ],
+      "frequency": 150
+    }
+  ],
+  "maps": [
+    {
+      "match": {
+        "name": "/aos*",
+        "source_node": "pi1"
+      },
+      "rename": {
+        "name": "/pi1/aos"
+      }
+    },
+    {
+      "match": {
+        "name": "/aos*",
+        "source_node": "pi2"
+      },
+      "rename": {
+        "name": "/pi2/aos"
+      }
+    }
+  ],
+  "nodes": [
+    {
+      "name": "pi1",
+      "hostname": "raspberrypi",
+      "port": 9971
+    },
+    {
+      "name": "pi2",
+      "hostname": "raspberrypi2",
+      "port": 9971
+    }
+  ]
+}
diff --git a/aos/events/simulated_event_loop.h b/aos/events/simulated_event_loop.h
index 6d88aa6..e976544 100644
--- a/aos/events/simulated_event_loop.h
+++ b/aos/events/simulated_event_loop.h
@@ -245,6 +245,10 @@
   template <class Main, class... Args>
   Main *AlwaysStart(std::string_view name, Args &&...args);
 
+  // Stops an application given the pointer to the application started.
+  template <class Main>
+  void Stop(Main *application);
+
   // Returns the simulated network delay for messages forwarded between nodes.
   std::chrono::nanoseconds network_delay() const {
     return factory_->network_delay();
@@ -341,6 +345,8 @@
         : event_loop(node_factory->MakeEventLoop(name)) {}
     virtual ~Application() {}
 
+    virtual void *application() = 0;
+
     std::unique_ptr<EventLoop> event_loop;
   };
 
@@ -365,6 +371,8 @@
     }
     ~TypedApplication() override {}
 
+    void *application() override { return &main; }
+
     Main main;
   };
 
@@ -392,6 +400,16 @@
   return main_ptr;
 }
 
+template <class Main>
+void NodeEventLoopFactory::Stop(Main *application) {
+  auto it = std::remove_if(
+      applications_.begin(), applications_.end(),
+      [application](const std::unique_ptr<Application> &app) {
+        return app->application() == static_cast<void *>(application);
+      });
+  applications_.erase(it, applications_.end());
+}
+
 inline monotonic_clock::time_point NodeEventLoopFactory::monotonic_now() const {
   // TODO(austin): Confirm that time never goes backwards?
   return scheduler_.monotonic_now();