Validate that RobustOwnershipTracker catches all the deaths right

This adds tests for our new robust death tracker.

Change-Id: I66e1d03df13ad873bcaeb878afe118a78d95a3a1
Co-authored-by: Austin Schuh <austin.schuh@bluerivertech.com>
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/ipc_lib/BUILD b/aos/ipc_lib/BUILD
index d51ee25..75bbd4d 100644
--- a/aos/ipc_lib/BUILD
+++ b/aos/ipc_lib/BUILD
@@ -253,6 +253,16 @@
 )
 
 cc_test(
+    name = "robust_ownership_tracker_test",
+    srcs = ["robust_ownership_tracker_test.cc"],
+    target_compatible_with = ["@platforms//os:linux"],
+    deps = [
+        ":lockless_queue",
+        "//aos/testing:googletest",
+    ],
+)
+
+cc_test(
     name = "lockless_queue_test",
     timeout = "eternal",
     srcs = ["lockless_queue_test.cc"],
diff --git a/aos/ipc_lib/lockless_queue_death_test.cc b/aos/ipc_lib/lockless_queue_death_test.cc
index 2f4e08a..92ad8a8 100644
--- a/aos/ipc_lib/lockless_queue_death_test.cc
+++ b/aos/ipc_lib/lockless_queue_death_test.cc
@@ -102,14 +102,13 @@
         bool print = false;
 
         // TestShmRobustness doesn't handle robust futexes.  It is happy to just
-        // not crash with them.  We know what they are, and what the tid of the
-        // holder is.  So go pretend to be the kernel and fix it for it.
+        // not crash with them.  We know what the futexes are, and what the tid
+        // of the corresponding holder is.  So go pretend to be the kernel and
+        // fix the futex.
         PretendThatOwnerIsDeadForTesting(&memory->queue_setup_lock, tid.Get());
 
         for (size_t i = 0; i < config.num_senders; ++i) {
-          if (memory->GetSender(i)
-                  ->ownership_tracker.PretendThatOwnerIsDeadForTesting(
-                      tid.Get())) {
+          if (memory->GetSender(i)->ownership_tracker.IsHeldBy(tid.Get())) {
             // Print out before and after results if a sender died.  That is the
             // more fun case.
             print = true;
diff --git a/aos/ipc_lib/robust_ownership_tracker.cc b/aos/ipc_lib/robust_ownership_tracker.cc
index 3b6d32a..f106113 100644
--- a/aos/ipc_lib/robust_ownership_tracker.cc
+++ b/aos/ipc_lib/robust_ownership_tracker.cc
@@ -5,10 +5,6 @@
 namespace aos {
 namespace ipc_lib {
 
-bool RobustOwnershipTracker::PretendThatOwnerIsDeadForTesting(pid_t died_tid) {
-  return ipc_lib::PretendThatOwnerIsDeadForTesting(&mutex_, died_tid);
-}
-
 ::std::string RobustOwnershipTracker::DebugString() const {
   ::std::stringstream s;
   s << "{.tid=aos_mutex(" << ::std::hex << mutex_.futex;
diff --git a/aos/ipc_lib/robust_ownership_tracker.h b/aos/ipc_lib/robust_ownership_tracker.h
index 6f0d222..a8bbca3 100644
--- a/aos/ipc_lib/robust_ownership_tracker.h
+++ b/aos/ipc_lib/robust_ownership_tracker.h
@@ -10,6 +10,9 @@
 #include "aos/util/top.h"
 
 namespace aos::ipc_lib {
+namespace testing {
+class RobustOwnershipTrackerTest;
+}  // namespace testing
 
 // Results of atomically loading the ownership state via RobustOwnershipTracker
 // below. This allows the state to be compared and queried later.
@@ -122,6 +125,11 @@
   // Returns true if this thread holds ownership.
   bool IsHeldBySelf() { return death_notification_is_held(&mutex_); }
 
+  // Returns true if the mutex is held by the provided tid.  This is primarily
+  // intended for testing. There should be no need to call this in production
+  // code.
+  bool IsHeldBy(pid_t tid) { return LoadRelaxed().tid() == tid; }
+
   // Acquires ownership. Other threads will know that this thread holds the
   // ownership or be notified if this thread dies.
   void Acquire() {
@@ -143,18 +151,12 @@
     start_time_ticks_ = kNoStartTimeTicks;
   }
 
-  // Marks the owner as dead if the specified tid is the current owner. In other
-  // words, after this call, a call to `LoadAcquire().OwnerIsDead()` may start
-  // returning true.
-  //
-  // The motivation here is for use in testing. DO NOT USE in production code.
-  // The logic here is only good enough for testing.
-  bool PretendThatOwnerIsDeadForTesting(pid_t tid);
-
   // Returns a string representing this object.
   std::string DebugString() const;
 
  private:
+  friend class testing::RobustOwnershipTrackerTest;
+
   // Robust futex to track ownership the normal way. The futex is inside the
   // mutex here. We use the wrapper mutex because the death_notification_*
   // functions operate on that instead of the futex directly.
diff --git a/aos/ipc_lib/robust_ownership_tracker_test.cc b/aos/ipc_lib/robust_ownership_tracker_test.cc
new file mode 100644
index 0000000..b204886
--- /dev/null
+++ b/aos/ipc_lib/robust_ownership_tracker_test.cc
@@ -0,0 +1,155 @@
+#include "aos/ipc_lib/robust_ownership_tracker.h"
+
+#include <sys/mman.h>
+#include <sys/wait.h>
+
+#include "gtest/gtest.h"
+
+namespace aos::ipc_lib::testing {
+
+// Capture RobustOwnershipTracker in shared memory so it is shared across a
+// fork.
+class SharedRobustOwnershipTracker {
+ public:
+  SharedRobustOwnershipTracker() {
+    tracker_ = static_cast<RobustOwnershipTracker *>(
+        mmap(nullptr, sizeof(RobustOwnershipTracker), PROT_READ | PROT_WRITE,
+             MAP_SHARED | MAP_ANONYMOUS, -1, 0));
+    PCHECK(MAP_FAILED != tracker_);
+  };
+  ~SharedRobustOwnershipTracker() {
+    PCHECK(munmap(tracker_, sizeof(RobustOwnershipTracker)) != -1);
+  }
+
+  // Captures the tid.
+  RobustOwnershipTracker &tracker() const { return *tracker_; }
+
+ private:
+  RobustOwnershipTracker *tracker_;
+};
+
+class RobustOwnershipTrackerTest : public ::testing::Test {
+ public:
+  // Runs a function in a child process, and then exits afterwards.  Waits for
+  // the child to finish before resuming.
+  template <typename T>
+  void RunInChildAndBlockUntilComplete(T fn) {
+    pid_t pid = fork();
+    if (pid == 0) {
+      fn();
+      LOG(INFO) << "Child exiting normally.";
+      exit(0);
+      return;
+    }
+
+    LOG(INFO) << "Child has pid " << pid;
+
+    while (true) {
+      LOG(INFO) << "Waiting for child.";
+      int status;
+      const pid_t waited_on = waitpid(pid, &status, 0);
+      // Check for failure.
+      if (waited_on == -1) {
+        if (errno == EINTR) continue;
+        PLOG(FATAL) << ": waitpid(" << pid << ", " << &status << ", 0) failed";
+      }
+      CHECK_EQ(waited_on, pid)
+          << ": waitpid() got child " << waited_on << " instead of " << pid;
+      CHECK(WIFEXITED(status));
+      LOG(INFO) << "Status " << WEXITSTATUS(status);
+      CHECK(WEXITSTATUS(status) == 0);
+      return;
+    }
+  }
+
+  // Returns the robust mutex.
+  aos_mutex &GetMutex(RobustOwnershipTracker &tracker) {
+    return tracker.mutex_;
+  }
+
+  // Returns the current start time in ticks.
+  uint64_t GetStartTimeTicks(RobustOwnershipTracker &tracker) {
+    return tracker.start_time_ticks_.load();
+  }
+
+  // Sets the current start time in ticks.
+  void SetStartTimeTicks(RobustOwnershipTracker &tracker, uint64_t start_time) {
+    tracker.start_time_ticks_ = start_time;
+  }
+};
+
+// Tests that acquiring the futex doesn't erroneously report the owner (i.e.
+// "us") as dead.
+TEST_F(RobustOwnershipTrackerTest, AcquireWorks) {
+  SharedRobustOwnershipTracker shared_tracker;
+
+  EXPECT_FALSE(shared_tracker.tracker().OwnerIsDefinitelyAbsolutelyDead());
+
+  // Run acquire in the this process, and expect it should not be dead until
+  // after the test finishes.
+  shared_tracker.tracker().Acquire();
+
+  // We have ownership. Since we are alive, the owner should not be marked as
+  // dead. We can use relaxed ordering since we are the only ones touching the
+  // data here.
+  EXPECT_FALSE(shared_tracker.tracker().LoadRelaxed().OwnerIsDead());
+  EXPECT_FALSE(shared_tracker.tracker().OwnerIsDefinitelyAbsolutelyDead());
+}
+
+// Tests that child death without unlocking results in the futex being marked as
+// dead, and the owner being very dead.
+TEST_F(RobustOwnershipTrackerTest, FutexRecovers) {
+  SharedRobustOwnershipTracker shared_tracker;
+
+  RunInChildAndBlockUntilComplete(
+      [&]() { shared_tracker.tracker().Acquire(); });
+
+  // Since the child that took ownership died, we expect that death to be
+  // reported.
+  EXPECT_TRUE(shared_tracker.tracker().LoadRelaxed().OwnerIsDead());
+  EXPECT_TRUE(shared_tracker.tracker().OwnerIsDefinitelyAbsolutelyDead());
+}
+
+// Tests that a PID which doesn't exist results in the process being noticed as
+// dead when we inspect /proc.
+TEST_F(RobustOwnershipTrackerTest, NoMatchingPID) {
+  SharedRobustOwnershipTracker shared_tracker;
+
+  shared_tracker.tracker().Acquire();
+  EXPECT_FALSE(shared_tracker.tracker().LoadRelaxed().OwnerIsDead());
+  EXPECT_FALSE(shared_tracker.tracker().OwnerIsDefinitelyAbsolutelyDead());
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+  GetMutex(shared_tracker.tracker()).futex =
+      std::numeric_limits<aos_futex>::max() & FUTEX_TID_MASK;
+#pragma GCC diagnostic pop
+
+  // Since we're only pretending that the owner died (by changing the TID in the
+  // futex), we only notice that the owner is dead when spending the time
+  // walking through /proc.
+  EXPECT_FALSE(shared_tracker.tracker().LoadRelaxed().OwnerIsDead());
+  EXPECT_TRUE(shared_tracker.tracker().OwnerIsDefinitelyAbsolutelyDead());
+}
+
+// Tests that a mismatched start time results in the process being marked as
+// dead.
+TEST_F(RobustOwnershipTrackerTest, NoMatchingStartTime) {
+  SharedRobustOwnershipTracker shared_tracker;
+
+  shared_tracker.tracker().Acquire();
+  EXPECT_FALSE(shared_tracker.tracker().LoadRelaxed().OwnerIsDead());
+  EXPECT_FALSE(shared_tracker.tracker().OwnerIsDefinitelyAbsolutelyDead());
+
+  EXPECT_NE(GetStartTimeTicks(shared_tracker.tracker()), 0);
+  EXPECT_NE(GetStartTimeTicks(shared_tracker.tracker()),
+            RobustOwnershipTracker::kNoStartTimeTicks);
+  SetStartTimeTicks(shared_tracker.tracker(), 1);
+
+  // Since we're only pretending that the owner died (by changing the tracked
+  // start time ticks in the tracker), we only notice that the owner is dead
+  // when spending the time walking through /proc.
+  EXPECT_FALSE(shared_tracker.tracker().LoadRelaxed().OwnerIsDead());
+  EXPECT_TRUE(shared_tracker.tracker().OwnerIsDefinitelyAbsolutelyDead());
+}
+
+}  // namespace aos::ipc_lib::testing