aos: Detect lockless queue owner death more reliably

When the OOM killer kills us, or the process otherwise dies
aggressively, the robust futex cleanup doesn't happen. This results in
senders, watchers, or pinners getting leaked until reboot.

Fix this by both checking that the tid exists, along with tracking and
confirming that it's start time matches the original start time.  That
should let us catch the PID collision case reliably.  We only need to do
the exhaustive check when constructing the queue, so it is OK to be
expensive.

Because we're changing the format in the SHMEM files here (i.e. adding
the `start_time_ticks` field) we need to bump the queue version
number.

Mostly written by Phil Schrader.

Change-Id: I6bff78b6933fed2e0163bcee26138b6a8af857ad
Co-authored-by: Austin Schuh <austin.schuh@bluerivertech.com>
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/ipc_lib/robust_ownership_tracker.h b/aos/ipc_lib/robust_ownership_tracker.h
index 968c2f7..6f0d222 100644
--- a/aos/ipc_lib/robust_ownership_tracker.h
+++ b/aos/ipc_lib/robust_ownership_tracker.h
@@ -2,10 +2,12 @@
 #define AOS_IPC_LIB_ROBUST_OWNERSHIP_TRACKER_H_
 
 #include <linux/futex.h>
+#include <sys/syscall.h>
 
 #include <string>
 
 #include "aos/ipc_lib/aos_sync.h"
+#include "aos/util/top.h"
 
 namespace aos::ipc_lib {
 
@@ -28,11 +30,6 @@
   // Returns true if no one has claimed ownership.
   bool IsUnclaimed() const { return futex_ == 0; }
 
-  // Returns true if either ownership hasn't been acquired or the owner died.
-  bool IsUnclaimedOrOwnerIsDead() const {
-    return IsUnclaimed() || OwnerIsDead();
-  }
-
   // Returns the thread ID (a.k.a. "tid") of the owning thread. Use this when
   // trying to access the /proc entry that corresponds to the owning thread for
   // example. Do not use the futex value directly.
@@ -55,19 +52,59 @@
 // All methods other than Load* must be accessed under a mutex.
 class RobustOwnershipTracker {
  public:
-  // Loads all the contents of the ownership tracker with Acquire memory
-  // ordering.
+  static constexpr uint64_t kNoStartTimeTicks =
+      std::numeric_limits<uint64_t>::max();
+
+  static uint64_t ReadStartTimeTicks(pid_t tid) {
+    if (tid == 0) {
+      return kNoStartTimeTicks;
+    }
+    std::optional<aos::util::ProcStat> proc_stat = util::ReadProcStat(tid);
+    if (!proc_stat.has_value()) {
+      return kNoStartTimeTicks;
+    }
+    return proc_stat->start_time_ticks;
+  }
+
+  // Loads the realtime-compatible contents of the ownership tracker with
+  // Acquire memory ordering.
   ThreadOwnerStatusSnapshot LoadAcquire() const {
     return ThreadOwnerStatusSnapshot(
         __atomic_load_n(&(mutex_.futex), __ATOMIC_ACQUIRE));
   }
 
-  // Loads all the contents of the ownership tracker with Relaxed memory order.
+  // Loads all the realtime-compatible contents of the ownership tracker with
+  // Relaxed memory order.
   ThreadOwnerStatusSnapshot LoadRelaxed() const {
     return ThreadOwnerStatusSnapshot(
         __atomic_load_n(&(mutex_.futex), __ATOMIC_RELAXED));
   }
 
+  // Checks both the robust futex and dredges through /proc to see if the thread
+  // is alive. As per the class description, this must only be called under a
+  // mutex. This must not be called in a realtime context and it is slow.
+  bool OwnerIsDefinitelyAbsolutelyDead() const {
+    auto loaded = LoadAcquire();
+    if (loaded.OwnerIsDead()) {
+      return true;
+    }
+    if (loaded.IsUnclaimed()) {
+      return false;
+    }
+    const uint64_t proc_start_time_ticks = ReadStartTimeTicks(loaded.tid());
+    if (proc_start_time_ticks == kNoStartTimeTicks) {
+      LOG(ERROR) << "Detected that PID " << loaded.tid() << " died.";
+      return true;
+    }
+
+    if (proc_start_time_ticks != start_time_ticks_) {
+      LOG(ERROR) << "Detected that PID " << loaded.tid()
+                 << " died from a starttime missmatch.";
+      return true;
+    }
+    return false;
+  }
+
   // Clears all ownership state.
   //
   // This should only really be called if you are 100% certain that the owner is
@@ -79,6 +116,7 @@
     // want the kernel to know about this element via the linked list the next
     // time someone takes ownership.
     __atomic_store_n(&(mutex_.futex), 0, __ATOMIC_RELEASE);
+    start_time_ticks_ = kNoStartTimeTicks;
   }
 
   // Returns true if this thread holds ownership.
@@ -86,7 +124,15 @@
 
   // Acquires ownership. Other threads will know that this thread holds the
   // ownership or be notified if this thread dies.
-  void Acquire() { death_notification_init(&mutex_); }
+  void Acquire() {
+    pid_t tid = syscall(SYS_gettid);
+    assert(tid > 0);
+    const uint64_t proc_start_time_ticks = ReadStartTimeTicks(tid);
+    CHECK_NE(proc_start_time_ticks, kNoStartTimeTicks);
+
+    start_time_ticks_ = proc_start_time_ticks;
+    death_notification_init(&mutex_);
+  }
 
   // Releases ownership.
   //
@@ -94,6 +140,7 @@
   void Release() {
     // Must be opposite order of Acquire.
     death_notification_release(&mutex_);
+    start_time_ticks_ = kNoStartTimeTicks;
   }
 
   // Marks the owner as dead if the specified tid is the current owner. In other
@@ -118,6 +165,9 @@
   //   appropriately.
   // - Owners can clean up after dead threads.
   aos_mutex mutex_;
+
+  // Thread's start time ticks.
+  std::atomic<uint64_t> start_time_ticks_;
 };
 
 }  // namespace aos::ipc_lib