implement robust mutex support

This allows making everything in shared memory robust to
processes dying at any point in time (not actually done yet in this
commit).

This includes using FUTEX_REQUEUE_PI, but currently only on ARM because
that's the only place we can rely on the kernel not corrupting random
memory due to a bug (fix has been merged upstream).

Change-Id: Id5bda1dc3185a1aac759510934bce6fd9121ad3f
diff --git a/aos/common/BUILD b/aos/common/BUILD
index 9bcf811..658a2e6 100644
--- a/aos/common/BUILD
+++ b/aos/common/BUILD
@@ -266,6 +266,7 @@
     '//aos/common/util:thread',
     '//aos/common:time',
     '//aos/testing:test_logging',
+    '//aos/testing:test_shm',
   ],
 )
 
diff --git a/aos/common/condition.h b/aos/common/condition.h
index eccc0da..84f0542 100644
--- a/aos/common/condition.h
+++ b/aos/common/condition.h
@@ -54,7 +54,7 @@
   // and will be locked when this method returns.
   // NOTE: The relocking of the mutex is not performed atomically with waking
   // up.
-  // Returns false.
+  // Returns true if the previous owner of the mutex died before we relocked it.
   bool Wait() __attribute__((warn_unused_result));
 
   // Signals approximately 1 other process currently Wait()ing on this condition
diff --git a/aos/common/condition_test.cc b/aos/common/condition_test.cc
index dbaa546..0e7bfe4 100644
--- a/aos/common/condition_test.cc
+++ b/aos/common/condition_test.cc
@@ -5,6 +5,7 @@
 #include <sys/wait.h>
 
 #include <atomic>
+#include <thread>
 
 #include "gtest/gtest.h"
 
@@ -75,6 +76,22 @@
   EXPECT_TRUE(child_finished.load());
 }
 
+// Tests that contention on the associated mutex doesn't break anything.
+// This seems likely to cause issues with AddressSanitizer in particular.
+TEST_F(SimpleConditionTest, MutexContention) {
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_FALSE(mutex_.Lock());
+    ::std::thread thread([this]() {
+      ASSERT_FALSE(mutex_.Lock());
+      condition_.Signal();
+      mutex_.Unlock();
+    });
+    ASSERT_FALSE(condition_.Wait());
+    mutex_.Unlock();
+    thread.join();
+  }
+}
+
 class ConditionTest : public ConditionTestCommon {
  public:
   struct Shared {
@@ -307,7 +324,8 @@
   Settle();
   shared_->condition.Signal();
   EXPECT_FALSE(child.Hung());
-  EXPECT_EQ(Mutex::State::kLockFailed, shared_->mutex.TryLock());
+  EXPECT_EQ(Mutex::State::kOwnerDied, shared_->mutex.TryLock());
+  shared_->mutex.Unlock();
 }
 
 // Tests that Signal() stops exactly 1 Wait()er.
diff --git a/aos/common/mutex.cc b/aos/common/mutex.cc
index c5b8652..eed5b0b 100644
--- a/aos/common/mutex.cc
+++ b/aos/common/mutex.cc
@@ -28,6 +28,8 @@
   const int ret = mutex_grab(&impl_);
   if (ret == 0) {
     return false;
+  } else if (ret == 1) {
+    return true;
   } else {
     LOG(FATAL, "mutex_grab(%p(=%" PRIu32 ")) failed with %d\n",
         &impl_, impl_.futex, ret);
@@ -43,6 +45,8 @@
   switch (ret) {
     case 0:
       return State::kLocked;
+    case 1:
+      return State::kOwnerDied;
     case 4:
       return State::kLockFailed;
     default:
diff --git a/aos/common/mutex.h b/aos/common/mutex.h
index ce10a08..cc180f5 100644
--- a/aos/common/mutex.h
+++ b/aos/common/mutex.h
@@ -25,7 +25,9 @@
     // The mutex was acquired successfully.
     kLocked,
     // TryLock tried to grab the mutex and failed.
-    kLockFailed
+    kLockFailed,
+    // The previous owner of the mutex died.
+    kOwnerDied,
   };
 
   // Creates an unlocked mutex.
@@ -37,7 +39,7 @@
   ~Mutex();
 
   // Locks the mutex. If it fails, it calls LOG(FATAL).
-  // Returns false.
+  // Returns true if the previous owner died instead of unlocking nicely.
   bool Lock() __attribute__((warn_unused_result));
   // Unlocks the mutex. Fails like Lock.
   // Multiple unlocking is undefined.
diff --git a/aos/common/mutex_test.cc b/aos/common/mutex_test.cc
index af80bd1..8cc4e73 100644
--- a/aos/common/mutex_test.cc
+++ b/aos/common/mutex_test.cc
@@ -14,6 +14,7 @@
 #include "aos/common/util/thread.h"
 #include "aos/common/time.h"
 #include "aos/testing/test_logging.h"
+#include "aos/testing/test_shm.h"
 #include "aos/linux_code/ipc_lib/core_lib.h"
 
 namespace aos {
@@ -84,7 +85,7 @@
       ".*multiple unlock.*");
 }
 
-// Sees what happens with multiple locks.
+// Tests that locking a mutex multiple times from the same thread fails nicely.
 TEST_F(MutexDeathTest, RepeatLock) {
   EXPECT_DEATH(
       {
@@ -95,6 +96,7 @@
       ".*multiple lock.*");
 }
 
+// Tests that destroying a locked mutex fails nicely.
 TEST_F(MutexDeathTest, DestroyLocked) {
   EXPECT_DEATH(
       {
@@ -105,6 +107,64 @@
       ".*destroying locked mutex.*");
 }
 
+// Tests that Lock behaves correctly when the previous owner exits with the lock
+// held (which is the same as dying any other way).
+TEST_F(MutexTest, OwnerDiedDeathLock) {
+  testing::TestSharedMemory my_shm;
+  Mutex *mutex =
+      static_cast<Mutex *>(shm_malloc_aligned(sizeof(Mutex), alignof(Mutex)));
+  new (mutex) Mutex();
+
+  util::FunctionThread::RunInOtherThread([&]() {
+    ASSERT_FALSE(mutex->Lock());
+  });
+  EXPECT_TRUE(mutex->Lock());
+
+  mutex->Unlock();
+  mutex->~Mutex();
+}
+
+// Tests that TryLock behaves correctly when the previous owner dies.
+TEST_F(MutexTest, OwnerDiedDeathTryLock) {
+  testing::TestSharedMemory my_shm;
+  Mutex *mutex =
+      static_cast<Mutex *>(shm_malloc_aligned(sizeof(Mutex), alignof(Mutex)));
+  new (mutex) Mutex();
+
+  util::FunctionThread::RunInOtherThread([&]() {
+    ASSERT_FALSE(mutex->Lock());
+  });
+  EXPECT_EQ(Mutex::State::kOwnerDied, mutex->TryLock());
+
+  mutex->Unlock();
+  mutex->~Mutex();
+}
+
+// TODO(brians): Test owner dying by being SIGKILLed and SIGTERMed.
+
+// This sequence of mutex operations used to mess up the robust list and cause
+// one of the mutexes to not get owner-died like it should.
+TEST_F(MutexTest, DontCorruptRobustList) {
+  // I think this was the allocator lock in the original failure.
+  Mutex mutex1;
+  // This one should get owner-died afterwards (iff the kernel accepts the
+  // robust list and uses it). I think it was the task_death_notification lock
+  // in the original failure.
+  Mutex mutex2;
+
+  util::FunctionThread::RunInOtherThread([&]() {
+    ASSERT_FALSE(mutex1.Lock());
+    ASSERT_FALSE(mutex2.Lock());
+    mutex1.Unlock();
+  });
+
+  EXPECT_EQ(Mutex::State::kLocked, mutex1.TryLock());
+  EXPECT_EQ(Mutex::State::kOwnerDied, mutex2.TryLock());
+
+  mutex1.Unlock();
+  mutex2.Unlock();
+}
+
 namespace {
 
 class AdderThread : public ::aos::util::Thread {
@@ -115,6 +175,8 @@
         mutex_(mutex),
         sleep_before_time_(sleep_before_time),
         sleep_after_time_(sleep_after_time) {}
+
+ private:
   virtual void Run() override {
     ::aos::time::SleepFor(sleep_before_time_);
     MutexLocker locker(mutex_);
@@ -122,7 +184,6 @@
     ::aos::time::SleepFor(sleep_after_time_);
   }
 
- private:
   int *const counter_;
   Mutex *const mutex_;
   const ::aos::time::Time sleep_before_time_, sleep_after_time_;
@@ -150,20 +211,23 @@
 
 // Verifiers that ThreadSanitizer understands how a mutex works.
 // For some reason this used to fail when the other tests didn't...
+// The loops make it fail more reliably when it's going to.
 TEST_F(MutexTest, ThreadSanitizerMutexLocker) {
-  int counter = 0;
-  ::std::thread thread([&counter, this]() {
-    for (int i = 0; i < 1000; ++i) {
+  for (int i = 0; i < 100; ++i) {
+    int counter = 0;
+    ::std::thread thread([&counter, this]() {
+      for (int i = 0; i < 300; ++i) {
+        MutexLocker locker(&test_mutex_);
+        ++counter;
+      }
+    });
+    for (int i = 0; i < 300; ++i) {
       MutexLocker locker(&test_mutex_);
-      ++counter;
+      --counter;
     }
-  });
-  for (int i = 0; i < 1000; ++i) {
-    MutexLocker locker(&test_mutex_);
-    --counter;
+    thread.join();
+    EXPECT_EQ(0, counter);
   }
-  thread.join();
-  EXPECT_EQ(0, counter);
 }
 
 // Verifies that ThreadSanitizer understands that an uncontended mutex
@@ -171,9 +235,9 @@
 TEST_F(MutexTest, ThreadSanitizerUncontended) {
   int counter = 0;
   AdderThread threads[2]{
-      {&counter, &test_mutex_, ::aos::time::Time::InSeconds(0.2),
-       ::aos::time::Time::InSeconds(0)},
       {&counter, &test_mutex_, ::aos::time::Time::InSeconds(0),
+       ::aos::time::Time::InSeconds(0)},
+      {&counter, &test_mutex_, ::aos::time::Time::InSeconds(0.2),
        ::aos::time::Time::InSeconds(0)}, };
   for (auto &c : threads) {
     c.Start();
@@ -222,6 +286,26 @@
   test_mutex_.Unlock();
 }
 
+// Tests that MutexLocker behaves correctly when the previous owner dies.
+TEST_F(MutexLockerDeathTest, OwnerDied) {
+  testing::TestSharedMemory my_shm;
+  Mutex *mutex =
+      static_cast<Mutex *>(shm_malloc_aligned(sizeof(Mutex), alignof(Mutex)));
+  new (mutex) Mutex();
+
+  util::FunctionThread::RunInOtherThread([&]() {
+    ASSERT_FALSE(mutex->Lock());
+  });
+  EXPECT_DEATH(
+      {
+        logging::AddImplementation(new util::DeathTestLogImplementation());
+        MutexLocker locker(mutex);
+      },
+      ".*previous owner of mutex [^ ]+ died.*");
+
+  mutex->~Mutex();
+}
+
 TEST_F(IPCMutexLockerTest, Basic) {
   {
     aos::IPCMutexLocker locker(&test_mutex_);
@@ -269,5 +353,26 @@
   test_mutex_.Unlock();
 }
 
+// Tests that IPCMutexLocker behaves correctly when the previous owner dies.
+TEST_F(IPCMutexLockerTest, OwnerDied) {
+  testing::TestSharedMemory my_shm;
+  Mutex *mutex =
+      static_cast<Mutex *>(shm_malloc_aligned(sizeof(Mutex), alignof(Mutex)));
+  new (mutex) Mutex();
+
+  util::FunctionThread::RunInOtherThread([&]() {
+    ASSERT_FALSE(mutex->Lock());
+  });
+  {
+    aos::IPCMutexLocker locker(mutex);
+    EXPECT_EQ(Mutex::State::kLockFailed, mutex->TryLock());
+    EXPECT_TRUE(locker.owner_died());
+  }
+  EXPECT_EQ(Mutex::State::kLocked, mutex->TryLock());
+
+  mutex->Unlock();
+  mutex->~Mutex();
+}
+
 }  // namespace testing
 }  // namespace aos