blob: bc2db172dfed5c2be007f378248bd22d3fd1b8a1 [file] [log] [blame]
#ifndef AOS_IPC_LIB_ROBUST_OWNERSHIP_TRACKER_H_
#define AOS_IPC_LIB_ROBUST_OWNERSHIP_TRACKER_H_
#include <assert.h>
#include <linux/futex.h>
#include <stdint.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <atomic>
#include <limits>
#include <optional>
#include <ostream>
#include <string>
#include "absl/log/check.h"
#include "absl/log/log.h"
#include "aos/ipc_lib/aos_sync.h"
#include "aos/util/top.h"
namespace aos::ipc_lib {
namespace testing {
class RobustOwnershipTrackerTest;
} // namespace testing
// Results of atomically loading the ownership state via RobustOwnershipTracker
// below. This allows the state to be compared and queried later.
class ThreadOwnerStatusSnapshot {
public:
ThreadOwnerStatusSnapshot() : futex_(0) {}
ThreadOwnerStatusSnapshot(aos_futex futex) : futex_(futex) {}
ThreadOwnerStatusSnapshot(const ThreadOwnerStatusSnapshot &) = default;
ThreadOwnerStatusSnapshot &operator=(const ThreadOwnerStatusSnapshot &) =
default;
ThreadOwnerStatusSnapshot(ThreadOwnerStatusSnapshot &&) = default;
ThreadOwnerStatusSnapshot &operator=(ThreadOwnerStatusSnapshot &&) = default;
// Returns if the owner died as noticed by the robust futex using Acquire
// memory ordering.
bool OwnerIsDead() const { return (futex_ & FUTEX_OWNER_DIED) != 0; }
// Returns true if no one has claimed ownership.
bool IsUnclaimed() const { return futex_ == 0; }
// Returns the thread ID (a.k.a. "tid") of the owning thread. Use this when
// trying to access the /proc entry that corresponds to the owning thread for
// example. Do not use the futex value directly.
pid_t tid() const { return futex_ & FUTEX_TID_MASK; }
bool operator==(const ThreadOwnerStatusSnapshot &other) const {
return other.futex_ == futex_;
}
private:
aos_futex futex_;
};
// This object reliably tracks a thread owning a resource. A single thread may
// possess multiple resources like senders and receivers. Each resource can have
// its own instance of this class. These instances are responsible for
// monitoring the thread that owns them. Each resource can use its instance of
// this class to reliably check whether the owning thread is no longer alive.
//
// All methods other than Load* must be accessed under a mutex.
class RobustOwnershipTracker {
public:
static constexpr uint64_t kNoStartTimeTicks =
std::numeric_limits<uint64_t>::max();
static uint64_t ReadStartTimeTicks(pid_t tid) {
if (tid == 0) {
return kNoStartTimeTicks;
}
std::optional<aos::util::ProcStat> proc_stat = util::ReadProcStat(tid);
if (!proc_stat.has_value()) {
return kNoStartTimeTicks;
}
return proc_stat->start_time_ticks;
}
// Loads the realtime-compatible contents of the ownership tracker with
// Acquire memory ordering.
ThreadOwnerStatusSnapshot LoadAcquire() const {
return ThreadOwnerStatusSnapshot(
__atomic_load_n(&(mutex_.futex), __ATOMIC_ACQUIRE));
}
// Loads all the realtime-compatible contents of the ownership tracker with
// Relaxed memory order.
ThreadOwnerStatusSnapshot LoadRelaxed() const {
return ThreadOwnerStatusSnapshot(
__atomic_load_n(&(mutex_.futex), __ATOMIC_RELAXED));
}
// Checks both the robust futex and dredges through /proc to see if the thread
// is alive. As per the class description, this must only be called under a
// mutex. This must not be called in a realtime context and it is slow.
bool OwnerIsDefinitelyAbsolutelyDead() const {
auto loaded = LoadAcquire();
if (loaded.OwnerIsDead()) {
return true;
}
if (loaded.IsUnclaimed()) {
return false;
}
const uint64_t proc_start_time_ticks = ReadStartTimeTicks(loaded.tid());
if (proc_start_time_ticks == kNoStartTimeTicks) {
LOG(ERROR) << "Detected that PID " << loaded.tid() << " died.";
return true;
}
if (proc_start_time_ticks != start_time_ticks_) {
LOG(ERROR) << "Detected that PID " << loaded.tid()
<< " died from a starttime missmatch.";
return true;
}
return false;
}
// Clears all ownership state.
//
// This should only really be called if you are 100% certain that the owner is
// dead. Use `LoadAquire().OwnerIsDead()` to determine this.
void ForceClear() {
// Must be opposite order of Acquire.
// We only deal with the futex here because we don't want to change anything
// about the linked list. We just want to release ownership here. We still
// want the kernel to know about this element via the linked list the next
// time someone takes ownership.
__atomic_store_n(&(mutex_.futex), 0, __ATOMIC_RELEASE);
start_time_ticks_ = kNoStartTimeTicks;
}
// Returns true if this thread holds ownership.
bool IsHeldBySelf() { return death_notification_is_held(&mutex_); }
// Returns true if the mutex is held by the provided tid. This is primarily
// intended for testing. There should be no need to call this in production
// code.
bool IsHeldBy(pid_t tid) { return LoadRelaxed().tid() == tid; }
// Acquires ownership. Other threads will know that this thread holds the
// ownership or be notified if this thread dies.
void Acquire() {
pid_t tid = syscall(SYS_gettid);
assert(tid > 0);
const uint64_t proc_start_time_ticks = ReadStartTimeTicks(tid);
CHECK_NE(proc_start_time_ticks, kNoStartTimeTicks);
start_time_ticks_ = proc_start_time_ticks;
death_notification_init(&mutex_);
}
// Releases ownership.
//
// This should only be called from the owning thread.
void Release() {
// Must be opposite order of Acquire.
death_notification_release(&mutex_);
start_time_ticks_ = kNoStartTimeTicks;
}
// Returns a string representing this object.
std::string DebugString() const;
private:
friend class testing::RobustOwnershipTrackerTest;
// Robust futex to track ownership the normal way. The futex is inside the
// mutex here. We use the wrapper mutex because the death_notification_*
// functions operate on that instead of the futex directly.
//
// We use a futex here because:
// - futexes are fast.
// - The kernel can atomically clean up a dead thread and mark the futex
// appropriately.
// - Owners can clean up after dead threads.
aos_mutex mutex_;
// Thread's start time ticks.
std::atomic<uint64_t> start_time_ticks_;
};
} // namespace aos::ipc_lib
#endif // AOS_IPC_LIB_ROBUST_OWNERSHIP_TRACKER_H_