blob: a8bbca3552edb82cfe3da1ea93d14ec5d66e1ec0 [file] [log] [blame]
Philipp Schraderab2f8432023-09-17 18:58:06 -07001#ifndef AOS_IPC_LIB_ROBUST_OWNERSHIP_TRACKER_H_
2#define AOS_IPC_LIB_ROBUST_OWNERSHIP_TRACKER_H_
3
4#include <linux/futex.h>
Philipp Schrader81fa3fb2023-09-17 18:58:35 -07005#include <sys/syscall.h>
Philipp Schraderab2f8432023-09-17 18:58:06 -07006
7#include <string>
8
9#include "aos/ipc_lib/aos_sync.h"
Philipp Schrader81fa3fb2023-09-17 18:58:35 -070010#include "aos/util/top.h"
Philipp Schraderab2f8432023-09-17 18:58:06 -070011
12namespace aos::ipc_lib {
Philipp Schrader23eabd62023-09-19 14:59:49 -070013namespace testing {
14class RobustOwnershipTrackerTest;
15} // namespace testing
Philipp Schraderab2f8432023-09-17 18:58:06 -070016
17// Results of atomically loading the ownership state via RobustOwnershipTracker
18// below. This allows the state to be compared and queried later.
19class ThreadOwnerStatusSnapshot {
20 public:
21 ThreadOwnerStatusSnapshot() : futex_(0) {}
22 ThreadOwnerStatusSnapshot(aos_futex futex) : futex_(futex) {}
23 ThreadOwnerStatusSnapshot(const ThreadOwnerStatusSnapshot &) = default;
24 ThreadOwnerStatusSnapshot &operator=(const ThreadOwnerStatusSnapshot &) =
25 default;
26 ThreadOwnerStatusSnapshot(ThreadOwnerStatusSnapshot &&) = default;
27 ThreadOwnerStatusSnapshot &operator=(ThreadOwnerStatusSnapshot &&) = default;
28
29 // Returns if the owner died as noticed by the robust futex using Acquire
30 // memory ordering.
31 bool OwnerIsDead() const { return (futex_ & FUTEX_OWNER_DIED) != 0; }
32
33 // Returns true if no one has claimed ownership.
34 bool IsUnclaimed() const { return futex_ == 0; }
35
Philipp Schraderab2f8432023-09-17 18:58:06 -070036 // Returns the thread ID (a.k.a. "tid") of the owning thread. Use this when
37 // trying to access the /proc entry that corresponds to the owning thread for
38 // example. Do not use the futex value directly.
39 pid_t tid() const { return futex_ & FUTEX_TID_MASK; }
40
41 bool operator==(const ThreadOwnerStatusSnapshot &other) const {
42 return other.futex_ == futex_;
43 }
44
45 private:
46 aos_futex futex_;
47};
48
49// This object reliably tracks a thread owning a resource. A single thread may
50// possess multiple resources like senders and receivers. Each resource can have
51// its own instance of this class. These instances are responsible for
52// monitoring the thread that owns them. Each resource can use its instance of
53// this class to reliably check whether the owning thread is no longer alive.
54//
55// All methods other than Load* must be accessed under a mutex.
56class RobustOwnershipTracker {
57 public:
Philipp Schrader81fa3fb2023-09-17 18:58:35 -070058 static constexpr uint64_t kNoStartTimeTicks =
59 std::numeric_limits<uint64_t>::max();
60
61 static uint64_t ReadStartTimeTicks(pid_t tid) {
62 if (tid == 0) {
63 return kNoStartTimeTicks;
64 }
65 std::optional<aos::util::ProcStat> proc_stat = util::ReadProcStat(tid);
66 if (!proc_stat.has_value()) {
67 return kNoStartTimeTicks;
68 }
69 return proc_stat->start_time_ticks;
70 }
71
72 // Loads the realtime-compatible contents of the ownership tracker with
73 // Acquire memory ordering.
Philipp Schraderab2f8432023-09-17 18:58:06 -070074 ThreadOwnerStatusSnapshot LoadAcquire() const {
75 return ThreadOwnerStatusSnapshot(
76 __atomic_load_n(&(mutex_.futex), __ATOMIC_ACQUIRE));
77 }
78
Philipp Schrader81fa3fb2023-09-17 18:58:35 -070079 // Loads all the realtime-compatible contents of the ownership tracker with
80 // Relaxed memory order.
Philipp Schraderab2f8432023-09-17 18:58:06 -070081 ThreadOwnerStatusSnapshot LoadRelaxed() const {
82 return ThreadOwnerStatusSnapshot(
83 __atomic_load_n(&(mutex_.futex), __ATOMIC_RELAXED));
84 }
85
Philipp Schrader81fa3fb2023-09-17 18:58:35 -070086 // Checks both the robust futex and dredges through /proc to see if the thread
87 // is alive. As per the class description, this must only be called under a
88 // mutex. This must not be called in a realtime context and it is slow.
89 bool OwnerIsDefinitelyAbsolutelyDead() const {
90 auto loaded = LoadAcquire();
91 if (loaded.OwnerIsDead()) {
92 return true;
93 }
94 if (loaded.IsUnclaimed()) {
95 return false;
96 }
97 const uint64_t proc_start_time_ticks = ReadStartTimeTicks(loaded.tid());
98 if (proc_start_time_ticks == kNoStartTimeTicks) {
99 LOG(ERROR) << "Detected that PID " << loaded.tid() << " died.";
100 return true;
101 }
102
103 if (proc_start_time_ticks != start_time_ticks_) {
104 LOG(ERROR) << "Detected that PID " << loaded.tid()
105 << " died from a starttime missmatch.";
106 return true;
107 }
108 return false;
109 }
110
Philipp Schraderab2f8432023-09-17 18:58:06 -0700111 // Clears all ownership state.
112 //
113 // This should only really be called if you are 100% certain that the owner is
114 // dead. Use `LoadAquire().OwnerIsDead()` to determine this.
115 void ForceClear() {
116 // Must be opposite order of Acquire.
117 // We only deal with the futex here because we don't want to change anything
118 // about the linked list. We just want to release ownership here. We still
119 // want the kernel to know about this element via the linked list the next
120 // time someone takes ownership.
121 __atomic_store_n(&(mutex_.futex), 0, __ATOMIC_RELEASE);
Philipp Schrader81fa3fb2023-09-17 18:58:35 -0700122 start_time_ticks_ = kNoStartTimeTicks;
Philipp Schraderab2f8432023-09-17 18:58:06 -0700123 }
124
125 // Returns true if this thread holds ownership.
126 bool IsHeldBySelf() { return death_notification_is_held(&mutex_); }
127
Philipp Schrader23eabd62023-09-19 14:59:49 -0700128 // Returns true if the mutex is held by the provided tid. This is primarily
129 // intended for testing. There should be no need to call this in production
130 // code.
131 bool IsHeldBy(pid_t tid) { return LoadRelaxed().tid() == tid; }
132
Philipp Schraderab2f8432023-09-17 18:58:06 -0700133 // Acquires ownership. Other threads will know that this thread holds the
134 // ownership or be notified if this thread dies.
Philipp Schrader81fa3fb2023-09-17 18:58:35 -0700135 void Acquire() {
136 pid_t tid = syscall(SYS_gettid);
137 assert(tid > 0);
138 const uint64_t proc_start_time_ticks = ReadStartTimeTicks(tid);
139 CHECK_NE(proc_start_time_ticks, kNoStartTimeTicks);
140
141 start_time_ticks_ = proc_start_time_ticks;
142 death_notification_init(&mutex_);
143 }
Philipp Schraderab2f8432023-09-17 18:58:06 -0700144
145 // Releases ownership.
146 //
147 // This should only be called from the owning thread.
148 void Release() {
149 // Must be opposite order of Acquire.
150 death_notification_release(&mutex_);
Philipp Schrader81fa3fb2023-09-17 18:58:35 -0700151 start_time_ticks_ = kNoStartTimeTicks;
Philipp Schraderab2f8432023-09-17 18:58:06 -0700152 }
153
Philipp Schraderab2f8432023-09-17 18:58:06 -0700154 // Returns a string representing this object.
155 std::string DebugString() const;
156
157 private:
Philipp Schrader23eabd62023-09-19 14:59:49 -0700158 friend class testing::RobustOwnershipTrackerTest;
159
Philipp Schraderab2f8432023-09-17 18:58:06 -0700160 // Robust futex to track ownership the normal way. The futex is inside the
161 // mutex here. We use the wrapper mutex because the death_notification_*
162 // functions operate on that instead of the futex directly.
163 //
164 // We use a futex here because:
165 // - futexes are fast.
166 // - The kernel can atomically clean up a dead thread and mark the futex
167 // appropriately.
168 // - Owners can clean up after dead threads.
169 aos_mutex mutex_;
Philipp Schrader81fa3fb2023-09-17 18:58:35 -0700170
171 // Thread's start time ticks.
172 std::atomic<uint64_t> start_time_ticks_;
Philipp Schraderab2f8432023-09-17 18:58:06 -0700173};
174
175} // namespace aos::ipc_lib
176
177#endif // AOS_IPC_LIB_ROBUST_OWNERSHIP_TRACKER_H_