blob: 34c8b7a3aa80d9b823bc42cf1ecfd676c6dcfbf2 [file] [log] [blame]
Philipp Schraderab2f8432023-09-17 18:58:06 -07001#ifndef AOS_IPC_LIB_ROBUST_OWNERSHIP_TRACKER_H_
2#define AOS_IPC_LIB_ROBUST_OWNERSHIP_TRACKER_H_
3
Stephan Pleines682928d2024-05-31 20:43:48 -07004#include <assert.h>
Philipp Schraderab2f8432023-09-17 18:58:06 -07005#include <linux/futex.h>
Stephan Pleines682928d2024-05-31 20:43:48 -07006#include <stdint.h>
Philipp Schrader81fa3fb2023-09-17 18:58:35 -07007#include <sys/syscall.h>
Stephan Pleines682928d2024-05-31 20:43:48 -07008#include <unistd.h>
Philipp Schraderab2f8432023-09-17 18:58:06 -07009
Stephan Pleines682928d2024-05-31 20:43:48 -070010#include <atomic>
11#include <limits>
12#include <optional>
13#include <ostream>
Philipp Schraderab2f8432023-09-17 18:58:06 -070014#include <string>
15
Stephan Pleines682928d2024-05-31 20:43:48 -070016#include "glog/logging.h"
17
Philipp Schraderab2f8432023-09-17 18:58:06 -070018#include "aos/ipc_lib/aos_sync.h"
Philipp Schrader81fa3fb2023-09-17 18:58:35 -070019#include "aos/util/top.h"
Philipp Schraderab2f8432023-09-17 18:58:06 -070020
21namespace aos::ipc_lib {
Philipp Schrader23eabd62023-09-19 14:59:49 -070022namespace testing {
23class RobustOwnershipTrackerTest;
24} // namespace testing
Philipp Schraderab2f8432023-09-17 18:58:06 -070025
26// Results of atomically loading the ownership state via RobustOwnershipTracker
27// below. This allows the state to be compared and queried later.
28class ThreadOwnerStatusSnapshot {
29 public:
30 ThreadOwnerStatusSnapshot() : futex_(0) {}
31 ThreadOwnerStatusSnapshot(aos_futex futex) : futex_(futex) {}
32 ThreadOwnerStatusSnapshot(const ThreadOwnerStatusSnapshot &) = default;
33 ThreadOwnerStatusSnapshot &operator=(const ThreadOwnerStatusSnapshot &) =
34 default;
35 ThreadOwnerStatusSnapshot(ThreadOwnerStatusSnapshot &&) = default;
36 ThreadOwnerStatusSnapshot &operator=(ThreadOwnerStatusSnapshot &&) = default;
37
38 // Returns if the owner died as noticed by the robust futex using Acquire
39 // memory ordering.
40 bool OwnerIsDead() const { return (futex_ & FUTEX_OWNER_DIED) != 0; }
41
42 // Returns true if no one has claimed ownership.
43 bool IsUnclaimed() const { return futex_ == 0; }
44
Philipp Schraderab2f8432023-09-17 18:58:06 -070045 // Returns the thread ID (a.k.a. "tid") of the owning thread. Use this when
46 // trying to access the /proc entry that corresponds to the owning thread for
47 // example. Do not use the futex value directly.
48 pid_t tid() const { return futex_ & FUTEX_TID_MASK; }
49
50 bool operator==(const ThreadOwnerStatusSnapshot &other) const {
51 return other.futex_ == futex_;
52 }
53
54 private:
55 aos_futex futex_;
56};
57
58// This object reliably tracks a thread owning a resource. A single thread may
59// possess multiple resources like senders and receivers. Each resource can have
60// its own instance of this class. These instances are responsible for
61// monitoring the thread that owns them. Each resource can use its instance of
62// this class to reliably check whether the owning thread is no longer alive.
63//
64// All methods other than Load* must be accessed under a mutex.
65class RobustOwnershipTracker {
66 public:
Philipp Schrader81fa3fb2023-09-17 18:58:35 -070067 static constexpr uint64_t kNoStartTimeTicks =
68 std::numeric_limits<uint64_t>::max();
69
70 static uint64_t ReadStartTimeTicks(pid_t tid) {
71 if (tid == 0) {
72 return kNoStartTimeTicks;
73 }
74 std::optional<aos::util::ProcStat> proc_stat = util::ReadProcStat(tid);
75 if (!proc_stat.has_value()) {
76 return kNoStartTimeTicks;
77 }
78 return proc_stat->start_time_ticks;
79 }
80
81 // Loads the realtime-compatible contents of the ownership tracker with
82 // Acquire memory ordering.
Philipp Schraderab2f8432023-09-17 18:58:06 -070083 ThreadOwnerStatusSnapshot LoadAcquire() const {
84 return ThreadOwnerStatusSnapshot(
85 __atomic_load_n(&(mutex_.futex), __ATOMIC_ACQUIRE));
86 }
87
Philipp Schrader81fa3fb2023-09-17 18:58:35 -070088 // Loads all the realtime-compatible contents of the ownership tracker with
89 // Relaxed memory order.
Philipp Schraderab2f8432023-09-17 18:58:06 -070090 ThreadOwnerStatusSnapshot LoadRelaxed() const {
91 return ThreadOwnerStatusSnapshot(
92 __atomic_load_n(&(mutex_.futex), __ATOMIC_RELAXED));
93 }
94
Philipp Schrader81fa3fb2023-09-17 18:58:35 -070095 // Checks both the robust futex and dredges through /proc to see if the thread
96 // is alive. As per the class description, this must only be called under a
97 // mutex. This must not be called in a realtime context and it is slow.
98 bool OwnerIsDefinitelyAbsolutelyDead() const {
99 auto loaded = LoadAcquire();
100 if (loaded.OwnerIsDead()) {
101 return true;
102 }
103 if (loaded.IsUnclaimed()) {
104 return false;
105 }
106 const uint64_t proc_start_time_ticks = ReadStartTimeTicks(loaded.tid());
107 if (proc_start_time_ticks == kNoStartTimeTicks) {
108 LOG(ERROR) << "Detected that PID " << loaded.tid() << " died.";
109 return true;
110 }
111
112 if (proc_start_time_ticks != start_time_ticks_) {
113 LOG(ERROR) << "Detected that PID " << loaded.tid()
114 << " died from a starttime missmatch.";
115 return true;
116 }
117 return false;
118 }
119
Philipp Schraderab2f8432023-09-17 18:58:06 -0700120 // Clears all ownership state.
121 //
122 // This should only really be called if you are 100% certain that the owner is
123 // dead. Use `LoadAquire().OwnerIsDead()` to determine this.
124 void ForceClear() {
125 // Must be opposite order of Acquire.
126 // We only deal with the futex here because we don't want to change anything
127 // about the linked list. We just want to release ownership here. We still
128 // want the kernel to know about this element via the linked list the next
129 // time someone takes ownership.
130 __atomic_store_n(&(mutex_.futex), 0, __ATOMIC_RELEASE);
Philipp Schrader81fa3fb2023-09-17 18:58:35 -0700131 start_time_ticks_ = kNoStartTimeTicks;
Philipp Schraderab2f8432023-09-17 18:58:06 -0700132 }
133
134 // Returns true if this thread holds ownership.
135 bool IsHeldBySelf() { return death_notification_is_held(&mutex_); }
136
Philipp Schrader23eabd62023-09-19 14:59:49 -0700137 // Returns true if the mutex is held by the provided tid. This is primarily
138 // intended for testing. There should be no need to call this in production
139 // code.
140 bool IsHeldBy(pid_t tid) { return LoadRelaxed().tid() == tid; }
141
Philipp Schraderab2f8432023-09-17 18:58:06 -0700142 // Acquires ownership. Other threads will know that this thread holds the
143 // ownership or be notified if this thread dies.
Philipp Schrader81fa3fb2023-09-17 18:58:35 -0700144 void Acquire() {
145 pid_t tid = syscall(SYS_gettid);
146 assert(tid > 0);
147 const uint64_t proc_start_time_ticks = ReadStartTimeTicks(tid);
148 CHECK_NE(proc_start_time_ticks, kNoStartTimeTicks);
149
150 start_time_ticks_ = proc_start_time_ticks;
151 death_notification_init(&mutex_);
152 }
Philipp Schraderab2f8432023-09-17 18:58:06 -0700153
154 // Releases ownership.
155 //
156 // This should only be called from the owning thread.
157 void Release() {
158 // Must be opposite order of Acquire.
159 death_notification_release(&mutex_);
Philipp Schrader81fa3fb2023-09-17 18:58:35 -0700160 start_time_ticks_ = kNoStartTimeTicks;
Philipp Schraderab2f8432023-09-17 18:58:06 -0700161 }
162
Philipp Schraderab2f8432023-09-17 18:58:06 -0700163 // Returns a string representing this object.
164 std::string DebugString() const;
165
166 private:
Philipp Schrader23eabd62023-09-19 14:59:49 -0700167 friend class testing::RobustOwnershipTrackerTest;
168
Philipp Schraderab2f8432023-09-17 18:58:06 -0700169 // Robust futex to track ownership the normal way. The futex is inside the
170 // mutex here. We use the wrapper mutex because the death_notification_*
171 // functions operate on that instead of the futex directly.
172 //
173 // We use a futex here because:
174 // - futexes are fast.
175 // - The kernel can atomically clean up a dead thread and mark the futex
176 // appropriately.
177 // - Owners can clean up after dead threads.
178 aos_mutex mutex_;
Philipp Schrader81fa3fb2023-09-17 18:58:35 -0700179
180 // Thread's start time ticks.
181 std::atomic<uint64_t> start_time_ticks_;
Philipp Schraderab2f8432023-09-17 18:58:06 -0700182};
183
184} // namespace aos::ipc_lib
185
186#endif // AOS_IPC_LIB_ROBUST_OWNERSHIP_TRACKER_H_