aos: Detect lockless queue owner death more reliably
When the OOM killer kills us, or the process otherwise dies
aggressively, the robust futex cleanup doesn't happen. This results in
senders, watchers, or pinners getting leaked until reboot.
Fix this by both checking that the tid exists, along with tracking and
confirming that it's start time matches the original start time. That
should let us catch the PID collision case reliably. We only need to do
the exhaustive check when constructing the queue, so it is OK to be
expensive.
Because we're changing the format in the SHMEM files here (i.e. adding
the `start_time_ticks` field) we need to bump the queue version
number.
Mostly written by Phil Schrader.
Change-Id: I6bff78b6933fed2e0163bcee26138b6a8af857ad
Co-authored-by: Austin Schuh <austin.schuh@bluerivertech.com>
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/ipc_lib/lockless_queue.cc b/aos/ipc_lib/lockless_queue.cc
index 0752e2f..f033e60 100644
--- a/aos/ipc_lib/lockless_queue.cc
+++ b/aos/ipc_lib/lockless_queue.cc
@@ -172,7 +172,7 @@
size_t valid_senders = 0;
for (size_t i = 0; i < num_senders; ++i) {
Sender *sender = memory->GetSender(i);
- if (!sender->ownership_tracker.LoadAcquire().OwnerIsDead()) {
+ if (!sender->ownership_tracker.OwnerIsDefinitelyAbsolutelyDead()) {
// Not dead.
++valid_senders;
continue;
@@ -257,7 +257,7 @@
// read it before it's set.
for (size_t i = 0; i < num_pinners; ++i) {
Pinner *const pinner = memory->GetPinner(i);
- if (!pinner->ownership_tracker.LoadAcquire().OwnerIsDead()) {
+ if (!pinner->ownership_tracker.OwnerIsDefinitelyAbsolutelyDead()) {
continue;
}
pinner->pinned.Invalidate();
@@ -280,7 +280,7 @@
num_missing = 0;
for (size_t i = 0; i < num_senders; ++i) {
Sender *const sender = memory->GetSender(i);
- if (sender->ownership_tracker.LoadAcquire().OwnerIsDead()) {
+ if (sender->ownership_tracker.OwnerIsDefinitelyAbsolutelyDead()) {
if (!need_recovery[i]) {
return false;
}
@@ -325,7 +325,7 @@
const size_t starting_num_missing = num_missing;
for (size_t i = 0; i < num_senders; ++i) {
Sender *sender = memory->GetSender(i);
- if (!sender->ownership_tracker.LoadAcquire().OwnerIsDead()) {
+ if (!sender->ownership_tracker.OwnerIsDefinitelyAbsolutelyDead()) {
CHECK(!need_recovery[i]) << ": Somebody else recovered a sender: " << i;
continue;
}
@@ -733,7 +733,8 @@
// it needs to happen-after whatever that process did before dying.
auto *const ownership_tracker =
&(memory_->GetWatcher(i)->ownership_tracker);
- if (ownership_tracker->LoadAcquire().IsUnclaimedOrOwnerIsDead()) {
+ if (ownership_tracker->LoadAcquire().IsUnclaimed() ||
+ ownership_tracker->OwnerIsDefinitelyAbsolutelyDead()) {
watcher_index_ = i;
// Relaxed is OK here because we're the only task going to touch it
// between here and the write in death_notification_init below (other