Add (default off) support for dying on malloc in RT code

Our malloc is not realtime.  We shouldn't be using it in RT code.  To
enforce this, add a hook into tcmalloc to LOG_RAW(FATAL) whenever an
application tries to allocate memory inside code running at the RT
priority level.

We have code in our drivetrain code which is allocating memory still
when realtime.  That prevents us from enabling it yet.

Change-Id: I7679bb11fc9ef0cc676c77f5ef7b041427e1f32a
diff --git a/aos/events/shm_event_loop_test.cc b/aos/events/shm_event_loop_test.cc
index 2aeefb4..8e7ad93 100644
--- a/aos/events/shm_event_loop_test.cc
+++ b/aos/events/shm_event_loop_test.cc
@@ -3,6 +3,7 @@
 #include <string_view>
 
 #include "aos/events/event_loop_param_test.h"
+#include "aos/realtime.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
@@ -95,8 +96,22 @@
   int scheduler;
   PCHECK((scheduler = sched_getscheduler(0)) != -1);
 
-  LOG(INFO) << "scheduler is " << scheduler;
-  return scheduler == SCHED_FIFO || scheduler == SCHED_RR;
+  {
+    // If we are RT, logging the scheduler will crash us.  Mark that we just
+    // don't care.
+    aos::ScopedNotRealtime nrt;
+    LOG(INFO) << "scheduler is " << scheduler;
+  }
+
+  const bool result = scheduler == SCHED_FIFO || scheduler == SCHED_RR;
+  // Confirm that the scheduler matches AOS' interpretation of if we are
+  // realtime or not.
+  if (result) {
+    aos::CheckRealtime();
+  } else {
+    aos::CheckNotRealtime();
+  }
+  return result;
 }
 
 class ShmEventLoopTest : public ::testing::TestWithParam<ReadMethod> {
diff --git a/aos/events/simulated_event_loop.cc b/aos/events/simulated_event_loop.cc
index d86d86c..bc15643 100644
--- a/aos/events/simulated_event_loop.cc
+++ b/aos/events/simulated_event_loop.cc
@@ -245,6 +245,9 @@
 
 std::shared_ptr<SimulatedMessage> SimulatedMessage::Make(
     SimulatedChannel *channel) {
+  // The allocations in here are due to infrastructure and don't count in the no
+  // mallocs in RT code.
+  ScopedNotRealtime nrt;
   const size_t size = channel->max_size();
   SimulatedMessage *const message = reinterpret_cast<SimulatedMessage *>(
       malloc(sizeof(SimulatedMessage) + size + kChannelDataAlignment - 1));
@@ -288,6 +291,9 @@
               aos::monotonic_clock::time_point monotonic_remote_time,
               aos::realtime_clock::time_point realtime_remote_time,
               uint32_t remote_queue_index) override {
+    // The allocations in here are due to infrastructure and don't count in the
+    // no mallocs in RT code.
+    ScopedNotRealtime nrt;
     CHECK_LE(length, size()) << ": Attempting to send too big a message.";
     message_->context.monotonic_event_time = event_loop_->monotonic_now();
     message_->context.monotonic_remote_time = monotonic_remote_time;
@@ -353,6 +359,9 @@
   ~SimulatedFetcher() { simulated_channel_->UnregisterFetcher(this); }
 
   std::pair<bool, monotonic_clock::time_point> DoFetchNext() override {
+    // The allocations in here are due to infrastructure and don't count in the
+    // no mallocs in RT code.
+    ScopedNotRealtime nrt;
     if (msgs_.size() == 0) {
       return std::make_pair(false, monotonic_clock::min_time);
     }
@@ -367,6 +376,9 @@
   }
 
   std::pair<bool, monotonic_clock::time_point> DoFetch() override {
+    // The allocations in here are due to infrastructure and don't count in the
+    // no mallocs in RT code.
+    ScopedNotRealtime nrt;
     if (msgs_.size() == 0) {
       // TODO(austin): Can we just do this logic unconditionally?  It is a lot
       // simpler.  And call clear, obviously.
@@ -837,6 +849,9 @@
 
 void SimulatedTimerHandler::Setup(monotonic_clock::time_point base,
                                   monotonic_clock::duration repeat_offset) {
+  // The allocations in here are due to infrastructure and don't count in the no
+  // mallocs in RT code.
+  ScopedNotRealtime nrt;
   Disable();
   const ::aos::monotonic_clock::time_point monotonic_now =
       simulated_event_loop_->monotonic_now();
@@ -925,6 +940,9 @@
 
 void SimulatedPhasedLoopHandler::Schedule(
     monotonic_clock::time_point sleep_time) {
+  // The allocations in here are due to infrastructure and don't count in the no
+  // mallocs in RT code.
+  ScopedNotRealtime nrt;
   if (token_ != scheduler_->InvalidToken()) {
     scheduler_->Deschedule(token_);
     token_ = scheduler_->InvalidToken();
diff --git a/aos/init.cc b/aos/init.cc
index 51e22cf..71eeabd 100644
--- a/aos/init.cc
+++ b/aos/init.cc
@@ -39,6 +39,7 @@
 // Common stuff that needs to happen at the beginning of both the realtime and
 // non-realtime initialization sequences. May be called twice.
 void InitStart() {
+  RegisterMallocHook();
   if (FLAGS_coredump) {
     WriteCoreDumps();
   }
@@ -58,6 +59,8 @@
   google::InitGoogleLogging((*argv)[0]);
   gflags::ParseCommandLineFlags(argc, argv, true);
   google::InstallFailureSignalHandler();
+
+  RegisterMallocHook();
 }
 
 void InitNRT() {
diff --git a/aos/realtime.cc b/aos/realtime.cc
index ef7c9d7..a9628ee 100644
--- a/aos/realtime.cc
+++ b/aos/realtime.cc
@@ -15,13 +15,27 @@
 
 #include "aos/thread_local.h"
 #include "glog/logging.h"
+#include "glog/raw_logging.h"
 
+DEFINE_bool(
+    die_on_malloc, false,
+    "If true, die when the application allocates memory in a RT section.");
 DEFINE_bool(skip_realtime_scheduler, false,
             "If true, skip changing the scheduler.  Pretend that we changed "
             "the scheduler instead.");
 DEFINE_bool(skip_locking_memory, false,
             "If true, skip locking memory.  Pretend that we did it instead.");
 
+extern "C" {
+typedef void (*MallocHook_NewHook)(const void* ptr, size_t size);
+int MallocHook_AddNewHook(MallocHook_NewHook hook) __attribute__((weak));
+int MallocHook_RemoveNewHook(MallocHook_NewHook hook) __attribute__((weak));
+
+typedef void (*MallocHook_DeleteHook)(const void* ptr);
+int MallocHook_AddDeleteHook(MallocHook_DeleteHook hook) __attribute__((weak));
+int MallocHook_RemoveDeleteHook(MallocHook_DeleteHook hook) __attribute__((weak));
+}   // extern "C"
+
 namespace FLAG__namespace_do_not_use_directly_use_DECLARE_double_instead {
 extern double FLAGS_tcmalloc_release_rate __attribute__((weak));
 }
@@ -208,4 +222,35 @@
 
 ScopedRealtimeRestorer::ScopedRealtimeRestorer() : prior_(is_realtime) {}
 
+void NewHook(const void *ptr, size_t size) {
+  if (is_realtime) {
+    is_realtime = false;
+    RAW_LOG(FATAL, "Malloced %p -> %zu bytes", ptr, size);
+  }
+}
+
+void DeleteHook(const void *ptr) {
+  if (is_realtime) {
+    is_realtime = false;
+    RAW_LOG(FATAL, "Delete Hook %p", ptr);
+  }
+}
+
+void RegisterMallocHook() {
+  if (FLAGS_die_on_malloc) {
+    if (&MallocHook_AddNewHook != nullptr) {
+      CHECK(MallocHook_AddNewHook(&NewHook));
+    } else {
+      LOG(FATAL) << "Failed to register required malloc hooks, disable "
+                    "--die_on_malloc to continue.";
+    }
+    if (&MallocHook_AddDeleteHook != nullptr) {
+      CHECK(MallocHook_AddDeleteHook(&DeleteHook));
+    } else {
+      LOG(FATAL) << "Failed to register required malloc hooks, disable "
+                    "--die_on_malloc to continue.";
+    }
+  }
+}
+
 }  // namespace aos
diff --git a/aos/realtime.h b/aos/realtime.h
index 52db4a8..bb5ea84 100644
--- a/aos/realtime.h
+++ b/aos/realtime.h
@@ -36,6 +36,9 @@
 
 void ExpandStackSize();
 
+// Registers our hooks which crash on RT malloc.
+void RegisterMallocHook();
+
 // CHECKs that we are (or are not) running on the RT scheduler.  Useful for
 // enforcing that operations which are or are not bounded shouldn't be run. This
 // works both in simulation and when running against the real target.
diff --git a/aos/testing/BUILD b/aos/testing/BUILD
index 1fd5380..4b2631a 100644
--- a/aos/testing/BUILD
+++ b/aos/testing/BUILD
@@ -6,6 +6,7 @@
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//aos:init",
         "@com_github_gflags_gflags//:gflags",
         "@com_github_google_glog//:glog",
         "@com_google_googletest//:gtest",
diff --git a/aos/testing/gtest_main.cc b/aos/testing/gtest_main.cc
index adb516c..c8e3172 100644
--- a/aos/testing/gtest_main.cc
+++ b/aos/testing/gtest_main.cc
@@ -1,6 +1,7 @@
 #include <iostream>
 #include <getopt.h>
 
+#include "aos/init.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
@@ -25,9 +26,8 @@
 GTEST_API_ int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
   FLAGS_logtostderr = true;
-  google::InitGoogleLogging(argv[0]);
-  ::gflags::ParseCommandLineFlags(&argc, &argv, false);
-  google::InstallFailureSignalHandler();
+
+  aos::InitGoogle(&argc, &argv);
 
   if (FLAGS_print_logs) {
     if (::aos::testing::ForcePrintLogsDuringTests) {
diff --git a/frc971/wpilib/loop_output_handler_test.cc b/frc971/wpilib/loop_output_handler_test.cc
index a74b762..fcbcbb5 100644
--- a/frc971/wpilib/loop_output_handler_test.cc
+++ b/frc971/wpilib/loop_output_handler_test.cc
@@ -5,7 +5,7 @@
 #include "gtest/gtest.h"
 
 #include "aos/events/simulated_event_loop.h"
-#include "aos/logging/logging.h"
+#include "aos/realtime.h"
 #include "aos/testing/test_logging.h"
 #include "aos/time/time.h"
 #include "frc971/wpilib/loop_output_handler_test_generated.h"
@@ -43,8 +43,6 @@
   TestLoopOutputHandler(::aos::EventLoop *event_loop, const ::std::string &name)
       : LoopOutputHandler(event_loop, name) {}
 
-  ~TestLoopOutputHandler() { Stop(); }
-
   int count() const { return count_; }
 
   ::aos::monotonic_clock::time_point last_time() const { return last_time_; }
@@ -52,12 +50,18 @@
 
  protected:
   void Write(const LoopOutputHandlerTestOutput &output) override {
+    aos::CheckRealtime();
+    // We don't care if this is RT if we are testing.
+    aos::ScopedNotRealtime nrt;
     LOG(INFO) << "output " << aos::FlatbufferToJson(&output);
     ++count_;
     last_time_ = event_loop()->monotonic_now();
   }
 
   void Stop() override {
+    aos::CheckRealtime();
+    // We don't care if this is RT if we are testing.
+    aos::ScopedNotRealtime nrt;
     stop_time_ = event_loop()->monotonic_now();
     LOG(INFO) << "Stopping";
   }