[orin] Add health check monitor for argus_camera and nvargus_daemon
Signed-off-by: Tushar Pankaj <tushar.s.pankaj@gmail.com>
Change-Id: Ifbbec3150cf9a553b51e4942bdad27449d636ae2
diff --git a/frc971/orin/BUILD b/frc971/orin/BUILD
index 8c4e9ac..e483d65 100644
--- a/frc971/orin/BUILD
+++ b/frc971/orin/BUILD
@@ -176,3 +176,17 @@
"@com_google_absl//absl/strings",
],
)
+
+cc_binary(
+ name = "argus_monitor",
+ srcs = [
+ "argus_monitor.cc",
+ ],
+ target_compatible_with = ["@platforms//cpu:arm64"],
+ visibility = ["//visibility:public"],
+ deps = [
+ "//aos:aos_cli_utils",
+ "//aos:init",
+ "//aos/events:shm_event_loop",
+ ],
+)
diff --git a/frc971/orin/argus_camera.cc b/frc971/orin/argus_camera.cc
index 811efdd..f886fab 100644
--- a/frc971/orin/argus_camera.cc
+++ b/frc971/orin/argus_camera.cc
@@ -575,13 +575,6 @@
ArgusCamera::MappedBuffer buffer = camera.NextImageBlocking();
if (buffer.nvbuf_surf() == nullptr) {
- // TODO(austin): Control-C isn't working for some reason, debug it...
- // We're restarting nvargus-daemon here because if we exit like this its
- // likely that nvargus-daemon has run into an error that it can't
- // recover from. Which means even if this program restarts it can't get
- // new camera images.
- CHECK_EQ(std::system("sudo systemctl restart nvargus-daemon.service"),
- 0);
event_loop.Exit();
return;
}
diff --git a/frc971/orin/argus_monitor.cc b/frc971/orin/argus_monitor.cc
new file mode 100644
index 0000000..8f129b2
--- /dev/null
+++ b/frc971/orin/argus_monitor.cc
@@ -0,0 +1,107 @@
+#include <unistd.h>
+
+#include <iostream>
+
+#include "gflags/gflags.h"
+
+#include "aos/aos_cli_utils.h"
+#include "aos/configuration.h"
+#include "aos/init.h"
+#include "aos/json_to_flatbuffer.h"
+#include "aos/realtime.h"
+
+DEFINE_int32(priority, -1, "If set, the RT priority to run at.");
+DEFINE_double(max_jitter, 5.00,
+ "The max time in seconds between messages before considering the "
+ "camera processes dead.");
+DEFINE_double(grace_period, 10.00,
+ "The grace period at startup before enforcing that messages must "
+ "flow from the camera processes.");
+
+namespace aos {
+
+class State {
+ public:
+ State(aos::EventLoop *event_loop, const Channel *channel)
+ : channel_(channel),
+ channel_name_(aos::configuration::StrippedChannelToString(channel_)) {
+ LOG(INFO) << "Watching for healthy message sends on " << channel_name_;
+
+ event_loop->MakeRawNoArgWatcher(
+ channel_,
+ [this](const aos::Context &context) { HandleMessage(context); });
+
+ timer_handle_ = event_loop->AddTimer(
+ [this, event_loop]() { RunHealthCheck(event_loop); });
+ timer_handle_->set_name("jitter");
+ event_loop->OnRun([this, event_loop]() {
+ timer_handle_->Schedule(
+ event_loop->monotonic_now() +
+ std::chrono::duration_cast<std::chrono::nanoseconds>(
+ std::chrono::duration<double>(FLAGS_grace_period)),
+ std::chrono::milliseconds(1000));
+ });
+ }
+
+ void HandleMessage(const aos::Context &context) {
+ last_time_ = context.monotonic_event_time;
+ }
+
+ void RunHealthCheck(aos::EventLoop *event_loop) {
+ if (last_time_ + std::chrono::duration_cast<std::chrono::nanoseconds>(
+ std::chrono::duration<double>(FLAGS_max_jitter)) <
+ event_loop->monotonic_now()) {
+ // Restart camera services
+ LOG(INFO) << "Restarting camera services";
+ CHECK_EQ(std::system("aos_starter stop argus_camera0"), 0);
+ CHECK_EQ(std::system("aos_starter stop argus_camera1"), 0);
+ CHECK_EQ(std::system("sudo systemctl restart nvargus-daemon.service"), 0);
+ CHECK_EQ(std::system("aos_starter start argus_camera0"), 0);
+ CHECK_EQ(std::system("aos_starter start argus_camera1"), 0);
+
+ std::exit(0);
+ return;
+ }
+ }
+
+ private:
+ const Channel *channel_;
+
+ std::string channel_name_;
+
+ aos::monotonic_clock::time_point last_time_ = aos::monotonic_clock::min_time;
+
+ aos::TimerHandler *timer_handle_;
+};
+
+} // namespace aos
+
+int main(int argc, char **argv) {
+ aos::InitGoogle(&argc, &argv);
+
+ aos::CliUtilInfo cli_info;
+ if (cli_info.Initialize(
+ &argc, &argv,
+ [&cli_info](const aos::Channel *channel) {
+ return aos::configuration::ChannelIsReadableOnNode(
+ channel, cli_info.event_loop->node());
+ },
+ "channel is readeable on node", true)) {
+ return 0;
+ }
+
+ std::vector<std::unique_ptr<aos::State>> states;
+
+ for (const aos::Channel *channel : cli_info.found_channels) {
+ states.emplace_back(
+ std::make_unique<aos::State>(&(cli_info.event_loop.value()), channel));
+ }
+
+ if (FLAGS_priority > 0) {
+ cli_info.event_loop->SetRuntimeRealtimePriority(FLAGS_priority);
+ }
+
+ cli_info.event_loop->Run();
+
+ return 0;
+}
diff --git a/y2024/BUILD b/y2024/BUILD
index 6687dac..2727ac7 100644
--- a/y2024/BUILD
+++ b/y2024/BUILD
@@ -56,6 +56,7 @@
"//aos/util:foxglove_websocket",
"//frc971/image_streamer:image_streamer",
"//frc971/orin:hardware_monitor",
+ "//frc971/orin:argus_monitor",
"//frc971/vision:intrinsics_calibration",
"//aos/util:filesystem_monitor",
"//y2024/vision:viewer",
diff --git a/y2024/y2024_imu.json b/y2024/y2024_imu.json
index 2680856..aa8b042 100644
--- a/y2024/y2024_imu.json
+++ b/y2024/y2024_imu.json
@@ -503,6 +503,20 @@
]
},
{
+ "name": "argus_monitor_imu",
+ "executable_name": "argus_monitor",
+ "args": [
+ "/imu/camera0",
+ "frc971.vision.TargetMap",
+ "/imu/camera1",
+ "frc971.vision.TargetMap",
+ ],
+ "user": "pi",
+ "nodes": [
+ "imu"
+ ]
+ },
+ {
"name": "argus_camera0",
"executable_name": "argus_camera",
"args": [
diff --git a/y2024/y2024_orin1.json b/y2024/y2024_orin1.json
index 6072da8..e9c8093 100644
--- a/y2024/y2024_orin1.json
+++ b/y2024/y2024_orin1.json
@@ -27,7 +27,7 @@
"source_node": "orin1",
"frequency": 50,
"num_senders": 20,
- "max_size": 2000
+ "max_size": 3000
},
{
"name": "/orin1/aos",
@@ -371,6 +371,20 @@
]
},
{
+ "name": "argus_monitor_orin1",
+ "executable_name": "argus_monitor",
+ "args": [
+ "/orin1/camera0",
+ "frc971.vision.TargetMap",
+ "/orin1/camera1",
+ "frc971.vision.TargetMap",
+ ],
+ "user": "pi",
+ "nodes": [
+ "orin1"
+ ]
+ },
+ {
"name": "argus_camera0",
"executable_name": "argus_camera",
"args": [