Add utility for tracking process CPU usage

Being able to log this sort of information can be helpful for debugging
things when something weird happened on the system.

References: PRO-13362
Change-Id: Ie2847536fdc58279f62c9b7b0208d7fe51a90a5c
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/util/top.cc b/aos/util/top.cc
new file mode 100644
index 0000000..4882af7
--- /dev/null
+++ b/aos/util/top.cc
@@ -0,0 +1,254 @@
+#include "aos/util/top.h"
+
+#include <dirent.h>
+#include <unistd.h>
+
+#include <queue>
+#include <string>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
+
+namespace aos::util {
+namespace {
+std::optional<std::string> ReadShortFile(std::string_view file_name) {
+  // Open as input and seek to end immediately.
+  std::ifstream file(std::string(file_name), std::ios_base::in);
+  if (!file.good()) {
+    VLOG(1) << "Can't read " << file_name;
+    return std::nullopt;
+  }
+  const size_t kMaxLineLength = 4096;
+  char buffer[kMaxLineLength];
+  file.read(buffer, kMaxLineLength);
+  if (!file.eof()) {
+    return std::nullopt;
+  }
+  return std::string(buffer, file.gcount());
+}
+}  // namespace
+
+std::optional<ProcStat> ReadProcStat(pid_t pid) {
+  std::optional<std::string> contents =
+      ReadShortFile(absl::StrFormat("/proc/%d/stat", pid));
+  if (!contents.has_value()) {
+    return std::nullopt;
+  }
+  const size_t start_name = contents->find_first_of('(');
+  const size_t end_name = contents->find_last_of(')');
+  if (start_name == std::string::npos || end_name == std::string::npos ||
+      end_name < start_name) {
+    VLOG(1) << "No name found in stat line " << contents.value();
+    return std::nullopt;
+  }
+  std::string_view name(contents->c_str() + start_name + 1,
+                        end_name - start_name - 1);
+
+  std::vector<std::string_view> fields =
+      absl::StrSplit(std::string_view(contents->c_str() + end_name + 1,
+                                      contents->size() - end_name - 1),
+                     ' ', absl::SkipWhitespace());
+  constexpr int kNumFieldsAfterName = 50;
+  if (fields.size() != kNumFieldsAfterName) {
+    VLOG(1) << "Incorrect number of fields " << fields.size();
+    return std::nullopt;
+  }
+  // The first field is a character for the current process state; every single
+  // field after that should be an integer.
+  if (fields[0].size() != 1) {
+    VLOG(1) << "State field is too long: " << fields[0];
+    return std::nullopt;
+  }
+  std::array<absl::int128, kNumFieldsAfterName - 1> numbers;
+  for (int ii = 1; ii < kNumFieldsAfterName; ++ii) {
+    if (!absl::SimpleAtoi(fields[ii], &numbers[ii - 1])) {
+      VLOG(1) << "Failed to parse field " << ii << " as number: " << fields[ii];
+      return std::nullopt;
+    }
+  }
+  return ProcStat{
+      .pid = pid,
+      .name = std::string(name),
+      .state = fields.at(0).at(0),
+      .parent_pid = static_cast<int64_t>(numbers.at(0)),
+      .group_id = static_cast<int64_t>(numbers.at(1)),
+      .session_id = static_cast<int64_t>(numbers.at(2)),
+      .tty = static_cast<int64_t>(numbers.at(3)),
+      .tpgid = static_cast<int64_t>(numbers.at(4)),
+      .kernel_flags = static_cast<uint64_t>(numbers.at(5)),
+      .minor_faults = static_cast<uint64_t>(numbers.at(6)),
+      .children_minor_faults = static_cast<uint64_t>(numbers.at(7)),
+      .major_faults = static_cast<uint64_t>(numbers.at(8)),
+      .children_major_faults = static_cast<uint64_t>(numbers.at(9)),
+      .user_mode_ticks = static_cast<uint64_t>(numbers.at(10)),
+      .kernel_mode_ticks = static_cast<uint64_t>(numbers.at(11)),
+      .children_user_mode_ticks = static_cast<int64_t>(numbers.at(12)),
+      .children_kernel_mode_ticks = static_cast<int64_t>(numbers.at(13)),
+      .priority = static_cast<int64_t>(numbers.at(14)),
+      .nice = static_cast<int64_t>(numbers.at(15)),
+      .num_threads = static_cast<int64_t>(numbers.at(16)),
+      .itrealvalue = static_cast<int64_t>(numbers.at(17)),
+      .start_time_ticks = static_cast<uint64_t>(numbers.at(18)),
+      .virtual_memory_size = static_cast<uint64_t>(numbers.at(19)),
+      .resident_set_size = static_cast<int64_t>(numbers.at(20)),
+      .rss_soft_limit = static_cast<uint64_t>(numbers.at(21)),
+      .start_code_address = static_cast<uint64_t>(numbers.at(22)),
+      .end_code_address = static_cast<uint64_t>(numbers.at(23)),
+      .start_stack_address = static_cast<uint64_t>(numbers.at(24)),
+      .stack_pointer = static_cast<uint64_t>(numbers.at(25)),
+      .instruction_pointer = static_cast<uint64_t>(numbers.at(26)),
+      .signal_bitmask = static_cast<uint64_t>(numbers.at(27)),
+      .blocked_signals = static_cast<uint64_t>(numbers.at(28)),
+      .ignored_signals = static_cast<uint64_t>(numbers.at(29)),
+      .caught_signals = static_cast<uint64_t>(numbers.at(30)),
+      .wchan = static_cast<uint64_t>(numbers.at(31)),
+      .swap_pages = static_cast<uint64_t>(numbers.at(32)),
+      .children_swap_pages = static_cast<uint64_t>(numbers.at(33)),
+      .exit_signal = static_cast<int64_t>(numbers.at(34)),
+      .processor = static_cast<int64_t>(numbers.at(35)),
+      .rt_priority = static_cast<uint64_t>(numbers.at(36)),
+      .scheduling_policy = static_cast<uint64_t>(numbers.at(37)),
+      .block_io_delay_ticks = static_cast<uint64_t>(numbers.at(38)),
+      .guest_ticks = static_cast<uint64_t>(numbers.at(39)),
+      .children_guest_ticks = static_cast<uint64_t>(numbers.at(40)),
+      .start_data_address = static_cast<uint64_t>(numbers.at(41)),
+      .end_data_address = static_cast<uint64_t>(numbers.at(42)),
+      .start_brk_address = static_cast<uint64_t>(numbers.at(43)),
+      .start_arg_address = static_cast<uint64_t>(numbers.at(44)),
+      .end_arg_address = static_cast<uint64_t>(numbers.at(45)),
+      .start_env_address = static_cast<uint64_t>(numbers.at(46)),
+      .end_env_address = static_cast<uint64_t>(numbers.at(47)),
+      .exit_code = static_cast<int64_t>(numbers.at(48))};
+}
+
+Top::Top(aos::EventLoop *event_loop)
+    : event_loop_(event_loop),
+      clock_tick_(std::chrono::nanoseconds(1000000000 / sysconf(_SC_CLK_TCK))),
+      page_size_(sysconf(_SC_PAGESIZE)) {
+  TimerHandler *timer = event_loop_->AddTimer([this]() { UpdateReadings(); });
+  event_loop_->OnRun([timer, this]() {
+    timer->Setup(event_loop_->monotonic_now(), kSamplePeriod);
+  });
+}
+
+std::chrono::nanoseconds Top::TotalProcessTime(const ProcStat &proc_stat) {
+  return (proc_stat.user_mode_ticks + proc_stat.kernel_mode_ticks) *
+         clock_tick_;
+}
+
+aos::monotonic_clock::time_point Top::ProcessStartTime(
+    const ProcStat &proc_stat) {
+  return aos::monotonic_clock::time_point(proc_stat.start_time_ticks *
+                                          clock_tick_);
+}
+
+uint64_t Top::RealMemoryUsage(const ProcStat &proc_stat) {
+  return proc_stat.resident_set_size * page_size_;
+}
+
+void Top::UpdateReadings() {
+  aos::monotonic_clock::time_point now = event_loop_->monotonic_now();
+  // Get all the processes that we *might* care about.
+  std::set<pid_t> pids = pids_to_track_;
+  if (track_all_) {
+    DIR *const dir = opendir("/proc");
+    if (dir == nullptr) {
+      PLOG(FATAL) << "Failed to open /proc";
+    }
+    while (true) {
+      struct dirent *const dir_entry = readdir(dir);
+      if (dir_entry == nullptr) {
+        break;
+      }
+      pid_t pid;
+      if (dir_entry->d_type == DT_DIR &&
+          absl::SimpleAtoi(dir_entry->d_name, &pid)) {
+        pids.insert(pid);
+      }
+    }
+  }
+
+  for (const pid_t pid : pids) {
+    std::optional<ProcStat> proc_stat = ReadProcStat(pid);
+    // Stop tracking processes that have died.
+    if (!proc_stat.has_value()) {
+      readings_.erase(pid);
+      continue;
+    }
+    const aos::monotonic_clock::time_point start_time =
+        ProcessStartTime(*proc_stat);
+    auto reading_iter = readings_.find(pid);
+    if (reading_iter == readings_.end()) {
+      reading_iter = readings_
+                         .insert(std::make_pair(
+                             pid, ProcessReadings{.name = proc_stat->name,
+                                                  .start_time = start_time,
+                                                  .cpu_percent = 0.0,
+                                                  .readings = {}}))
+                         .first;
+    }
+    ProcessReadings &process = reading_iter->second;
+    // The process associated with the PID has changed; reset the state.
+    if (process.start_time != start_time) {
+      process.name = proc_stat->name;
+      process.start_time = start_time;
+      process.readings.Reset();
+    }
+
+    process.readings.Push(Reading{now, TotalProcessTime(*proc_stat),
+                                  RealMemoryUsage(*proc_stat)});
+    if (process.readings.size() == 2) {
+      process.cpu_percent =
+          aos::time::DurationInSeconds(process.readings[1].total_run_time -
+                                       process.readings[0].total_run_time) /
+          aos::time::DurationInSeconds(process.readings[1].reading_time -
+                                       process.readings[0].reading_time);
+    } else {
+      process.cpu_percent = 0.0;
+    }
+  }
+}
+
+flatbuffers::Offset<ProcessInfo> Top::InfoForProcess(
+    flatbuffers::FlatBufferBuilder *fbb, pid_t pid) {
+  auto reading_iter = readings_.find(pid);
+  if (reading_iter == readings_.end()) {
+    return {};
+  }
+  const ProcessReadings &reading = reading_iter->second;
+  const flatbuffers::Offset<flatbuffers::String> name =
+      fbb->CreateString(reading.name);
+  ProcessInfo::Builder builder(*fbb);
+  builder.add_pid(pid);
+  builder.add_name(name);
+  builder.add_cpu_usage(reading.cpu_percent);
+  builder.add_physical_memory(
+      reading.readings[reading.readings.size() - 1].memory_usage);
+  return builder.Finish();
+}
+
+flatbuffers::Offset<TopProcessesFbs> Top::TopProcesses(
+    flatbuffers::FlatBufferBuilder *fbb, int n) {
+  // Pair is {cpu_usage, pid}.
+  std::priority_queue<std::pair<double, pid_t>> cpu_usages;
+  for (const auto &pair : readings_) {
+    // Deliberately include 0.0 percent CPU things in the usage list so that if
+    // the user asks for an arbitrarily large number of processes they'll get
+    // everything.
+    cpu_usages.push(std::make_pair(pair.second.cpu_percent, pair.first));
+  }
+  std::vector<flatbuffers::Offset<ProcessInfo>> offsets;
+  for (int ii = 0; ii < n && !cpu_usages.empty(); ++ii) {
+    offsets.push_back(InfoForProcess(fbb, cpu_usages.top().second));
+    cpu_usages.pop();
+  }
+  const flatbuffers::Offset<
+      flatbuffers::Vector<flatbuffers::Offset<ProcessInfo>>>
+      vector_offset = fbb->CreateVector(offsets);
+  TopProcessesFbs::Builder builder(*fbb);
+  builder.add_processes(vector_offset);
+  return builder.Finish();
+}
+
+}  // namespace aos::util