Add utility for tracking process CPU usage

Being able to log this sort of information can be helpful for debugging
things when something weird happened on the system.

References: PRO-13362
Change-Id: Ie2847536fdc58279f62c9b7b0208d7fe51a90a5c
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/util/top.h b/aos/util/top.h
new file mode 100644
index 0000000..32ff65d
--- /dev/null
+++ b/aos/util/top.h
@@ -0,0 +1,157 @@
+#ifndef AOS_UTIL_TOP_H_
+#define AOS_UTIL_TOP_H_
+
+#include <map>
+#include <string>
+
+#include "aos/containers/ring_buffer.h"
+#include "aos/events/event_loop.h"
+#include "aos/util/process_info_generated.h"
+
+namespace aos::util {
+
+// ProcStat is a struct to hold all the fields available in /proc/[pid]/stat.
+// Currently we only use a small subset of the feilds. See man 5 proc for
+// details on what the fields are--these are in the same order as they appear in
+// the stat file.
+//
+// Things are signed or unsigned based on whether they are listed
+// as signed/unsigned in man 5 proc. We just make everything 64 bits wide
+// because otherwise we have to write out way too many casts everywhere.
+struct ProcStat {
+  int pid;
+  std::string name;
+  char state;
+  int64_t parent_pid;
+  int64_t group_id;
+  int64_t session_id;
+  int64_t tty;
+  int64_t tpgid;
+  uint64_t kernel_flags;
+  uint64_t minor_faults;
+  uint64_t children_minor_faults;
+  uint64_t major_faults;
+  uint64_t children_major_faults;
+  uint64_t user_mode_ticks;
+  uint64_t kernel_mode_ticks;
+  int64_t children_user_mode_ticks;
+  int64_t children_kernel_mode_ticks;
+  int64_t priority;
+  int64_t nice;
+  int64_t num_threads;
+  int64_t itrealvalue;  // always zero.
+  uint64_t start_time_ticks;
+  uint64_t virtual_memory_size;
+  // Number of pages in real memory.
+  int64_t resident_set_size;
+  uint64_t rss_soft_limit;
+  uint64_t start_code_address;
+  uint64_t end_code_address;
+  uint64_t start_stack_address;
+  uint64_t stack_pointer;
+  uint64_t instruction_pointer;
+  uint64_t signal_bitmask;
+  uint64_t blocked_signals;
+  uint64_t ignored_signals;
+  uint64_t caught_signals;
+  uint64_t wchan;
+  // swap_pages fields are not maintained.
+  uint64_t swap_pages;
+  uint64_t children_swap_pages;
+  int64_t exit_signal;
+  // CPU number last exitted on.
+  int64_t processor;
+  // Zero for non-realtime processes.
+  uint64_t rt_priority;
+  uint64_t scheduling_policy;
+  // Aggregated block I/O delay.
+  uint64_t block_io_delay_ticks;
+  uint64_t guest_ticks;
+  uint64_t children_guest_ticks;
+  uint64_t start_data_address;
+  uint64_t end_data_address;
+  uint64_t start_brk_address;
+  uint64_t start_arg_address;
+  uint64_t end_arg_address;
+  uint64_t start_env_address;
+  uint64_t end_env_address;
+  int64_t exit_code;
+};
+
+// Retrieves the stats for a particular process (note that there also exists a
+// /proc/[pid]/task/[tid]/stat with the same format for per-thread information;
+// we currently do not read that).
+// Returns nullopt if unable to read/parse the file.
+std::optional<ProcStat> ReadProcStat(int pid);
+
+// This class provides a basic utility for retrieving general performance
+// information on running processes (named after the top utility). It can either
+// be used to directly get information on individual processes (via
+// set_track_pids()) or used to track a list of the top N processes with the
+// highest CPU usage.
+// Note that this currently relies on sampling processes in /proc every second
+// and using the differences between the two readings to calculate CPU usage.
+// For crash-looping processees or other situations with highly variable or
+// extremely short-lived loads, this may do a poor job of capturing information.
+class Top {
+ public:
+  Top(aos::EventLoop *event_loop);
+
+  // Set whether to track all the top processes (this will result in us having
+  // to track every single process on the system, so that we can sort them).
+  void set_track_top_processes(bool track_all) { track_all_ = track_all; }
+
+  // Specify a set of individual processes to track statistics for.
+  // This can be changed at run-time, although it may take up to kSamplePeriod
+  // to have full statistics on all the relevant processes, since we need at
+  // least two samples to estimate CPU usage.
+  void set_track_pids(const std::set<pid_t> &pids) { pids_to_track_ = pids; }
+
+  // Retrieve statistics for the specified process. Will return the null offset
+  // of no such pid is being tracked.
+  flatbuffers::Offset<ProcessInfo> InfoForProcess(
+      flatbuffers::FlatBufferBuilder *fbb, pid_t pid);
+
+  // Returns information on up to n processes, sorted by CPU usage.
+  flatbuffers::Offset<TopProcessesFbs> TopProcesses(
+      flatbuffers::FlatBufferBuilder *fbb, int n);
+
+ private:
+  // Rate at which to sample /proc/[pid]/stat.
+  static constexpr std::chrono::seconds kSamplePeriod{1};
+
+  struct Reading {
+    aos::monotonic_clock::time_point reading_time;
+    std::chrono::nanoseconds total_run_time;
+    uint64_t memory_usage;
+  };
+
+  struct ProcessReadings {
+    std::string name;
+    aos::monotonic_clock::time_point start_time;
+    // CPU usage is based on the past two readings.
+    double cpu_percent;
+    aos::RingBuffer<Reading, 2> readings;
+  };
+
+  std::chrono::nanoseconds TotalProcessTime(const ProcStat &proc_stat);
+  aos::monotonic_clock::time_point ProcessStartTime(const ProcStat &proc_stat);
+  uint64_t RealMemoryUsage(const ProcStat &proc_stat);
+  void UpdateReadings();
+
+  aos::EventLoop *event_loop_;
+
+  // Length of a clock tick (used to convert from raw numbers in /proc to actual
+  // times).
+  const std::chrono::nanoseconds clock_tick_;
+  // Page size, in bytes, on the current system.
+  const long page_size_;
+
+  std::set<pid_t> pids_to_track_;
+  bool track_all_ = false;
+
+  std::map<pid_t, ProcessReadings> readings_;
+};
+
+}  // namespace aos::util
+#endif  // AOS_UTIL_TOP_H_