Track send failures in timing reports

Previously, there was no external indication of the number of
failed sends. Start tracking failures by cause, so that
we have at least some visibility.

Change-Id: Ia66c9a8fa7bb580b786d435ce3dbdca242e2c80f
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/events/event_loop.cc b/aos/events/event_loop.cc
index 08b0064..3679062 100644
--- a/aos/events/event_loop.cc
+++ b/aos/events/event_loop.cc
@@ -374,10 +374,21 @@
     flatbuffers::Offset<timing::Statistic> size_offset =
         timing::CreateStatistic(fbb);
 
+    std::vector<flatbuffers::Offset<timing::SendErrorCount>>
+        error_count_offsets;
+    for (size_t ii = 0; ii < internal::RawSenderTiming::kNumErrors; ++ii) {
+      error_count_offsets.push_back(timing::CreateSendErrorCount(
+          fbb, timing::EnumValuesSendError()[ii], 0));
+    }
+    const flatbuffers::Offset<
+        flatbuffers::Vector<flatbuffers::Offset<timing::SendErrorCount>>>
+        error_counts_offset = fbb.CreateVector(error_count_offsets);
+
     timing::Sender::Builder sender_builder(fbb);
 
     sender_builder.add_channel_index(sender->timing_.channel_index);
     sender_builder.add_size(size_offset);
+    sender_builder.add_error_counts(error_counts_offset);
     sender_builder.add_count(0);
     sender_offsets.emplace_back(sender_builder.Finish());
   }
diff --git a/aos/events/event_loop.fbs b/aos/events/event_loop.fbs
index 051f575..c0aaf19 100644
--- a/aos/events/event_loop.fbs
+++ b/aos/events/event_loop.fbs
@@ -2,6 +2,21 @@
 
 namespace aos.timing;
 
+// Enum indicating why a message failed to send.
+enum SendError : byte {
+  // Messages were sent faster than allowed on a given channel.
+  MESSAGE_SENT_TOO_FAST = 0,
+  // The redzone for a message was invalid, implying some form of memory
+  // corruption.
+  INVALID_REDZONE = 1,
+}
+
+// Table used to track how many times a given error occurred.
+table SendErrorCount {
+  error:SendError (id: 0);
+  count:uint (id: 1);
+}
+
 // Holds statistics for a time or size sample.
 table Statistic {
   average:float = nan (id: 0);
@@ -18,6 +33,11 @@
   count:uint (id: 1);
   // Statistics on the size of messages published.
   size:Statistic (id: 2);
+  // Counts of how often different errors occurred. There
+  // should be an entry for all possible errors, even if no errors occurred
+  // since the last timing report. The absence of an entry implies that the
+  // error in question was not supported when the log was generated.
+  error_counts:[SendErrorCount] (id: 4);
 
   // Channel for this sender.  Not filled out by default.
   channel:Channel (id: 3);
diff --git a/aos/events/event_loop_param_test.cc b/aos/events/event_loop_param_test.cc
index 18f5f96..c8ceeb5 100644
--- a/aos/events/event_loop_param_test.cc
+++ b/aos/events/event_loop_param_test.cc
@@ -1816,13 +1816,26 @@
 
   auto sender = loop1->MakeSender<TestMessage>("/test");
 
+  // Sanity check channel frequencies to ensure that we've designed the test
+  // correctly.
+  ASSERT_EQ(800, sender.channel()->frequency());
+  ASSERT_EQ(2000000000, loop1->configuration()->channel_storage_duration());
+  constexpr int kMaxAllowedMessages = 800 * 2;
+  constexpr int kSendMessages = kMaxAllowedMessages * 2;
+  constexpr int kDroppedMessages = kSendMessages - kMaxAllowedMessages;
+
   // Add a timer to actually quit.
   auto test_timer = loop1->AddTimer([&sender]() {
-    for (int i = 0; i < 10; ++i) {
+    for (int i = 0; i < kSendMessages; ++i) {
       aos::Sender<TestMessage>::Builder msg = sender.MakeBuilder();
       TestMessage::Builder builder = msg.MakeBuilder<TestMessage>();
       builder.add_value(200 + i);
-      msg.CheckOk(msg.Send(builder.Finish()));
+      if (i < kMaxAllowedMessages) {
+        msg.CheckOk(msg.Send(builder.Finish()));
+      } else {
+        EXPECT_EQ(RawSender::Error::kMessagesSentTooFast,
+                  msg.Send(builder.Finish()));
+      }
     }
   });
 
@@ -1862,7 +1875,27 @@
             ->name()
             ->string_view(),
         "/test");
-    EXPECT_EQ(primary_report.message().senders()->Get(0)->count(), 10);
+    EXPECT_EQ(primary_report.message().senders()->Get(0)->count(),
+              kMaxAllowedMessages);
+    ASSERT_TRUE(primary_report.message().senders()->Get(0)->has_error_counts());
+    ASSERT_EQ(
+        primary_report.message().senders()->Get(0)->error_counts()->size(), 2u);
+    EXPECT_EQ(
+        primary_report.message()
+            .senders()
+            ->Get(0)
+            ->error_counts()
+            ->Get(static_cast<size_t>(timing::SendError::MESSAGE_SENT_TOO_FAST))
+            ->count(),
+        kDroppedMessages)
+        << aos::FlatbufferToJson(primary_report);
+    EXPECT_EQ(primary_report.message()
+                  .senders()
+                  ->Get(0)
+                  ->error_counts()
+                  ->Get(static_cast<size_t>(timing::SendError::INVALID_REDZONE))
+                  ->count(),
+              0);
 
     // Confirm that the timing primary_report sender looks sane.
     EXPECT_EQ(
diff --git a/aos/events/event_loop_tmpl.h b/aos/events/event_loop_tmpl.h
index 0b08dc0..1382332 100644
--- a/aos/events/event_loop_tmpl.h
+++ b/aos/events/event_loop_tmpl.h
@@ -143,16 +143,26 @@
     uint32_t remote_queue_index, const UUID &uuid) {
   const auto err = DoSend(size, monotonic_remote_time, realtime_remote_time,
                           remote_queue_index, uuid);
-  if (err == RawSender::Error::kOk) {
-    if (timing_.sender) {
-      timing_.size.Add(size);
-      timing_.sender->mutate_count(timing_.sender->count() + 1);
+  switch (err) {
+    case Error::kOk: {
+      if (timing_.sender) {
+        timing_.size.Add(size);
+        timing_.sender->mutate_count(timing_.sender->count() + 1);
+      }
+      ftrace_.FormatMessage(
+          "%.*s: sent internal: event=%" PRId64 " queue=%" PRIu32,
+          static_cast<int>(ftrace_prefix_.size()), ftrace_prefix_.data(),
+          static_cast<int64_t>(
+              monotonic_sent_time().time_since_epoch().count()),
+          sent_queue_index());
+      break;
     }
-    ftrace_.FormatMessage(
-        "%.*s: sent internal: event=%" PRId64 " queue=%" PRIu32,
-        static_cast<int>(ftrace_prefix_.size()), ftrace_prefix_.data(),
-        static_cast<int64_t>(monotonic_sent_time().time_since_epoch().count()),
-        sent_queue_index());
+    case Error::kMessagesSentTooFast:
+      timing_.IncrementError(timing::SendError::MESSAGE_SENT_TOO_FAST);
+      break;
+    case Error::kInvalidRedzone:
+      timing_.IncrementError(timing::SendError::INVALID_REDZONE);
+      break;
   }
   return err;
 }
diff --git a/aos/events/logging/logger_test.cc b/aos/events/logging/logger_test.cc
index c346854..42ba12f 100644
--- a/aos/events/logging/logger_test.cc
+++ b/aos/events/logging/logger_test.cc
@@ -3393,9 +3393,9 @@
 }
 
 constexpr std::string_view kCombinedConfigSha1(
-    "b33e522a2d0d746f5e514c3a9799987d9c79b2cc45c66f0a91c7a879d5248f0e");
+    "ad71114d104afea6cc400f3a968bb69e394681183fd42655d4efcdc8f4cd8911");
 constexpr std::string_view kSplitConfigSha1(
-    "0f36fe738e357c65082ced1b5d8179e70cb9e2ac6638e748279619db523aec1c");
+    "cdd60ecc4423ef2450bf67bed059f2f28f0e8ff6819b94a2194d825a6b15fe91");
 
 INSTANTIATE_TEST_SUITE_P(
     All, MultinodeLoggerTest,
diff --git a/aos/events/timing_statistics.cc b/aos/events/timing_statistics.cc
index 52f2048..fbc49f4 100644
--- a/aos/events/timing_statistics.cc
+++ b/aos/events/timing_statistics.cc
@@ -40,6 +40,19 @@
 
   size.Reset();
   sender->mutate_count(0);
+  for (size_t ii = 0; ii < kNumErrors; ++ii) {
+    sender->mutable_error_counts()->GetMutableObject(ii)->mutate_count(0);
+  }
+}
+
+void RawSenderTiming::IncrementError(timing::SendError error) {
+  if (!sender) {
+    return;
+  }
+  const size_t index = static_cast<size_t>(error);
+  timing::SendErrorCount *counter =
+      sender->mutable_error_counts()->GetMutableObject(index);
+  counter->mutate_count(counter->count() + 1);
 }
 
 void TimerTiming::set_timing_report(timing::Timer *new_timer) {
diff --git a/aos/events/timing_statistics.h b/aos/events/timing_statistics.h
index 4d9a524..e9edff2 100644
--- a/aos/events/timing_statistics.h
+++ b/aos/events/timing_statistics.h
@@ -79,10 +79,24 @@
 
 // Class to hold timing information for a raw sender.
 struct RawSenderTiming {
+  static constexpr size_t kNumErrors =
+      static_cast<int>(timing::SendError::MAX) -
+      static_cast<int>(timing::SendError::MIN) + 1;
+
   RawSenderTiming(int new_channel_index) : channel_index(new_channel_index) {}
 
   void set_timing_report(timing::Sender *sender);
   void ResetTimingReport();
+  void IncrementError(timing::SendError error);
+  // Sanity check that the enum values are such that we can just use the enum
+  // values themselves as array indices without anything weird happening.
+  static_assert(0 == static_cast<int>(timing::SendError::MIN),
+                "Expected error enum values to start at zero.");
+  static_assert(
+      sizeof(std::invoke_result<decltype(timing::EnumValuesSendError)>::type) /
+              sizeof(timing::SendError) ==
+          kNumErrors,
+      "Expected continguous enum values.");
 
   const int channel_index;
   timing::Sender *sender = nullptr;