Sort logs into structures rather than simple lists

This lets us return start times, uuids, etc.  The end result is the user
can figure out which logs go with what easier without having to
reproduce this logic again.

Change-Id: I2a0ee2c17492ddfdddda80f20563b3fccfed0f7e
diff --git a/aos/events/logging/log_cat.cc b/aos/events/logging/log_cat.cc
index f406d24..0afdc03 100644
--- a/aos/events/logging/log_cat.cc
+++ b/aos/events/logging/log_cat.cc
@@ -127,7 +127,7 @@
     unsorted_logfiles.emplace_back(std::string(argv[i]));
   }
 
-  std::vector<std::vector<std::string>> logfiles =
+  const std::vector<aos::logger::LogFile> logfiles =
       aos::logger::SortParts(unsorted_logfiles);
 
   aos::logger::LogReader reader(logfiles);
diff --git a/aos/events/logging/logger.cc b/aos/events/logging/logger.cc
index 424153a..f7251f1 100644
--- a/aos/events/logging/logger.cc
+++ b/aos/events/logging/logger.cc
@@ -545,95 +545,233 @@
   } while (last_synchronized_time_ + polling_period_ < monotonic_now);
 }
 
-std::vector<std::vector<std::string>> SortParts(
-    const std::vector<std::string> &parts) {
+std::vector<LogFile> SortParts(const std::vector<std::string> &parts) {
   // Start by grouping all parts by UUID, and extracting the part index.
-  std::map<std::string, std::vector<std::pair<std::string, int>>> parts_list;
+  // Datastructure to hold all the info extracted from a set of parts which go
+  // together so we can sort them afterwords.
+  struct UnsortedLogParts {
+    // Start times.
+    aos::monotonic_clock::time_point monotonic_start_time;
+    aos::realtime_clock::time_point realtime_start_time;
+
+    // Node to save.
+    std::string node;
+
+    // Pairs of the filename and the part index for sorting.
+    std::vector<std::pair<std::string, int>> parts;
+  };
+
+  // Map holding the logger_uuid -> second map.  The second map holds the
+  // parts_uuid -> list of parts for sorting.
+  std::map<std::string, std::map<std::string, UnsortedLogParts>> parts_list;
 
   // Sort part files without UUIDs and part indexes as well.  Extract everything
   // useful from the log in the first pass, then sort later.
-  struct LogPart {
-    std::string filename;
-    monotonic_clock::time_point start_time;
-    monotonic_clock::time_point first_message_time;
+  struct UnsortedOldParts {
+    // Part information with everything but the list of parts.
+    LogParts parts;
+
+    // Tuple of time for the data and filename needed for sorting after
+    // extracting.
+    std::vector<std::pair<monotonic_clock::time_point, std::string>> unsorted_parts;
   };
 
-  std::vector<LogPart> old_parts;
+  // A list of all the old parts which we don't know how to sort using uuids.
+  // There are enough of these in the wild that this is worth supporting.
+  std::vector<UnsortedOldParts> old_parts;
 
+  // Now extract everything into our datastructures above for sorting.
   for (const std::string &part : parts) {
     FlatbufferVector<LogFileHeader> log_header = ReadHeader(part);
 
+    const monotonic_clock::time_point monotonic_start_time(
+        chrono::nanoseconds(log_header.message().monotonic_start_time()));
+    const realtime_clock::time_point realtime_start_time(
+        chrono::nanoseconds(log_header.message().realtime_start_time()));
+
+    const std::string_view node =
+        log_header.message().has_node()
+            ? log_header.message().node()->name()->string_view()
+            : "";
+
     // Looks like an old log.  No UUID, index, and also single node.  We have
     // little to no multi-node log files in the wild without part UUIDs and
     // indexes which we care much about.
     if (!log_header.message().has_parts_uuid() &&
         !log_header.message().has_parts_index() &&
         !log_header.message().has_node()) {
-      LogPart log_part;
-      log_part.filename = part;
-      log_part.start_time = monotonic_clock::time_point(
-          chrono::nanoseconds(log_header.message().monotonic_start_time()));
       FlatbufferVector<MessageHeader> first_message = ReadNthMessage(part, 0);
-      log_part.first_message_time = monotonic_clock::time_point(
+      const monotonic_clock::time_point first_message_time(
           chrono::nanoseconds(first_message.message().monotonic_sent_time()));
-      old_parts.emplace_back(std::move(log_part));
+
+      // Find anything with a matching start time.  They all go together.
+      auto result = std::find_if(
+          old_parts.begin(), old_parts.end(),
+          [&](const UnsortedOldParts &parts) {
+            return parts.parts.monotonic_start_time == monotonic_start_time &&
+                   parts.parts.realtime_start_time == realtime_start_time;
+          });
+
+      if (result == old_parts.end()) {
+        old_parts.emplace_back();
+        old_parts.back().parts.monotonic_start_time = monotonic_start_time;
+        old_parts.back().parts.realtime_start_time = realtime_start_time;
+        old_parts.back().unsorted_parts.emplace_back(
+            std::make_pair(first_message_time, part));
+      } else {
+        result->unsorted_parts.emplace_back(
+            std::make_pair(first_message_time, part));
+      }
       continue;
     }
 
+    CHECK(log_header.message().has_logger_uuid());
     CHECK(log_header.message().has_parts_uuid());
     CHECK(log_header.message().has_parts_index());
 
+    const std::string logger_uuid = log_header.message().logger_uuid()->str();
     const std::string parts_uuid = log_header.message().parts_uuid()->str();
-    auto it = parts_list.find(parts_uuid);
-    if (it == parts_list.end()) {
-      it = parts_list
+    int32_t parts_index = log_header.message().parts_index();
+
+    auto log_it = parts_list.find(logger_uuid);
+    if (log_it == parts_list.end()) {
+      log_it = parts_list
                .insert(std::make_pair(
-                   parts_uuid, std::vector<std::pair<std::string, int>>{}))
+                   logger_uuid, std::map<std::string, UnsortedLogParts>()))
                .first;
     }
-    it->second.emplace_back(
-        std::make_pair(part, log_header.message().parts_index()));
+
+    auto it = log_it->second.find(parts_uuid);
+    if (it == log_it->second.end()) {
+      it = log_it->second.insert(std::make_pair(parts_uuid, UnsortedLogParts()))
+               .first;
+      it->second.monotonic_start_time = monotonic_start_time;
+      it->second.realtime_start_time = realtime_start_time;
+      it->second.node = std::string(node);
+    }
+
+    // First part might be min_time.  If it is, try to put a better time on it.
+    if (it->second.monotonic_start_time == monotonic_clock::min_time) {
+      it->second.monotonic_start_time = monotonic_start_time;
+    } else if (monotonic_start_time != monotonic_clock::min_time) {
+      CHECK_EQ(it->second.monotonic_start_time, monotonic_start_time);
+    }
+    if (it->second.realtime_start_time == realtime_clock::min_time) {
+      it->second.realtime_start_time = realtime_start_time;
+    } else if (realtime_start_time != realtime_clock::min_time) {
+      CHECK_EQ(it->second.realtime_start_time, realtime_start_time);
+    }
+
+    it->second.parts.emplace_back(std::make_pair(part, parts_index));
   }
 
   CHECK_NE(old_parts.empty(), parts_list.empty())
       << ": Can't have a mix of old and new parts.";
 
+  // Now reformat old_parts to be in the right datastructure to report.
   if (!old_parts.empty()) {
-    // Confirm they all have the same start time.  Old loggers always used the
-    // same start time.
-    for (const LogPart &p : old_parts) {
-      CHECK_EQ(old_parts[0].start_time, p.start_time);
+    std::vector<LogFile> result;
+    for (UnsortedOldParts &p : old_parts) {
+      // Sort by the oldest message in each file.
+      std::sort(
+          p.unsorted_parts.begin(), p.unsorted_parts.end(),
+          [](const std::pair<monotonic_clock::time_point, std::string> &a,
+             const std::pair<monotonic_clock::time_point, std::string> &b) {
+            return a.first < b.first;
+          });
+      LogFile log_file;
+      for (std::pair<monotonic_clock::time_point, std::string> &f :
+           p.unsorted_parts) {
+        p.parts.parts.emplace_back(std::move(f.second));
+      }
+      log_file.parts.emplace_back(std::move(p.parts));
+      result.emplace_back(std::move(log_file));
     }
-    // Sort by the oldest message in each file.
-    std::sort(old_parts.begin(), old_parts.end(),
-              [](const LogPart &a, const LogPart &b) {
-                return a.first_message_time < b.first_message_time;
-              });
 
-    // Produce the final form.
-    std::vector<std::string> sorted_old_parts;
-    sorted_old_parts.reserve(old_parts.size());
-    for (LogPart &p : old_parts) {
-      sorted_old_parts.emplace_back(std::move(p.filename));
-    }
-    return std::vector<std::vector<std::string>>{std::move(sorted_old_parts)};
+    return result;
   }
 
   // Now, sort them and produce the final vector form.
-  std::vector<std::vector<std::string>> result;
+  std::vector<LogFile> result;
   result.reserve(parts_list.size());
-  for (auto &part : parts_list) {
-    std::sort(part.second.begin(), part.second.end(),
-              [](const std::pair<std::string, int> &a,
-                 const std::pair<std::string, int> &b) {
-                return a.second < b.second;
-              });
-    std::vector<std::string> result_line;
-    result_line.reserve(part.second.size());
-    for (std::pair<std::string, int> &p : part.second) {
-      result_line.emplace_back(std::move(p.first));
+  for (std::pair<const std::string, std::map<std::string, UnsortedLogParts>> &logs : parts_list) {
+    LogFile new_file;
+    new_file.logger_uuid = logs.first;
+    for (std::pair<const std::string, UnsortedLogParts> &parts : logs.second) {
+      LogParts new_parts;
+      new_parts.monotonic_start_time = parts.second.monotonic_start_time;
+      new_parts.realtime_start_time = parts.second.realtime_start_time;
+      new_parts.logger_uuid = logs.first;
+      new_parts.parts_uuid = parts.first;
+      new_parts.node = std::move(parts.second.node);
+
+      std::sort(parts.second.parts.begin(), parts.second.parts.end(),
+                [](const std::pair<std::string, int> &a,
+                   const std::pair<std::string, int> &b) {
+                  return a.second < b.second;
+                });
+      new_parts.parts.reserve(parts.second.parts.size());
+      for (std::pair<std::string, int> &p : parts.second.parts) {
+        new_parts.parts.emplace_back(std::move(p.first));
+      }
+      new_file.parts.emplace_back(std::move(new_parts));
     }
-    result.emplace_back(std::move(result_line));
+    result.emplace_back(std::move(new_file));
+  }
+  return result;
+}
+
+std::ostream &operator<<(std::ostream &stream, const LogFile &file) {
+  stream << "{";
+  if (!file.logger_uuid.empty()) {
+    stream << "\"logger_uuid\": \"" << file.logger_uuid << "\", ";
+  }
+  stream << "\"parts\": [";
+  for (size_t i = 0; i < file.parts.size(); ++i) {
+    if (i != 0u) {
+      stream << ", ";
+    }
+    stream << file.parts[i];
+  }
+  stream << "]}";
+  return stream;
+}
+std::ostream &operator<<(std::ostream &stream, const LogParts &parts) {
+  stream << "{";
+  if (!parts.logger_uuid.empty()) {
+    stream << "\"logger_uuid\": \"" << parts.logger_uuid << "\", ";
+  }
+  if (!parts.parts_uuid.empty()) {
+    stream << "\"parts_uuid\": \"" << parts.parts_uuid << "\", ";
+  }
+  if (!parts.node.empty()) {
+    stream << "\"node\": \"" << parts.node << "\", ";
+  }
+  stream << "\"monotonic_start_time\": " << parts.monotonic_start_time
+         << ", \"realtime_start_time\": " << parts.realtime_start_time << ", [";
+
+  for (size_t i = 0; i < parts.parts.size(); ++i) {
+    if (i != 0u) {
+      stream << ", ";
+    }
+    stream << parts.parts[i];
+  }
+
+  stream << "]}";
+  return stream;
+}
+
+std::vector<std::vector<std::string>> ToLogReaderVector(
+    const std::vector<LogFile> &log_files) {
+  std::vector<std::vector<std::string>> result;
+  for (const LogFile &log_file : log_files) {
+    for (const LogParts &log_parts : log_file.parts) {
+      std::vector<std::string> parts;
+      for (const std::string &part : log_parts.parts) {
+        parts.emplace_back(part);
+      }
+      result.emplace_back(std::move(parts));
+    }
   }
   return result;
 }
@@ -648,6 +786,12 @@
     : LogReader(std::vector<std::vector<std::string>>{filenames},
                 replay_configuration) {}
 
+// TODO(austin): Make this the base and kill the others.  This has much better
+// context for sorting.
+LogReader::LogReader(const std::vector<LogFile> &log_files,
+                     const Configuration *replay_configuration)
+    : LogReader(ToLogReaderVector(log_files), replay_configuration) {}
+
 LogReader::LogReader(const std::vector<std::vector<std::string>> &filenames,
                      const Configuration *replay_configuration)
     : filenames_(filenames),
@@ -721,7 +865,8 @@
   return configuration::GetNodes(remapped_configuration_);
 }
 
-monotonic_clock::time_point LogReader::monotonic_start_time(const Node *node) {
+monotonic_clock::time_point LogReader::monotonic_start_time(
+    const Node *node) const {
   State *state =
       states_[configuration::GetNodeIndex(configuration(), node)].get();
   CHECK(state != nullptr) << ": Unknown node " << FlatbufferToJson(node);
@@ -729,7 +874,8 @@
   return state->monotonic_start_time();
 }
 
-realtime_clock::time_point LogReader::realtime_start_time(const Node *node) {
+realtime_clock::time_point LogReader::realtime_start_time(
+    const Node *node) const {
   State *state =
       states_[configuration::GetNodeIndex(configuration(), node)].get();
   CHECK(state != nullptr) << ": Unknown node " << FlatbufferToJson(node);
diff --git a/aos/events/logging/logger.h b/aos/events/logging/logger.h
index a3189db..2f6f931 100644
--- a/aos/events/logging/logger.h
+++ b/aos/events/logging/logger.h
@@ -148,9 +148,43 @@
   std::vector<NodeState> node_state_;
 };
 
+// Datastructure to hold ordered parts.
+struct LogParts {
+  // Monotonic and realtime start times for this set of log files.  For log
+  // files which started out unknown and then became known, this is the known
+  // start time.
+  aos::monotonic_clock::time_point monotonic_start_time;
+  aos::realtime_clock::time_point realtime_start_time;
+
+  // UUIDs if available.
+  std::string logger_uuid;
+  std::string parts_uuid;
+
+  // The node this represents, or empty if we are in a single node world.
+  std::string node;
+
+  // Pre-sorted list of parts.
+  std::vector<std::string> parts;
+};
+
+// Datastructure to hold parts from the same run of the logger which have no
+// ordering constraints relative to each other.
+struct LogFile {
+  // The UUID tying them all together (if available)
+  std::string logger_uuid;
+
+  // All the parts, unsorted.
+  std::vector<LogParts> parts;
+};
+
+std::ostream &operator<<(std::ostream &stream, const LogFile &file);
+std::ostream &operator<<(std::ostream &stream, const LogParts &parts);
+
 // Takes a bunch of parts and sorts them based on part_uuid and part_index.
-std::vector<std::vector<std::string>> SortParts(
-    const std::vector<std::string> &parts);
+std::vector<LogFile> SortParts(const std::vector<std::string> &parts);
+
+std::vector<std::vector<std::string>> ToLogReaderVector(
+    const std::vector<LogFile> &log_files);
 
 // We end up with one of the following 3 log file types.
 //
@@ -205,6 +239,8 @@
             const Configuration *replay_configuration = nullptr);
   LogReader(const std::vector<std::vector<std::string>> &filenames,
             const Configuration *replay_configuration = nullptr);
+  LogReader(const std::vector<LogFile> &log_files,
+            const Configuration *replay_configuration = nullptr);
   ~LogReader();
 
   // Registers all the callbacks to send the log file data out on an event loop
@@ -233,7 +269,11 @@
   // SimulatedEventLoopFactory, and then get the configuration from there for
   // everything else.
   const Configuration *logged_configuration() const;
-  // Returns the configuration being used for replay.
+  // Returns the configuration being used for replay from the log file.
+  // Note that this may be different from the configuration actually used for
+  // handling events. You should generally only use this to create a
+  // SimulatedEventLoopFactory, and then get the configuration from there for
+  // everything else.
   // The pointer is invalidated whenever RemapLoggedChannel is called.
   const Configuration *configuration() const;
 
@@ -243,8 +283,10 @@
   std::vector<const Node *> Nodes() const;
 
   // Returns the starting timestamp for the log file.
-  monotonic_clock::time_point monotonic_start_time(const Node *node = nullptr);
-  realtime_clock::time_point realtime_start_time(const Node *node = nullptr);
+  monotonic_clock::time_point monotonic_start_time(
+      const Node *node = nullptr) const;
+  realtime_clock::time_point realtime_start_time(
+      const Node *node = nullptr) const;
 
   // Causes the logger to publish the provided channel on a different name so
   // that replayed applications can publish on the proper channel name without
diff --git a/aos/events/logging/logger_test.cc b/aos/events/logging/logger_test.cc
index 47fe9e2..be77f38 100644
--- a/aos/events/logging/logger_test.cc
+++ b/aos/events/logging/logger_test.cc
@@ -973,13 +973,48 @@
     event_loop_factory_.RunFor(chrono::milliseconds(2000));
   }
 
-  const std::vector<std::vector<std::string>> sorted_parts =
-      SortParts(logfiles_);
+  const std::vector<LogFile> sorted_parts = SortParts(logfiles_);
+
+  EXPECT_EQ(sorted_parts.size(), 2u);
+
+  // Count up the number of UUIDs and make sure they are what we expect as a
+  // sanity check.
+  std::set<std::string> logger_uuids;
+  std::set<std::string> parts_uuids;
+  std::set<std::string> both_uuids;
+
+  size_t missing_rt_count = 0;
+
+  for (const LogFile &log_file : sorted_parts) {
+    EXPECT_FALSE(log_file.logger_uuid.empty());
+    logger_uuids.insert(log_file.logger_uuid);
+    both_uuids.insert(log_file.logger_uuid);
+
+    for (const LogParts &part : log_file.parts) {
+      EXPECT_NE(part.monotonic_start_time, aos::monotonic_clock::min_time)
+          << ": " << part;
+      missing_rt_count += part.realtime_start_time == aos::realtime_clock::min_time;
+
+      EXPECT_TRUE(logger_uuids.find(part.logger_uuid) != logger_uuids.end());
+      EXPECT_NE(part.node, "");
+      parts_uuids.insert(part.parts_uuid);
+      both_uuids.insert(part.parts_uuid);
+    }
+  }
+
+  // We won't have RT timestamps for 5 log files.  We don't log the RT start
+  // time on remote nodes because we don't know it and would be guessing.  And
+  // the log reader can actually do a better job.
+  EXPECT_EQ(missing_rt_count, 5u);
+
+  EXPECT_EQ(logger_uuids.size(), 2u);
+  EXPECT_EQ(parts_uuids.size(), ToLogReaderVector(sorted_parts).size());
+  EXPECT_EQ(logger_uuids.size() + parts_uuids.size(), both_uuids.size());
 
   // Test that each list of parts is in order.  Don't worry about the ordering
   // between part file lists though.
   // (inner vectors all need to be in order, but outer one doesn't matter).
-  EXPECT_THAT(sorted_parts,
+  EXPECT_THAT(ToLogReaderVector(sorted_parts),
               ::testing::UnorderedElementsAreArray(structured_logfiles_));
 }