Error more gracefully in logfile sorting if 2 servers fight
If 2 computers have the same IP address, the client will flip flop
between them. This could happen if 2 laptops are configured as the
logger node and are swapped back and forth, or various other patterns.
In this case, the timestamp problem is poorly formed and ends up failing
pretty horribly. While we could try to figure out how to replay the
world during that time, in reality this isn't a problem we should be
solving. We can catch this during logfile sorting and explode with a
much better error.
Connected to node A
Connected to node B
10sec - 20sec
20sec - 30sec
30sec - 40sec
In the above example, we are printing out the times on the logger node
which stays booted through this whole event. If we track the times we
receive unreliable messages on the logger node, we can see that the
boots overlap. A has time from 10-40, and B has time from 20-30. So
there's overlap so we have this rebooting case.
Change-Id: I24746625c5fc4838a97ec7929cf1bd9ad6d7ffd6
Signed-off-by: Austin Schuh <austin.linux@gmail.com>
diff --git a/aos/events/logging/logfile_sorting.cc b/aos/events/logging/logfile_sorting.cc
index 7eb17a0..7f98fa8 100644
--- a/aos/events/logging/logfile_sorting.cc
+++ b/aos/events/logging/logfile_sorting.cc
@@ -295,12 +295,13 @@
// Map from a observed pair of boots to the associated timestamps.
// logger_node -> logger_node_boot_uuid -> destination_node index ->
- // destination_boot_uuid -> times.
+ // destination_boot_uuid -> list of all times from all parts.
absl::btree_map<
std::string,
absl::btree_map<
std::string,
- absl::btree_map<size_t, absl::btree_map<std::string, BootPairTimes>>>>
+ absl::btree_map<size_t, absl::btree_map<std::string,
+ std::vector<BootPairTimes>>>>>
boot_times;
// Map holding the log_event_uuid -> second map. The second map holds the
@@ -571,7 +572,8 @@
std::string,
absl::btree_map<
size_t,
- absl::btree_map<std::string, BootPairTimes>>>())
+ absl::btree_map<std::string,
+ std::vector<BootPairTimes>>>>())
.first;
}
auto source_boot_times_it =
@@ -583,7 +585,8 @@
.emplace(
logger_boot_uuid,
absl::btree_map<
- size_t, absl::btree_map<std::string, BootPairTimes>>())
+ size_t, absl::btree_map<std::string,
+ std::vector<BootPairTimes>>>())
.first;
}
@@ -646,7 +649,8 @@
destination_boot_times_it =
source_boot_times_it->second
.emplace(node_index,
- absl::btree_map<std::string, BootPairTimes>())
+ absl::btree_map<std::string,
+ std::vector<BootPairTimes>>())
.first;
}
@@ -656,7 +660,17 @@
if (boot_times_it == destination_boot_times_it->second.end()) {
// We have a new boot UUID pairing. Copy over the data we have.
destination_boot_times_it->second.emplace(
- boot_uuid,
+ boot_uuid, std::vector<BootPairTimes>{BootPairTimes{
+ .oldest_remote_monotonic_timestamp =
+ oldest_remote_monotonic_timestamp,
+ .oldest_local_monotonic_timestamp =
+ oldest_local_monotonic_timestamp,
+ .oldest_remote_unreliable_monotonic_timestamp =
+ oldest_remote_unreliable_monotonic_timestamp,
+ .oldest_local_unreliable_monotonic_timestamp =
+ oldest_local_unreliable_monotonic_timestamp}});
+ } else {
+ boot_times_it->second.emplace_back(
BootPairTimes{.oldest_remote_monotonic_timestamp =
oldest_remote_monotonic_timestamp,
.oldest_local_monotonic_timestamp =
@@ -665,24 +679,6 @@
oldest_remote_unreliable_monotonic_timestamp,
.oldest_local_unreliable_monotonic_timestamp =
oldest_local_unreliable_monotonic_timestamp});
- } else {
- // If we found an existing entry, update the min to be the min of all
- // records. This lets us combine info from multiple part files.
- if (oldest_remote_monotonic_timestamp <
- boot_times_it->second.oldest_remote_monotonic_timestamp) {
- boot_times_it->second.oldest_remote_monotonic_timestamp =
- oldest_remote_monotonic_timestamp;
- boot_times_it->second.oldest_local_monotonic_timestamp =
- oldest_local_monotonic_timestamp;
- }
- if (oldest_remote_unreliable_monotonic_timestamp <
- boot_times_it->second
- .oldest_remote_unreliable_monotonic_timestamp) {
- boot_times_it->second.oldest_remote_unreliable_monotonic_timestamp =
- oldest_remote_unreliable_monotonic_timestamp;
- boot_times_it->second.oldest_local_unreliable_monotonic_timestamp =
- oldest_local_unreliable_monotonic_timestamp;
- }
}
}
}
@@ -800,38 +796,139 @@
// Now, we have a bunch of remote boots for the same local boot and
// remote node. We want to sort them by observed local time. This will
- // tell us which ones happened first.
- std::vector<std::pair<std::string, BootPairTimes>> source_boot_times;
- for (const auto &boot_time : source_nodes.second) {
- source_boot_times.emplace_back(boot_time);
+ // tell us which ones happened first. Hold on to the max time on that
+ // node too so we can check for overlapping boots.
+ std::vector<std::tuple<std::string, BootPairTimes, BootPairTimes>>
+ source_boot_times;
+ for (const auto &boot_time_list : source_nodes.second) {
+ // Track the first boot time we have evidence of.
+ BootPairTimes boot_time = boot_time_list.second[0];
+ // And the last one so we can look for overlapping boots.
+ BootPairTimes max_boot_time = boot_time_list.second[0];
+ for (size_t i = 0; i < boot_time_list.second.size(); ++i) {
+ const BootPairTimes &next_boot_time = boot_time_list.second[i];
+ if (next_boot_time.oldest_local_unreliable_monotonic_timestamp !=
+ aos::monotonic_clock::max_time) {
+ VLOG(1)
+ << "Remote time "
+ << next_boot_time.oldest_remote_unreliable_monotonic_timestamp
+ << " " << boot_time_list.first;
+ VLOG(1)
+ << "Local time "
+ << next_boot_time.oldest_local_unreliable_monotonic_timestamp
+ << " " << boot_time_list.first;
+ }
+ // If we found an existing entry, update the min to be the min of
+ // all records. This lets us combine info from multiple part files.
+ if (next_boot_time.oldest_remote_monotonic_timestamp <
+ boot_time.oldest_remote_monotonic_timestamp) {
+ boot_time.oldest_remote_monotonic_timestamp =
+ next_boot_time.oldest_remote_monotonic_timestamp;
+ boot_time.oldest_local_monotonic_timestamp =
+ next_boot_time.oldest_local_monotonic_timestamp;
+ }
+ if ((next_boot_time.oldest_remote_monotonic_timestamp >
+ max_boot_time.oldest_remote_monotonic_timestamp ||
+ max_boot_time.oldest_remote_monotonic_timestamp ==
+ aos::monotonic_clock::max_time) &&
+ next_boot_time.oldest_remote_monotonic_timestamp !=
+ aos::monotonic_clock::max_time) {
+ max_boot_time.oldest_remote_monotonic_timestamp =
+ next_boot_time.oldest_remote_monotonic_timestamp;
+ max_boot_time.oldest_local_monotonic_timestamp =
+ next_boot_time.oldest_local_monotonic_timestamp;
+ }
+ if (next_boot_time.oldest_remote_unreliable_monotonic_timestamp <
+ boot_time.oldest_remote_unreliable_monotonic_timestamp) {
+ boot_time.oldest_remote_unreliable_monotonic_timestamp =
+ next_boot_time.oldest_remote_unreliable_monotonic_timestamp;
+ boot_time.oldest_local_unreliable_monotonic_timestamp =
+ next_boot_time.oldest_local_unreliable_monotonic_timestamp;
+ }
+ if ((next_boot_time.oldest_remote_unreliable_monotonic_timestamp >
+ max_boot_time
+ .oldest_remote_unreliable_monotonic_timestamp ||
+ max_boot_time.oldest_remote_unreliable_monotonic_timestamp ==
+ aos::monotonic_clock::max_time) &&
+ next_boot_time.oldest_remote_unreliable_monotonic_timestamp !=
+ aos::monotonic_clock::max_time) {
+ max_boot_time.oldest_remote_unreliable_monotonic_timestamp =
+ next_boot_time.oldest_remote_unreliable_monotonic_timestamp;
+ max_boot_time.oldest_local_unreliable_monotonic_timestamp =
+ next_boot_time.oldest_local_unreliable_monotonic_timestamp;
+ }
+ }
+ source_boot_times.emplace_back(
+ std::make_tuple(boot_time_list.first, boot_time, max_boot_time));
}
std::sort(
source_boot_times.begin(), source_boot_times.end(),
- [](const std::pair<std::string, BootPairTimes> &a,
- const std::pair<std::string, BootPairTimes> &b) {
+ [](const std::tuple<std::string, BootPairTimes, BootPairTimes> &a,
+ const std::tuple<std::string, BootPairTimes, BootPairTimes> &b) {
// There are cases where we will only have a reliable timestamp.
// In that case, we need to use oldest_local_monotonic_timestamp.
// But, that may result in collisions if the same message gets
// forwarded to both boots, so it will have the same timestamp.
// Solve that by breaking the tie with the unreliable messages.
- if (a.second.oldest_local_monotonic_timestamp ==
- b.second.oldest_local_monotonic_timestamp) {
- CHECK_NE(a.second.oldest_local_unreliable_monotonic_timestamp,
- b.second.oldest_local_unreliable_monotonic_timestamp);
- return a.second.oldest_local_unreliable_monotonic_timestamp <
- b.second.oldest_local_unreliable_monotonic_timestamp;
+ if (std::get<1>(a).oldest_local_monotonic_timestamp ==
+ std::get<1>(b).oldest_local_monotonic_timestamp) {
+ CHECK_NE(
+ std::get<1>(a).oldest_local_unreliable_monotonic_timestamp,
+ std::get<1>(b).oldest_local_unreliable_monotonic_timestamp);
+ return std::get<1>(a)
+ .oldest_local_unreliable_monotonic_timestamp <
+ std::get<1>(b)
+ .oldest_local_unreliable_monotonic_timestamp;
} else {
- return a.second.oldest_local_monotonic_timestamp <
- b.second.oldest_local_monotonic_timestamp;
+ return std::get<1>(a).oldest_local_monotonic_timestamp <
+ std::get<1>(b).oldest_local_monotonic_timestamp;
}
});
+ // The last time from the source node on the logger node.
+ // This is used to track overlapping boots since this should always
+ // increase.
+ aos::monotonic_clock::time_point last_boot_time =
+ aos::monotonic_clock::min_time;
+
// Now take our sorted list and build up constraints so we can solve.
for (size_t boot_id = 0; boot_id < source_boot_times.size();
++boot_id) {
- const std::pair<std::string, BootPairTimes> &boot_time =
- source_boot_times[boot_id];
- const std::string &source_boot_uuid = boot_time.first;
+ const std::tuple<std::string, BootPairTimes, BootPairTimes>
+ &boot_time = source_boot_times[boot_id];
+ const std::string &source_boot_uuid = std::get<0>(boot_time);
+
+ // Enforce that the last time observed in the headers on the previous
+ // boot is less than the first time on the next boot. This equates to
+ // there being no overlap between the two boots.
+ if (std::get<1>(boot_time)
+ .oldest_local_unreliable_monotonic_timestamp <
+ last_boot_time) {
+ for (size_t fatal_boot_id = 0;
+ fatal_boot_id < source_boot_times.size(); ++fatal_boot_id) {
+ const std::tuple<std::string, BootPairTimes, BootPairTimes>
+ &fatal_boot_time = source_boot_times[fatal_boot_id];
+ const std::string &fatal_source_boot_uuid =
+ std::get<0>(fatal_boot_time);
+ LOG(ERROR) << "Boot " << fatal_boot_id << ", "
+ << fatal_source_boot_uuid << " on " << source_node_name
+ << " spans ["
+ << std::get<1>(fatal_boot_time)
+ .oldest_local_unreliable_monotonic_timestamp
+ << ", "
+ << std::get<2>(fatal_boot_time)
+ .oldest_local_unreliable_monotonic_timestamp
+ << "] on logger " << logger_node_name;
+ }
+ LOG(FATAL) << "Found overlapping boots on " << source_node_name
+ << " logged on " << logger_node_name << ", "
+ << std::get<1>(boot_time)
+ .oldest_local_unreliable_monotonic_timestamp
+ << " < " << last_boot_time;
+ }
+
+ last_boot_time = std::get<2>(boot_time)
+ .oldest_local_unreliable_monotonic_timestamp;
auto source_node_boot_constraints_it =
boot_constraints.find(source_node_name);
@@ -854,9 +951,9 @@
&per_node_boot_constraints =
source_node_boot_constraints_it->second.constraints;
- const std::pair<std::string, BootPairTimes> &prior_boot_time =
- source_boot_times[boot_id - 1];
- const std::string &prior_boot_uuid = prior_boot_time.first;
+ const std::tuple<std::string, BootPairTimes, BootPairTimes>
+ &prior_boot_time = source_boot_times[boot_id - 1];
+ const std::string &prior_boot_uuid = std::get<0>(prior_boot_time);
auto first_per_boot_constraints =
per_node_boot_constraints.find(prior_boot_uuid);
diff --git a/aos/events/logging/logfile_utils_test.cc b/aos/events/logging/logfile_utils_test.cc
index 9be70c6..c9ec37e 100644
--- a/aos/events/logging/logfile_utils_test.cc
+++ b/aos/events/logging/logfile_utils_test.cc
@@ -2506,6 +2506,220 @@
}
}
+class SortingDeathTest : public SortingElementTest {
+ public:
+ SortingDeathTest()
+ : SortingElementTest(),
+ part0_(MakeHeader(config_, R"({
+ /* 100ms */
+ "max_out_of_order_duration": 100000000,
+ "node": {
+ "name": "pi1"
+ },
+ "logger_node": {
+ "name": "pi1"
+ },
+ "monotonic_start_time": 1000000,
+ "realtime_start_time": 1000000000000,
+ "logger_monotonic_start_time": 1000000,
+ "logger_realtime_start_time": 1000000000000,
+ "log_event_uuid": "30ef1283-81d7-4004-8c36-1c162dbcb2b2",
+ "parts_uuid": "ee4f5a98-77d0-4e01-af2f-bbb29e098ede",
+ "parts_index": 0,
+ "logger_instance_uuid": "1c3142ad-10a5-408d-a760-b63b73d3b904",
+ "logger_node_boot_uuid": "a570df8b-5cc2-4dbe-89bd-286f9ddd02b7",
+ "source_node_boot_uuid": "a570df8b-5cc2-4dbe-89bd-286f9ddd02b7",
+ "boot_uuids": [
+ "a570df8b-5cc2-4dbe-89bd-286f9ddd02b7",
+ "6ba4f28d-21a2-4d7f-83f4-ee365cf86464",
+ ""
+ ],
+ "oldest_remote_monotonic_timestamps": [
+ 9223372036854775807,
+ 9223372036854775807,
+ 9223372036854775807
+ ],
+ "oldest_local_monotonic_timestamps": [
+ 9223372036854775807,
+ 9223372036854775807,
+ 9223372036854775807
+ ],
+ "oldest_remote_unreliable_monotonic_timestamps": [
+ 9223372036854775807,
+ 0,
+ 9223372036854775807
+ ],
+ "oldest_local_unreliable_monotonic_timestamps": [
+ 9223372036854775807,
+ 0,
+ 9223372036854775807
+ ]
+})")),
+ part1_(MakeHeader(config_, R"({
+ /* 100ms */
+ "max_out_of_order_duration": 100000000,
+ "node": {
+ "name": "pi1"
+ },
+ "logger_node": {
+ "name": "pi1"
+ },
+ "monotonic_start_time": 1000000,
+ "realtime_start_time": 1000000000000,
+ "logger_monotonic_start_time": 1000000,
+ "logger_realtime_start_time": 1000000000000,
+ "log_event_uuid": "30ef1283-81d7-4004-8c36-1c162dbcb2b2",
+ "parts_uuid": "ee4f5a98-77d0-4e01-af2f-bbb29e098ede",
+ "parts_index": 1,
+ "logger_instance_uuid": "1c3142ad-10a5-408d-a760-b63b73d3b904",
+ "logger_node_boot_uuid": "a570df8b-5cc2-4dbe-89bd-286f9ddd02b7",
+ "source_node_boot_uuid": "a570df8b-5cc2-4dbe-89bd-286f9ddd02b7",
+ "boot_uuids": [
+ "a570df8b-5cc2-4dbe-89bd-286f9ddd02b7",
+ "b728d27a-9181-4eac-bfc1-5d09b80469d2",
+ ""
+ ],
+ "oldest_remote_monotonic_timestamps": [
+ 9223372036854775807,
+ 9223372036854775807,
+ 9223372036854775807
+ ],
+ "oldest_local_monotonic_timestamps": [
+ 9223372036854775807,
+ 9223372036854775807,
+ 9223372036854775807
+ ],
+ "oldest_remote_unreliable_monotonic_timestamps": [
+ 9223372036854775807,
+ 100000,
+ 9223372036854775807
+ ],
+ "oldest_local_unreliable_monotonic_timestamps": [
+ 9223372036854775807,
+ 100000,
+ 9223372036854775807
+ ]
+})")),
+ part2_(MakeHeader(config_, R"({
+ /* 100ms */
+ "max_out_of_order_duration": 100000000,
+ "node": {
+ "name": "pi1"
+ },
+ "logger_node": {
+ "name": "pi1"
+ },
+ "monotonic_start_time": 1000000,
+ "realtime_start_time": 1000000000000,
+ "logger_monotonic_start_time": 1000000,
+ "logger_realtime_start_time": 1000000000000,
+ "log_event_uuid": "30ef1283-81d7-4004-8c36-1c162dbcb2b2",
+ "parts_uuid": "ee4f5a98-77d0-4e01-af2f-bbb29e098ede",
+ "parts_index": 2,
+ "logger_instance_uuid": "1c3142ad-10a5-408d-a760-b63b73d3b904",
+ "logger_node_boot_uuid": "a570df8b-5cc2-4dbe-89bd-286f9ddd02b7",
+ "source_node_boot_uuid": "a570df8b-5cc2-4dbe-89bd-286f9ddd02b7",
+ "boot_uuids": [
+ "a570df8b-5cc2-4dbe-89bd-286f9ddd02b7",
+ "6ba4f28d-21a2-4d7f-83f4-ee365cf86464",
+ ""
+ ],
+ "oldest_remote_monotonic_timestamps": [
+ 9223372036854775807,
+ 9223372036854775807,
+ 9223372036854775807
+ ],
+ "oldest_local_monotonic_timestamps": [
+ 9223372036854775807,
+ 9223372036854775807,
+ 9223372036854775807
+ ],
+ "oldest_remote_unreliable_monotonic_timestamps": [
+ 9223372036854775807,
+ 200000,
+ 9223372036854775807
+ ],
+ "oldest_local_unreliable_monotonic_timestamps": [
+ 9223372036854775807,
+ 200000,
+ 9223372036854775807
+ ]
+})")),
+ part3_(MakeHeader(config_, R"({
+ /* 100ms */
+ "max_out_of_order_duration": 100000000,
+ "node": {
+ "name": "pi1"
+ },
+ "logger_node": {
+ "name": "pi1"
+ },
+ "monotonic_start_time": 1000000,
+ "realtime_start_time": 1000000000000,
+ "logger_monotonic_start_time": 1000000,
+ "logger_realtime_start_time": 1000000000000,
+ "log_event_uuid": "30ef1283-81d7-4004-8c36-1c162dbcb2b2",
+ "parts_uuid": "ee4f5a98-77d0-4e01-af2f-bbb29e098ede",
+ "parts_index": 3,
+ "logger_instance_uuid": "1c3142ad-10a5-408d-a760-b63b73d3b904",
+ "logger_node_boot_uuid": "a570df8b-5cc2-4dbe-89bd-286f9ddd02b7",
+ "source_node_boot_uuid": "a570df8b-5cc2-4dbe-89bd-286f9ddd02b7",
+ "boot_uuids": [
+ "a570df8b-5cc2-4dbe-89bd-286f9ddd02b7",
+ "b728d27a-9181-4eac-bfc1-5d09b80469d2",
+ ""
+ ],
+ "oldest_remote_monotonic_timestamps": [
+ 9223372036854775807,
+ 9223372036854775807,
+ 9223372036854775807
+ ],
+ "oldest_local_monotonic_timestamps": [
+ 9223372036854775807,
+ 9223372036854775807,
+ 9223372036854775807
+ ],
+ "oldest_remote_unreliable_monotonic_timestamps": [
+ 9223372036854775807,
+ 300000,
+ 9223372036854775807
+ ],
+ "oldest_local_unreliable_monotonic_timestamps": [
+ 9223372036854775807,
+ 300000,
+ 9223372036854775807
+ ]
+})")) {}
+
+ protected:
+ const aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> part0_;
+ const aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> part1_;
+ const aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> part2_;
+ const aos::SizePrefixedFlatbufferDetachedBuffer<LogFileHeader> part3_;
+};
+
+// Tests that if 2 computers go back and forth trying to be the same node, we
+// die in sorting instead of failing to estimate time.
+TEST_F(SortingDeathTest, FightingNodes) {
+ {
+ DetachedBufferWriter writer0(logfile0_, std::make_unique<DummyEncoder>());
+ writer0.QueueSpan(part0_.span());
+ DetachedBufferWriter writer1(logfile1_, std::make_unique<DummyEncoder>());
+ writer1.QueueSpan(part1_.span());
+ DetachedBufferWriter writer2(logfile2_, std::make_unique<DummyEncoder>());
+ writer2.QueueSpan(part2_.span());
+ DetachedBufferWriter writer3(logfile3_, std::make_unique<DummyEncoder>());
+ writer3.QueueSpan(part3_.span());
+ }
+
+ EXPECT_DEATH(
+ {
+ const std::vector<LogFile> parts =
+ SortParts({logfile0_, logfile1_, logfile2_, logfile3_});
+ },
+ "Found overlapping boots on");
+}
+
} // namespace testing
} // namespace logger
} // namespace aos