blob: ec1a45018b8be4a056297fc92b3730caef734d3a [file] [log] [blame]
include "aos/configuration.fbs";
namespace aos.logger;
// A log file is a sequence of size prefixed flatbuffers.
// The first flatbuffer will be the LogFileHeader, followed by an arbitrary
// number of MessageHeaders.
//
// The log file starts at the time demarcated in the header on the monotonic
// clock. There will be any number of messages per channel logged before the
// start time. These messages are logged so that fetchers can retrieve the
// state of the system at the start of the logfile for things like capturing
// parameters. In replay, they should be made available to fetchers, but not
// trigger watchers.
table LogFileHeader {
// Time this log file started on the monotonic clock in nanoseconds.
// If this isn't known (the log file is being recorded from another node
// where we don't know the time offset), both timestamps will be min_time.
// This log file may contain data from before the start times (e.g.,
// fetched message data), but should guarantee that all data within the
// logfile *after* start_time is present until the end of the file (note
// that there may be incomplete data at the very end of a log if it is
// truncated poorly).
// These timestamps are from the perspective of `node`.
monotonic_start_time:int64 = -9223372036854775808 (id: 0);
// Time this log file started on the realtime clock in nanoseconds.
// Will only be populated if logger_node == node.
realtime_start_time:int64 = -9223372036854775808 (id: 1);
// Messages are not written in order to disk. They will be out of order by
// at most this duration (in nanoseconds). If the log reader buffers until
// it finds messages this much newer than it's simulation time, it will never
// find a message out of order.
max_out_of_order_duration:long (id: 2);
// The configuration of the channels. It is valid to have a log file with
// just this filled out. That is a config only file which will be pointed to
// by files using configuration_sha256 and optionally configuration_path.
configuration:aos.Configuration (id: 3);
// sha256 of the configuration used. If this is set, configuration will not
// be set.
configuration_sha256:string (id: 16);
// Name of the device which this log file is for.
name:string (id: 4);
// The current node, if known and running in a multi-node configuration.
node:Node (id: 5);
// All UUIDs are uuid4.
// A log file is made up of a bunch of log files and parts. These build up
// a tree. Every .bfbs file has a LogFileHeader at the start.
//
// /-- basename_pi1_data.part0.bfbs, basename_pi1_data.part1.bfbs, etc.
// ---- basename_timestamps/pi1/aos/remote_timestamps/pi2/aos.logger.MessageHeader.part0.bfbs, etc.
// \-- basename_pi2_data/pi2/aos/aos.message_bridge.Timestamp.part0.bfbs, etc.
// All log files and parts from a single logging event will have
// the same uuid. This should be all the files generated on a single node.
// Used to correlate files recorded together.
log_event_uuid:string (id: 6);
// All log parts generated by a single Logger instance will have the same
// value here.
logger_instance_uuid: string (id: 10);
// All log parts generated on a single node while it is booted will have the
// same value here. It also matches with the one used by systemd.
logger_node_boot_uuid: string (id: 11);
// Empty if we didn't have one at the time.
source_node_boot_uuid: string (id: 13);
// Timestamps that this logfile started at on the logger's clocks. This is
// mostly useful when trying to deduce the order of node reboots. These
// timestamps don't change on reboot, so they can't be used reliably.
logger_monotonic_start_time:int64 = -9223372036854775808 (id: 14);
logger_realtime_start_time:int64 = -9223372036854775808 (id: 15);
// All log events across all nodes produced by a single high-level start event
// will have the same value here.
log_start_uuid: string (id: 12);
// Part files which go together all have headers. When creating a log file
// with multiple parts, the logger should stop writing to part n-1 as soon
// as it starts writing to part n, and write messages as though there was
// just 1 big file. Therefore, part files won't be self standing, since
// they won't have data fetched at the beginning.
// If data is logged before the time offset can be established to the other
// node, the start time will be monotonic_clock::min_time, and a new part file
// will be created when the start time is known.
// All the parts which go together have the same uuid.
parts_uuid:string (id: 7);
// And the parts_index corresponds to which part this is in the sequence. The
// index should start at 0.
parts_index:int32 (id: 8);
// The node the data was logged on, if known and running in a multi-node configuration.
logger_node:Node (id: 9);
// The boot UUIDs for all nodes we know them for, or "" for the ones we don't.
boot_uuids:[string] (id: 17);
// Timestamps that the header on this part file was written on the logger node.
logger_part_monotonic_start_time:int64 = -9223372036854775808 (id: 18);
logger_part_realtime_start_time:int64 = -9223372036854775808 (id: 19);
// These timestamps provide summary information about the oldest messages we
// know which crossed the network. The goal is to enable log file sorting
// to determine the order of all boots by observing corresponding times
// across the network and using those to determine constraints so we can sort
// a DAG.
//
// There are 5 main cases. Let's say we have 2 channels. /r which
// is reliable, and /u which isn't, both sent from the same remote node.
// The examples below are listed as the remote node sending the message, and
// then the local node receiving and logging the message.
//
// case 0: /r -> boot 0 received on boot 0.
// /u -> boot 0 received on boot 0.
// We log for a bit, then the remote reboots.
// /r -> boot 1 received on boot 0.
// /u -> boot 1 received on boot 0.
//
// case 1: /r -> boot 0 received on boot 0.
// /u -> boot 1 received on boot 0.
// We start logging after both messages arrive.
//
// case 2: /r -> boot 0 received on boot 0.
// /u -> boot 0 received on boot 0.
// We log for a bit, then reboot. More messages show up when we reconnect.
// /r -> boot 0 received on boot 1.
// /u -> boot 0 received on boot 1.
//
// case 3: /u -> boot 0 received on boot 0.
// /r -> boot 1 received on boot 0.
// /u -> boot 1 received on boot 0.
// We start logging after all three messages arrive.
//
// case 4: /u -> boot 0 received on boot 0.
// /r -> boot 1 received on boot 0.
//
// In case 0, we have all the messages showing up and a reboot of the remote.
//
// In case 1: we only have a reliable timestamp from boot 0, but that
// reliable timestamp makes it clear that /r was before /u, so boot 0 was
// before boot 1.
//
// In case 2: we have the same reliable timestamp, so that tells us nothing.
// The unreliable timestamps though tell a different story. /u will be after
// /r, since any messages on /u generated before the reboot won't get
// delivered. So, we get an ordering constraint saying that any sent /u's
// on the second boot were after /u on the first boot.
//
// In case 3: we only got the reliable message on the second boot for some
// reason. Reliable messages aren't 100% reliable. In this case, the
// reliable timestamps are actually a distraction and are misleading. We
// want to use the unreliable timestamps here.
//
// In case 4: we have utter madness...
//
// We expect the nominal case to be case 0, or the first half of case 0 if
// there are no reboots.
//
// We believe that any other cases are covered by the same mechanism.
// TODO(austin/brian): Shore up this and capture the cases that are 100%
// ambiguous and we can't sort.
//
// For all channels sent from a specific node, these vectors hold the
// timestamp of the oldest known message from that node, and the
// corresponding monotonic timestamp for when that was received on our node.
//
// The local node is the node that this log file is from the perspective of
// (field 6)
corrupted_oldest_remote_monotonic_timestamps:[int64] (id: 20, deprecated);
corrupted_oldest_local_monotonic_timestamps:[int64] (id: 21, deprecated);
oldest_remote_monotonic_timestamps:[int64] (id: 24);
oldest_local_monotonic_timestamps:[int64] (id: 25);
// For all channels *excluding* the reliable channels (ttl == 0), record the
// same quantity.
corrupted_oldest_remote_unreliable_monotonic_timestamps:[int64] (id: 22, deprecated);
corrupted_oldest_local_unreliable_monotonic_timestamps:[int64] (id: 23, deprecated);
oldest_remote_unreliable_monotonic_timestamps:[int64] (id: 26);
oldest_local_unreliable_monotonic_timestamps:[int64] (id: 27);
// For all channels *excluding* the unreliable channels (ttl != 0), record the
// same quantity.
oldest_remote_reliable_monotonic_timestamps:[int64] (id: 28);
oldest_local_reliable_monotonic_timestamps:[int64] (id: 29);
// For all the remote timestamps which come back to the logger. The "local"
// time here is the logger node boot, and "remote" is the node which sent the
// timestamps.
oldest_logger_remote_unreliable_monotonic_timestamps:[int64] (id: 30);
oldest_logger_local_unreliable_monotonic_timestamps:[int64] (id: 31);
// Logger build version. This is normally the git sha1 that the logger was
// built from.
logger_sha1:string (id:32);
// Logger textual version. This is normally the release name stamped into
// the binary.
logger_version:string (id:33);
}
// Table holding a message.
table MessageHeader {
// Index into the channel datastructure in the log file header. This
// provides the data type.
channel_index:uint (id: 0);
// Time this message was sent on the monotonic clock in nanoseconds on this
// node.
monotonic_sent_time:long (id: 1);
// Time this message was sent on the realtime clock in nanoseconds on this
// node.
realtime_sent_time:long (id: 2);
// Index into the ipc queue of this message. This should start with 0 and
// always monotonically increment if no messages were ever lost. It will
// wrap at a multiple of the queue size.
queue_index:uint (id: 3);
// TODO(austin): Format? Compressed?
// The nested flatbuffer.
data:[ubyte] (id: 4);
// Time this message was sent on the monotonic clock of the remote node in
// nanoseconds.
monotonic_remote_time:int64 = -9223372036854775808 (id: 5);
// Time this message was sent on the realtime clock of the remote node in
// nanoseconds.
realtime_remote_time:int64 = -9223372036854775808 (id: 6);
// Queue index of this message on the remote node.
remote_queue_index:uint32 = 4294967295 (id: 7);
// Time this timestamp was received on the monotonic clock of the logger node
// in nanoseconds.
monotonic_timestamp_time:int64 = -9223372036854775808 (id: 8);
}
root_type MessageHeader;