Support graceful logging failures when running out of disk space

It's the most obvious failure mode when logging, so let's allow the
higher-level code to report a status instead of just crash-looping.

Change-Id: Iff223fd8b6b0f7f4b21d154a4dda5cce80fa6af2
diff --git a/aos/events/logging/logfile_utils.cc b/aos/events/logging/logfile_utils.cc
index 7ddc658..5226154 100644
--- a/aos/events/logging/logfile_utils.cc
+++ b/aos/events/logging/logfile_utils.cc
@@ -46,12 +46,11 @@
 }
 
 DetachedBufferWriter::~DetachedBufferWriter() {
-  encoder_->Finish();
-  while (encoder_->queue_size() > 0) {
-    Flush();
+  Close();
+  if (ran_out_of_space_) {
+    CHECK(acknowledge_ran_out_of_space_)
+        << ": Unacknowledged out of disk space, log file was not completed";
   }
-  PLOG_IF(ERROR, close(fd_) == -1) << " Failed to close logfile";
-  VLOG(1) << "Closed " << filename_;
 }
 
 DetachedBufferWriter::DetachedBufferWriter(DetachedBufferWriter &&other) {
@@ -66,6 +65,8 @@
   std::swap(filename_, other.filename_);
   std::swap(encoder_, other.encoder_);
   std::swap(fd_, other.fd_);
+  std::swap(ran_out_of_space_, other.ran_out_of_space_);
+  std::swap(acknowledge_ran_out_of_space_, other.acknowledge_ran_out_of_space_);
   std::swap(iovec_, other.iovec_);
   std::swap(max_write_time_, other.max_write_time_);
   std::swap(max_write_time_bytes_, other.max_write_time_bytes_);
@@ -83,6 +84,14 @@
     // syscall to write the data immediately instead of copying it to
     // enqueue.
 
+    if (ran_out_of_space_) {
+      // We don't want any later data to be written after space becomes
+      // available, so refuse to write anything more once we've dropped data
+      // because we ran out of space.
+      VLOG(1) << "Ignoring span: " << span.size();
+      return;
+    }
+
     // First, flush everything.
     while (encoder_->queue_size() > 0u) {
       Flush();
@@ -92,9 +101,7 @@
     const auto start = aos::monotonic_clock::now();
     const ssize_t written = write(fd_, span.data(), span.size());
     const auto end = aos::monotonic_clock::now();
-    PCHECK(written >= 0) << ": write failed";
-    CHECK_EQ(written, static_cast<ssize_t>(span.size()))
-        << ": Wrote " << written << " expected " << span.size();
+    HandleWriteReturn(written, span.size());
     UpdateStatsForWrite(end - start, written, 1);
   } else {
     encoder_->Encode(CopySpanAsDetachedBuffer(span));
@@ -103,11 +110,38 @@
   FlushAtThreshold();
 }
 
+void DetachedBufferWriter::Close() {
+  if (fd_ == -1) {
+    return;
+  }
+  encoder_->Finish();
+  while (encoder_->queue_size() > 0) {
+    Flush();
+  }
+  if (close(fd_) == -1) {
+    if (errno == ENOSPC) {
+      ran_out_of_space_ = true;
+    } else {
+      PLOG(ERROR) << "Closing log file failed";
+    }
+  }
+  fd_ = -1;
+  VLOG(1) << "Closed " << filename_;
+}
+
 void DetachedBufferWriter::Flush() {
   const auto queue = encoder_->queue();
   if (queue.empty()) {
     return;
   }
+  if (ran_out_of_space_) {
+    // We don't want any later data to be written after space becomes available,
+    // so refuse to write anything more once we've dropped data because we ran
+    // out of space.
+    VLOG(1) << "Ignoring queue: " << queue.size();
+    encoder_->Clear(queue.size());
+    return;
+  }
 
   iovec_.clear();
   const size_t iovec_size = std::min<size_t>(queue.size(), IOV_MAX);
@@ -122,16 +156,30 @@
   const auto start = aos::monotonic_clock::now();
   const ssize_t written = writev(fd_, iovec_.data(), iovec_.size());
   const auto end = aos::monotonic_clock::now();
-  PCHECK(written >= 0) << ": writev failed";
-  // TODO(austin): Handle partial writes in some way other than crashing...
-  CHECK_EQ(written, static_cast<ssize_t>(counted_size))
-      << ": Wrote " << written << " expected " << counted_size;
+  HandleWriteReturn(written, counted_size);
 
   encoder_->Clear(iovec_size);
 
   UpdateStatsForWrite(end - start, written, iovec_size);
 }
 
+void DetachedBufferWriter::HandleWriteReturn(ssize_t write_return,
+                                             size_t write_size) {
+  if (write_return == -1 && errno == ENOSPC) {
+    ran_out_of_space_ = true;
+    return;
+  }
+  PCHECK(write_return >= 0) << ": write failed";
+  if (write_return < static_cast<ssize_t>(write_size)) {
+    // Sometimes this happens instead of ENOSPC. On a real filesystem, this
+    // never seems to happen in any other case. If we ever want to log to a
+    // socket, this will happen more often. However, until we get there, we'll
+    // just assume it means we ran out of space.
+    ran_out_of_space_ = true;
+    return;
+  }
+}
+
 void DetachedBufferWriter::UpdateStatsForWrite(
     aos::monotonic_clock::duration duration, ssize_t written, int iovec_size) {
   if (duration > max_write_time_) {