Use O_DIRECT when possible for faster log writing O_DIRECT skips the kernel buffers. On the rockpi, I get 2x the write performance for significantly less CPU utilization. It is disabled by default. Change-Id: If0938e2720286f168383ecca2401f28c1ac121b8 Signed-off-by: Austin Schuh <austin.linux@gmail.com>

commit: 4c3cdb790872ec1830b6024541278effde7151f5 [log] [tgz]
author: Austin Schuh <austin.linux@gmail.com> Sat Feb 11 15:05:26 2023 -0800
committer: Austin Schuh <austin.linux@gmail.com> Sat Feb 11 18:09:48 2023 -0800
tree: 0fda88bdd9c38d82a84a6019f1fe8f0687a7acb4
parent: fce6c7906c33d30896dedb09839e2953cd9d7c58 [diff] [blame]
diff --git a/aos/events/logging/logfile_utils.cc b/aos/events/logging/logfile_utils.cc
index 86c89a8..02a9fdd 100644
--- a/aos/events/logging/logfile_utils.cc
+++ b/aos/events/logging/logfile_utils.cc

@@ -64,6 +64,10 @@
             "corrupt message found by MessageReader be silently ignored, "
             "providing access to all uncorrupted messages in a logfile.");
 
+DEFINE_bool(direct, false,
+            "If true, write using O_DIRECT and write 512 byte aligned blocks "
+            "whenever possible.");
+
 namespace aos::logger {
 namespace {
 
@@ -81,7 +85,10 @@
 
 DetachedBufferWriter::DetachedBufferWriter(std::string_view filename,
                                            std::unique_ptr<DataEncoder> encoder)
-    : filename_(filename), encoder_(std::move(encoder)) {
+    : filename_(filename),
+      encoder_(std::move(encoder)),
+      supports_odirect_(FLAGS_direct) {
+  iovec_.reserve(10);
   if (!util::MkdirPIfSpace(filename, 0777)) {
     ran_out_of_space_ = true;
   } else {
@@ -92,10 +99,38 @@
       PCHECK(fd_ != -1) << ": Failed to open " << this->filename()
                         << " for writing";
       VLOG(1) << "Opened " << this->filename() << " for writing";
+
+      flags_ = fcntl(fd_, F_GETFL, 0);
+      PCHECK(flags_ >= 0) << ": Failed to get flags for " << this->filename();
+
+      EnableDirect();
     }
   }
 }
 
+void DetachedBufferWriter::EnableDirect() {
+  if (supports_odirect_ && !ODirectEnabled()) {
+    const int new_flags = flags_ | O_DIRECT;
+    // Track if we failed to set O_DIRECT.  Note: Austin hasn't seen this call
+    // fail.  The write call tends to fail instead.
+    if (fcntl(fd_, F_SETFL, new_flags) == -1) {
+      PLOG(WARNING) << "Failed to set O_DIRECT on " << filename();
+      supports_odirect_ = false;
+    } else {
+      VLOG(1) << "Enabled O_DIRECT on " << filename();
+      flags_ = new_flags;
+    }
+  }
+}
+
+void DetachedBufferWriter::DisableDirect() {
+  if (supports_odirect_ && ODirectEnabled()) {
+    flags_ = flags_ & (~O_DIRECT);
+    PCHECK(fcntl(fd_, F_SETFL, flags_) != -1) << ": Failed to disable O_DIRECT";
+    VLOG(1) << "Disabled O_DIRECT on " << filename();
+  }
+}
+
 DetachedBufferWriter::~DetachedBufferWriter() {
   Close();
   if (ran_out_of_space_) {
@@ -126,6 +161,10 @@
   std::swap(total_write_count_, other.total_write_count_);
   std::swap(total_write_messages_, other.total_write_messages_);
   std::swap(total_write_bytes_, other.total_write_bytes_);
+  std::swap(last_synced_bytes_, other.last_synced_bytes_);
+  std::swap(supports_odirect_, other.supports_odirect_);
+  std::swap(flags_, other.flags_);
+  std::swap(last_flush_time_, other.last_flush_time_);
   return *this;
 }
 
@@ -198,23 +237,110 @@
   }
 
   iovec_.clear();
-  const size_t iovec_size = std::min<size_t>(queue.size(), IOV_MAX);
+  size_t iovec_size = std::min<size_t>(queue.size(), IOV_MAX);
   iovec_.resize(iovec_size);
   size_t counted_size = 0;
+
+  // Ok, we now need to figure out if we were aligned, and if we were, how much
+  // of the data we are being asked to write is aligned.
+  //
+  // The file is aligned if it is a multiple of kSector in length.  The data is
+  // aligned if it's memory is kSector aligned, and the length is a multiple of
+  // kSector in length.
+  bool aligned = (total_write_bytes_ % kSector) == 0;
+  size_t write_index = 0;
   for (size_t i = 0; i < iovec_size; ++i) {
-    iovec_[i].iov_base = const_cast<uint8_t *>(queue[i].data());
-    iovec_[i].iov_len = queue[i].size();
-    counted_size += iovec_[i].iov_len;
+    iovec_[write_index].iov_base = const_cast<uint8_t *>(queue[i].data());
+
+    // Make sure the address is aligned, or give up.  This should be uncommon,
+    // but is always possible.
+    if ((reinterpret_cast<size_t>(iovec_[write_index].iov_base) &
+         (kSector - 1)) != 0) {
+      aligned = false;
+    }
+
+    // Now, see if the length is a multiple of kSector.  The goal is to figure
+    // out if/how much memory we can write out with O_DIRECT so that only the
+    // last little bit is done with non-direct IO to keep it fast.
+    iovec_[write_index].iov_len = queue[i].size();
+    if ((iovec_[write_index].iov_len % kSector) != 0) {
+      VLOG(1) << "Unaligned length on " << filename();
+      // If we've got over a sector of data to write, write it out with O_DIRECT
+      // and then continue writing the rest unaligned.
+      if (aligned && iovec_[write_index].iov_len > kSector) {
+        const size_t aligned_size =
+            iovec_[write_index].iov_len & (~(kSector - 1));
+        VLOG(1) << "Was aligned, writing last chunk rounded from "
+                << queue[i].size() << " to " << aligned_size;
+        iovec_[write_index].iov_len = aligned_size;
+
+        WriteV(iovec_.data(), i + 1, true, counted_size + aligned_size);
+
+        // Now, everything before here has been written.  Make an iovec out of
+        // the last bytes, and keep going.
+        iovec_size -= write_index;
+        iovec_.resize(iovec_size);
+        write_index = 0;
+        counted_size = 0;
+
+        iovec_[write_index].iov_base =
+            const_cast<uint8_t *>(queue[i].data() + aligned_size);
+        iovec_[write_index].iov_len = queue[i].size() - aligned_size;
+      }
+      aligned = false;
+    }
+    VLOG(1) << "Writing " << iovec_[write_index].iov_len << " to "
+            << filename();
+    counted_size += iovec_[write_index].iov_len;
+    ++write_index;
+  }
+
+  // Either write the aligned data if it is all aligned, or write the rest
+  // unaligned if we wrote aligned up above.
+  WriteV(iovec_.data(), iovec_.size(), aligned, counted_size);
+
+  encoder_->Clear(iovec_size);
+}
+
+size_t DetachedBufferWriter::WriteV(struct iovec *iovec_data, size_t iovec_size,
+                                    bool aligned, size_t counted_size) {
+  // Configure the file descriptor to match the mode we should be in.  This is
+  // safe to over-call since it only does the syscall if needed.
+  if (aligned) {
+    EnableDirect();
+  } else {
+    DisableDirect();
   }
 
   const auto start = aos::monotonic_clock::now();
-  const ssize_t written = writev(fd_, iovec_.data(), iovec_.size());
+  const ssize_t written = writev(fd_, iovec_data, iovec_size);
+
+  if (written > 0) {
+    // Flush asynchronously and force the data out of the cache.
+    sync_file_range(fd_, total_write_bytes_, written, SYNC_FILE_RANGE_WRITE);
+    if (last_synced_bytes_ != 0) {
+      // Per Linus' recommendation online on how to do fast file IO, do a
+      // blocking flush of the previous write chunk, and then tell the kernel to
+      // drop the pages from the cache.  This makes sure we can't get too far
+      // ahead.
+      sync_file_range(fd_, last_synced_bytes_,
+                      total_write_bytes_ - last_synced_bytes_,
+                      SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
+                          SYNC_FILE_RANGE_WAIT_AFTER);
+      posix_fadvise(fd_, last_synced_bytes_,
+                    total_write_bytes_ - last_synced_bytes_,
+                    POSIX_FADV_DONTNEED);
+
+      last_synced_bytes_ = total_write_bytes_;
+    }
+  }
+
   const auto end = aos::monotonic_clock::now();
   HandleWriteReturn(written, counted_size);
 
-  encoder_->Clear(iovec_size);
-
   UpdateStatsForWrite(end - start, written, iovec_size);
+
+  return written;
 }
 
 void DetachedBufferWriter::HandleWriteReturn(ssize_t write_return,
@@ -223,7 +349,7 @@
     ran_out_of_space_ = true;
     return;
   }
-  PCHECK(write_return >= 0) << ": write failed";
+  PCHECK(write_return >= 0) << ": write failed, got " << write_return;
   if (write_return < static_cast<ssize_t>(write_size)) {
     // Sometimes this happens instead of ENOSPC. On a real filesystem, this
     // never seems to happen in any other case. If we ever want to log to a
commit	4c3cdb790872ec1830b6024541278effde7151f5	[log] [tgz]
author	Austin Schuh <austin.linux@gmail.com>	Sat Feb 11 15:05:26 2023 -0800
committer	Austin Schuh <austin.linux@gmail.com>	Sat Feb 11 18:09:48 2023 -0800
tree	0fda88bdd9c38d82a84a6019f1fe8f0687a7acb4
parent	fce6c7906c33d30896dedb09839e2953cd9d7c58 [diff] [blame]