Implement a workaround for thread_local on aarch64

Some versions of lld break thread_local on aarch64. There's a
minimally-painful workaround for it, so let's apply that.

Change-Id: I706de9e280cc15006e5767ee10d961cb2d99352c
diff --git a/aos/BUILD b/aos/BUILD
index a46911e..04c0548 100644
--- a/aos/BUILD
+++ b/aos/BUILD
@@ -535,3 +535,11 @@
         "@com_google_absl//absl/strings:str_format",
     ],
 )
+
+cc_library(
+    name = "thread_local",
+    hdrs = [
+        "thread_local.h",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/aos/ipc_lib/BUILD b/aos/ipc_lib/BUILD
index fed9a8a..6ee8ab4 100644
--- a/aos/ipc_lib/BUILD
+++ b/aos/ipc_lib/BUILD
@@ -12,6 +12,7 @@
     visibility = ["//visibility:public"],
     deps = [
         "//aos:macros",
+        "//aos:thread_local",
         "//aos/util:compiler_memory_barrier",
         "@com_github_google_glog//:glog",
         "@com_google_absl//absl/base",
diff --git a/aos/ipc_lib/aos_sync.cc b/aos/ipc_lib/aos_sync.cc
index d5c3d4e..34d9489 100644
--- a/aos/ipc_lib/aos_sync.cc
+++ b/aos/ipc_lib/aos_sync.cc
@@ -27,10 +27,12 @@
 #include <type_traits>
 
 #include "absl/base/call_once.h"
-#include "aos/macros.h"
-#include "aos/util/compiler_memory_barrier.h"
 #include "glog/logging.h"
 
+#include "aos/macros.h"
+#include "aos/thread_local.h"
+#include "aos/util/compiler_memory_barrier.h"
+
 using ::aos::linux_code::ipc_lib::FutexAccessorObserver;
 
 // This code was originally based on <https://www.akkadia.org/drepper/futex.pdf>,
@@ -369,7 +371,7 @@
 
 // Starts off at 0 in each new thread (because that's what it gets initialized
 // to in most of them or it gets to reset to 0 after a fork by atfork_child()).
-thread_local pid_t my_tid = 0;
+AOS_THREAD_LOCAL pid_t my_tid = 0;
 
 // Gets called before the fork(2) wrapper function returns in the child.
 void atfork_child() {
@@ -428,7 +430,7 @@
 static_assert(sizeof(aos_robust_list_head) == sizeof(robust_list_head),
               "Our aos_robust_list_head doesn't match the kernel's");
 
-thread_local aos_robust_list_head robust_head;
+AOS_THREAD_LOCAL aos_robust_list_head robust_head;
 
 // Extra offset between mutex values and where we point to for their robust list
 // entries (from SetRobustListOffset).
diff --git a/aos/libc/BUILD b/aos/libc/BUILD
index 234c29a..35b6629 100644
--- a/aos/libc/BUILD
+++ b/aos/libc/BUILD
@@ -9,6 +9,7 @@
         "aos_strsignal.h",
     ],
     deps = [
+        "//aos:thread_local",
         "@com_github_google_glog//:glog",
     ],
 )
@@ -53,6 +54,9 @@
     hdrs = [
         "aos_strerror.h",
     ],
+    deps = [
+        "//aos:thread_local",
+    ],
 )
 
 cc_test(
diff --git a/aos/libc/aos_strerror.cc b/aos/libc/aos_strerror.cc
index 1353242..b045f24 100644
--- a/aos/libc/aos_strerror.cc
+++ b/aos/libc/aos_strerror.cc
@@ -1,9 +1,11 @@
 #include "aos/libc/aos_strerror.h"
 
 #include <assert.h>
-#include <sys/types.h>
-#include <string.h>
 #include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "aos/thread_local.h"
 
 // This code uses an overloaded function to handle the result from either
 // version of strerror_r correctly without needing a way to get the choice out
@@ -15,14 +17,15 @@
 
 // Handle the result from the GNU version of strerror_r. It never fails, so
 // that's pretty easy...
-__attribute__((unused))
-char *aos_strerror_handle_result(int /*error*/, char *ret, char * /*buffer*/) {
+__attribute__((unused)) char *aos_strerror_handle_result(int /*error*/,
+                                                         char *ret,
+                                                         char * /*buffer*/) {
   return ret;
 }
 
 // Handle the result from the POSIX version of strerror_r.
-__attribute__((unused))
-char *aos_strerror_handle_result(int error, int ret, char *buffer) {
+__attribute__((unused)) char *aos_strerror_handle_result(int error, int ret,
+                                                         char *buffer) {
   if (ret != 0) {
 #ifndef NDEBUG
     // assert doesn't use the return value when building optimized.
@@ -37,7 +40,7 @@
 }  // namespace
 
 const char *aos_strerror(int error) {
-  static thread_local char buffer[kBufferSize];
+  AOS_THREAD_LOCAL char buffer[kBufferSize];
 
   // Call the overload for whichever version we're using.
   return aos_strerror_handle_result(
diff --git a/aos/libc/aos_strsignal.cc b/aos/libc/aos_strsignal.cc
index 9262ac6..91c2211 100644
--- a/aos/libc/aos_strsignal.cc
+++ b/aos/libc/aos_strsignal.cc
@@ -4,8 +4,10 @@
 
 #include "glog/logging.h"
 
+#include "aos/thread_local.h"
+
 const char *aos_strsignal(int signal) {
-  static thread_local char buffer[512];
+  AOS_THREAD_LOCAL char buffer[512];
 
   if (signal >= SIGRTMIN && signal <= SIGRTMAX) {
     CHECK_GT(snprintf(buffer, sizeof(buffer), "Real-time signal %d",
diff --git a/aos/logging/BUILD b/aos/logging/BUILD
index 6686a3c..45a9921 100644
--- a/aos/logging/BUILD
+++ b/aos/logging/BUILD
@@ -21,6 +21,7 @@
         "//aos:complex_thread_local",
         "//aos:die",
         "//aos:macros",
+        "//aos:thread_local",
         "//aos/libc:aos_strerror",
         "//aos/mutex",
         "//aos/stl_mutex",
diff --git a/aos/logging/context.cc b/aos/logging/context.cc
index 2c689ed..c6f1063 100644
--- a/aos/logging/context.cc
+++ b/aos/logging/context.cc
@@ -22,6 +22,7 @@
 #include "aos/complex_thread_local.h"
 #include "aos/die.h"
 #include "aos/logging/implementations.h"
+#include "aos/thread_local.h"
 
 namespace aos {
 namespace logging {
@@ -68,7 +69,7 @@
 // reason for doing this instead of just deleting them is that tsan (at least)
 // doesn't like it when pthread_atfork handlers do complicated stuff and it's
 // not a great idea anyways.
-thread_local bool delete_current_context(false);
+AOS_THREAD_LOCAL bool delete_current_context(false);
 
 }  // namespace
 
diff --git a/aos/testing/BUILD b/aos/testing/BUILD
index 052cabe..ce67b84 100644
--- a/aos/testing/BUILD
+++ b/aos/testing/BUILD
@@ -24,6 +24,7 @@
     visibility = ["//visibility:public"],
     deps = [
         ":googletest",
+        "//aos:thread_local",
         "//aos/logging:implementations",
         "//aos/mutex",
         "@com_google_absl//absl/base",
diff --git a/aos/testing/test_logging.cc b/aos/testing/test_logging.cc
index db6246c..77bc4e9 100644
--- a/aos/testing/test_logging.cc
+++ b/aos/testing/test_logging.cc
@@ -7,8 +7,10 @@
 #include "gtest/gtest.h"
 
 #include "absl/base/call_once.h"
+
 #include "aos/logging/implementations.h"
 #include "aos/mutex/mutex.h"
+#include "aos/thread_local.h"
 
 using ::aos::logging::LogMessage;
 
@@ -97,12 +99,12 @@
   // Thread local storage for mock time.  This is thread local because if
   // someone spawns a thread and goes to town in parallel with a simulated event
   // loop, we want to just print the actual monotonic clock out.
-  static thread_local bool mock_time_;
-  static thread_local ::aos::monotonic_clock::time_point monotonic_now_;
+  static AOS_THREAD_LOCAL bool mock_time_;
+  static AOS_THREAD_LOCAL ::aos::monotonic_clock::time_point monotonic_now_;
 };
 
-thread_local bool TestLogImplementation::mock_time_ = false;
-thread_local ::aos::monotonic_clock::time_point
+AOS_THREAD_LOCAL bool TestLogImplementation::mock_time_ = false;
+AOS_THREAD_LOCAL ::aos::monotonic_clock::time_point
     TestLogImplementation::monotonic_now_ = ::aos::monotonic_clock::min_time;
 
 class MyTestEventListener : public ::testing::EmptyTestEventListener {
diff --git a/aos/thread_local.h b/aos/thread_local.h
new file mode 100644
index 0000000..e8c8854
--- /dev/null
+++ b/aos/thread_local.h
@@ -0,0 +1,36 @@
+#ifndef AOS_THREAD_LOCAL_H_
+#define AOS_THREAD_LOCAL_H_
+
+// Use AOS_THREAD_LOCAL instead of thread_local to pick up specifics for various
+// compilers/platforms.
+
+#ifdef __aarch64__
+// Workaround for https://bugs.llvm.org/show_bug.cgi?id=41527.
+// TODO(Brian): Remove this once we upgrade past LLVM 9.0.0.
+// 9.0.1 might have the fix, but I can't find prebuilt binaries for it, for some
+// reason. Going by release dates, 10.0.0 should definitely have the fix.
+//
+// https://reviews.llvm.org/D53906 broke it, https://reviews.llvm.org/D62055
+// reverted it, https://reviews.llvm.org/D61825 re-enabled it for only Android.
+//
+// Basically, LLD hacks the program header, but fails to change enough of the
+// values to be self-consistent. The resulting values cause glibc's dynamic
+// linker to do something different than lld is expecting, so then things
+// overlap at runtime and break horribly.
+//
+// This workaround ensures that the program header has the alignment lld wants
+// already, which ensures it's set there early enough in lld's processing that
+// it actually gets the proper alignment. This makes the hack a NOP so
+// everything works correctly.
+//
+// To check for the problem, build a binary (a complete binary, not a test
+// binary which references shared objects for all the code) and run `readelf
+// -aW` on it. Look for the TLS program header. If its alignment is 0x40, this
+// workaround is probably needed. Verify its address is aligned mod 0x40 to
+// verify the workaround is effective.
+#define AOS_THREAD_LOCAL __attribute__((aligned(0x40))) thread_local
+#else
+#define AOS_THREAD_LOCAL thread_local
+#endif
+
+#endif  // AOS_THREAD_LOCAL_H_