Squashed 'third_party/gperftools/' content from commit 54505f1
Change-Id: Id02e833828732b0efe7dac722b8485279e67c5fa
git-subtree-dir: third_party/gperftools
git-subtree-split: 54505f1d50c2d1f4676f5e87090b64a117fd980e
diff --git a/src/thread_cache.cc b/src/thread_cache.cc
new file mode 100644
index 0000000..444a09f
--- /dev/null
+++ b/src/thread_cache.cc
@@ -0,0 +1,474 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Ken Ashcraft <opensource@google.com>
+
+#include <config.h>
+#include "thread_cache.h"
+#include <errno.h>
+#include <string.h> // for memcpy
+#include <algorithm> // for max, min
+#include "base/commandlineflags.h" // for SpinLockHolder
+#include "base/spinlock.h" // for SpinLockHolder
+#include "getenv_safe.h" // for TCMallocGetenvSafe
+#include "central_freelist.h" // for CentralFreeListPadded
+#include "maybe_threads.h"
+
+using std::min;
+using std::max;
+
+// Note: this is initialized manually in InitModule to ensure that
+// it's configured at right time
+//
+// DEFINE_int64(tcmalloc_max_total_thread_cache_bytes,
+// EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES",
+// kDefaultOverallThreadCacheSize),
+// "Bound on the total amount of bytes allocated to "
+// "thread caches. This bound is not strict, so it is possible "
+// "for the cache to go over this bound in certain circumstances. "
+// "Maximum value of this flag is capped to 1 GB.");
+
+
+namespace tcmalloc {
+
+static bool phinited = false;
+
+volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize;
+size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize;
+ssize_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize;
+PageHeapAllocator<ThreadCache> threadcache_allocator;
+ThreadCache* ThreadCache::thread_heaps_ = NULL;
+int ThreadCache::thread_heap_count_ = 0;
+ThreadCache* ThreadCache::next_memory_steal_ = NULL;
+#ifdef HAVE_TLS
+__thread ThreadCache::ThreadLocalData ThreadCache::threadlocal_data_
+ ATTR_INITIAL_EXEC
+ = {0, 0};
+#endif
+bool ThreadCache::tsd_inited_ = false;
+pthread_key_t ThreadCache::heap_key_;
+
+void ThreadCache::Init(pthread_t tid) {
+ size_ = 0;
+
+ max_size_ = 0;
+ IncreaseCacheLimitLocked();
+ if (max_size_ == 0) {
+ // There isn't enough memory to go around. Just give the minimum to
+ // this thread.
+ max_size_ = kMinThreadCacheSize;
+
+ // Take unclaimed_cache_space_ negative.
+ unclaimed_cache_space_ -= kMinThreadCacheSize;
+ ASSERT(unclaimed_cache_space_ < 0);
+ }
+
+ next_ = NULL;
+ prev_ = NULL;
+ tid_ = tid;
+ in_setspecific_ = false;
+ for (size_t cl = 0; cl < kNumClasses; ++cl) {
+ list_[cl].Init();
+ }
+
+ uint32_t sampler_seed;
+ memcpy(&sampler_seed, &tid, sizeof(sampler_seed));
+ sampler_.Init(sampler_seed);
+}
+
+void ThreadCache::Cleanup() {
+ // Put unused memory back into central cache
+ for (int cl = 0; cl < kNumClasses; ++cl) {
+ if (list_[cl].length() > 0) {
+ ReleaseToCentralCache(&list_[cl], cl, list_[cl].length());
+ }
+ }
+}
+
+// Remove some objects of class "cl" from central cache and add to thread heap.
+// On success, return the first object for immediate use; otherwise return NULL.
+void* ThreadCache::FetchFromCentralCache(size_t cl, size_t byte_size) {
+ FreeList* list = &list_[cl];
+ ASSERT(list->empty());
+ const int batch_size = Static::sizemap()->num_objects_to_move(cl);
+
+ const int num_to_move = min<int>(list->max_length(), batch_size);
+ void *start, *end;
+ int fetch_count = Static::central_cache()[cl].RemoveRange(
+ &start, &end, num_to_move);
+
+ ASSERT((start == NULL) == (fetch_count == 0));
+ if (--fetch_count >= 0) {
+ size_ += byte_size * fetch_count;
+ list->PushRange(fetch_count, SLL_Next(start), end);
+ }
+
+ // Increase max length slowly up to batch_size. After that,
+ // increase by batch_size in one shot so that the length is a
+ // multiple of batch_size.
+ if (list->max_length() < batch_size) {
+ list->set_max_length(list->max_length() + 1);
+ } else {
+ // Don't let the list get too long. In 32 bit builds, the length
+ // is represented by a 16 bit int, so we need to watch out for
+ // integer overflow.
+ int new_length = min<int>(list->max_length() + batch_size,
+ kMaxDynamicFreeListLength);
+ // The list's max_length must always be a multiple of batch_size,
+ // and kMaxDynamicFreeListLength is not necessarily a multiple
+ // of batch_size.
+ new_length -= new_length % batch_size;
+ ASSERT(new_length % batch_size == 0);
+ list->set_max_length(new_length);
+ }
+ return start;
+}
+
+void ThreadCache::ListTooLong(FreeList* list, size_t cl) {
+ const int batch_size = Static::sizemap()->num_objects_to_move(cl);
+ ReleaseToCentralCache(list, cl, batch_size);
+
+ // If the list is too long, we need to transfer some number of
+ // objects to the central cache. Ideally, we would transfer
+ // num_objects_to_move, so the code below tries to make max_length
+ // converge on num_objects_to_move.
+
+ if (list->max_length() < batch_size) {
+ // Slow start the max_length so we don't overreserve.
+ list->set_max_length(list->max_length() + 1);
+ } else if (list->max_length() > batch_size) {
+ // If we consistently go over max_length, shrink max_length. If we don't
+ // shrink it, some amount of memory will always stay in this freelist.
+ list->set_length_overages(list->length_overages() + 1);
+ if (list->length_overages() > kMaxOverages) {
+ ASSERT(list->max_length() > batch_size);
+ list->set_max_length(list->max_length() - batch_size);
+ list->set_length_overages(0);
+ }
+ }
+}
+
+// Remove some objects of class "cl" from thread heap and add to central cache
+void ThreadCache::ReleaseToCentralCache(FreeList* src, size_t cl, int N) {
+ ASSERT(src == &list_[cl]);
+ if (N > src->length()) N = src->length();
+ size_t delta_bytes = N * Static::sizemap()->ByteSizeForClass(cl);
+
+ // We return prepackaged chains of the correct size to the central cache.
+ // TODO: Use the same format internally in the thread caches?
+ int batch_size = Static::sizemap()->num_objects_to_move(cl);
+ while (N > batch_size) {
+ void *tail, *head;
+ src->PopRange(batch_size, &head, &tail);
+ Static::central_cache()[cl].InsertRange(head, tail, batch_size);
+ N -= batch_size;
+ }
+ void *tail, *head;
+ src->PopRange(N, &head, &tail);
+ Static::central_cache()[cl].InsertRange(head, tail, N);
+ size_ -= delta_bytes;
+}
+
+// Release idle memory to the central cache
+void ThreadCache::Scavenge() {
+ // If the low-water mark for the free list is L, it means we would
+ // not have had to allocate anything from the central cache even if
+ // we had reduced the free list size by L. We aim to get closer to
+ // that situation by dropping L/2 nodes from the free list. This
+ // may not release much memory, but if so we will call scavenge again
+ // pretty soon and the low-water marks will be high on that call.
+ //int64 start = CycleClock::Now();
+ for (int cl = 0; cl < kNumClasses; cl++) {
+ FreeList* list = &list_[cl];
+ const int lowmark = list->lowwatermark();
+ if (lowmark > 0) {
+ const int drop = (lowmark > 1) ? lowmark/2 : 1;
+ ReleaseToCentralCache(list, cl, drop);
+
+ // Shrink the max length if it isn't used. Only shrink down to
+ // batch_size -- if the thread was active enough to get the max_length
+ // above batch_size, it will likely be that active again. If
+ // max_length shinks below batch_size, the thread will have to
+ // go through the slow-start behavior again. The slow-start is useful
+ // mainly for threads that stay relatively idle for their entire
+ // lifetime.
+ const int batch_size = Static::sizemap()->num_objects_to_move(cl);
+ if (list->max_length() > batch_size) {
+ list->set_max_length(
+ max<int>(list->max_length() - batch_size, batch_size));
+ }
+ }
+ list->clear_lowwatermark();
+ }
+
+ IncreaseCacheLimit();
+}
+
+void ThreadCache::IncreaseCacheLimit() {
+ SpinLockHolder h(Static::pageheap_lock());
+ IncreaseCacheLimitLocked();
+}
+
+void ThreadCache::IncreaseCacheLimitLocked() {
+ if (unclaimed_cache_space_ > 0) {
+ // Possibly make unclaimed_cache_space_ negative.
+ unclaimed_cache_space_ -= kStealAmount;
+ max_size_ += kStealAmount;
+ return;
+ }
+ // Don't hold pageheap_lock too long. Try to steal from 10 other
+ // threads before giving up. The i < 10 condition also prevents an
+ // infinite loop in case none of the existing thread heaps are
+ // suitable places to steal from.
+ for (int i = 0; i < 10;
+ ++i, next_memory_steal_ = next_memory_steal_->next_) {
+ // Reached the end of the linked list. Start at the beginning.
+ if (next_memory_steal_ == NULL) {
+ ASSERT(thread_heaps_ != NULL);
+ next_memory_steal_ = thread_heaps_;
+ }
+ if (next_memory_steal_ == this ||
+ next_memory_steal_->max_size_ <= kMinThreadCacheSize) {
+ continue;
+ }
+ next_memory_steal_->max_size_ -= kStealAmount;
+ max_size_ += kStealAmount;
+
+ next_memory_steal_ = next_memory_steal_->next_;
+ return;
+ }
+}
+
+int ThreadCache::GetSamplePeriod() {
+ return sampler_.GetSamplePeriod();
+}
+
+void ThreadCache::InitModule() {
+ SpinLockHolder h(Static::pageheap_lock());
+ if (!phinited) {
+ const char *tcb = TCMallocGetenvSafe("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES");
+ if (tcb) {
+ set_overall_thread_cache_size(strtoll(tcb, NULL, 10));
+ }
+ Static::InitStaticVars();
+ threadcache_allocator.Init();
+ phinited = 1;
+ }
+}
+
+void ThreadCache::InitTSD() {
+ ASSERT(!tsd_inited_);
+ perftools_pthread_key_create(&heap_key_, DestroyThreadCache);
+ tsd_inited_ = true;
+
+#ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
+ // We may have used a fake pthread_t for the main thread. Fix it.
+ pthread_t zero;
+ memset(&zero, 0, sizeof(zero));
+ SpinLockHolder h(Static::pageheap_lock());
+ for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
+ if (h->tid_ == zero) {
+ h->tid_ = pthread_self();
+ }
+ }
+#endif
+}
+
+ThreadCache* ThreadCache::CreateCacheIfNecessary() {
+ // Initialize per-thread data if necessary
+ ThreadCache* heap = NULL;
+ {
+ SpinLockHolder h(Static::pageheap_lock());
+ // On some old glibc's, and on freebsd's libc (as of freebsd 8.1),
+ // calling pthread routines (even pthread_self) too early could
+ // cause a segfault. Since we can call pthreads quite early, we
+ // have to protect against that in such situations by making a
+ // 'fake' pthread. This is not ideal since it doesn't work well
+ // when linking tcmalloc statically with apps that create threads
+ // before main, so we only do it if we have to.
+#ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
+ pthread_t me;
+ if (!tsd_inited_) {
+ memset(&me, 0, sizeof(me));
+ } else {
+ me = pthread_self();
+ }
+#else
+ const pthread_t me = pthread_self();
+#endif
+
+ // This may be a recursive malloc call from pthread_setspecific()
+ // In that case, the heap for this thread has already been created
+ // and added to the linked list. So we search for that first.
+ for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
+ if (h->tid_ == me) {
+ heap = h;
+ break;
+ }
+ }
+
+ if (heap == NULL) heap = NewHeap(me);
+ }
+
+ // We call pthread_setspecific() outside the lock because it may
+ // call malloc() recursively. We check for the recursive call using
+ // the "in_setspecific_" flag so that we can avoid calling
+ // pthread_setspecific() if we are already inside pthread_setspecific().
+ if (!heap->in_setspecific_ && tsd_inited_) {
+ heap->in_setspecific_ = true;
+ perftools_pthread_setspecific(heap_key_, heap);
+#ifdef HAVE_TLS
+ // Also keep a copy in __thread for faster retrieval
+ threadlocal_data_.heap = heap;
+ SetMinSizeForSlowPath(kMaxSize + 1);
+#endif
+ heap->in_setspecific_ = false;
+ }
+ return heap;
+}
+
+ThreadCache* ThreadCache::NewHeap(pthread_t tid) {
+ // Create the heap and add it to the linked list
+ ThreadCache *heap = threadcache_allocator.New();
+ heap->Init(tid);
+ heap->next_ = thread_heaps_;
+ heap->prev_ = NULL;
+ if (thread_heaps_ != NULL) {
+ thread_heaps_->prev_ = heap;
+ } else {
+ // This is the only thread heap at the momment.
+ ASSERT(next_memory_steal_ == NULL);
+ next_memory_steal_ = heap;
+ }
+ thread_heaps_ = heap;
+ thread_heap_count_++;
+ return heap;
+}
+
+void ThreadCache::BecomeIdle() {
+ if (!tsd_inited_) return; // No caches yet
+ ThreadCache* heap = GetThreadHeap();
+ if (heap == NULL) return; // No thread cache to remove
+ if (heap->in_setspecific_) return; // Do not disturb the active caller
+
+ heap->in_setspecific_ = true;
+ perftools_pthread_setspecific(heap_key_, NULL);
+#ifdef HAVE_TLS
+ // Also update the copy in __thread
+ threadlocal_data_.heap = NULL;
+ SetMinSizeForSlowPath(0);
+#endif
+ heap->in_setspecific_ = false;
+ if (GetThreadHeap() == heap) {
+ // Somehow heap got reinstated by a recursive call to malloc
+ // from pthread_setspecific. We give up in this case.
+ return;
+ }
+
+ // We can now get rid of the heap
+ DeleteCache(heap);
+}
+
+void ThreadCache::DestroyThreadCache(void* ptr) {
+ // Note that "ptr" cannot be NULL since pthread promises not
+ // to invoke the destructor on NULL values, but for safety,
+ // we check anyway.
+ if (ptr == NULL) return;
+#ifdef HAVE_TLS
+ // Prevent fast path of GetThreadHeap() from returning heap.
+ threadlocal_data_.heap = NULL;
+ SetMinSizeForSlowPath(0);
+#endif
+ DeleteCache(reinterpret_cast<ThreadCache*>(ptr));
+}
+
+void ThreadCache::DeleteCache(ThreadCache* heap) {
+ // Remove all memory from heap
+ heap->Cleanup();
+
+ // Remove from linked list
+ SpinLockHolder h(Static::pageheap_lock());
+ if (heap->next_ != NULL) heap->next_->prev_ = heap->prev_;
+ if (heap->prev_ != NULL) heap->prev_->next_ = heap->next_;
+ if (thread_heaps_ == heap) thread_heaps_ = heap->next_;
+ thread_heap_count_--;
+
+ if (next_memory_steal_ == heap) next_memory_steal_ = heap->next_;
+ if (next_memory_steal_ == NULL) next_memory_steal_ = thread_heaps_;
+ unclaimed_cache_space_ += heap->max_size_;
+
+ threadcache_allocator.Delete(heap);
+}
+
+void ThreadCache::RecomputePerThreadCacheSize() {
+ // Divide available space across threads
+ int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1;
+ size_t space = overall_thread_cache_size_ / n;
+
+ // Limit to allowed range
+ if (space < kMinThreadCacheSize) space = kMinThreadCacheSize;
+ if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize;
+
+ double ratio = space / max<double>(1, per_thread_cache_size_);
+ size_t claimed = 0;
+ for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
+ // Increasing the total cache size should not circumvent the
+ // slow-start growth of max_size_.
+ if (ratio < 1.0) {
+ h->max_size_ = static_cast<size_t>(h->max_size_ * ratio);
+ }
+ claimed += h->max_size_;
+ }
+ unclaimed_cache_space_ = overall_thread_cache_size_ - claimed;
+ per_thread_cache_size_ = space;
+}
+
+void ThreadCache::GetThreadStats(uint64_t* total_bytes, uint64_t* class_count) {
+ for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
+ *total_bytes += h->Size();
+ if (class_count) {
+ for (int cl = 0; cl < kNumClasses; ++cl) {
+ class_count[cl] += h->freelist_length(cl);
+ }
+ }
+ }
+}
+
+void ThreadCache::set_overall_thread_cache_size(size_t new_size) {
+ // Clip the value to a reasonable range
+ if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize;
+ if (new_size > (1<<30)) new_size = (1<<30); // Limit to 1GB
+ overall_thread_cache_size_ = new_size;
+
+ RecomputePerThreadCacheSize();
+}
+
+} // namespace tcmalloc