Austin Schuh | 745610d | 2015-09-06 18:19:50 -0700 | [diff] [blame] | 1 | // -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- |
| 2 | // Copyright (c) 2008, Google Inc. |
| 3 | // All rights reserved. |
| 4 | // |
| 5 | // Redistribution and use in source and binary forms, with or without |
| 6 | // modification, are permitted provided that the following conditions are |
| 7 | // met: |
| 8 | // |
| 9 | // * Redistributions of source code must retain the above copyright |
| 10 | // notice, this list of conditions and the following disclaimer. |
| 11 | // * Redistributions in binary form must reproduce the above |
| 12 | // copyright notice, this list of conditions and the following disclaimer |
| 13 | // in the documentation and/or other materials provided with the |
| 14 | // distribution. |
| 15 | // * Neither the name of Google Inc. nor the names of its |
| 16 | // contributors may be used to endorse or promote products derived from |
| 17 | // this software without specific prior written permission. |
| 18 | // |
| 19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 30 | |
| 31 | // --- |
| 32 | // Author: Sanjay Ghemawat <opensource@google.com> |
| 33 | |
| 34 | #ifndef TCMALLOC_THREAD_CACHE_H_ |
| 35 | #define TCMALLOC_THREAD_CACHE_H_ |
| 36 | |
| 37 | #include <config.h> |
| 38 | #ifdef HAVE_PTHREAD |
| 39 | #include <pthread.h> // for pthread_t, pthread_key_t |
| 40 | #endif |
| 41 | #include <stddef.h> // for size_t, NULL |
| 42 | #ifdef HAVE_STDINT_H |
| 43 | #include <stdint.h> // for uint32_t, uint64_t |
| 44 | #endif |
| 45 | #include <sys/types.h> // for ssize_t |
| 46 | #include "common.h" |
| 47 | #include "linked_list.h" |
| 48 | #include "maybe_threads.h" |
| 49 | #include "page_heap_allocator.h" |
| 50 | #include "sampler.h" |
| 51 | #include "static_vars.h" |
| 52 | |
| 53 | #include "common.h" // for SizeMap, kMaxSize, etc |
| 54 | #include "internal_logging.h" // for ASSERT, etc |
| 55 | #include "linked_list.h" // for SLL_Pop, SLL_PopRange, etc |
| 56 | #include "page_heap_allocator.h" // for PageHeapAllocator |
| 57 | #include "sampler.h" // for Sampler |
| 58 | #include "static_vars.h" // for Static |
| 59 | |
| 60 | namespace tcmalloc { |
| 61 | |
| 62 | //------------------------------------------------------------------- |
| 63 | // Data kept per thread |
| 64 | //------------------------------------------------------------------- |
| 65 | |
| 66 | class ThreadCache { |
| 67 | public: |
| 68 | #ifdef HAVE_TLS |
| 69 | enum { have_tls = true }; |
| 70 | #else |
| 71 | enum { have_tls = false }; |
| 72 | #endif |
| 73 | |
| 74 | // All ThreadCache objects are kept in a linked list (for stats collection) |
| 75 | ThreadCache* next_; |
| 76 | ThreadCache* prev_; |
| 77 | |
| 78 | void Init(pthread_t tid); |
| 79 | void Cleanup(); |
| 80 | |
| 81 | // Accessors (mostly just for printing stats) |
| 82 | int freelist_length(size_t cl) const { return list_[cl].length(); } |
| 83 | |
| 84 | // Total byte size in cache |
| 85 | size_t Size() const { return size_; } |
| 86 | |
| 87 | // Allocate an object of the given size and class. The size given |
| 88 | // must be the same as the size of the class in the size map. |
| 89 | void* Allocate(size_t size, size_t cl); |
| 90 | void Deallocate(void* ptr, size_t size_class); |
| 91 | |
| 92 | void Scavenge(); |
| 93 | |
| 94 | int GetSamplePeriod(); |
| 95 | |
| 96 | // Record allocation of "k" bytes. Return true iff allocation |
| 97 | // should be sampled |
| 98 | bool SampleAllocation(size_t k); |
| 99 | |
| 100 | static void InitModule(); |
| 101 | static void InitTSD(); |
| 102 | static ThreadCache* GetThreadHeap(); |
| 103 | static ThreadCache* GetCache(); |
| 104 | static ThreadCache* GetCacheIfPresent(); |
| 105 | static ThreadCache* GetCacheWhichMustBePresent(); |
| 106 | static ThreadCache* CreateCacheIfNecessary(); |
| 107 | static void BecomeIdle(); |
| 108 | static size_t MinSizeForSlowPath(); |
| 109 | static void SetMinSizeForSlowPath(size_t size); |
| 110 | |
| 111 | static bool IsFastPathAllowed() { return MinSizeForSlowPath() != 0; } |
| 112 | |
| 113 | // Return the number of thread heaps in use. |
| 114 | static inline int HeapsInUse(); |
| 115 | |
| 116 | // Adds to *total_bytes the total number of bytes used by all thread heaps. |
| 117 | // Also, if class_count is not NULL, it must be an array of size kNumClasses, |
| 118 | // and this function will increment each element of class_count by the number |
| 119 | // of items in all thread-local freelists of the corresponding size class. |
| 120 | // REQUIRES: Static::pageheap_lock is held. |
| 121 | static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count); |
| 122 | |
| 123 | // Sets the total thread cache size to new_size, recomputing the |
| 124 | // individual thread cache sizes as necessary. |
| 125 | // REQUIRES: Static::pageheap lock is held. |
| 126 | static void set_overall_thread_cache_size(size_t new_size); |
| 127 | static size_t overall_thread_cache_size() { |
| 128 | return overall_thread_cache_size_; |
| 129 | } |
| 130 | |
| 131 | private: |
| 132 | class FreeList { |
| 133 | private: |
| 134 | void* list_; // Linked list of nodes |
| 135 | |
| 136 | #ifdef _LP64 |
| 137 | // On 64-bit hardware, manipulating 16-bit values may be slightly slow. |
| 138 | uint32_t length_; // Current length. |
| 139 | uint32_t lowater_; // Low water mark for list length. |
| 140 | uint32_t max_length_; // Dynamic max list length based on usage. |
| 141 | // Tracks the number of times a deallocation has caused |
| 142 | // length_ > max_length_. After the kMaxOverages'th time, max_length_ |
| 143 | // shrinks and length_overages_ is reset to zero. |
| 144 | uint32_t length_overages_; |
| 145 | #else |
| 146 | // If we aren't using 64-bit pointers then pack these into less space. |
| 147 | uint16_t length_; |
| 148 | uint16_t lowater_; |
| 149 | uint16_t max_length_; |
| 150 | uint16_t length_overages_; |
| 151 | #endif |
| 152 | |
| 153 | public: |
| 154 | void Init() { |
| 155 | list_ = NULL; |
| 156 | length_ = 0; |
| 157 | lowater_ = 0; |
| 158 | max_length_ = 1; |
| 159 | length_overages_ = 0; |
| 160 | } |
| 161 | |
| 162 | // Return current length of list |
| 163 | size_t length() const { |
| 164 | return length_; |
| 165 | } |
| 166 | |
| 167 | // Return the maximum length of the list. |
| 168 | size_t max_length() const { |
| 169 | return max_length_; |
| 170 | } |
| 171 | |
| 172 | // Set the maximum length of the list. If 'new_max' > length(), the |
| 173 | // client is responsible for removing objects from the list. |
| 174 | void set_max_length(size_t new_max) { |
| 175 | max_length_ = new_max; |
| 176 | } |
| 177 | |
| 178 | // Return the number of times that length() has gone over max_length(). |
| 179 | size_t length_overages() const { |
| 180 | return length_overages_; |
| 181 | } |
| 182 | |
| 183 | void set_length_overages(size_t new_count) { |
| 184 | length_overages_ = new_count; |
| 185 | } |
| 186 | |
| 187 | // Is list empty? |
| 188 | bool empty() const { |
| 189 | return list_ == NULL; |
| 190 | } |
| 191 | |
| 192 | // Low-water mark management |
| 193 | int lowwatermark() const { return lowater_; } |
| 194 | void clear_lowwatermark() { lowater_ = length_; } |
| 195 | |
| 196 | void Push(void* ptr) { |
| 197 | SLL_Push(&list_, ptr); |
| 198 | length_++; |
| 199 | } |
| 200 | |
| 201 | void* Pop() { |
| 202 | ASSERT(list_ != NULL); |
| 203 | length_--; |
| 204 | if (length_ < lowater_) lowater_ = length_; |
| 205 | return SLL_Pop(&list_); |
| 206 | } |
| 207 | |
| 208 | void* Next() { |
| 209 | return SLL_Next(&list_); |
| 210 | } |
| 211 | |
| 212 | void PushRange(int N, void *start, void *end) { |
| 213 | SLL_PushRange(&list_, start, end); |
| 214 | length_ += N; |
| 215 | } |
| 216 | |
| 217 | void PopRange(int N, void **start, void **end) { |
| 218 | SLL_PopRange(&list_, N, start, end); |
| 219 | ASSERT(length_ >= N); |
| 220 | length_ -= N; |
| 221 | if (length_ < lowater_) lowater_ = length_; |
| 222 | } |
| 223 | }; |
| 224 | |
| 225 | // Gets and returns an object from the central cache, and, if possible, |
| 226 | // also adds some objects of that size class to this thread cache. |
| 227 | void* FetchFromCentralCache(size_t cl, size_t byte_size); |
| 228 | |
| 229 | // Releases some number of items from src. Adjusts the list's max_length |
| 230 | // to eventually converge on num_objects_to_move(cl). |
| 231 | void ListTooLong(FreeList* src, size_t cl); |
| 232 | |
| 233 | // Releases N items from this thread cache. |
| 234 | void ReleaseToCentralCache(FreeList* src, size_t cl, int N); |
| 235 | |
| 236 | // Increase max_size_ by reducing unclaimed_cache_space_ or by |
| 237 | // reducing the max_size_ of some other thread. In both cases, |
| 238 | // the delta is kStealAmount. |
| 239 | void IncreaseCacheLimit(); |
| 240 | // Same as above but requires Static::pageheap_lock() is held. |
| 241 | void IncreaseCacheLimitLocked(); |
| 242 | |
| 243 | // If TLS is available, we also store a copy of the per-thread object |
| 244 | // in a __thread variable since __thread variables are faster to read |
| 245 | // than pthread_getspecific(). We still need pthread_setspecific() |
| 246 | // because __thread variables provide no way to run cleanup code when |
| 247 | // a thread is destroyed. |
| 248 | // We also give a hint to the compiler to use the "initial exec" TLS |
| 249 | // model. This is faster than the default TLS model, at the cost that |
| 250 | // you cannot dlopen this library. (To see the difference, look at |
| 251 | // the CPU use of __tls_get_addr with and without this attribute.) |
| 252 | // Since we don't really use dlopen in google code -- and using dlopen |
| 253 | // on a malloc replacement is asking for trouble in any case -- that's |
| 254 | // a good tradeoff for us. |
| 255 | #ifdef HAVE___ATTRIBUTE__ |
| 256 | #define ATTR_INITIAL_EXEC __attribute__ ((tls_model ("initial-exec"))) |
| 257 | #else |
| 258 | #define ATTR_INITIAL_EXEC |
| 259 | #endif |
| 260 | |
| 261 | #ifdef HAVE_TLS |
| 262 | struct ThreadLocalData { |
| 263 | ThreadCache* heap; |
| 264 | // min_size_for_slow_path is 0 if heap is NULL or kMaxSize + 1 otherwise. |
| 265 | // The latter is the common case and allows allocation to be faster |
| 266 | // than it would be otherwise: typically a single branch will |
| 267 | // determine that the requested allocation is no more than kMaxSize |
| 268 | // and we can then proceed, knowing that global and thread-local tcmalloc |
| 269 | // state is initialized. |
| 270 | size_t min_size_for_slow_path; |
| 271 | }; |
| 272 | static __thread ThreadLocalData threadlocal_data_ ATTR_INITIAL_EXEC; |
| 273 | #endif |
| 274 | |
| 275 | // Thread-specific key. Initialization here is somewhat tricky |
| 276 | // because some Linux startup code invokes malloc() before it |
| 277 | // is in a good enough state to handle pthread_keycreate(). |
| 278 | // Therefore, we use TSD keys only after tsd_inited is set to true. |
| 279 | // Until then, we use a slow path to get the heap object. |
| 280 | static bool tsd_inited_; |
| 281 | static pthread_key_t heap_key_; |
| 282 | |
| 283 | // Linked list of heap objects. Protected by Static::pageheap_lock. |
| 284 | static ThreadCache* thread_heaps_; |
| 285 | static int thread_heap_count_; |
| 286 | |
| 287 | // A pointer to one of the objects in thread_heaps_. Represents |
| 288 | // the next ThreadCache from which a thread over its max_size_ should |
| 289 | // steal memory limit. Round-robin through all of the objects in |
| 290 | // thread_heaps_. Protected by Static::pageheap_lock. |
| 291 | static ThreadCache* next_memory_steal_; |
| 292 | |
| 293 | // Overall thread cache size. Protected by Static::pageheap_lock. |
| 294 | static size_t overall_thread_cache_size_; |
| 295 | |
| 296 | // Global per-thread cache size. Writes are protected by |
| 297 | // Static::pageheap_lock. Reads are done without any locking, which should be |
| 298 | // fine as long as size_t can be written atomically and we don't place |
| 299 | // invariants between this variable and other pieces of state. |
| 300 | static volatile size_t per_thread_cache_size_; |
| 301 | |
| 302 | // Represents overall_thread_cache_size_ minus the sum of max_size_ |
| 303 | // across all ThreadCaches. Protected by Static::pageheap_lock. |
| 304 | static ssize_t unclaimed_cache_space_; |
| 305 | |
| 306 | // This class is laid out with the most frequently used fields |
| 307 | // first so that hot elements are placed on the same cache line. |
| 308 | |
| 309 | size_t size_; // Combined size of data |
| 310 | size_t max_size_; // size_ > max_size_ --> Scavenge() |
| 311 | |
| 312 | // We sample allocations, biased by the size of the allocation |
| 313 | Sampler sampler_; // A sampler |
| 314 | |
| 315 | FreeList list_[kNumClasses]; // Array indexed by size-class |
| 316 | |
| 317 | pthread_t tid_; // Which thread owns it |
| 318 | bool in_setspecific_; // In call to pthread_setspecific? |
| 319 | |
| 320 | // Allocate a new heap. REQUIRES: Static::pageheap_lock is held. |
| 321 | static ThreadCache* NewHeap(pthread_t tid); |
| 322 | |
| 323 | // Use only as pthread thread-specific destructor function. |
| 324 | static void DestroyThreadCache(void* ptr); |
| 325 | |
| 326 | static void DeleteCache(ThreadCache* heap); |
| 327 | static void RecomputePerThreadCacheSize(); |
| 328 | |
| 329 | // Ensure that this class is cacheline-aligned. This is critical for |
| 330 | // performance, as false sharing would negate many of the benefits |
| 331 | // of a per-thread cache. |
| 332 | } CACHELINE_ALIGNED; |
| 333 | |
| 334 | // Allocator for thread heaps |
| 335 | // This is logically part of the ThreadCache class, but MSVC, at |
| 336 | // least, does not like using ThreadCache as a template argument |
| 337 | // before the class is fully defined. So we put it outside the class. |
| 338 | extern PageHeapAllocator<ThreadCache> threadcache_allocator; |
| 339 | |
| 340 | inline int ThreadCache::HeapsInUse() { |
| 341 | return threadcache_allocator.inuse(); |
| 342 | } |
| 343 | |
| 344 | inline bool ThreadCache::SampleAllocation(size_t k) { |
| 345 | return sampler_.SampleAllocation(k); |
| 346 | } |
| 347 | |
| 348 | inline void* ThreadCache::Allocate(size_t size, size_t cl) { |
| 349 | ASSERT(size <= kMaxSize); |
| 350 | ASSERT(size == Static::sizemap()->ByteSizeForClass(cl)); |
| 351 | |
| 352 | FreeList* list = &list_[cl]; |
| 353 | if (UNLIKELY(list->empty())) { |
| 354 | return FetchFromCentralCache(cl, size); |
| 355 | } |
| 356 | size_ -= size; |
| 357 | return list->Pop(); |
| 358 | } |
| 359 | |
| 360 | inline void ThreadCache::Deallocate(void* ptr, size_t cl) { |
| 361 | FreeList* list = &list_[cl]; |
| 362 | size_ += Static::sizemap()->ByteSizeForClass(cl); |
| 363 | ssize_t size_headroom = max_size_ - size_ - 1; |
| 364 | |
| 365 | // This catches back-to-back frees of allocs in the same size |
| 366 | // class. A more comprehensive (and expensive) test would be to walk |
| 367 | // the entire freelist. But this might be enough to find some bugs. |
| 368 | ASSERT(ptr != list->Next()); |
| 369 | |
| 370 | list->Push(ptr); |
| 371 | ssize_t list_headroom = |
| 372 | static_cast<ssize_t>(list->max_length()) - list->length(); |
| 373 | |
| 374 | // There are two relatively uncommon things that require further work. |
| 375 | // In the common case we're done, and in that case we need a single branch |
| 376 | // because of the bitwise-or trick that follows. |
| 377 | if (UNLIKELY((list_headroom | size_headroom) < 0)) { |
| 378 | if (list_headroom < 0) { |
| 379 | ListTooLong(list, cl); |
| 380 | } |
| 381 | if (size_ >= max_size_) Scavenge(); |
| 382 | } |
| 383 | } |
| 384 | |
| 385 | inline ThreadCache* ThreadCache::GetThreadHeap() { |
| 386 | #ifdef HAVE_TLS |
| 387 | return threadlocal_data_.heap; |
| 388 | #else |
| 389 | return reinterpret_cast<ThreadCache *>( |
| 390 | perftools_pthread_getspecific(heap_key_)); |
| 391 | #endif |
| 392 | } |
| 393 | |
| 394 | inline ThreadCache* ThreadCache::GetCacheWhichMustBePresent() { |
| 395 | #ifdef HAVE_TLS |
| 396 | ASSERT(threadlocal_data_.heap); |
| 397 | return threadlocal_data_.heap; |
| 398 | #else |
| 399 | ASSERT(perftools_pthread_getspecific(heap_key_)); |
| 400 | return reinterpret_cast<ThreadCache *>( |
| 401 | perftools_pthread_getspecific(heap_key_)); |
| 402 | #endif |
| 403 | } |
| 404 | |
| 405 | inline ThreadCache* ThreadCache::GetCache() { |
| 406 | ThreadCache* ptr = NULL; |
| 407 | if (!tsd_inited_) { |
| 408 | InitModule(); |
| 409 | } else { |
| 410 | ptr = GetThreadHeap(); |
| 411 | } |
| 412 | if (ptr == NULL) ptr = CreateCacheIfNecessary(); |
| 413 | return ptr; |
| 414 | } |
| 415 | |
| 416 | // In deletion paths, we do not try to create a thread-cache. This is |
| 417 | // because we may be in the thread destruction code and may have |
| 418 | // already cleaned up the cache for this thread. |
| 419 | inline ThreadCache* ThreadCache::GetCacheIfPresent() { |
| 420 | if (!tsd_inited_) return NULL; |
| 421 | return GetThreadHeap(); |
| 422 | } |
| 423 | |
| 424 | inline size_t ThreadCache::MinSizeForSlowPath() { |
| 425 | #ifdef HAVE_TLS |
| 426 | return threadlocal_data_.min_size_for_slow_path; |
| 427 | #else |
| 428 | return 0; |
| 429 | #endif |
| 430 | } |
| 431 | |
| 432 | inline void ThreadCache::SetMinSizeForSlowPath(size_t size) { |
| 433 | #ifdef HAVE_TLS |
| 434 | threadlocal_data_.min_size_for_slow_path = size; |
| 435 | #endif |
| 436 | } |
| 437 | |
| 438 | } // namespace tcmalloc |
| 439 | |
| 440 | #endif // TCMALLOC_THREAD_CACHE_H_ |