blob: 81a020eda66f9eb248d3e51b8ef9c9701c033dfd [file] [log] [blame]
Austin Schuh745610d2015-09-06 18:19:50 -07001// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
2// Copyright (c) 2008, Google Inc.
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9// * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11// * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15// * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31// ---
32// Author: Sanjay Ghemawat <opensource@google.com>
33
34#ifndef TCMALLOC_THREAD_CACHE_H_
35#define TCMALLOC_THREAD_CACHE_H_
36
37#include <config.h>
38#ifdef HAVE_PTHREAD
39#include <pthread.h> // for pthread_t, pthread_key_t
40#endif
41#include <stddef.h> // for size_t, NULL
42#ifdef HAVE_STDINT_H
43#include <stdint.h> // for uint32_t, uint64_t
44#endif
45#include <sys/types.h> // for ssize_t
46#include "common.h"
47#include "linked_list.h"
48#include "maybe_threads.h"
49#include "page_heap_allocator.h"
50#include "sampler.h"
51#include "static_vars.h"
52
53#include "common.h" // for SizeMap, kMaxSize, etc
54#include "internal_logging.h" // for ASSERT, etc
55#include "linked_list.h" // for SLL_Pop, SLL_PopRange, etc
56#include "page_heap_allocator.h" // for PageHeapAllocator
57#include "sampler.h" // for Sampler
58#include "static_vars.h" // for Static
59
60namespace tcmalloc {
61
62//-------------------------------------------------------------------
63// Data kept per thread
64//-------------------------------------------------------------------
65
66class ThreadCache {
67 public:
68#ifdef HAVE_TLS
69 enum { have_tls = true };
70#else
71 enum { have_tls = false };
72#endif
73
74 // All ThreadCache objects are kept in a linked list (for stats collection)
75 ThreadCache* next_;
76 ThreadCache* prev_;
77
78 void Init(pthread_t tid);
79 void Cleanup();
80
81 // Accessors (mostly just for printing stats)
82 int freelist_length(size_t cl) const { return list_[cl].length(); }
83
84 // Total byte size in cache
85 size_t Size() const { return size_; }
86
87 // Allocate an object of the given size and class. The size given
88 // must be the same as the size of the class in the size map.
89 void* Allocate(size_t size, size_t cl);
90 void Deallocate(void* ptr, size_t size_class);
91
92 void Scavenge();
93
94 int GetSamplePeriod();
95
96 // Record allocation of "k" bytes. Return true iff allocation
97 // should be sampled
98 bool SampleAllocation(size_t k);
99
100 static void InitModule();
101 static void InitTSD();
102 static ThreadCache* GetThreadHeap();
103 static ThreadCache* GetCache();
104 static ThreadCache* GetCacheIfPresent();
105 static ThreadCache* GetCacheWhichMustBePresent();
106 static ThreadCache* CreateCacheIfNecessary();
107 static void BecomeIdle();
108 static size_t MinSizeForSlowPath();
109 static void SetMinSizeForSlowPath(size_t size);
110
111 static bool IsFastPathAllowed() { return MinSizeForSlowPath() != 0; }
112
113 // Return the number of thread heaps in use.
114 static inline int HeapsInUse();
115
116 // Adds to *total_bytes the total number of bytes used by all thread heaps.
117 // Also, if class_count is not NULL, it must be an array of size kNumClasses,
118 // and this function will increment each element of class_count by the number
119 // of items in all thread-local freelists of the corresponding size class.
120 // REQUIRES: Static::pageheap_lock is held.
121 static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count);
122
123 // Sets the total thread cache size to new_size, recomputing the
124 // individual thread cache sizes as necessary.
125 // REQUIRES: Static::pageheap lock is held.
126 static void set_overall_thread_cache_size(size_t new_size);
127 static size_t overall_thread_cache_size() {
128 return overall_thread_cache_size_;
129 }
130
131 private:
132 class FreeList {
133 private:
134 void* list_; // Linked list of nodes
135
136#ifdef _LP64
137 // On 64-bit hardware, manipulating 16-bit values may be slightly slow.
138 uint32_t length_; // Current length.
139 uint32_t lowater_; // Low water mark for list length.
140 uint32_t max_length_; // Dynamic max list length based on usage.
141 // Tracks the number of times a deallocation has caused
142 // length_ > max_length_. After the kMaxOverages'th time, max_length_
143 // shrinks and length_overages_ is reset to zero.
144 uint32_t length_overages_;
145#else
146 // If we aren't using 64-bit pointers then pack these into less space.
147 uint16_t length_;
148 uint16_t lowater_;
149 uint16_t max_length_;
150 uint16_t length_overages_;
151#endif
152
153 public:
154 void Init() {
155 list_ = NULL;
156 length_ = 0;
157 lowater_ = 0;
158 max_length_ = 1;
159 length_overages_ = 0;
160 }
161
162 // Return current length of list
163 size_t length() const {
164 return length_;
165 }
166
167 // Return the maximum length of the list.
168 size_t max_length() const {
169 return max_length_;
170 }
171
172 // Set the maximum length of the list. If 'new_max' > length(), the
173 // client is responsible for removing objects from the list.
174 void set_max_length(size_t new_max) {
175 max_length_ = new_max;
176 }
177
178 // Return the number of times that length() has gone over max_length().
179 size_t length_overages() const {
180 return length_overages_;
181 }
182
183 void set_length_overages(size_t new_count) {
184 length_overages_ = new_count;
185 }
186
187 // Is list empty?
188 bool empty() const {
189 return list_ == NULL;
190 }
191
192 // Low-water mark management
193 int lowwatermark() const { return lowater_; }
194 void clear_lowwatermark() { lowater_ = length_; }
195
196 void Push(void* ptr) {
197 SLL_Push(&list_, ptr);
198 length_++;
199 }
200
201 void* Pop() {
202 ASSERT(list_ != NULL);
203 length_--;
204 if (length_ < lowater_) lowater_ = length_;
205 return SLL_Pop(&list_);
206 }
207
208 void* Next() {
209 return SLL_Next(&list_);
210 }
211
212 void PushRange(int N, void *start, void *end) {
213 SLL_PushRange(&list_, start, end);
214 length_ += N;
215 }
216
217 void PopRange(int N, void **start, void **end) {
218 SLL_PopRange(&list_, N, start, end);
219 ASSERT(length_ >= N);
220 length_ -= N;
221 if (length_ < lowater_) lowater_ = length_;
222 }
223 };
224
225 // Gets and returns an object from the central cache, and, if possible,
226 // also adds some objects of that size class to this thread cache.
227 void* FetchFromCentralCache(size_t cl, size_t byte_size);
228
229 // Releases some number of items from src. Adjusts the list's max_length
230 // to eventually converge on num_objects_to_move(cl).
231 void ListTooLong(FreeList* src, size_t cl);
232
233 // Releases N items from this thread cache.
234 void ReleaseToCentralCache(FreeList* src, size_t cl, int N);
235
236 // Increase max_size_ by reducing unclaimed_cache_space_ or by
237 // reducing the max_size_ of some other thread. In both cases,
238 // the delta is kStealAmount.
239 void IncreaseCacheLimit();
240 // Same as above but requires Static::pageheap_lock() is held.
241 void IncreaseCacheLimitLocked();
242
243 // If TLS is available, we also store a copy of the per-thread object
244 // in a __thread variable since __thread variables are faster to read
245 // than pthread_getspecific(). We still need pthread_setspecific()
246 // because __thread variables provide no way to run cleanup code when
247 // a thread is destroyed.
248 // We also give a hint to the compiler to use the "initial exec" TLS
249 // model. This is faster than the default TLS model, at the cost that
250 // you cannot dlopen this library. (To see the difference, look at
251 // the CPU use of __tls_get_addr with and without this attribute.)
252 // Since we don't really use dlopen in google code -- and using dlopen
253 // on a malloc replacement is asking for trouble in any case -- that's
254 // a good tradeoff for us.
255#ifdef HAVE___ATTRIBUTE__
256#define ATTR_INITIAL_EXEC __attribute__ ((tls_model ("initial-exec")))
257#else
258#define ATTR_INITIAL_EXEC
259#endif
260
261#ifdef HAVE_TLS
262 struct ThreadLocalData {
263 ThreadCache* heap;
264 // min_size_for_slow_path is 0 if heap is NULL or kMaxSize + 1 otherwise.
265 // The latter is the common case and allows allocation to be faster
266 // than it would be otherwise: typically a single branch will
267 // determine that the requested allocation is no more than kMaxSize
268 // and we can then proceed, knowing that global and thread-local tcmalloc
269 // state is initialized.
270 size_t min_size_for_slow_path;
271 };
272 static __thread ThreadLocalData threadlocal_data_ ATTR_INITIAL_EXEC;
273#endif
274
275 // Thread-specific key. Initialization here is somewhat tricky
276 // because some Linux startup code invokes malloc() before it
277 // is in a good enough state to handle pthread_keycreate().
278 // Therefore, we use TSD keys only after tsd_inited is set to true.
279 // Until then, we use a slow path to get the heap object.
280 static bool tsd_inited_;
281 static pthread_key_t heap_key_;
282
283 // Linked list of heap objects. Protected by Static::pageheap_lock.
284 static ThreadCache* thread_heaps_;
285 static int thread_heap_count_;
286
287 // A pointer to one of the objects in thread_heaps_. Represents
288 // the next ThreadCache from which a thread over its max_size_ should
289 // steal memory limit. Round-robin through all of the objects in
290 // thread_heaps_. Protected by Static::pageheap_lock.
291 static ThreadCache* next_memory_steal_;
292
293 // Overall thread cache size. Protected by Static::pageheap_lock.
294 static size_t overall_thread_cache_size_;
295
296 // Global per-thread cache size. Writes are protected by
297 // Static::pageheap_lock. Reads are done without any locking, which should be
298 // fine as long as size_t can be written atomically and we don't place
299 // invariants between this variable and other pieces of state.
300 static volatile size_t per_thread_cache_size_;
301
302 // Represents overall_thread_cache_size_ minus the sum of max_size_
303 // across all ThreadCaches. Protected by Static::pageheap_lock.
304 static ssize_t unclaimed_cache_space_;
305
306 // This class is laid out with the most frequently used fields
307 // first so that hot elements are placed on the same cache line.
308
309 size_t size_; // Combined size of data
310 size_t max_size_; // size_ > max_size_ --> Scavenge()
311
312 // We sample allocations, biased by the size of the allocation
313 Sampler sampler_; // A sampler
314
315 FreeList list_[kNumClasses]; // Array indexed by size-class
316
317 pthread_t tid_; // Which thread owns it
318 bool in_setspecific_; // In call to pthread_setspecific?
319
320 // Allocate a new heap. REQUIRES: Static::pageheap_lock is held.
321 static ThreadCache* NewHeap(pthread_t tid);
322
323 // Use only as pthread thread-specific destructor function.
324 static void DestroyThreadCache(void* ptr);
325
326 static void DeleteCache(ThreadCache* heap);
327 static void RecomputePerThreadCacheSize();
328
329 // Ensure that this class is cacheline-aligned. This is critical for
330 // performance, as false sharing would negate many of the benefits
331 // of a per-thread cache.
332} CACHELINE_ALIGNED;
333
334// Allocator for thread heaps
335// This is logically part of the ThreadCache class, but MSVC, at
336// least, does not like using ThreadCache as a template argument
337// before the class is fully defined. So we put it outside the class.
338extern PageHeapAllocator<ThreadCache> threadcache_allocator;
339
340inline int ThreadCache::HeapsInUse() {
341 return threadcache_allocator.inuse();
342}
343
344inline bool ThreadCache::SampleAllocation(size_t k) {
345 return sampler_.SampleAllocation(k);
346}
347
348inline void* ThreadCache::Allocate(size_t size, size_t cl) {
349 ASSERT(size <= kMaxSize);
350 ASSERT(size == Static::sizemap()->ByteSizeForClass(cl));
351
352 FreeList* list = &list_[cl];
353 if (UNLIKELY(list->empty())) {
354 return FetchFromCentralCache(cl, size);
355 }
356 size_ -= size;
357 return list->Pop();
358}
359
360inline void ThreadCache::Deallocate(void* ptr, size_t cl) {
361 FreeList* list = &list_[cl];
362 size_ += Static::sizemap()->ByteSizeForClass(cl);
363 ssize_t size_headroom = max_size_ - size_ - 1;
364
365 // This catches back-to-back frees of allocs in the same size
366 // class. A more comprehensive (and expensive) test would be to walk
367 // the entire freelist. But this might be enough to find some bugs.
368 ASSERT(ptr != list->Next());
369
370 list->Push(ptr);
371 ssize_t list_headroom =
372 static_cast<ssize_t>(list->max_length()) - list->length();
373
374 // There are two relatively uncommon things that require further work.
375 // In the common case we're done, and in that case we need a single branch
376 // because of the bitwise-or trick that follows.
377 if (UNLIKELY((list_headroom | size_headroom) < 0)) {
378 if (list_headroom < 0) {
379 ListTooLong(list, cl);
380 }
381 if (size_ >= max_size_) Scavenge();
382 }
383}
384
385inline ThreadCache* ThreadCache::GetThreadHeap() {
386#ifdef HAVE_TLS
387 return threadlocal_data_.heap;
388#else
389 return reinterpret_cast<ThreadCache *>(
390 perftools_pthread_getspecific(heap_key_));
391#endif
392}
393
394inline ThreadCache* ThreadCache::GetCacheWhichMustBePresent() {
395#ifdef HAVE_TLS
396 ASSERT(threadlocal_data_.heap);
397 return threadlocal_data_.heap;
398#else
399 ASSERT(perftools_pthread_getspecific(heap_key_));
400 return reinterpret_cast<ThreadCache *>(
401 perftools_pthread_getspecific(heap_key_));
402#endif
403}
404
405inline ThreadCache* ThreadCache::GetCache() {
406 ThreadCache* ptr = NULL;
407 if (!tsd_inited_) {
408 InitModule();
409 } else {
410 ptr = GetThreadHeap();
411 }
412 if (ptr == NULL) ptr = CreateCacheIfNecessary();
413 return ptr;
414}
415
416// In deletion paths, we do not try to create a thread-cache. This is
417// because we may be in the thread destruction code and may have
418// already cleaned up the cache for this thread.
419inline ThreadCache* ThreadCache::GetCacheIfPresent() {
420 if (!tsd_inited_) return NULL;
421 return GetThreadHeap();
422}
423
424inline size_t ThreadCache::MinSizeForSlowPath() {
425#ifdef HAVE_TLS
426 return threadlocal_data_.min_size_for_slow_path;
427#else
428 return 0;
429#endif
430}
431
432inline void ThreadCache::SetMinSizeForSlowPath(size_t size) {
433#ifdef HAVE_TLS
434 threadlocal_data_.min_size_for_slow_path = size;
435#endif
436}
437
438} // namespace tcmalloc
439
440#endif // TCMALLOC_THREAD_CACHE_H_