Blame - src/thread_cache.h - RealtimeRoboticsGroup/test

blob: 81a020eda66f9eb248d3e51b8ef9c9701c033dfd [file] [log] [blame]

Austin Schuh	745610d	2015-09-06 18:19:50 -0700	[diff] [blame^]	1	// -- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil --
				2	// Copyright (c) 2008, Google Inc.
				3	// All rights reserved.
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are
				7	// met:
				8	//
				9	// * Redistributions of source code must retain the above copyright
				10	// notice, this list of conditions and the following disclaimer.
				11	// * Redistributions in binary form must reproduce the above
				12	// copyright notice, this list of conditions and the following disclaimer
				13	// in the documentation and/or other materials provided with the
				14	// distribution.
				15	// * Neither the name of Google Inc. nor the names of its
				16	// contributors may be used to endorse or promote products derived from
				17	// this software without specific prior written permission.
				18	//
				19	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				20	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				21	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				22	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				23	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				24	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				25	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				26	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				27	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				28	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				29	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				30
				31	// ---
				32	// Author: Sanjay Ghemawat <opensource@google.com>
				33
				34	#ifndef TCMALLOC_THREAD_CACHE_H_
				35	#define TCMALLOC_THREAD_CACHE_H_
				36
				37	#include <config.h>
				38	#ifdef HAVE_PTHREAD
				39	#include <pthread.h> // for pthread_t, pthread_key_t
				40	#endif
				41	#include <stddef.h> // for size_t, NULL
				42	#ifdef HAVE_STDINT_H
				43	#include <stdint.h> // for uint32_t, uint64_t
				44	#endif
				45	#include <sys/types.h> // for ssize_t
				46	#include "common.h"
				47	#include "linked_list.h"
				48	#include "maybe_threads.h"
				49	#include "page_heap_allocator.h"
				50	#include "sampler.h"
				51	#include "static_vars.h"
				52
				53	#include "common.h" // for SizeMap, kMaxSize, etc
				54	#include "internal_logging.h" // for ASSERT, etc
				55	#include "linked_list.h" // for SLL_Pop, SLL_PopRange, etc
				56	#include "page_heap_allocator.h" // for PageHeapAllocator
				57	#include "sampler.h" // for Sampler
				58	#include "static_vars.h" // for Static
				59
				60	namespace tcmalloc {
				61
				62	//-------------------------------------------------------------------
				63	// Data kept per thread
				64	//-------------------------------------------------------------------
				65
				66	class ThreadCache {
				67	public:
				68	#ifdef HAVE_TLS
				69	enum { have_tls = true };
				70	#else
				71	enum { have_tls = false };
				72	#endif
				73
				74	// All ThreadCache objects are kept in a linked list (for stats collection)
				75	ThreadCache* next_;
				76	ThreadCache* prev_;
				77
				78	void Init(pthread_t tid);
				79	void Cleanup();
				80
				81	// Accessors (mostly just for printing stats)
				82	int freelist_length(size_t cl) const { return list_[cl].length(); }
				83
				84	// Total byte size in cache
				85	size_t Size() const { return size_; }
				86
				87	// Allocate an object of the given size and class. The size given
				88	// must be the same as the size of the class in the size map.
				89	void* Allocate(size_t size, size_t cl);
				90	void Deallocate(void* ptr, size_t size_class);
				91
				92	void Scavenge();
				93
				94	int GetSamplePeriod();
				95
				96	// Record allocation of "k" bytes. Return true iff allocation
				97	// should be sampled
				98	bool SampleAllocation(size_t k);
				99
				100	static void InitModule();
				101	static void InitTSD();
				102	static ThreadCache* GetThreadHeap();
				103	static ThreadCache* GetCache();
				104	static ThreadCache* GetCacheIfPresent();
				105	static ThreadCache* GetCacheWhichMustBePresent();
				106	static ThreadCache* CreateCacheIfNecessary();
				107	static void BecomeIdle();
				108	static size_t MinSizeForSlowPath();
				109	static void SetMinSizeForSlowPath(size_t size);
				110
				111	static bool IsFastPathAllowed() { return MinSizeForSlowPath() != 0; }
				112
				113	// Return the number of thread heaps in use.
				114	static inline int HeapsInUse();
				115
				116	// Adds to *total_bytes the total number of bytes used by all thread heaps.
				117	// Also, if class_count is not NULL, it must be an array of size kNumClasses,
				118	// and this function will increment each element of class_count by the number
				119	// of items in all thread-local freelists of the corresponding size class.
				120	// REQUIRES: Static::pageheap_lock is held.
				121	static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count);
				122
				123	// Sets the total thread cache size to new_size, recomputing the
				124	// individual thread cache sizes as necessary.
				125	// REQUIRES: Static::pageheap lock is held.
				126	static void set_overall_thread_cache_size(size_t new_size);
				127	static size_t overall_thread_cache_size() {
				128	return overall_thread_cache_size_;
				129	}
				130
				131	private:
				132	class FreeList {
				133	private:
				134	void* list_; // Linked list of nodes
				135
				136	#ifdef _LP64
				137	// On 64-bit hardware, manipulating 16-bit values may be slightly slow.
				138	uint32_t length_; // Current length.
				139	uint32_t lowater_; // Low water mark for list length.
				140	uint32_t max_length_; // Dynamic max list length based on usage.
				141	// Tracks the number of times a deallocation has caused
				142	// length_ > max_length_. After the kMaxOverages'th time, max_length_
				143	// shrinks and length_overages_ is reset to zero.
				144	uint32_t length_overages_;
				145	#else
				146	// If we aren't using 64-bit pointers then pack these into less space.
				147	uint16_t length_;
				148	uint16_t lowater_;
				149	uint16_t max_length_;
				150	uint16_t length_overages_;
				151	#endif
				152
				153	public:
				154	void Init() {
				155	list_ = NULL;
				156	length_ = 0;
				157	lowater_ = 0;
				158	max_length_ = 1;
				159	length_overages_ = 0;
				160	}
				161
				162	// Return current length of list
				163	size_t length() const {
				164	return length_;
				165	}
				166
				167	// Return the maximum length of the list.
				168	size_t max_length() const {
				169	return max_length_;
				170	}
				171
				172	// Set the maximum length of the list. If 'new_max' > length(), the
				173	// client is responsible for removing objects from the list.
				174	void set_max_length(size_t new_max) {
				175	max_length_ = new_max;
				176	}
				177
				178	// Return the number of times that length() has gone over max_length().
				179	size_t length_overages() const {
				180	return length_overages_;
				181	}
				182
				183	void set_length_overages(size_t new_count) {
				184	length_overages_ = new_count;
				185	}
				186
				187	// Is list empty?
				188	bool empty() const {
				189	return list_ == NULL;
				190	}
				191
				192	// Low-water mark management
				193	int lowwatermark() const { return lowater_; }
				194	void clear_lowwatermark() { lowater_ = length_; }
				195
				196	void Push(void* ptr) {
				197	SLL_Push(&list_, ptr);
				198	length_++;
				199	}
				200
				201	void* Pop() {
				202	ASSERT(list_ != NULL);
				203	length_--;
				204	if (length_ < lowater_) lowater_ = length_;
				205	return SLL_Pop(&list_);
				206	}
				207
				208	void* Next() {
				209	return SLL_Next(&list_);
				210	}
				211
				212	void PushRange(int N, void start, void end) {
				213	SLL_PushRange(&list_, start, end);
				214	length_ += N;
				215	}
				216
				217	void PopRange(int N, void start, void end) {
				218	SLL_PopRange(&list_, N, start, end);
				219	ASSERT(length_ >= N);
				220	length_ -= N;
				221	if (length_ < lowater_) lowater_ = length_;
				222	}
				223	};
				224
				225	// Gets and returns an object from the central cache, and, if possible,
				226	// also adds some objects of that size class to this thread cache.
				227	void* FetchFromCentralCache(size_t cl, size_t byte_size);
				228
				229	// Releases some number of items from src. Adjusts the list's max_length
				230	// to eventually converge on num_objects_to_move(cl).
				231	void ListTooLong(FreeList* src, size_t cl);
				232
				233	// Releases N items from this thread cache.
				234	void ReleaseToCentralCache(FreeList* src, size_t cl, int N);
				235
				236	// Increase max_size_ by reducing unclaimed_cache_space_ or by
				237	// reducing the max_size_ of some other thread. In both cases,
				238	// the delta is kStealAmount.
				239	void IncreaseCacheLimit();
				240	// Same as above but requires Static::pageheap_lock() is held.
				241	void IncreaseCacheLimitLocked();
				242
				243	// If TLS is available, we also store a copy of the per-thread object
				244	// in a __thread variable since __thread variables are faster to read
				245	// than pthread_getspecific(). We still need pthread_setspecific()
				246	// because __thread variables provide no way to run cleanup code when
				247	// a thread is destroyed.
				248	// We also give a hint to the compiler to use the "initial exec" TLS
				249	// model. This is faster than the default TLS model, at the cost that
				250	// you cannot dlopen this library. (To see the difference, look at
				251	// the CPU use of __tls_get_addr with and without this attribute.)
				252	// Since we don't really use dlopen in google code -- and using dlopen
				253	// on a malloc replacement is asking for trouble in any case -- that's
				254	// a good tradeoff for us.
				255	#ifdef HAVE___ATTRIBUTE__
				256	#define ATTR_INITIAL_EXEC __attribute__ ((tls_model ("initial-exec")))
				257	#else
				258	#define ATTR_INITIAL_EXEC
				259	#endif
				260
				261	#ifdef HAVE_TLS
				262	struct ThreadLocalData {
				263	ThreadCache* heap;
				264	// min_size_for_slow_path is 0 if heap is NULL or kMaxSize + 1 otherwise.
				265	// The latter is the common case and allows allocation to be faster
				266	// than it would be otherwise: typically a single branch will
				267	// determine that the requested allocation is no more than kMaxSize
				268	// and we can then proceed, knowing that global and thread-local tcmalloc
				269	// state is initialized.
				270	size_t min_size_for_slow_path;
				271	};
				272	static __thread ThreadLocalData threadlocal_data_ ATTR_INITIAL_EXEC;
				273	#endif
				274
				275	// Thread-specific key. Initialization here is somewhat tricky
				276	// because some Linux startup code invokes malloc() before it
				277	// is in a good enough state to handle pthread_keycreate().
				278	// Therefore, we use TSD keys only after tsd_inited is set to true.
				279	// Until then, we use a slow path to get the heap object.
				280	static bool tsd_inited_;
				281	static pthread_key_t heap_key_;
				282
				283	// Linked list of heap objects. Protected by Static::pageheap_lock.
				284	static ThreadCache* thread_heaps_;
				285	static int thread_heap_count_;
				286
				287	// A pointer to one of the objects in thread_heaps_. Represents
				288	// the next ThreadCache from which a thread over its max_size_ should
				289	// steal memory limit. Round-robin through all of the objects in
				290	// thread_heaps_. Protected by Static::pageheap_lock.
				291	static ThreadCache* next_memory_steal_;
				292
				293	// Overall thread cache size. Protected by Static::pageheap_lock.
				294	static size_t overall_thread_cache_size_;
				295
				296	// Global per-thread cache size. Writes are protected by
				297	// Static::pageheap_lock. Reads are done without any locking, which should be
				298	// fine as long as size_t can be written atomically and we don't place
				299	// invariants between this variable and other pieces of state.
				300	static volatile size_t per_thread_cache_size_;
				301
				302	// Represents overall_thread_cache_size_ minus the sum of max_size_
				303	// across all ThreadCaches. Protected by Static::pageheap_lock.
				304	static ssize_t unclaimed_cache_space_;
				305
				306	// This class is laid out with the most frequently used fields
				307	// first so that hot elements are placed on the same cache line.
				308
				309	size_t size_; // Combined size of data
				310	size_t max_size_; // size_ > max_size_ --> Scavenge()
				311
				312	// We sample allocations, biased by the size of the allocation
				313	Sampler sampler_; // A sampler
				314
				315	FreeList list_[kNumClasses]; // Array indexed by size-class
				316
				317	pthread_t tid_; // Which thread owns it
				318	bool in_setspecific_; // In call to pthread_setspecific?
				319
				320	// Allocate a new heap. REQUIRES: Static::pageheap_lock is held.
				321	static ThreadCache* NewHeap(pthread_t tid);
				322
				323	// Use only as pthread thread-specific destructor function.
				324	static void DestroyThreadCache(void* ptr);
				325
				326	static void DeleteCache(ThreadCache* heap);
				327	static void RecomputePerThreadCacheSize();
				328
				329	// Ensure that this class is cacheline-aligned. This is critical for
				330	// performance, as false sharing would negate many of the benefits
				331	// of a per-thread cache.
				332	} CACHELINE_ALIGNED;
				333
				334	// Allocator for thread heaps
				335	// This is logically part of the ThreadCache class, but MSVC, at
				336	// least, does not like using ThreadCache as a template argument
				337	// before the class is fully defined. So we put it outside the class.
				338	extern PageHeapAllocator<ThreadCache> threadcache_allocator;
				339
				340	inline int ThreadCache::HeapsInUse() {
				341	return threadcache_allocator.inuse();
				342	}
				343
				344	inline bool ThreadCache::SampleAllocation(size_t k) {
				345	return sampler_.SampleAllocation(k);
				346	}
				347
				348	inline void* ThreadCache::Allocate(size_t size, size_t cl) {
				349	ASSERT(size <= kMaxSize);
				350	ASSERT(size == Static::sizemap()->ByteSizeForClass(cl));
				351
				352	FreeList* list = &list_[cl];
				353	if (UNLIKELY(list->empty())) {
				354	return FetchFromCentralCache(cl, size);
				355	}
				356	size_ -= size;
				357	return list->Pop();
				358	}
				359
				360	inline void ThreadCache::Deallocate(void* ptr, size_t cl) {
				361	FreeList* list = &list_[cl];
				362	size_ += Static::sizemap()->ByteSizeForClass(cl);
				363	ssize_t size_headroom = max_size_ - size_ - 1;
				364
				365	// This catches back-to-back frees of allocs in the same size
				366	// class. A more comprehensive (and expensive) test would be to walk
				367	// the entire freelist. But this might be enough to find some bugs.
				368	ASSERT(ptr != list->Next());
				369
				370	list->Push(ptr);
				371	ssize_t list_headroom =
				372	static_cast<ssize_t>(list->max_length()) - list->length();
				373
				374	// There are two relatively uncommon things that require further work.
				375	// In the common case we're done, and in that case we need a single branch
				376	// because of the bitwise-or trick that follows.
				377	if (UNLIKELY((list_headroom \| size_headroom) < 0)) {
				378	if (list_headroom < 0) {
				379	ListTooLong(list, cl);
				380	}
				381	if (size_ >= max_size_) Scavenge();
				382	}
				383	}
				384
				385	inline ThreadCache* ThreadCache::GetThreadHeap() {
				386	#ifdef HAVE_TLS
				387	return threadlocal_data_.heap;
				388	#else
				389	return reinterpret_cast<ThreadCache *>(
				390	perftools_pthread_getspecific(heap_key_));
				391	#endif
				392	}
				393
				394	inline ThreadCache* ThreadCache::GetCacheWhichMustBePresent() {
				395	#ifdef HAVE_TLS
				396	ASSERT(threadlocal_data_.heap);
				397	return threadlocal_data_.heap;
				398	#else
				399	ASSERT(perftools_pthread_getspecific(heap_key_));
				400	return reinterpret_cast<ThreadCache *>(
				401	perftools_pthread_getspecific(heap_key_));
				402	#endif
				403	}
				404
				405	inline ThreadCache* ThreadCache::GetCache() {
				406	ThreadCache* ptr = NULL;
				407	if (!tsd_inited_) {
				408	InitModule();
				409	} else {
				410	ptr = GetThreadHeap();
				411	}
				412	if (ptr == NULL) ptr = CreateCacheIfNecessary();
				413	return ptr;
				414	}
				415
				416	// In deletion paths, we do not try to create a thread-cache. This is
				417	// because we may be in the thread destruction code and may have
				418	// already cleaned up the cache for this thread.
				419	inline ThreadCache* ThreadCache::GetCacheIfPresent() {
				420	if (!tsd_inited_) return NULL;
				421	return GetThreadHeap();
				422	}
				423
				424	inline size_t ThreadCache::MinSizeForSlowPath() {
				425	#ifdef HAVE_TLS
				426	return threadlocal_data_.min_size_for_slow_path;
				427	#else
				428	return 0;
				429	#endif
				430	}
				431
				432	inline void ThreadCache::SetMinSizeForSlowPath(size_t size) {
				433	#ifdef HAVE_TLS
				434	threadlocal_data_.min_size_for_slow_path = size;
				435	#endif
				436	}
				437
				438	} // namespace tcmalloc
				439
				440	#endif // TCMALLOC_THREAD_CACHE_H_