|  | // Copyright (c) 2008, Google Inc. | 
|  | // All rights reserved. | 
|  | // | 
|  | // Redistribution and use in source and binary forms, with or without | 
|  | // modification, are permitted provided that the following conditions are | 
|  | // met: | 
|  | // | 
|  | //     * Redistributions of source code must retain the above copyright | 
|  | // notice, this list of conditions and the following disclaimer. | 
|  | //     * Redistributions in binary form must reproduce the above | 
|  | // copyright notice, this list of conditions and the following disclaimer | 
|  | // in the documentation and/or other materials provided with the | 
|  | // distribution. | 
|  | //     * Neither the name of Google Inc. nor the names of its | 
|  | // contributors may be used to endorse or promote products derived from | 
|  | // this software without specific prior written permission. | 
|  | // | 
|  | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
|  | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
|  | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 
|  | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | 
|  | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 
|  | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 
|  | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 
|  | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 
|  | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
|  | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
|  | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
|  |  | 
|  | // --- | 
|  | // Author: Sanjay Ghemawat <opensource@google.com> | 
|  |  | 
|  | #ifndef TCMALLOC_THREAD_CACHE_H_ | 
|  | #define TCMALLOC_THREAD_CACHE_H_ | 
|  |  | 
|  | #include <config.h> | 
|  | #ifdef HAVE_PTHREAD | 
|  | #include <pthread.h>                    // for pthread_t, pthread_key_t | 
|  | #endif | 
|  | #include <stddef.h>                     // for size_t, NULL | 
|  | #ifdef HAVE_STDINT_H | 
|  | #include <stdint.h>                     // for uint32_t, uint64_t | 
|  | #endif | 
|  | #include <sys/types.h>                  // for ssize_t | 
|  | #include "common.h"            // for SizeMap, kMaxSize, etc | 
|  | #include "free_list.h"  // for FL_Pop, FL_PopRange, etc | 
|  | #include "internal_logging.h"  // for ASSERT, etc | 
|  | #include "maybe_threads.h" | 
|  | #include "page_heap_allocator.h"  // for PageHeapAllocator | 
|  | #include "sampler.h"           // for Sampler | 
|  | #include "static_vars.h"       // for Static | 
|  |  | 
|  | namespace tcmalloc { | 
|  |  | 
|  | // Even if we have support for thread-local storage in the compiler | 
|  | // and linker, the OS may not support it.  We need to check that at | 
|  | // runtime.  Right now, we have to keep a manual set of "bad" OSes. | 
|  | #if defined(HAVE_TLS) | 
|  | extern bool kernel_supports_tls;   // defined in thread_cache.cc | 
|  | void CheckIfKernelSupportsTLS(); | 
|  | inline bool KernelSupportsTLS() { | 
|  | return kernel_supports_tls; | 
|  | } | 
|  | #endif    // HAVE_TLS | 
|  |  | 
|  | //------------------------------------------------------------------- | 
|  | // Data kept per thread | 
|  | //------------------------------------------------------------------- | 
|  |  | 
|  | class ThreadCache { | 
|  | public: | 
|  | // All ThreadCache objects are kept in a linked list (for stats collection) | 
|  | ThreadCache* next_; | 
|  | ThreadCache* prev_; | 
|  |  | 
|  | void Init(pthread_t tid); | 
|  | void Cleanup(); | 
|  |  | 
|  | // Accessors (mostly just for printing stats) | 
|  | int freelist_length(size_t cl) const { return list_[cl].length(); } | 
|  |  | 
|  | // Total byte size in cache | 
|  | size_t Size() const { return size_; } | 
|  |  | 
|  | // Allocate an object of the given size and class. The size given | 
|  | // must be the same as the size of the class in the size map. | 
|  | void* Allocate(size_t size, size_t cl); | 
|  | void Deallocate(void* ptr, size_t size_class); | 
|  |  | 
|  | void Scavenge(); | 
|  |  | 
|  | int GetSamplePeriod(); | 
|  |  | 
|  | // Record allocation of "k" bytes.  Return true iff allocation | 
|  | // should be sampled | 
|  | bool SampleAllocation(size_t k); | 
|  |  | 
|  | // Record additional bytes allocated. | 
|  | void AddToByteAllocatedTotal(size_t k) { total_bytes_allocated_ += k; } | 
|  |  | 
|  | // Return the total number of bytes allocated from this heap.  The value will | 
|  | // wrap when there is an overflow, and so only the differences between two | 
|  | // values should be relied on (and even then, modulo 2^32). | 
|  | uint32 GetTotalBytesAllocated() const; | 
|  |  | 
|  | // On the current thread, return GetTotalBytesAllocated(). | 
|  | static uint32 GetBytesAllocatedOnCurrentThread(); | 
|  |  | 
|  | static void         InitModule(); | 
|  | static void         InitTSD(); | 
|  | static ThreadCache* GetThreadHeap(); | 
|  | static ThreadCache* GetCache(); | 
|  | static ThreadCache* GetCacheIfPresent(); | 
|  | static ThreadCache* CreateCacheIfNecessary(); | 
|  | static void         BecomeIdle(); | 
|  |  | 
|  | // Return the number of thread heaps in use. | 
|  | static inline int HeapsInUse(); | 
|  |  | 
|  | // Writes to total_bytes the total number of bytes used by all thread heaps. | 
|  | // class_count must be an array of size kNumClasses.  Writes the number of | 
|  | // items on the corresponding freelist.  class_count may be NULL. | 
|  | // The storage of both parameters must be zero intialized. | 
|  | // REQUIRES: Static::pageheap_lock is held. | 
|  | static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count); | 
|  |  | 
|  | // Sets the total thread cache size to new_size, recomputing the | 
|  | // individual thread cache sizes as necessary. | 
|  | // REQUIRES: Static::pageheap lock is held. | 
|  | static void set_overall_thread_cache_size(size_t new_size); | 
|  | static size_t overall_thread_cache_size() { | 
|  | return overall_thread_cache_size_; | 
|  | } | 
|  |  | 
|  | private: | 
|  | class FreeList { | 
|  | private: | 
|  | void*    list_;       // Linked list of nodes | 
|  |  | 
|  | #ifdef _LP64 | 
|  | // On 64-bit hardware, manipulating 16-bit values may be slightly slow. | 
|  | uint32_t length_;      // Current length. | 
|  | uint32_t lowater_;     // Low water mark for list length. | 
|  | uint32_t max_length_;  // Dynamic max list length based on usage. | 
|  | // Tracks the number of times a deallocation has caused | 
|  | // length_ > max_length_.  After the kMaxOverages'th time, max_length_ | 
|  | // shrinks and length_overages_ is reset to zero. | 
|  | uint32_t length_overages_; | 
|  | #else | 
|  | // If we aren't using 64-bit pointers then pack these into less space. | 
|  | uint16_t length_; | 
|  | uint16_t lowater_; | 
|  | uint16_t max_length_; | 
|  | uint16_t length_overages_; | 
|  | #endif | 
|  |  | 
|  | public: | 
|  | void Init() { | 
|  | list_ = NULL; | 
|  | length_ = 0; | 
|  | lowater_ = 0; | 
|  | max_length_ = 1; | 
|  | length_overages_ = 0; | 
|  | } | 
|  |  | 
|  | // Return current length of list | 
|  | size_t length() const { | 
|  | return length_; | 
|  | } | 
|  |  | 
|  | // Return the maximum length of the list. | 
|  | size_t max_length() const { | 
|  | return max_length_; | 
|  | } | 
|  |  | 
|  | // Set the maximum length of the list.  If 'new_max' > length(), the | 
|  | // client is responsible for removing objects from the list. | 
|  | void set_max_length(size_t new_max) { | 
|  | max_length_ = new_max; | 
|  | } | 
|  |  | 
|  | // Return the number of times that length() has gone over max_length(). | 
|  | size_t length_overages() const { | 
|  | return length_overages_; | 
|  | } | 
|  |  | 
|  | void set_length_overages(size_t new_count) { | 
|  | length_overages_ = new_count; | 
|  | } | 
|  |  | 
|  | // Is list empty? | 
|  | bool empty() const { | 
|  | return list_ == NULL; | 
|  | } | 
|  |  | 
|  | // Low-water mark management | 
|  | int lowwatermark() const { return lowater_; } | 
|  | void clear_lowwatermark() { lowater_ = length_; } | 
|  |  | 
|  | void Push(void* ptr) { | 
|  | FL_Push(&list_, ptr); | 
|  | length_++; | 
|  | } | 
|  |  | 
|  | void* Pop() { | 
|  | ASSERT(list_ != NULL); | 
|  | length_--; | 
|  | if (length_ < lowater_) lowater_ = length_; | 
|  | return FL_Pop(&list_); | 
|  | } | 
|  |  | 
|  | void* Next() { | 
|  | if (list_ == NULL) return NULL; | 
|  | return FL_Next(list_); | 
|  | } | 
|  |  | 
|  | void PushRange(int N, void *start, void *end) { | 
|  | FL_PushRange(&list_, start, end); | 
|  | length_ += N; | 
|  | } | 
|  |  | 
|  | void PopRange(int N, void **start, void **end) { | 
|  | FL_PopRange(&list_, N, start, end); | 
|  | ASSERT(length_ >= N); | 
|  | length_ -= N; | 
|  | if (length_ < lowater_) lowater_ = length_; | 
|  | } | 
|  | }; | 
|  |  | 
|  | // Gets and returns an object from the central cache, and, if possible, | 
|  | // also adds some objects of that size class to this thread cache. | 
|  | void* FetchFromCentralCache(size_t cl, size_t byte_size); | 
|  |  | 
|  | // Releases some number of items from src.  Adjusts the list's max_length | 
|  | // to eventually converge on num_objects_to_move(cl). | 
|  | void ListTooLong(FreeList* src, size_t cl); | 
|  |  | 
|  | // Releases N items from this thread cache. | 
|  | void ReleaseToCentralCache(FreeList* src, size_t cl, int N); | 
|  |  | 
|  | // Increase max_size_ by reducing unclaimed_cache_space_ or by | 
|  | // reducing the max_size_ of some other thread.  In both cases, | 
|  | // the delta is kStealAmount. | 
|  | void IncreaseCacheLimit(); | 
|  | // Same as above but requires Static::pageheap_lock() is held. | 
|  | void IncreaseCacheLimitLocked(); | 
|  |  | 
|  | // If TLS is available, we also store a copy of the per-thread object | 
|  | // in a __thread variable since __thread variables are faster to read | 
|  | // than pthread_getspecific().  We still need pthread_setspecific() | 
|  | // because __thread variables provide no way to run cleanup code when | 
|  | // a thread is destroyed. | 
|  | // We also give a hint to the compiler to use the "initial exec" TLS | 
|  | // model.  This is faster than the default TLS model, at the cost that | 
|  | // you cannot dlopen this library.  (To see the difference, look at | 
|  | // the CPU use of __tls_get_addr with and without this attribute.) | 
|  | // Since we don't really use dlopen in google code -- and using dlopen | 
|  | // on a malloc replacement is asking for trouble in any case -- that's | 
|  | // a good tradeoff for us. | 
|  | #ifdef HAVE_TLS | 
|  | static __thread ThreadCache* threadlocal_heap_ | 
|  | // This code links against pyautolib.so, which causes dlopen() on that shared | 
|  | // object to fail when -fprofile-generate is used with it. Ideally | 
|  | // pyautolib.so should not link against this code. There is a bug filed for | 
|  | // that: | 
|  | // http://code.google.com/p/chromium/issues/detail?id=124489 | 
|  | // For now the workaround is to pass in -DPGO_GENERATE when building Chrome | 
|  | // for instrumentation (-fprofile-generate). | 
|  | // For all non-instrumentation builds, this define will not be set and the | 
|  | // performance benefit of "intial-exec" will be achieved. | 
|  | #if defined(HAVE___ATTRIBUTE__) && !defined(PGO_GENERATE) | 
|  | __attribute__ ((tls_model ("initial-exec"))) | 
|  | # endif | 
|  | ; | 
|  | #endif | 
|  |  | 
|  | // Thread-specific key.  Initialization here is somewhat tricky | 
|  | // because some Linux startup code invokes malloc() before it | 
|  | // is in a good enough state to handle pthread_keycreate(). | 
|  | // Therefore, we use TSD keys only after tsd_inited is set to true. | 
|  | // Until then, we use a slow path to get the heap object. | 
|  | static bool tsd_inited_; | 
|  | static pthread_key_t heap_key_; | 
|  |  | 
|  | // Linked list of heap objects.  Protected by Static::pageheap_lock. | 
|  | static ThreadCache* thread_heaps_; | 
|  | static int thread_heap_count_; | 
|  |  | 
|  | // A pointer to one of the objects in thread_heaps_.  Represents | 
|  | // the next ThreadCache from which a thread over its max_size_ should | 
|  | // steal memory limit.  Round-robin through all of the objects in | 
|  | // thread_heaps_.  Protected by Static::pageheap_lock. | 
|  | static ThreadCache* next_memory_steal_; | 
|  |  | 
|  | // Overall thread cache size.  Protected by Static::pageheap_lock. | 
|  | static size_t overall_thread_cache_size_; | 
|  |  | 
|  | // Global per-thread cache size.  Writes are protected by | 
|  | // Static::pageheap_lock.  Reads are done without any locking, which should be | 
|  | // fine as long as size_t can be written atomically and we don't place | 
|  | // invariants between this variable and other pieces of state. | 
|  | static volatile size_t per_thread_cache_size_; | 
|  |  | 
|  | // Represents overall_thread_cache_size_ minus the sum of max_size_ | 
|  | // across all ThreadCaches.  Protected by Static::pageheap_lock. | 
|  | static ssize_t unclaimed_cache_space_; | 
|  |  | 
|  | // This class is laid out with the most frequently used fields | 
|  | // first so that hot elements are placed on the same cache line. | 
|  |  | 
|  | size_t        size_;                  // Combined size of data | 
|  | size_t        max_size_;              // size_ > max_size_ --> Scavenge() | 
|  |  | 
|  | // The following is the tally of bytes allocated on a thread as a response to | 
|  | // any flavor of malloc() call.  The aggegated amount includes all padding to | 
|  | // the smallest class that can hold the request, or to the nearest whole page | 
|  | // when a large allocation is made without using a class.  This sum is | 
|  | // currently used for Chromium profiling, where tallies are kept of the amount | 
|  | // of memory allocated during the running of each task on each thread. | 
|  | uint32        total_bytes_allocated_;  // Total, modulo 2^32. | 
|  |  | 
|  | // We sample allocations, biased by the size of the allocation | 
|  | Sampler       sampler_;               // A sampler | 
|  |  | 
|  | FreeList      list_[kNumClasses];     // Array indexed by size-class | 
|  |  | 
|  | pthread_t     tid_;                   // Which thread owns it | 
|  | bool          in_setspecific_;        // In call to pthread_setspecific? | 
|  |  | 
|  | // Allocate a new heap. REQUIRES: Static::pageheap_lock is held. | 
|  | static ThreadCache* NewHeap(pthread_t tid); | 
|  |  | 
|  | // Use only as pthread thread-specific destructor function. | 
|  | static void DestroyThreadCache(void* ptr); | 
|  |  | 
|  | static void DeleteCache(ThreadCache* heap); | 
|  | static void RecomputePerThreadCacheSize(); | 
|  |  | 
|  | // Ensure that this class is cacheline-aligned. This is critical for | 
|  | // performance, as false sharing would negate many of the benefits | 
|  | // of a per-thread cache. | 
|  | } CACHELINE_ALIGNED; | 
|  |  | 
|  | // Allocator for thread heaps | 
|  | // This is logically part of the ThreadCache class, but MSVC, at | 
|  | // least, does not like using ThreadCache as a template argument | 
|  | // before the class is fully defined.  So we put it outside the class. | 
|  | extern PageHeapAllocator<ThreadCache> threadcache_allocator; | 
|  |  | 
|  | inline int ThreadCache::HeapsInUse() { | 
|  | return threadcache_allocator.inuse(); | 
|  | } | 
|  |  | 
|  | inline bool ThreadCache::SampleAllocation(size_t k) { | 
|  | return sampler_.SampleAllocation(k); | 
|  | } | 
|  |  | 
|  | inline uint32 ThreadCache::GetTotalBytesAllocated() const { | 
|  | return total_bytes_allocated_; | 
|  | } | 
|  |  | 
|  | inline void* ThreadCache::Allocate(size_t size, size_t cl) { | 
|  | ASSERT(size <= kMaxSize); | 
|  | ASSERT(size == Static::sizemap()->ByteSizeForClass(cl)); | 
|  |  | 
|  | FreeList* list = &list_[cl]; | 
|  | if (list->empty()) { | 
|  | return FetchFromCentralCache(cl, size); | 
|  | } | 
|  | size_ -= size; | 
|  | return list->Pop(); | 
|  | } | 
|  |  | 
|  | inline void ThreadCache::Deallocate(void* ptr, size_t cl) { | 
|  | FreeList* list = &list_[cl]; | 
|  | size_ += Static::sizemap()->ByteSizeForClass(cl); | 
|  | ssize_t size_headroom = max_size_ - size_ - 1; | 
|  |  | 
|  | // This catches back-to-back frees of allocs in the same size | 
|  | // class. A more comprehensive (and expensive) test would be to walk | 
|  | // the entire freelist. But this might be enough to find some bugs. | 
|  | ASSERT(ptr != list->Next()); | 
|  |  | 
|  | list->Push(ptr); | 
|  | ssize_t list_headroom = | 
|  | static_cast<ssize_t>(list->max_length()) - list->length(); | 
|  |  | 
|  | // There are two relatively uncommon things that require further work. | 
|  | // In the common case we're done, and in that case we need a single branch | 
|  | // because of the bitwise-or trick that follows. | 
|  | if ((list_headroom | size_headroom) < 0) { | 
|  | if (list_headroom < 0) { | 
|  | ListTooLong(list, cl); | 
|  | } | 
|  | if (size_ >= max_size_) Scavenge(); | 
|  | } | 
|  | } | 
|  |  | 
|  | inline ThreadCache* ThreadCache::GetThreadHeap() { | 
|  | #ifdef HAVE_TLS | 
|  | // __thread is faster, but only when the kernel supports it | 
|  | if (KernelSupportsTLS()) | 
|  | return threadlocal_heap_; | 
|  | #endif | 
|  | return reinterpret_cast<ThreadCache *>( | 
|  | perftools_pthread_getspecific(heap_key_)); | 
|  | } | 
|  |  | 
|  | inline ThreadCache* ThreadCache::GetCache() { | 
|  | ThreadCache* ptr = NULL; | 
|  | if (!tsd_inited_) { | 
|  | InitModule(); | 
|  | } else { | 
|  | ptr = GetThreadHeap(); | 
|  | } | 
|  | if (ptr == NULL) ptr = CreateCacheIfNecessary(); | 
|  | return ptr; | 
|  | } | 
|  |  | 
|  | // In deletion paths, we do not try to create a thread-cache.  This is | 
|  | // because we may be in the thread destruction code and may have | 
|  | // already cleaned up the cache for this thread. | 
|  | inline ThreadCache* ThreadCache::GetCacheIfPresent() { | 
|  | if (!tsd_inited_) return NULL; | 
|  | return GetThreadHeap(); | 
|  | } | 
|  |  | 
|  | }  // namespace tcmalloc | 
|  |  | 
|  | #endif  // TCMALLOC_THREAD_CACHE_H_ |