Squashed 'third_party/gperftools/' content from commit 54505f1 Change-Id: Id02e833828732b0efe7dac722b8485279e67c5fa git-subtree-dir: third_party/gperftools git-subtree-split: 54505f1d50c2d1f4676f5e87090b64a117fd980e

commit: 745610d16119f59479f84918a66456ece9d6d461 [log] [tgz]
author: Austin Schuh <austin@peloton-tech.com> Sun Sep 06 18:19:50 2015 -0700
committer: Austin Schuh <austin@peloton-tech.com> Sun Sep 06 18:19:50 2015 -0700
tree: 135f4ea4b4c31e809bdbaba6221da5cffb29fd88
diff --git a/src/addressmap-inl.h b/src/addressmap-inl.h
new file mode 100644
index 0000000..fd1dc5b
--- /dev/null
+++ b/src/addressmap-inl.h

@@ -0,0 +1,422 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// A fast map from addresses to values.  Assumes that addresses are
+// clustered.  The main use is intended to be for heap-profiling.
+// May be too memory-hungry for other uses.
+//
+// We use a user-defined allocator/de-allocator so that we can use
+// this data structure during heap-profiling.
+//
+// IMPLEMENTATION DETAIL:
+//
+// Some default definitions/parameters:
+//  * Block      -- aligned 128-byte region of the address space
+//  * Cluster    -- aligned 1-MB region of the address space
+//  * Block-ID   -- block-number within a cluster
+//  * Cluster-ID -- Starting address of cluster divided by cluster size
+//
+// We use a three-level map to represent the state:
+//  1. A hash-table maps from a cluster-ID to the data for that cluster.
+//  2. For each non-empty cluster we keep an array indexed by
+//     block-ID tht points to the first entry in the linked-list
+//     for the block.
+//  3. At the bottom, we keep a singly-linked list of all
+//     entries in a block (for non-empty blocks).
+//
+//    hash table
+//  +-------------+
+//  | id->cluster |---> ...
+//  |     ...     |
+//  | id->cluster |--->  Cluster
+//  +-------------+     +-------+    Data for one block
+//                      |  nil  |   +------------------------------------+
+//                      |   ----+---|->[addr/value]-->[addr/value]-->... |
+//                      |  nil  |   +------------------------------------+
+//                      |   ----+--> ...
+//                      |  nil  |
+//                      |  ...  |
+//                      +-------+
+//
+// Note that we require zero-bytes of overhead for completely empty
+// clusters.  The minimum space requirement for a cluster is the size
+// of the hash-table entry plus a pointer value for each block in
+// the cluster.  Empty blocks impose no extra space requirement.
+//
+// The cost of a lookup is:
+//      a. A hash-table lookup to find the cluster
+//      b. An array access in the cluster structure
+//      c. A traversal over the linked-list for a block
+
+#ifndef BASE_ADDRESSMAP_INL_H_
+#define BASE_ADDRESSMAP_INL_H_
+
+#include "config.h"
+#include <stddef.h>
+#include <string.h>
+#if defined HAVE_STDINT_H
+#include <stdint.h>             // to get uint16_t (ISO naming madness)
+#elif defined HAVE_INTTYPES_H
+#include <inttypes.h>           // another place uint16_t might be defined
+#else
+#include <sys/types.h>          // our last best hope
+#endif
+
+// This class is thread-unsafe -- that is, instances of this class can
+// not be accessed concurrently by multiple threads -- because the
+// callback function for Iterate() may mutate contained values. If the
+// callback functions you pass do not mutate their Value* argument,
+// AddressMap can be treated as thread-compatible -- that is, it's
+// safe for multiple threads to call "const" methods on this class,
+// but not safe for one thread to call const methods on this class
+// while another thread is calling non-const methods on the class.
+template <class Value>
+class AddressMap {
+ public:
+  typedef void* (*Allocator)(size_t size);
+  typedef void  (*DeAllocator)(void* ptr);
+  typedef const void* Key;
+
+  // Create an AddressMap that uses the specified allocator/deallocator.
+  // The allocator/deallocator should behave like malloc/free.
+  // For instance, the allocator does not need to return initialized memory.
+  AddressMap(Allocator alloc, DeAllocator dealloc);
+  ~AddressMap();
+
+  // If the map contains an entry for "key", return it. Else return NULL.
+  inline const Value* Find(Key key) const;
+  inline Value* FindMutable(Key key);
+
+  // Insert <key,value> into the map.  Any old value associated
+  // with key is forgotten.
+  void Insert(Key key, Value value);
+
+  // Remove any entry for key in the map.  If an entry was found
+  // and removed, stores the associated value in "*removed_value"
+  // and returns true.  Else returns false.
+  bool FindAndRemove(Key key, Value* removed_value);
+
+  // Similar to Find but we assume that keys are addresses of non-overlapping
+  // memory ranges whose sizes are given by size_func.
+  // If the map contains a range into which "key" points
+  // (at its start or inside of it, but not at the end),
+  // return the address of the associated value
+  // and store its key in "*res_key".
+  // Else return NULL.
+  // max_size specifies largest range size possibly in existence now.
+  typedef size_t (*ValueSizeFunc)(const Value& v);
+  const Value* FindInside(ValueSizeFunc size_func, size_t max_size,
+                          Key key, Key* res_key);
+
+  // Iterate over the address map calling 'callback'
+  // for all stored key-value pairs and passing 'arg' to it.
+  // We don't use full Closure/Callback machinery not to add
+  // unnecessary dependencies to this class with low-level uses.
+  template<class Type>
+  inline void Iterate(void (*callback)(Key, Value*, Type), Type arg) const;
+
+ private:
+  typedef uintptr_t Number;
+
+  // The implementation assumes that addresses inserted into the map
+  // will be clustered.  We take advantage of this fact by splitting
+  // up the address-space into blocks and using a linked-list entry
+  // for each block.
+
+  // Size of each block.  There is one linked-list for each block, so
+  // do not make the block-size too big.  Oterwise, a lot of time
+  // will be spent traversing linked lists.
+  static const int kBlockBits = 7;
+  static const int kBlockSize = 1 << kBlockBits;
+
+  // Entry kept in per-block linked-list
+  struct Entry {
+    Entry* next;
+    Key    key;
+    Value  value;
+  };
+
+  // We further group a sequence of consecutive blocks into a cluster.
+  // The data for a cluster is represented as a dense array of
+  // linked-lists, one list per contained block.
+  static const int kClusterBits = 13;
+  static const Number kClusterSize = 1 << (kBlockBits + kClusterBits);
+  static const int kClusterBlocks = 1 << kClusterBits;
+
+  // We use a simple chaining hash-table to represent the clusters.
+  struct Cluster {
+    Cluster* next;                      // Next cluster in hash table chain
+    Number   id;                        // Cluster ID
+    Entry*   blocks[kClusterBlocks];    // Per-block linked-lists
+  };
+
+  // Number of hash-table entries.  With the block-size/cluster-size
+  // defined above, each cluster covers 1 MB, so an 4K entry
+  // hash-table will give an average hash-chain length of 1 for 4GB of
+  // in-use memory.
+  static const int kHashBits = 12;
+  static const int kHashSize = 1 << 12;
+
+  // Number of entry objects allocated at a time
+  static const int ALLOC_COUNT = 64;
+
+  Cluster**     hashtable_;              // The hash-table
+  Entry*        free_;                   // Free list of unused Entry objects
+
+  // Multiplicative hash function:
+  // The value "kHashMultiplier" is the bottom 32 bits of
+  //    int((sqrt(5)-1)/2 * 2^32)
+  // This is a good multiplier as suggested in CLR, Knuth.  The hash
+  // value is taken to be the top "k" bits of the bottom 32 bits
+  // of the muliplied value.
+  static const uint32_t kHashMultiplier = 2654435769u;
+  static int HashInt(Number x) {
+    // Multiply by a constant and take the top bits of the result.
+    const uint32_t m = static_cast<uint32_t>(x) * kHashMultiplier;
+    return static_cast<int>(m >> (32 - kHashBits));
+  }
+
+  // Find cluster object for specified address.  If not found
+  // and "create" is true, create the object.  If not found
+  // and "create" is false, return NULL.
+  //
+  // This method is bitwise-const if create is false.
+  Cluster* FindCluster(Number address, bool create) {
+    // Look in hashtable
+    const Number cluster_id = address >> (kBlockBits + kClusterBits);
+    const int h = HashInt(cluster_id);
+    for (Cluster* c = hashtable_[h]; c != NULL; c = c->next) {
+      if (c->id == cluster_id) {
+        return c;
+      }
+    }
+
+    // Create cluster if necessary
+    if (create) {
+      Cluster* c = New<Cluster>(1);
+      c->id = cluster_id;
+      c->next = hashtable_[h];
+      hashtable_[h] = c;
+      return c;
+    }
+    return NULL;
+  }
+
+  // Return the block ID for an address within its cluster
+  static int BlockID(Number address) {
+    return (address >> kBlockBits) & (kClusterBlocks - 1);
+  }
+
+  //--------------------------------------------------------------
+  // Memory management -- we keep all objects we allocate linked
+  // together in a singly linked list so we can get rid of them
+  // when we are all done.  Furthermore, we allow the client to
+  // pass in custom memory allocator/deallocator routines.
+  //--------------------------------------------------------------
+  struct Object {
+    Object* next;
+    // The real data starts here
+  };
+
+  Allocator     alloc_;                 // The allocator
+  DeAllocator   dealloc_;               // The deallocator
+  Object*       allocated_;             // List of allocated objects
+
+  // Allocates a zeroed array of T with length "num".  Also inserts
+  // the allocated block into a linked list so it can be deallocated
+  // when we are all done.
+  template <class T> T* New(int num) {
+    void* ptr = (*alloc_)(sizeof(Object) + num*sizeof(T));
+    memset(ptr, 0, sizeof(Object) + num*sizeof(T));
+    Object* obj = reinterpret_cast<Object*>(ptr);
+    obj->next = allocated_;
+    allocated_ = obj;
+    return reinterpret_cast<T*>(reinterpret_cast<Object*>(ptr) + 1);
+  }
+};
+
+// More implementation details follow:
+
+template <class Value>
+AddressMap<Value>::AddressMap(Allocator alloc, DeAllocator dealloc)
+  : free_(NULL),
+    alloc_(alloc),
+    dealloc_(dealloc),
+    allocated_(NULL) {
+  hashtable_ = New<Cluster*>(kHashSize);
+}
+
+template <class Value>
+AddressMap<Value>::~AddressMap() {
+  // De-allocate all of the objects we allocated
+  for (Object* obj = allocated_; obj != NULL; /**/) {
+    Object* next = obj->next;
+    (*dealloc_)(obj);
+    obj = next;
+  }
+}
+
+template <class Value>
+inline const Value* AddressMap<Value>::Find(Key key) const {
+  return const_cast<AddressMap*>(this)->FindMutable(key);
+}
+
+template <class Value>
+inline Value* AddressMap<Value>::FindMutable(Key key) {
+  const Number num = reinterpret_cast<Number>(key);
+  const Cluster* const c = FindCluster(num, false/*do not create*/);
+  if (c != NULL) {
+    for (Entry* e = c->blocks[BlockID(num)]; e != NULL; e = e->next) {
+      if (e->key == key) {
+        return &e->value;
+      }
+    }
+  }
+  return NULL;
+}
+
+template <class Value>
+void AddressMap<Value>::Insert(Key key, Value value) {
+  const Number num = reinterpret_cast<Number>(key);
+  Cluster* const c = FindCluster(num, true/*create*/);
+
+  // Look in linked-list for this block
+  const int block = BlockID(num);
+  for (Entry* e = c->blocks[block]; e != NULL; e = e->next) {
+    if (e->key == key) {
+      e->value = value;
+      return;
+    }
+  }
+
+  // Create entry
+  if (free_ == NULL) {
+    // Allocate a new batch of entries and add to free-list
+    Entry* array = New<Entry>(ALLOC_COUNT);
+    for (int i = 0; i < ALLOC_COUNT-1; i++) {
+      array[i].next = &array[i+1];
+    }
+    array[ALLOC_COUNT-1].next = free_;
+    free_ = &array[0];
+  }
+  Entry* e = free_;
+  free_ = e->next;
+  e->key = key;
+  e->value = value;
+  e->next = c->blocks[block];
+  c->blocks[block] = e;
+}
+
+template <class Value>
+bool AddressMap<Value>::FindAndRemove(Key key, Value* removed_value) {
+  const Number num = reinterpret_cast<Number>(key);
+  Cluster* const c = FindCluster(num, false/*do not create*/);
+  if (c != NULL) {
+    for (Entry** p = &c->blocks[BlockID(num)]; *p != NULL; p = &(*p)->next) {
+      Entry* e = *p;
+      if (e->key == key) {
+        *removed_value = e->value;
+        *p = e->next;         // Remove e from linked-list
+        e->next = free_;      // Add e to free-list
+        free_ = e;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+template <class Value>
+const Value* AddressMap<Value>::FindInside(ValueSizeFunc size_func,
+                                           size_t max_size,
+                                           Key key,
+                                           Key* res_key) {
+  const Number key_num = reinterpret_cast<Number>(key);
+  Number num = key_num;  // we'll move this to move back through the clusters
+  while (1) {
+    const Cluster* c = FindCluster(num, false/*do not create*/);
+    if (c != NULL) {
+      while (1) {
+        const int block = BlockID(num);
+        bool had_smaller_key = false;
+        for (const Entry* e = c->blocks[block]; e != NULL; e = e->next) {
+          const Number e_num = reinterpret_cast<Number>(e->key);
+          if (e_num <= key_num) {
+            if (e_num == key_num  ||  // to handle 0-sized ranges
+                key_num < e_num + (*size_func)(e->value)) {
+              *res_key = e->key;
+              return &e->value;
+            }
+            had_smaller_key = true;
+          }
+        }
+        if (had_smaller_key) return NULL;  // got a range before 'key'
+                                           // and it did not contain 'key'
+        if (block == 0) break;
+        // try address-wise previous block
+        num |= kBlockSize - 1;  // start at the last addr of prev block
+        num -= kBlockSize;
+        if (key_num - num > max_size) return NULL;
+      }
+    }
+    if (num < kClusterSize) return NULL;  // first cluster
+    // go to address-wise previous cluster to try
+    num |= kClusterSize - 1;  // start at the last block of previous cluster
+    num -= kClusterSize;
+    if (key_num - num > max_size) return NULL;
+      // Having max_size to limit the search is crucial: else
+      // we have to traverse a lot of empty clusters (or blocks).
+      // We can avoid needing max_size if we put clusters into
+      // a search tree, but performance suffers considerably
+      // if we use this approach by using stl::set.
+  }
+}
+
+template <class Value>
+template <class Type>
+inline void AddressMap<Value>::Iterate(void (*callback)(Key, Value*, Type),
+                                       Type arg) const {
+  // We could optimize this by traversing only non-empty clusters and/or blocks
+  // but it does not speed up heap-checker noticeably.
+  for (int h = 0; h < kHashSize; ++h) {
+    for (const Cluster* c = hashtable_[h]; c != NULL; c = c->next) {
+      for (int b = 0; b < kClusterBlocks; ++b) {
+        for (Entry* e = c->blocks[b]; e != NULL; e = e->next) {
+          callback(e->key, &e->value, arg);
+        }
+      }
+    }
+  }
+}
+
+#endif  // BASE_ADDRESSMAP_INL_H_

diff --git a/src/base/arm_instruction_set_select.h b/src/base/arm_instruction_set_select.h
new file mode 100644
index 0000000..6fde685
--- /dev/null
+++ b/src/base/arm_instruction_set_select.h

@@ -0,0 +1,84 @@
+// Copyright (c) 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: Alexander Levitskiy
+//
+// Generalizes the plethora of ARM flavors available to an easier to manage set
+// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto
+
+#ifndef ARM_INSTRUCTION_SET_SELECT_H_
+#define ARM_INSTRUCTION_SET_SELECT_H_
+
+#if defined(__ARM_ARCH_8A__)
+# define ARMV8 1
+#endif
+
+#if defined(ARMV8) || \
+    defined(__ARM_ARCH_7__) || \
+    defined(__ARM_ARCH_7R__) || \
+    defined(__ARM_ARCH_7A__)
+# define ARMV7 1
+#endif
+
+#if defined(ARMV7) || \
+    defined(__ARM_ARCH_6__) || \
+    defined(__ARM_ARCH_6J__) || \
+    defined(__ARM_ARCH_6K__) || \
+    defined(__ARM_ARCH_6Z__) || \
+    defined(__ARM_ARCH_6T2__) || \
+    defined(__ARM_ARCH_6ZK__)
+# define ARMV6 1
+#endif
+
+#if defined(ARMV6) || \
+    defined(__ARM_ARCH_5T__) || \
+    defined(__ARM_ARCH_5E__) || \
+    defined(__ARM_ARCH_5TE__) || \
+    defined(__ARM_ARCH_5TEJ__)
+# define ARMV5 1
+#endif
+
+#if defined(ARMV5) || \
+    defined(__ARM_ARCH_4__) || \
+    defined(__ARM_ARCH_4T__)
+# define ARMV4 1
+#endif
+
+#if defined(ARMV4) || \
+    defined(__ARM_ARCH_3__) || \
+    defined(__ARM_ARCH_3M__)
+# define ARMV3 1
+#endif
+
+#if defined(ARMV3) || \
+    defined(__ARM_ARCH_2__)
+# define ARMV2 1
+#endif
+
+#endif  // ARM_INSTRUCTION_SET_SELECT_H_

diff --git a/src/base/atomicops-internals-arm-generic.h b/src/base/atomicops-internals-arm-generic.h
new file mode 100644
index 0000000..d0f9413
--- /dev/null
+++ b/src/base/atomicops-internals-arm-generic.h

@@ -0,0 +1,228 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2003, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// ---
+//
+// Author: Lei Zhang, Sasha Levitskiy
+//
+// This file is an internal atomic implementation, use base/atomicops.h instead.
+//
+// LinuxKernelCmpxchg is from Google Gears.
+
+#ifndef BASE_ATOMICOPS_INTERNALS_ARM_GENERIC_H_
+#define BASE_ATOMICOPS_INTERNALS_ARM_GENERIC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "base/basictypes.h"
+
+typedef int32_t Atomic32;
+
+namespace base {
+namespace subtle {
+
+typedef int64_t Atomic64;
+
+// 0xffff0fc0 is the hard coded address of a function provided by
+// the kernel which implements an atomic compare-exchange. On older
+// ARM architecture revisions (pre-v6) this may be implemented using
+// a syscall. This address is stable, and in active use (hard coded)
+// by at least glibc-2.7 and the Android C library.
+// pLinuxKernelCmpxchg has both acquire and release barrier sematincs.
+typedef Atomic32 (*LinuxKernelCmpxchgFunc)(Atomic32 old_value,
+                                           Atomic32 new_value,
+                                           volatile Atomic32* ptr);
+LinuxKernelCmpxchgFunc pLinuxKernelCmpxchg ATTRIBUTE_WEAK =
+    (LinuxKernelCmpxchgFunc) 0xffff0fc0;
+
+typedef void (*LinuxKernelMemoryBarrierFunc)(void);
+LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier ATTRIBUTE_WEAK =
+    (LinuxKernelMemoryBarrierFunc) 0xffff0fa0;
+
+
+inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
+                                         Atomic32 old_value,
+                                         Atomic32 new_value) {
+  Atomic32 prev_value = *ptr;
+  do {
+    if (!pLinuxKernelCmpxchg(old_value, new_value,
+                             const_cast<Atomic32*>(ptr))) {
+      return old_value;
+    }
+    prev_value = *ptr;
+  } while (prev_value == old_value);
+  return prev_value;
+}
+
+inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
+                                         Atomic32 new_value) {
+  Atomic32 old_value;
+  do {
+    old_value = *ptr;
+  } while (pLinuxKernelCmpxchg(old_value, new_value,
+                               const_cast<Atomic32*>(ptr)));
+  return old_value;
+}
+
+inline Atomic32 Acquire_AtomicExchange(volatile Atomic32* ptr,
+                                       Atomic32 new_value) {
+  // pLinuxKernelCmpxchg already has acquire and release barrier semantics.
+  return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+inline Atomic32 Release_AtomicExchange(volatile Atomic32* ptr,
+                                       Atomic32 new_value) {
+  // pLinuxKernelCmpxchg already has acquire and release barrier semantics.
+  return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+}
+
+inline void MemoryBarrier() {
+  pLinuxKernelMemoryBarrier();
+}
+
+inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+  MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
+  MemoryBarrier();
+  *ptr = value;
+}
+
+inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) {
+  return *ptr;
+}
+
+inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
+  Atomic32 value = *ptr;
+  MemoryBarrier();
+  return value;
+}
+
+inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
+  MemoryBarrier();
+  return *ptr;
+}
+
+
+// 64-bit versions are not implemented yet.
+
+inline void NotImplementedFatalError(const char *function_name) {
+  fprintf(stderr, "64-bit %s() not implemented on this platform\n",
+          function_name);
+  abort();
+}
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+  NotImplementedFatalError("NoBarrier_CompareAndSwap");
+  return 0;
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_value) {
+  NotImplementedFatalError("NoBarrier_AtomicExchange");
+  return 0;
+}
+
+inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value) {
+  // pLinuxKernelCmpxchg already has acquire and release barrier semantics.
+  return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value) {
+  // pLinuxKernelCmpxchg already has acquire and release barrier semantics.
+  return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NotImplementedFatalError("NoBarrier_Store");
+}
+
+inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NotImplementedFatalError("Acquire_Store64");
+}
+
+inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NotImplementedFatalError("Release_Store");
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  NotImplementedFatalError("NoBarrier_Load");
+  return 0;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) {
+  NotImplementedFatalError("Atomic64 Acquire_Load");
+  return 0;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64* ptr) {
+  NotImplementedFatalError("Atomic64 Release_Load");
+  return 0;
+}
+
+inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  NotImplementedFatalError("Atomic64 Acquire_CompareAndSwap");
+  return 0;
+}
+
+inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  NotImplementedFatalError("Atomic64 Release_CompareAndSwap");
+  return 0;
+}
+
+}  // namespace base::subtle
+}  // namespace base
+
+#endif  // BASE_ATOMICOPS_INTERNALS_ARM_GENERIC_H_

diff --git a/src/base/atomicops-internals-arm-v6plus.h b/src/base/atomicops-internals-arm-v6plus.h
new file mode 100644
index 0000000..35f1048
--- /dev/null
+++ b/src/base/atomicops-internals-arm-v6plus.h

@@ -0,0 +1,330 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// ---
+//
+// Author: Sasha Levitskiy
+// based on atomicops-internals by Sanjay Ghemawat
+//
+// This file is an internal atomic implementation, use base/atomicops.h instead.
+//
+// This code implements ARM atomics for architectures V6 and  newer.
+
+#ifndef BASE_ATOMICOPS_INTERNALS_ARM_V6PLUS_H_
+#define BASE_ATOMICOPS_INTERNALS_ARM_V6PLUS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "base/basictypes.h"  // For COMPILE_ASSERT
+
+// The LDREXD and STREXD instructions in ARM all v7 variants or above.  In v6,
+// only some variants support it.  For simplicity, we only use exclusive
+// 64-bit load/store in V7 or above.
+#if defined(ARMV7)
+# define BASE_ATOMICOPS_HAS_LDREXD_AND_STREXD
+#endif
+
+typedef int32_t Atomic32;
+
+namespace base {
+namespace subtle {
+
+typedef int64_t Atomic64;
+
+// 32-bit low-level ops
+
+inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
+                                         Atomic32 old_value,
+                                         Atomic32 new_value) {
+  Atomic32 oldval, res;
+  do {
+    __asm__ __volatile__(
+    "ldrex   %1, [%3]\n"
+    "mov     %0, #0\n"
+    "teq     %1, %4\n"
+    // The following IT (if-then) instruction is needed for the subsequent
+    // conditional instruction STREXEQ when compiling in THUMB mode.
+    // In ARM mode, the compiler/assembler will not generate any code for it.
+    "it      eq\n"
+    "strexeq %0, %5, [%3]\n"
+        : "=&r" (res), "=&r" (oldval), "+Qo" (*ptr)
+        : "r" (ptr), "Ir" (old_value), "r" (new_value)
+        : "cc");
+  } while (res);
+  return oldval;
+}
+
+inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
+                                         Atomic32 new_value) {
+  Atomic32 tmp, old;
+  __asm__ __volatile__(
+      "1:\n"
+      "ldrex  %1, [%2]\n"
+      "strex  %0, %3, [%2]\n"
+      "teq    %0, #0\n"
+      "bne    1b"
+      : "=&r" (tmp), "=&r" (old)
+      : "r" (ptr), "r" (new_value)
+      : "cc", "memory");
+  return old;
+}
+
+inline void MemoryBarrier() {
+#if !defined(ARMV7)
+  uint32_t dest = 0;
+  __asm__ __volatile__("mcr p15,0,%0,c7,c10,5" :"=&r"(dest) : : "memory");
+#else
+  __asm__ __volatile__("dmb" : : : "memory");
+#endif
+}
+
+inline Atomic32 Acquire_AtomicExchange(volatile Atomic32* ptr,
+                                       Atomic32 new_value) {
+  Atomic32 old_value = NoBarrier_AtomicExchange(ptr, new_value);
+  MemoryBarrier();
+  return old_value;
+}
+
+inline Atomic32 Release_AtomicExchange(volatile Atomic32* ptr,
+                                       Atomic32 new_value) {
+  MemoryBarrier();
+  return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  Atomic32 value = NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+  MemoryBarrier();
+  return value;
+}
+
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  MemoryBarrier();
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+  MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
+  MemoryBarrier();
+  *ptr = value;
+}
+
+inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) {
+  return *ptr;
+}
+
+inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
+  Atomic32 value = *ptr;
+  MemoryBarrier();
+  return value;
+}
+
+inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
+  MemoryBarrier();
+  return *ptr;
+}
+
+// 64-bit versions are only available if LDREXD and STREXD instructions
+// are available.
+#ifdef BASE_ATOMICOPS_HAS_LDREXD_AND_STREXD
+
+#define BASE_HAS_ATOMIC64 1
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+  Atomic64 oldval, res;
+  do {
+    __asm__ __volatile__(
+    "ldrexd   %1, [%3]\n"
+    "mov      %0, #0\n"
+    "teq      %Q1, %Q4\n"
+    // The following IT (if-then) instructions are needed for the subsequent
+    // conditional instructions when compiling in THUMB mode.
+    // In ARM mode, the compiler/assembler will not generate any code for it.
+    "it       eq\n"
+    "teqeq    %R1, %R4\n"
+    "it       eq\n"
+    "strexdeq %0, %5, [%3]\n"
+        : "=&r" (res), "=&r" (oldval), "+Q" (*ptr)
+        : "r" (ptr), "Ir" (old_value), "r" (new_value)
+        : "cc");
+  } while (res);
+  return oldval;
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_value) {
+  int store_failed;
+  Atomic64 old;
+  __asm__ __volatile__(
+      "1:\n"
+      "ldrexd  %1, [%2]\n"
+      "strexd  %0, %3, [%2]\n"
+      "teq     %0, #0\n"
+      "bne     1b"
+      : "=&r" (store_failed), "=&r" (old)
+      : "r" (ptr), "r" (new_value)
+      : "cc", "memory");
+  return old;
+}
+
+inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value) {
+  Atomic64 old_value = NoBarrier_AtomicExchange(ptr, new_value);
+  MemoryBarrier();
+  return old_value;
+}
+
+inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value) {
+  MemoryBarrier();
+  return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  int store_failed;
+  Atomic64 dummy;
+  __asm__ __volatile__(
+      "1:\n"
+      // Dummy load to lock cache line.
+      "ldrexd  %1, [%3]\n"
+      "strexd  %0, %2, [%3]\n"
+      "teq     %0, #0\n"
+      "bne     1b"
+      : "=&r" (store_failed), "=&r"(dummy)
+      : "r"(value), "r" (ptr)
+      : "cc", "memory");
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  Atomic64 res;
+  __asm__ __volatile__(
+  "ldrexd   %0, [%1]\n"
+  "clrex\n"
+      : "=r" (res)
+      : "r"(ptr), "Q"(*ptr));
+  return res;
+}
+
+#else // BASE_ATOMICOPS_HAS_LDREXD_AND_STREXD
+
+inline void NotImplementedFatalError(const char *function_name) {
+  fprintf(stderr, "64-bit %s() not implemented on this platform\n",
+          function_name);
+  abort();
+}
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+  NotImplementedFatalError("NoBarrier_CompareAndSwap");
+  return 0;
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_value) {
+  NotImplementedFatalError("NoBarrier_AtomicExchange");
+  return 0;
+}
+
+inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value) {
+  NotImplementedFatalError("Acquire_AtomicExchange");
+  return 0;
+}
+
+inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value) {
+  NotImplementedFatalError("Release_AtomicExchange");
+  return 0;
+}
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NotImplementedFatalError("NoBarrier_Store");
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  NotImplementedFatalError("NoBarrier_Load");
+  return 0;
+}
+
+#endif // BASE_ATOMICOPS_HAS_LDREXD_AND_STREXD
+
+inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NoBarrier_Store(ptr, value);
+  MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) {
+  MemoryBarrier();
+  NoBarrier_Store(ptr, value);
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) {
+  Atomic64 value = NoBarrier_Load(ptr);
+  MemoryBarrier();
+  return value;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64* ptr) {
+  MemoryBarrier();
+  return NoBarrier_Load(ptr);
+}
+
+inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  Atomic64 value = NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+  MemoryBarrier();
+  return value;
+}
+
+inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  MemoryBarrier();
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+}  // namespace subtle ends
+}  // namespace base ends
+
+#endif  // BASE_ATOMICOPS_INTERNALS_ARM_V6PLUS_H_

diff --git a/src/base/atomicops-internals-gcc.h b/src/base/atomicops-internals-gcc.h
new file mode 100644
index 0000000..f8d2786
--- /dev/null
+++ b/src/base/atomicops-internals-gcc.h

@@ -0,0 +1,203 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2014, Linaro
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// ---
+//
+// Author: Riku Voipio, riku.voipio@linaro.org
+//
+// atomic primitives implemented with gcc atomic intrinsics:
+// http://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
+//
+
+#ifndef BASE_ATOMICOPS_INTERNALS_GCC_GENERIC_H_
+#define BASE_ATOMICOPS_INTERNALS_GCC_GENERIC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "base/basictypes.h"
+
+typedef int32_t Atomic32;
+
+namespace base {
+namespace subtle {
+
+typedef int64_t Atomic64;
+
+inline void MemoryBarrier() {
+    __sync_synchronize();
+}
+
+inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
+                                         Atomic32 old_value,
+                                         Atomic32 new_value) {
+  Atomic32 prev_value = old_value;
+  __atomic_compare_exchange_n(ptr, &prev_value, new_value, 
+          0, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  return prev_value;
+}
+
+inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
+                                         Atomic32 new_value) {
+  return __atomic_exchange_n(const_cast<Atomic32*>(ptr), new_value, __ATOMIC_RELAXED);
+}
+
+inline Atomic32 Acquire_AtomicExchange(volatile Atomic32* ptr,
+                                       Atomic32 new_value) {
+  return __atomic_exchange_n(const_cast<Atomic32*>(ptr), new_value,  __ATOMIC_ACQUIRE);
+}
+
+inline Atomic32 Release_AtomicExchange(volatile Atomic32* ptr,
+                                       Atomic32 new_value) {
+  return __atomic_exchange_n(const_cast<Atomic32*>(ptr), new_value, __ATOMIC_RELEASE);
+}
+
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  Atomic32 prev_value = old_value;
+  __atomic_compare_exchange_n(ptr, &prev_value, new_value, 
+          0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+  return prev_value;
+}
+
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  Atomic32 prev_value = old_value;
+  __atomic_compare_exchange_n(ptr, &prev_value, new_value, 
+          0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+  return prev_value;
+}
+
+inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+  MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
+  MemoryBarrier();
+  *ptr = value;
+}
+
+inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) {
+  return *ptr;
+}
+
+inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
+  Atomic32 value = *ptr;
+  MemoryBarrier();
+  return value;
+}
+
+inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
+  MemoryBarrier();
+  return *ptr;
+}
+
+// 64-bit versions
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+  Atomic64 prev_value = old_value;
+  __atomic_compare_exchange_n(ptr, &prev_value, new_value, 
+          0, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  return prev_value;
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_value) {
+  return __atomic_exchange_n(const_cast<Atomic64*>(ptr), new_value, __ATOMIC_RELAXED);
+}
+
+inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value) {
+  return __atomic_exchange_n(const_cast<Atomic64*>(ptr), new_value,  __ATOMIC_ACQUIRE);
+}
+
+inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value) {
+  return __atomic_exchange_n(const_cast<Atomic64*>(ptr), new_value, __ATOMIC_RELEASE);
+}
+
+inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  Atomic64 prev_value = old_value;
+  __atomic_compare_exchange_n(ptr, &prev_value, new_value, 
+          0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+  return prev_value;
+}
+
+inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  Atomic64 prev_value = old_value;
+  __atomic_compare_exchange_n(ptr, &prev_value, new_value, 
+          0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+  return prev_value;
+}
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) {
+  *ptr = value;
+  MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) {
+  MemoryBarrier();
+  *ptr = value;
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  return *ptr;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) {
+  Atomic64 value = *ptr;
+  MemoryBarrier();
+  return value;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64* ptr) {
+  MemoryBarrier();
+  return *ptr;
+}
+
+}  // namespace base::subtle
+}  // namespace base
+
+#endif  // BASE_ATOMICOPS_INTERNALS_GCC_GENERIC_H_

diff --git a/src/base/atomicops-internals-linuxppc.h b/src/base/atomicops-internals-linuxppc.h
new file mode 100644
index 0000000..b52fdf0
--- /dev/null
+++ b/src/base/atomicops-internals-linuxppc.h

@@ -0,0 +1,437 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2008, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ */
+
+// Implementation of atomic operations for ppc-linux.  This file should not
+// be included directly.  Clients should instead include
+// "base/atomicops.h".
+
+#ifndef BASE_ATOMICOPS_INTERNALS_LINUXPPC_H_
+#define BASE_ATOMICOPS_INTERNALS_LINUXPPC_H_
+
+typedef int32_t Atomic32;
+
+#ifdef __PPC64__
+#define BASE_HAS_ATOMIC64 1
+#endif
+
+namespace base {
+namespace subtle {
+
+static inline void _sync(void) {
+  __asm__ __volatile__("sync": : : "memory");
+}
+
+static inline void _lwsync(void) {
+  // gcc defines __NO_LWSYNC__ when appropriate; see
+  //    http://gcc.gnu.org/ml/gcc-patches/2006-11/msg01238.html
+#ifdef __NO_LWSYNC__
+  __asm__ __volatile__("msync": : : "memory");
+#else
+  __asm__ __volatile__("lwsync": : : "memory");
+#endif
+}
+
+static inline void _isync(void) {
+  __asm__ __volatile__("isync": : : "memory");
+}
+
+static inline Atomic32 OSAtomicAdd32(Atomic32 amount, Atomic32 *value) {
+  Atomic32 t;
+  __asm__ __volatile__(
+"1:		lwarx   %0,0,%3\n\
+		add     %0,%2,%0\n\
+		stwcx.  %0,0,%3 \n\
+		bne-    1b"
+		: "=&r" (t), "+m" (*value)
+		: "r" (amount), "r" (value)
+                : "cc");
+  return t;
+}
+
+static inline Atomic32 OSAtomicAdd32Barrier(Atomic32 amount, Atomic32 *value) {
+  Atomic32 t;
+  _lwsync();
+  t = OSAtomicAdd32(amount, value);
+  // This is based on the code snippet in the architecture manual (Vol
+  // 2, Appendix B).  It's a little tricky: correctness depends on the
+  // fact that the code right before this (in OSAtomicAdd32) has a
+  // conditional branch with a data dependency on the update.
+  // Otherwise, we'd have to use sync.
+  _isync();
+  return t;
+}
+
+static inline bool OSAtomicCompareAndSwap32(Atomic32 old_value,
+                                            Atomic32 new_value,
+                                            Atomic32 *value) {
+  Atomic32 prev;
+  __asm__ __volatile__(
+"1:		lwarx   %0,0,%2\n\
+		cmpw    0,%0,%3\n\
+		bne-    2f\n\
+		stwcx.  %4,0,%2\n\
+		bne-    1b\n\
+2:"
+                : "=&r" (prev), "+m" (*value)
+                : "r" (value), "r" (old_value), "r" (new_value)
+                : "cc");
+  return prev == old_value;
+}
+
+static inline Atomic32 OSAtomicCompareAndSwap32Acquire(Atomic32 old_value,
+                                                       Atomic32 new_value,
+                                                       Atomic32 *value) {
+  Atomic32 t;
+  t = OSAtomicCompareAndSwap32(old_value, new_value, value);
+  // This is based on the code snippet in the architecture manual (Vol
+  // 2, Appendix B).  It's a little tricky: correctness depends on the
+  // fact that the code right before this (in
+  // OSAtomicCompareAndSwap32) has a conditional branch with a data
+  // dependency on the update.  Otherwise, we'd have to use sync.
+  _isync();
+  return t;
+}
+
+static inline Atomic32 OSAtomicCompareAndSwap32Release(Atomic32 old_value,
+                                                       Atomic32 new_value,
+                                                       Atomic32 *value) {
+  _lwsync();
+  return OSAtomicCompareAndSwap32(old_value, new_value, value);
+}
+
+typedef int64_t Atomic64;
+
+inline void MemoryBarrier() {
+  // This can't be _lwsync(); we need to order the immediately
+  // preceding stores against any load that may follow, but lwsync
+  // doesn't guarantee that.
+  _sync();
+}
+
+// 32-bit Versions.
+
+inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32 *ptr,
+                                         Atomic32 old_value,
+                                         Atomic32 new_value) {
+  Atomic32 prev_value;
+  do {
+    if (OSAtomicCompareAndSwap32(old_value, new_value,
+                                 const_cast<Atomic32*>(ptr))) {
+      return old_value;
+    }
+    prev_value = *ptr;
+  } while (prev_value == old_value);
+  return prev_value;
+}
+
+inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32 *ptr,
+                                         Atomic32 new_value) {
+  Atomic32 old_value;
+  do {
+    old_value = *ptr;
+  } while (!OSAtomicCompareAndSwap32(old_value, new_value,
+                                     const_cast<Atomic32*>(ptr)));
+  return old_value;
+}
+
+inline Atomic32 Acquire_AtomicExchange(volatile Atomic32 *ptr,
+                                       Atomic32 new_value) {
+  Atomic32 old_value;
+  do {
+    old_value = *ptr;
+  } while (!OSAtomicCompareAndSwap32Acquire(old_value, new_value,
+                                            const_cast<Atomic32*>(ptr)));
+  return old_value;
+}
+
+inline Atomic32 Release_AtomicExchange(volatile Atomic32 *ptr,
+                                       Atomic32 new_value) {
+  Atomic32 old_value;
+  do {
+    old_value = *ptr;
+  } while (!OSAtomicCompareAndSwap32Release(old_value, new_value,
+                                            const_cast<Atomic32*>(ptr)));
+  return old_value;
+}
+
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32 *ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  Atomic32 prev_value;
+  do {
+    if (OSAtomicCompareAndSwap32Acquire(old_value, new_value,
+                                        const_cast<Atomic32*>(ptr))) {
+      return old_value;
+    }
+    prev_value = *ptr;
+  } while (prev_value == old_value);
+  return prev_value;
+}
+
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32 *ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  Atomic32 prev_value;
+  do {
+    if (OSAtomicCompareAndSwap32Release(old_value, new_value,
+                                        const_cast<Atomic32*>(ptr))) {
+      return old_value;
+    }
+    prev_value = *ptr;
+  } while (prev_value == old_value);
+  return prev_value;
+}
+
+#ifdef __PPC64__
+
+// 64-bit Versions.
+
+static inline Atomic64 OSAtomicAdd64(Atomic64 amount, Atomic64 *value) {
+  Atomic64 t;
+  __asm__ __volatile__(
+"1:		ldarx   %0,0,%3\n\
+		add     %0,%2,%0\n\
+		stdcx.  %0,0,%3 \n\
+		bne-    1b"
+		: "=&r" (t), "+m" (*value)
+		: "r" (amount), "r" (value)
+                : "cc");
+  return t;
+}
+
+static inline Atomic64 OSAtomicAdd64Barrier(Atomic64 amount, Atomic64 *value) {
+  Atomic64 t;
+  _lwsync();
+  t = OSAtomicAdd64(amount, value);
+  // This is based on the code snippet in the architecture manual (Vol
+  // 2, Appendix B).  It's a little tricky: correctness depends on the
+  // fact that the code right before this (in OSAtomicAdd64) has a
+  // conditional branch with a data dependency on the update.
+  // Otherwise, we'd have to use sync.
+  _isync();
+  return t;
+}
+
+static inline bool OSAtomicCompareAndSwap64(Atomic64 old_value,
+                                            Atomic64 new_value,
+                                            Atomic64 *value) {
+  Atomic64 prev;
+  __asm__ __volatile__(
+"1:		ldarx   %0,0,%2\n\
+		cmpd    0,%0,%3\n\
+		bne-    2f\n\
+		stdcx.  %4,0,%2\n\
+		bne-    1b\n\
+2:"
+                : "=&r" (prev), "+m" (*value)
+                : "r" (value), "r" (old_value), "r" (new_value)
+                : "cc");
+  return prev == old_value;
+}
+
+static inline Atomic64 OSAtomicCompareAndSwap64Acquire(Atomic64 old_value,
+                                                       Atomic64 new_value,
+                                                       Atomic64 *value) {
+  Atomic64 t;
+  t = OSAtomicCompareAndSwap64(old_value, new_value, value);
+  // This is based on the code snippet in the architecture manual (Vol
+  // 2, Appendix B).  It's a little tricky: correctness depends on the
+  // fact that the code right before this (in
+  // OSAtomicCompareAndSwap64) has a conditional branch with a data
+  // dependency on the update.  Otherwise, we'd have to use sync.
+  _isync();
+  return t;
+}
+
+static inline Atomic64 OSAtomicCompareAndSwap64Release(Atomic64 old_value,
+                                                       Atomic64 new_value,
+                                                       Atomic64 *value) {
+  _lwsync();
+  return OSAtomicCompareAndSwap64(old_value, new_value, value);
+}
+
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64 *ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+  Atomic64 prev_value;
+  do {
+    if (OSAtomicCompareAndSwap64(old_value, new_value,
+                                 const_cast<Atomic64*>(ptr))) {
+      return old_value;
+    }
+    prev_value = *ptr;
+  } while (prev_value == old_value);
+  return prev_value;
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64 *ptr,
+                                         Atomic64 new_value) {
+  Atomic64 old_value;
+  do {
+    old_value = *ptr;
+  } while (!OSAtomicCompareAndSwap64(old_value, new_value,
+                                     const_cast<Atomic64*>(ptr)));
+  return old_value;
+}
+
+inline Atomic64 Acquire_AtomicExchange(volatile Atomic64 *ptr,
+                                       Atomic64 new_value) {
+  Atomic64 old_value;
+  do {
+    old_value = *ptr;
+  } while (!OSAtomicCompareAndSwap64Acquire(old_value, new_value,
+                                            const_cast<Atomic64*>(ptr)));
+  return old_value;
+}
+
+inline Atomic64 Release_AtomicExchange(volatile Atomic64 *ptr,
+                                       Atomic64 new_value) {
+  Atomic64 old_value;
+  do {
+    old_value = *ptr;
+  } while (!OSAtomicCompareAndSwap64Release(old_value, new_value,
+                                            const_cast<Atomic64*>(ptr)));
+  return old_value;
+}
+
+inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64 *ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  Atomic64 prev_value;
+  do {
+    if (OSAtomicCompareAndSwap64Acquire(old_value, new_value,
+                                        const_cast<Atomic64*>(ptr))) {
+      return old_value;
+    }
+    prev_value = *ptr;
+  } while (prev_value == old_value);
+  return prev_value;
+}
+
+inline Atomic64 Release_CompareAndSwap(volatile Atomic64 *ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  Atomic64 prev_value;
+  do {
+    if (OSAtomicCompareAndSwap64Release(old_value, new_value,
+                                        const_cast<Atomic64*>(ptr))) {
+      return old_value;
+    }
+    prev_value = *ptr;
+  } while (prev_value == old_value);
+  return prev_value;
+}
+
+#endif
+
+inline void NoBarrier_Store(volatile Atomic32 *ptr, Atomic32 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic32 *ptr, Atomic32 value) {
+  *ptr = value;
+  // This can't be _lwsync(); we need to order the immediately
+  // preceding stores against any load that may follow, but lwsync
+  // doesn't guarantee that.
+  _sync();
+}
+
+inline void Release_Store(volatile Atomic32 *ptr, Atomic32 value) {
+  _lwsync();
+  *ptr = value;
+}
+
+inline Atomic32 NoBarrier_Load(volatile const Atomic32 *ptr) {
+  return *ptr;
+}
+
+inline Atomic32 Acquire_Load(volatile const Atomic32 *ptr) {
+  Atomic32 value = *ptr;
+  _lwsync();
+  return value;
+}
+
+inline Atomic32 Release_Load(volatile const Atomic32 *ptr) {
+  // This can't be _lwsync(); we need to order the immediately
+  // preceding stores against any load that may follow, but lwsync
+  // doesn't guarantee that.
+  _sync();
+  return *ptr;
+}
+
+#ifdef __PPC64__
+
+// 64-bit Versions.
+
+inline void NoBarrier_Store(volatile Atomic64 *ptr, Atomic64 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic64 *ptr, Atomic64 value) {
+  *ptr = value;
+  // This can't be _lwsync(); we need to order the immediately
+  // preceding stores against any load that may follow, but lwsync
+  // doesn't guarantee that.
+  _sync();
+}
+
+inline void Release_Store(volatile Atomic64 *ptr, Atomic64 value) {
+  _lwsync();
+  *ptr = value;
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64 *ptr) {
+  return *ptr;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64 *ptr) {
+  Atomic64 value = *ptr;
+  _lwsync();
+  return value;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64 *ptr) {
+  // This can't be _lwsync(); we need to order the immediately
+  // preceding stores against any load that may follow, but lwsync
+  // doesn't guarantee that.
+  _sync();
+  return *ptr;
+}
+
+#endif
+
+}   // namespace base::subtle
+}   // namespace base
+
+#endif  // BASE_ATOMICOPS_INTERNALS_LINUXPPC_H_

diff --git a/src/base/atomicops-internals-macosx.h b/src/base/atomicops-internals-macosx.h
new file mode 100644
index 0000000..b5130d4
--- /dev/null
+++ b/src/base/atomicops-internals-macosx.h

@@ -0,0 +1,370 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2006, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Implementation of atomic operations for Mac OS X.  This file should not
+// be included directly.  Clients should instead include
+// "base/atomicops.h".
+
+#ifndef BASE_ATOMICOPS_INTERNALS_MACOSX_H_
+#define BASE_ATOMICOPS_INTERNALS_MACOSX_H_
+
+typedef int32_t Atomic32;
+
+// MacOS uses long for intptr_t, AtomicWord and Atomic32 are always different
+// on the Mac, even when they are the same size.  Similarly, on __ppc64__,
+// AtomicWord and Atomic64 are always different.  Thus, we need explicit
+// casting.
+#ifdef __LP64__
+#define AtomicWordCastType base::subtle::Atomic64
+#else
+#define AtomicWordCastType Atomic32
+#endif
+
+#if defined(__LP64__) || defined(__i386__)
+#define BASE_HAS_ATOMIC64 1  // Use only in tests and base/atomic*
+#endif
+
+#include <libkern/OSAtomic.h>
+
+namespace base {
+namespace subtle {
+
+#if !defined(__LP64__) && defined(__ppc__)
+
+// The Mac 64-bit OSAtomic implementations are not available for 32-bit PowerPC,
+// while the underlying assembly instructions are available only some
+// implementations of PowerPC.
+
+// The following inline functions will fail with the error message at compile
+// time ONLY IF they are called.  So it is safe to use this header if user
+// code only calls AtomicWord and Atomic32 operations.
+//
+// NOTE(vchen): Implementation notes to implement the atomic ops below may
+// be found in "PowerPC Virtual Environment Architecture, Book II,
+// Version 2.02", January 28, 2005, Appendix B, page 46.  Unfortunately,
+// extra care must be taken to ensure data are properly 8-byte aligned, and
+// that data are returned correctly according to Mac OS X ABI specs.
+
+inline int64_t OSAtomicCompareAndSwap64(
+    int64_t oldValue, int64_t newValue, int64_t *theValue) {
+  __asm__ __volatile__(
+      "_OSAtomicCompareAndSwap64_not_supported_for_32_bit_ppc\n\t");
+  return 0;
+}
+
+inline int64_t OSAtomicAdd64(int64_t theAmount, int64_t *theValue) {
+  __asm__ __volatile__(
+      "_OSAtomicAdd64_not_supported_for_32_bit_ppc\n\t");
+  return 0;
+}
+
+inline int64_t OSAtomicCompareAndSwap64Barrier(
+    int64_t oldValue, int64_t newValue, int64_t *theValue) {
+  int64_t prev = OSAtomicCompareAndSwap64(oldValue, newValue, theValue);
+  OSMemoryBarrier();
+  return prev;
+}
+
+inline int64_t OSAtomicAdd64Barrier(
+    int64_t theAmount, int64_t *theValue) {
+  int64_t new_val = OSAtomicAdd64(theAmount, theValue);
+  OSMemoryBarrier();
+  return new_val;
+}
+#endif
+
+typedef int64_t Atomic64;
+
+inline void MemoryBarrier() {
+  OSMemoryBarrier();
+}
+
+// 32-bit Versions.
+
+inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32 *ptr,
+                                         Atomic32 old_value,
+                                         Atomic32 new_value) {
+  Atomic32 prev_value;
+  do {
+    if (OSAtomicCompareAndSwap32(old_value, new_value,
+                                 const_cast<Atomic32*>(ptr))) {
+      return old_value;
+    }
+    prev_value = *ptr;
+  } while (prev_value == old_value);
+  return prev_value;
+}
+
+inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32 *ptr,
+                                         Atomic32 new_value) {
+  Atomic32 old_value;
+  do {
+    old_value = *ptr;
+  } while (!OSAtomicCompareAndSwap32(old_value, new_value,
+                                     const_cast<Atomic32*>(ptr)));
+  return old_value;
+}
+
+inline Atomic32 Acquire_AtomicExchange(volatile Atomic32 *ptr,
+                                       Atomic32 new_value) {
+  Atomic32 old_value;
+  do {
+    old_value = *ptr;
+  } while (!OSAtomicCompareAndSwap32Barrier(old_value, new_value,
+                                            const_cast<Atomic32*>(ptr)));
+  return old_value;
+}
+
+inline Atomic32 Release_AtomicExchange(volatile Atomic32 *ptr,
+                                       Atomic32 new_value) {
+  return Acquire_AtomicExchange(ptr, new_value);
+}
+
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32 *ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  Atomic32 prev_value;
+  do {
+    if (OSAtomicCompareAndSwap32Barrier(old_value, new_value,
+                                        const_cast<Atomic32*>(ptr))) {
+      return old_value;
+    }
+    prev_value = *ptr;
+  } while (prev_value == old_value);
+  return prev_value;
+}
+
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32 *ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return Acquire_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic32 *ptr, Atomic32 value) {
+  *ptr = value;
+  MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic32 *ptr, Atomic32 value) {
+  MemoryBarrier();
+  *ptr = value;
+}
+
+inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) {
+  return *ptr;
+}
+
+inline Atomic32 Acquire_Load(volatile const Atomic32 *ptr) {
+  Atomic32 value = *ptr;
+  MemoryBarrier();
+  return value;
+}
+
+inline Atomic32 Release_Load(volatile const Atomic32 *ptr) {
+  MemoryBarrier();
+  return *ptr;
+}
+
+// 64-bit version
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64 *ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+  Atomic64 prev_value;
+  do {
+    if (OSAtomicCompareAndSwap64(old_value, new_value,
+                                 const_cast<Atomic64*>(ptr))) {
+      return old_value;
+    }
+    prev_value = *ptr;
+  } while (prev_value == old_value);
+  return prev_value;
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64 *ptr,
+                                         Atomic64 new_value) {
+  Atomic64 old_value;
+  do {
+    old_value = *ptr;
+  } while (!OSAtomicCompareAndSwap64(old_value, new_value,
+                                     const_cast<Atomic64*>(ptr)));
+  return old_value;
+}
+
+inline Atomic64 Acquire_AtomicExchange(volatile Atomic64 *ptr,
+                                       Atomic64 new_value) {
+  Atomic64 old_value;
+  do {
+    old_value = *ptr;
+  } while (!OSAtomicCompareAndSwap64Barrier(old_value, new_value,
+                                            const_cast<Atomic64*>(ptr)));
+  return old_value;
+}
+
+inline Atomic64 Release_AtomicExchange(volatile Atomic64 *ptr,
+                                       Atomic64 new_value) {
+  return Acquire_AtomicExchange(ptr, new_value);
+}
+
+inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64 *ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  Atomic64 prev_value;
+  do {
+    if (OSAtomicCompareAndSwap64Barrier(old_value, new_value,
+                                        const_cast<Atomic64*>(ptr))) {
+      return old_value;
+    }
+    prev_value = *ptr;
+  } while (prev_value == old_value);
+  return prev_value;
+}
+
+inline Atomic64 Release_CompareAndSwap(volatile Atomic64 *ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  // The lib kern interface does not distinguish between
+  // Acquire and Release memory barriers; they are equivalent.
+  return Acquire_CompareAndSwap(ptr, old_value, new_value);
+}
+
+#ifdef __LP64__
+
+// 64-bit implementation on 64-bit platform
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic64 *ptr, Atomic64 value) {
+  *ptr = value;
+  MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic64 *ptr, Atomic64 value) {
+  MemoryBarrier();
+  *ptr = value;
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  return *ptr;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64 *ptr) {
+  Atomic64 value = *ptr;
+  MemoryBarrier();
+  return value;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64 *ptr) {
+  MemoryBarrier();
+  return *ptr;
+}
+
+#else
+
+// 64-bit implementation on 32-bit platform
+
+#if defined(__ppc__)
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+   __asm__ __volatile__(
+       "_NoBarrier_Store_not_supported_for_32_bit_ppc\n\t");
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+   __asm__ __volatile__(
+       "_NoBarrier_Load_not_supported_for_32_bit_ppc\n\t");
+   return 0;
+}
+
+#elif defined(__i386__)
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  __asm__ __volatile__("movq %1, %%mm0\n\t"    // Use mmx reg for 64-bit atomic
+                       "movq %%mm0, %0\n\t"  // moves (ptr could be read-only)
+                       "emms\n\t"              // Reset FP registers
+                       : "=m" (*ptr)
+                       : "m" (value)
+                       : // mark the FP stack and mmx registers as clobbered
+                         "st", "st(1)", "st(2)", "st(3)", "st(4)",
+                         "st(5)", "st(6)", "st(7)", "mm0", "mm1",
+                         "mm2", "mm3", "mm4", "mm5", "mm6", "mm7");
+
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  Atomic64 value;
+  __asm__ __volatile__("movq %1, %%mm0\n\t"  // Use mmx reg for 64-bit atomic
+                       "movq %%mm0, %0\n\t"  // moves (ptr could be read-only)
+                       "emms\n\t"            // Reset FP registers
+                       : "=m" (value)
+                       : "m" (*ptr)
+                       : // mark the FP stack and mmx registers as clobbered
+                         "st", "st(1)", "st(2)", "st(3)", "st(4)",
+                         "st(5)", "st(6)", "st(7)", "mm0", "mm1",
+                         "mm2", "mm3", "mm4", "mm5", "mm6", "mm7");
+
+  return value;
+}
+#endif
+
+
+inline void Acquire_Store(volatile Atomic64 *ptr, Atomic64 value) {
+  NoBarrier_Store(ptr, value);
+  MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic64 *ptr, Atomic64 value) {
+  MemoryBarrier();
+  NoBarrier_Store(ptr, value);
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64 *ptr) {
+  Atomic64 value = NoBarrier_Load(ptr);
+  MemoryBarrier();
+  return value;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64 *ptr) {
+  MemoryBarrier();
+  return NoBarrier_Load(ptr);
+}
+#endif  // __LP64__
+
+}   // namespace base::subtle
+}   // namespace base
+
+#endif  // BASE_ATOMICOPS_INTERNALS_MACOSX_H_

diff --git a/src/base/atomicops-internals-mips.h b/src/base/atomicops-internals-mips.h
new file mode 100644
index 0000000..4bfd7f6
--- /dev/null
+++ b/src/base/atomicops-internals-mips.h

@@ -0,0 +1,323 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2013, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Author: Jovan Zelincevic <jovan.zelincevic@imgtec.com>
+// based on atomicops-internals by Sanjay Ghemawat
+
+// This file is an internal atomic implementation, use base/atomicops.h instead.
+//
+// This code implements MIPS atomics.
+
+#ifndef BASE_ATOMICOPS_INTERNALS_MIPS_H_
+#define BASE_ATOMICOPS_INTERNALS_MIPS_H_
+
+#if (_MIPS_ISA == _MIPS_ISA_MIPS64)
+#define BASE_HAS_ATOMIC64 1
+#endif
+
+typedef int32_t Atomic32;
+
+namespace base {
+namespace subtle {
+
+// Atomically execute:
+// result = *ptr;
+// if (*ptr == old_value)
+// *ptr = new_value;
+// return result;
+//
+// I.e., replace "*ptr" with "new_value" if "*ptr" used to be "old_value".
+// Always return the old value of "*ptr"
+//
+// This routine implies no memory barriers.
+inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
+                                         Atomic32 old_value,
+                                         Atomic32 new_value)
+{
+    Atomic32 prev, tmp;
+    __asm__ volatile(
+        ".set   push                \n"
+        ".set   noreorder           \n"
+
+    "1:                             \n"
+        "ll     %0,     %5          \n" // prev = *ptr
+        "bne    %0,     %3,     2f  \n" // if (prev != old_value) goto 2
+        " move  %2,     %4          \n" // tmp = new_value
+        "sc     %2,     %1          \n" // *ptr = tmp (with atomic check)
+        "beqz   %2,     1b          \n" // start again on atomic error
+        " nop                       \n" // delay slot nop
+    "2:                             \n"
+
+        ".set   pop                 \n"
+        : "=&r" (prev), "=m" (*ptr),
+          "=&r" (tmp)
+        : "Ir" (old_value), "r" (new_value),
+          "m" (*ptr)
+        : "memory"
+    );
+    return prev;
+}
+
+// Atomically store new_value into *ptr, returning the previous value held in
+// *ptr. This routine implies no memory barriers.
+inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
+                                         Atomic32 new_value)
+{
+    Atomic32 temp, old;
+    __asm__ volatile(
+        ".set   push                \n"
+        ".set   noreorder           \n"
+
+    "1:                             \n"
+        "ll     %1,     %2          \n" // old = *ptr
+        "move   %0,     %3          \n" // temp = new_value
+        "sc     %0,     %2          \n" // *ptr = temp (with atomic check)
+        "beqz   %0,     1b          \n" // start again on atomic error
+        " nop                       \n" // delay slot nop
+
+        ".set   pop                 \n"
+        : "=&r" (temp), "=&r" (old),
+          "=m" (*ptr)
+        : "r" (new_value), "m" (*ptr)
+        : "memory"
+    );
+    return old;
+}
+
+inline void MemoryBarrier()
+{
+    __asm__ volatile("sync" : : : "memory");
+}
+
+// "Acquire" operations
+// ensure that no later memory access can be reordered ahead of the operation.
+// "Release" operations ensure that no previous memory access can be reordered
+// after the operation. "Barrier" operations have both "Acquire" and "Release"
+// semantics. A MemoryBarrier() has "Barrier" semantics, but does no memory
+// access.
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value)
+{
+    Atomic32 res = NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+    MemoryBarrier();
+    return res;
+}
+
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value)
+{
+    MemoryBarrier();
+    Atomic32 res = NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+    return res;
+}
+
+inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value)
+{
+    *ptr = value;
+}
+
+inline Atomic32 Acquire_AtomicExchange(volatile Atomic32* ptr,
+                                       Atomic32 new_value)
+{
+    Atomic32 old_value = NoBarrier_AtomicExchange(ptr, new_value);
+    MemoryBarrier();
+    return old_value;
+}
+
+inline Atomic32 Release_AtomicExchange(volatile Atomic32* ptr,
+                                       Atomic32 new_value)
+{
+    MemoryBarrier();
+    return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value)
+{
+    *ptr = value;
+    MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic32* ptr, Atomic32 value)
+{
+    MemoryBarrier();
+    *ptr = value;
+}
+
+inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr)
+{
+    return *ptr;
+}
+
+inline Atomic32 Acquire_Load(volatile const Atomic32* ptr)
+{
+    Atomic32 value = *ptr;
+    MemoryBarrier();
+    return value;
+}
+
+inline Atomic32 Release_Load(volatile const Atomic32* ptr)
+{
+    MemoryBarrier();
+    return *ptr;
+}
+
+#if (_MIPS_ISA == _MIPS_ISA_MIPS64) || (_MIPS_SIM == _MIPS_SIM_ABI64)
+
+typedef int64_t Atomic64;
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value)
+{
+    Atomic64 prev, tmp;
+    __asm__ volatile(
+        ".set   push                \n"
+        ".set   noreorder           \n"
+
+    "1:                             \n"
+        "lld    %0,     %5          \n" // prev = *ptr
+        "bne    %0,     %3,     2f  \n" // if (prev != old_value) goto 2
+        " move  %2,     %4          \n" // tmp = new_value
+        "scd    %2,     %1          \n" // *ptr = tmp (with atomic check)
+        "beqz   %2,     1b          \n" // start again on atomic error
+        " nop                       \n" // delay slot nop
+    "2:                             \n"
+
+        ".set   pop                 \n"
+        : "=&r" (prev), "=m" (*ptr),
+          "=&r" (tmp)
+        : "Ir" (old_value), "r" (new_value),
+          "m" (*ptr)
+        : "memory"
+    );
+    return prev;
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_value)
+{
+    Atomic64 temp, old;
+    __asm__ volatile(
+        ".set   push                \n"
+        ".set   noreorder           \n"
+
+    "1:                             \n"
+        "lld    %1,     %2          \n" // old = *ptr
+        "move   %0,     %3          \n" // temp = new_value
+        "scd    %0,     %2          \n" // *ptr = temp (with atomic check)
+        "beqz   %0,     1b          \n" // start again on atomic error
+        " nop                       \n" // delay slot nop
+
+        ".set   pop                 \n"
+        : "=&r" (temp), "=&r" (old),
+          "=m" (*ptr)
+        : "r" (new_value), "m" (*ptr)
+        : "memory"
+    );
+    return old;
+}
+
+inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value)
+{
+    Atomic64 old_value = NoBarrier_AtomicExchange(ptr, new_value);
+    MemoryBarrier();
+    return old_value;
+}
+
+inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value)
+{
+    Atomic64 res = NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+    MemoryBarrier();
+    return res;
+}
+
+inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value)
+{
+    MemoryBarrier();
+    Atomic64 res = NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+    return res;
+}
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value)
+{
+    *ptr = value;
+}
+
+inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value)
+{
+    MemoryBarrier();
+    return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value)
+{
+    *ptr = value;
+    MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic64* ptr, Atomic64 value)
+{
+    MemoryBarrier();
+    *ptr = value;
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr)
+{
+    return *ptr;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64* ptr)
+{
+    Atomic64 value = *ptr;
+    MemoryBarrier();
+    return value;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64* ptr)
+{
+    MemoryBarrier();
+    return *ptr;
+}
+
+#endif
+
+}   // namespace base::subtle
+}   // namespace base
+
+#endif  // BASE_ATOMICOPS_INTERNALS_MIPS_H_

diff --git a/src/base/atomicops-internals-windows.h b/src/base/atomicops-internals-windows.h
new file mode 100644
index 0000000..93ced87
--- /dev/null
+++ b/src/base/atomicops-internals-windows.h

@@ -0,0 +1,457 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2006, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Sanjay Ghemawat
+ */
+
+// Implementation of atomic operations using Windows API
+// functions.  This file should not be included directly.  Clients
+// should instead include "base/atomicops.h".
+
+#ifndef BASE_ATOMICOPS_INTERNALS_WINDOWS_H_
+#define BASE_ATOMICOPS_INTERNALS_WINDOWS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "base/basictypes.h"  // For COMPILE_ASSERT
+
+typedef int32 Atomic32;
+
+#if defined(_WIN64)
+#define BASE_HAS_ATOMIC64 1  // Use only in tests and base/atomic*
+#endif
+
+namespace base {
+namespace subtle {
+
+typedef int64 Atomic64;
+
+// 32-bit low-level operations on any platform
+
+extern "C" {
+// We use windows intrinsics when we can (they seem to be supported
+// well on MSVC 8.0 and above).  Unfortunately, in some
+// environments, <windows.h> and <intrin.h> have conflicting
+// declarations of some other intrinsics, breaking compilation:
+//   http://connect.microsoft.com/VisualStudio/feedback/details/262047
+// Therefore, we simply declare the relevant intrinsics ourself.
+
+// MinGW has a bug in the header files where it doesn't indicate the
+// first argument is volatile -- they're not up to date.  See
+//   http://readlist.com/lists/lists.sourceforge.net/mingw-users/0/3861.html
+// We have to const_cast away the volatile to avoid compiler warnings.
+// TODO(csilvers): remove this once MinGW has updated MinGW/include/winbase.h
+#if defined(__MINGW32__)
+inline LONG FastInterlockedCompareExchange(volatile LONG* ptr,
+                                           LONG newval, LONG oldval) {
+  return ::InterlockedCompareExchange(const_cast<LONG*>(ptr), newval, oldval);
+}
+inline LONG FastInterlockedExchange(volatile LONG* ptr, LONG newval) {
+  return ::InterlockedExchange(const_cast<LONG*>(ptr), newval);
+}
+inline LONG FastInterlockedExchangeAdd(volatile LONG* ptr, LONG increment) {
+  return ::InterlockedExchangeAdd(const_cast<LONG*>(ptr), increment);
+}
+
+#elif _MSC_VER >= 1400   // intrinsics didn't work so well before MSVC 8.0
+// Unfortunately, in some environments, <windows.h> and <intrin.h>
+// have conflicting declarations of some intrinsics, breaking
+// compilation.  So we declare the intrinsics we need ourselves.  See
+//   http://connect.microsoft.com/VisualStudio/feedback/details/262047
+LONG _InterlockedCompareExchange(volatile LONG* ptr, LONG newval, LONG oldval);
+#pragma intrinsic(_InterlockedCompareExchange)
+inline LONG FastInterlockedCompareExchange(volatile LONG* ptr,
+                                           LONG newval, LONG oldval) {
+  return _InterlockedCompareExchange(ptr, newval, oldval);
+}
+
+LONG _InterlockedExchange(volatile LONG* ptr, LONG newval);
+#pragma intrinsic(_InterlockedExchange)
+inline LONG FastInterlockedExchange(volatile LONG* ptr, LONG newval) {
+  return _InterlockedExchange(ptr, newval);
+}
+
+LONG _InterlockedExchangeAdd(volatile LONG* ptr, LONG increment);
+#pragma intrinsic(_InterlockedExchangeAdd)
+inline LONG FastInterlockedExchangeAdd(volatile LONG* ptr, LONG increment) {
+  return _InterlockedExchangeAdd(ptr, increment);
+}
+
+#else
+inline LONG FastInterlockedCompareExchange(volatile LONG* ptr,
+                                           LONG newval, LONG oldval) {
+  return ::InterlockedCompareExchange(ptr, newval, oldval);
+}
+inline LONG FastInterlockedExchange(volatile LONG* ptr, LONG newval) {
+  return ::InterlockedExchange(ptr, newval);
+}
+inline LONG FastInterlockedExchangeAdd(volatile LONG* ptr, LONG increment) {
+  return ::InterlockedExchangeAdd(ptr, increment);
+}
+
+#endif  // ifdef __MINGW32__
+}  // extern "C"
+
+inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
+                                         Atomic32 old_value,
+                                         Atomic32 new_value) {
+  LONG result = FastInterlockedCompareExchange(
+      reinterpret_cast<volatile LONG*>(ptr),
+      static_cast<LONG>(new_value),
+      static_cast<LONG>(old_value));
+  return static_cast<Atomic32>(result);
+}
+
+inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
+                                         Atomic32 new_value) {
+  LONG result = FastInterlockedExchange(
+      reinterpret_cast<volatile LONG*>(ptr),
+      static_cast<LONG>(new_value));
+  return static_cast<Atomic32>(result);
+}
+
+inline Atomic32 Acquire_AtomicExchange(volatile Atomic32* ptr,
+                                       Atomic32 new_value) {
+  // FastInterlockedExchange has both acquire and release memory barriers.
+  return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+inline Atomic32 Release_AtomicExchange(volatile Atomic32* ptr,
+                                       Atomic32 new_value) {
+  // FastInterlockedExchange has both acquire and release memory barriers.
+  return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+}  // namespace base::subtle
+}  // namespace base
+
+
+// In msvc8/vs2005, winnt.h already contains a definition for
+// MemoryBarrier in the global namespace.  Add it there for earlier
+// versions and forward to it from within the namespace.
+#if !(defined(_MSC_VER) && _MSC_VER >= 1400)
+inline void MemoryBarrier() {
+  Atomic32 value = 0;
+  base::subtle::NoBarrier_AtomicExchange(&value, 0);
+                        // actually acts as a barrier in thisd implementation
+}
+#endif
+
+namespace base {
+namespace subtle {
+
+inline void MemoryBarrier() {
+  ::MemoryBarrier();
+}
+
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
+  Acquire_AtomicExchange(ptr, value);
+}
+
+inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value; // works w/o barrier for current Intel chips as of June 2005
+  // See comments in Atomic64 version of Release_Store() below.
+}
+
+inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) {
+  return *ptr;
+}
+
+inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
+  Atomic32 value = *ptr;
+  return value;
+}
+
+inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
+  MemoryBarrier();
+  return *ptr;
+}
+
+// 64-bit operations
+
+#if defined(_WIN64) || defined(__MINGW64__)
+
+// 64-bit low-level operations on 64-bit platform.
+
+COMPILE_ASSERT(sizeof(Atomic64) == sizeof(PVOID), atomic_word_is_atomic);
+
+// These are the intrinsics needed for 64-bit operations.  Similar to the
+// 32-bit case above.
+
+extern "C" {
+#if defined(__MINGW64__)
+inline PVOID FastInterlockedCompareExchangePointer(volatile PVOID* ptr,
+                                                   PVOID newval, PVOID oldval) {
+  return ::InterlockedCompareExchangePointer(const_cast<PVOID*>(ptr),
+                                             newval, oldval);
+}
+inline PVOID FastInterlockedExchangePointer(volatile PVOID* ptr, PVOID newval) {
+  return ::InterlockedExchangePointer(const_cast<PVOID*>(ptr), newval);
+}
+inline LONGLONG FastInterlockedExchangeAdd64(volatile LONGLONG* ptr,
+                                             LONGLONG increment) {
+  return ::InterlockedExchangeAdd64(const_cast<LONGLONG*>(ptr), increment);
+}
+
+#elif _MSC_VER >= 1400   // intrinsics didn't work so well before MSVC 8.0
+// Like above, we need to declare the intrinsics ourselves.
+PVOID _InterlockedCompareExchangePointer(volatile PVOID* ptr,
+                                         PVOID newval, PVOID oldval);
+#pragma intrinsic(_InterlockedCompareExchangePointer)
+inline PVOID FastInterlockedCompareExchangePointer(volatile PVOID* ptr,
+                                                   PVOID newval, PVOID oldval) {
+  return _InterlockedCompareExchangePointer(const_cast<PVOID*>(ptr),
+                                            newval, oldval);
+}
+
+PVOID _InterlockedExchangePointer(volatile PVOID* ptr, PVOID newval);
+#pragma intrinsic(_InterlockedExchangePointer)
+inline PVOID FastInterlockedExchangePointer(volatile PVOID* ptr, PVOID newval) {
+  return _InterlockedExchangePointer(const_cast<PVOID*>(ptr), newval);
+}
+
+LONGLONG _InterlockedExchangeAdd64(volatile LONGLONG* ptr, LONGLONG increment);
+#pragma intrinsic(_InterlockedExchangeAdd64)
+inline LONGLONG FastInterlockedExchangeAdd64(volatile LONGLONG* ptr,
+                                             LONGLONG increment) {
+  return _InterlockedExchangeAdd64(const_cast<LONGLONG*>(ptr), increment);
+}
+
+#else
+inline PVOID FastInterlockedCompareExchangePointer(volatile PVOID* ptr,
+                                                   PVOID newval, PVOID oldval) {
+  return ::InterlockedCompareExchangePointer(ptr, newval, oldval);
+}
+inline PVOID FastInterlockedExchangePointer(volatile PVOID* ptr, PVOID newval) {
+  return ::InterlockedExchangePointer(ptr, newval);
+}
+inline LONGLONG FastInterlockedExchangeAdd64(volatile LONGLONG* ptr,
+                                         LONGLONG increment) {
+  return ::InterlockedExchangeAdd64(ptr, increment);
+}
+
+#endif  // ifdef __MINGW64__
+}  // extern "C"
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+  PVOID result = FastInterlockedCompareExchangePointer(
+    reinterpret_cast<volatile PVOID*>(ptr),
+    reinterpret_cast<PVOID>(new_value), reinterpret_cast<PVOID>(old_value));
+  return reinterpret_cast<Atomic64>(result);
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_value) {
+  PVOID result = FastInterlockedExchangePointer(
+    reinterpret_cast<volatile PVOID*>(ptr),
+    reinterpret_cast<PVOID>(new_value));
+  return reinterpret_cast<Atomic64>(result);
+}
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NoBarrier_AtomicExchange(ptr, value);
+              // acts as a barrier in this implementation
+}
+
+inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) {
+  *ptr = value; // works w/o barrier for current Intel chips as of June 2005
+
+  // When new chips come out, check:
+  //  IA-32 Intel Architecture Software Developer's Manual, Volume 3:
+  //  System Programming Guide, Chatper 7: Multiple-processor management,
+  //  Section 7.2, Memory Ordering.
+  // Last seen at:
+  //   http://developer.intel.com/design/pentium4/manuals/index_new.htm
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  return *ptr;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) {
+  Atomic64 value = *ptr;
+  return value;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64* ptr) {
+  MemoryBarrier();
+  return *ptr;
+}
+
+#else  // defined(_WIN64) || defined(__MINGW64__)
+
+// 64-bit low-level operations on 32-bit platform
+
+// TODO(vchen): The GNU assembly below must be converted to MSVC inline
+// assembly.  Then the file should be renamed to ...-x86-msvc.h, probably.
+
+inline void NotImplementedFatalError(const char *function_name) {
+  fprintf(stderr, "64-bit %s() not implemented on this platform\n",
+          function_name);
+  abort();
+}
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+#if 0 // Not implemented
+  Atomic64 prev;
+  __asm__ __volatile__("movl (%3), %%ebx\n\t"    // Move 64-bit new_value into
+                       "movl 4(%3), %%ecx\n\t"   // ecx:ebx
+                       "lock; cmpxchg8b %1\n\t"  // If edx:eax (old_value) same
+                       : "=A" (prev)             // as contents of ptr:
+                       : "m" (*ptr),             //   ecx:ebx => ptr
+                         "0" (old_value),        // else:
+                         "r" (&new_value)        //   old *ptr => edx:eax
+                       : "memory", "%ebx", "%ecx");
+  return prev;
+#else
+  NotImplementedFatalError("NoBarrier_CompareAndSwap");
+  return 0;
+#endif
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_value) {
+#if 0 // Not implemented
+  __asm__ __volatile__(
+                       "movl (%2), %%ebx\n\t"    // Move 64-bit new_value into
+                       "movl 4(%2), %%ecx\n\t"   // ecx:ebx
+                       "0:\n\t"
+                       "movl %1, %%eax\n\t"      // Read contents of ptr into
+                       "movl 4%1, %%edx\n\t"     // edx:eax
+                       "lock; cmpxchg8b %1\n\t"  // Attempt cmpxchg; if *ptr
+                       "jnz 0b\n\t"              // is no longer edx:eax, loop
+                       : "=A" (new_value)
+                       : "m" (*ptr),
+                         "r" (&new_value)
+                       : "memory", "%ebx", "%ecx");
+  return new_value;  // Now it's the previous value.
+#else
+  NotImplementedFatalError("NoBarrier_AtomicExchange");
+  return 0;
+#endif
+}
+
+inline void NoBarrier_Store(volatile Atomic64* ptrValue, Atomic64 value)
+{
+ 	__asm {
+    	movq mm0, value;  // Use mmx reg for 64-bit atomic moves
+    	mov eax, ptrValue;
+    	movq [eax], mm0;
+    	emms;            // Empty mmx state to enable FP registers
+  	}
+}
+
+inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NoBarrier_AtomicExchange(ptr, value);
+              // acts as a barrier in this implementation
+}
+
+inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NoBarrier_Store(ptr, value);
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptrValue)
+{
+  	Atomic64 value;
+  	__asm {
+    	mov eax, ptrValue;
+    	movq mm0, [eax]; // Use mmx reg for 64-bit atomic moves
+    	movq value, mm0;
+    	emms; // Empty mmx state to enable FP registers
+  }
+  return value;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) {
+  Atomic64 value = NoBarrier_Load(ptr);
+  return value;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64* ptr) {
+  MemoryBarrier();
+  return NoBarrier_Load(ptr);
+}
+
+#endif  // defined(_WIN64) || defined(__MINGW64__)
+
+
+inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value) {
+  // FastInterlockedExchange has both acquire and release memory barriers.
+  return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value) {
+  // FastInterlockedExchange has both acquire and release memory barriers.
+  return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+}  // namespace base::subtle
+}  // namespace base
+
+#endif  // BASE_ATOMICOPS_INTERNALS_WINDOWS_H_

diff --git a/src/base/atomicops-internals-x86.cc b/src/base/atomicops-internals-x86.cc
new file mode 100644
index 0000000..c3391e7
--- /dev/null
+++ b/src/base/atomicops-internals-x86.cc

@@ -0,0 +1,112 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * This module gets enough CPU information to optimize the
+ * atomicops module on x86.
+ */
+
+#include "base/atomicops.h"
+#include "base/basictypes.h"
+#include "base/googleinit.h"
+#include "base/logging.h"
+#include <string.h>
+
+// This file only makes sense with atomicops-internals-x86.h -- it
+// depends on structs that are defined in that file.  If atomicops.h
+// doesn't sub-include that file, then we aren't needed, and shouldn't
+// try to do anything.
+#ifdef BASE_ATOMICOPS_INTERNALS_X86_H_
+
+// Inline cpuid instruction.  In PIC compilations, %ebx contains the address
+// of the global offset table.  To avoid breaking such executables, this code
+// must preserve that register's value across cpuid instructions.
+#if defined(__i386__)
+#define cpuid(a, b, c, d, inp) \
+  asm ("mov %%ebx, %%edi\n"    \
+       "cpuid\n"               \
+       "xchg %%edi, %%ebx\n"   \
+       : "=a" (a), "=D" (b), "=c" (c), "=d" (d) : "a" (inp))
+#elif defined (__x86_64__)
+#define cpuid(a, b, c, d, inp) \
+  asm ("mov %%rbx, %%rdi\n"    \
+       "cpuid\n"               \
+       "xchg %%rdi, %%rbx\n"   \
+       : "=a" (a), "=D" (b), "=c" (c), "=d" (d) : "a" (inp))
+#endif
+
+#if defined(cpuid)        // initialize the struct only on x86
+
+// Set the flags so that code will run correctly and conservatively
+// until InitGoogle() is called.
+struct AtomicOps_x86CPUFeatureStruct AtomicOps_Internalx86CPUFeatures = {
+  false,          // no SSE2
+  false           // no cmpxchg16b
+};
+
+// Initialize the AtomicOps_Internalx86CPUFeatures struct.
+static void AtomicOps_Internalx86CPUFeaturesInit() {
+  uint32 eax;
+  uint32 ebx;
+  uint32 ecx;
+  uint32 edx;
+
+  // Get vendor string (issue CPUID with eax = 0)
+  cpuid(eax, ebx, ecx, edx, 0);
+  char vendor[13];
+  memcpy(vendor, &ebx, 4);
+  memcpy(vendor + 4, &edx, 4);
+  memcpy(vendor + 8, &ecx, 4);
+  vendor[12] = 0;
+
+  // get feature flags in ecx/edx, and family/model in eax
+  cpuid(eax, ebx, ecx, edx, 1);
+
+  int family = (eax >> 8) & 0xf;        // family and model fields
+  int model = (eax >> 4) & 0xf;
+  if (family == 0xf) {                  // use extended family and model fields
+    family += (eax >> 20) & 0xff;
+    model += ((eax >> 16) & 0xf) << 4;
+  }
+
+  // edx bit 26 is SSE2 which we use to tell use whether we can use mfence
+  AtomicOps_Internalx86CPUFeatures.has_sse2 = ((edx >> 26) & 1);
+
+  // ecx bit 13 indicates whether the cmpxchg16b instruction is supported
+  AtomicOps_Internalx86CPUFeatures.has_cmpxchg16b = ((ecx >> 13) & 1);
+}
+
+REGISTER_MODULE_INITIALIZER(atomicops_x86, {
+  AtomicOps_Internalx86CPUFeaturesInit();
+});
+
+#endif
+
+#endif  /* ifdef BASE_ATOMICOPS_INTERNALS_X86_H_ */

diff --git a/src/base/atomicops-internals-x86.h b/src/base/atomicops-internals-x86.h
new file mode 100644
index 0000000..e441ac7
--- /dev/null
+++ b/src/base/atomicops-internals-x86.h

@@ -0,0 +1,391 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2006, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Sanjay Ghemawat
+ */
+
+// Implementation of atomic operations for x86.  This file should not
+// be included directly.  Clients should instead include
+// "base/atomicops.h".
+
+#ifndef BASE_ATOMICOPS_INTERNALS_X86_H_
+#define BASE_ATOMICOPS_INTERNALS_X86_H_
+#include "base/basictypes.h"
+
+typedef int32_t Atomic32;
+#define BASE_HAS_ATOMIC64 1  // Use only in tests and base/atomic*
+
+
+// NOTE(vchen): x86 does not need to define AtomicWordCastType, because it
+// already matches Atomic32 or Atomic64, depending on the platform.
+
+
+// This struct is not part of the public API of this module; clients may not
+// use it.
+// Features of this x86.  Values may not be correct before main() is run,
+// but are set conservatively.
+struct AtomicOps_x86CPUFeatureStruct {
+  bool has_sse2;            // Processor has SSE2.
+  bool has_cmpxchg16b;      // Processor supports cmpxchg16b instruction.
+};
+
+ATTRIBUTE_VISIBILITY_HIDDEN
+extern struct AtomicOps_x86CPUFeatureStruct AtomicOps_Internalx86CPUFeatures;
+
+
+#define ATOMICOPS_COMPILER_BARRIER() __asm__ __volatile__("" : : : "memory")
+
+
+namespace base {
+namespace subtle {
+
+typedef int64_t Atomic64;
+
+// 32-bit low-level operations on any platform.
+
+inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
+                                         Atomic32 old_value,
+                                         Atomic32 new_value) {
+  Atomic32 prev;
+  __asm__ __volatile__("lock; cmpxchgl %1,%2"
+                       : "=a" (prev)
+                       : "q" (new_value), "m" (*ptr), "0" (old_value)
+                       : "memory");
+  return prev;
+}
+
+inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
+                                         Atomic32 new_value) {
+  __asm__ __volatile__("xchgl %1,%0"  // The lock prefix is implicit for xchg.
+                       : "=r" (new_value)
+                       : "m" (*ptr), "0" (new_value)
+                       : "memory");
+  return new_value;  // Now it's the previous value.
+}
+
+inline Atomic32 Acquire_AtomicExchange(volatile Atomic32* ptr,
+                                       Atomic32 new_value) {
+  Atomic32 old_val = NoBarrier_AtomicExchange(ptr, new_value);
+  return old_val;
+}
+
+inline Atomic32 Release_AtomicExchange(volatile Atomic32* ptr,
+                                       Atomic32 new_value) {
+  // xchgl already has release memory barrier semantics.
+  return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  Atomic32 x = NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+  return x;
+}
+
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+}
+
+#if defined(__x86_64__)
+
+// 64-bit implementations of memory barrier can be simpler, because it
+// "mfence" is guaranteed to exist.
+inline void MemoryBarrier() {
+  __asm__ __volatile__("mfence" : : : "memory");
+}
+
+inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
+  *ptr = value;
+  MemoryBarrier();
+}
+
+#else
+
+inline void MemoryBarrier() {
+  if (AtomicOps_Internalx86CPUFeatures.has_sse2) {
+    __asm__ __volatile__("mfence" : : : "memory");
+  } else { // mfence is faster but not present on PIII
+    Atomic32 x = 0;
+    Acquire_AtomicExchange(&x, 0);
+  }
+}
+
+inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
+  if (AtomicOps_Internalx86CPUFeatures.has_sse2) {
+    *ptr = value;
+    __asm__ __volatile__("mfence" : : : "memory");
+  } else {
+    Acquire_AtomicExchange(ptr, value);
+  }
+}
+#endif
+
+inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
+  ATOMICOPS_COMPILER_BARRIER();
+  *ptr = value; // An x86 store acts as a release barrier.
+  // See comments in Atomic64 version of Release_Store(), below.
+}
+
+inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) {
+  return *ptr;
+}
+
+inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
+  Atomic32 value = *ptr; // An x86 load acts as a acquire barrier.
+  // See comments in Atomic64 version of Release_Store(), below.
+  ATOMICOPS_COMPILER_BARRIER();
+  return value;
+}
+
+inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
+  MemoryBarrier();
+  return *ptr;
+}
+
+#if defined(__x86_64__)
+
+// 64-bit low-level operations on 64-bit platform.
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_value,
+                                         Atomic64 new_value) {
+  Atomic64 prev;
+  __asm__ __volatile__("lock; cmpxchgq %1,%2"
+                       : "=a" (prev)
+                       : "q" (new_value), "m" (*ptr), "0" (old_value)
+                       : "memory");
+  return prev;
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_value) {
+  __asm__ __volatile__("xchgq %1,%0"  // The lock prefix is implicit for xchg.
+                       : "=r" (new_value)
+                       : "m" (*ptr), "0" (new_value)
+                       : "memory");
+  return new_value;  // Now it's the previous value.
+}
+
+inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value) {
+  Atomic64 old_val = NoBarrier_AtomicExchange(ptr, new_value);
+  return old_val;
+}
+
+inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_value) {
+  // xchgq already has release memory barrier semantics.
+  return NoBarrier_AtomicExchange(ptr, new_value);
+}
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  *ptr = value;
+}
+
+inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) {
+  *ptr = value;
+  MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) {
+  ATOMICOPS_COMPILER_BARRIER();
+
+  *ptr = value; // An x86 store acts as a release barrier
+                // for current AMD/Intel chips as of Jan 2008.
+                // See also Acquire_Load(), below.
+
+  // When new chips come out, check:
+  //  IA-32 Intel Architecture Software Developer's Manual, Volume 3:
+  //  System Programming Guide, Chatper 7: Multiple-processor management,
+  //  Section 7.2, Memory Ordering.
+  // Last seen at:
+  //   http://developer.intel.com/design/pentium4/manuals/index_new.htm
+  //
+  // x86 stores/loads fail to act as barriers for a few instructions (clflush
+  // maskmovdqu maskmovq movntdq movnti movntpd movntps movntq) but these are
+  // not generated by the compiler, and are rare.  Users of these instructions
+  // need to know about cache behaviour in any case since all of these involve
+  // either flushing cache lines or non-temporal cache hints.
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  return *ptr;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) {
+  Atomic64 value = *ptr; // An x86 load acts as a acquire barrier,
+                         // for current AMD/Intel chips as of Jan 2008.
+                         // See also Release_Store(), above.
+  ATOMICOPS_COMPILER_BARRIER();
+  return value;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64* ptr) {
+  MemoryBarrier();
+  return *ptr;
+}
+
+#else // defined(__x86_64__)
+
+// 64-bit low-level operations on 32-bit platform.
+
+#if !((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
+// For compilers older than gcc 4.1, we use inline asm.
+//
+// Potential pitfalls:
+//
+// 1. %ebx points to Global offset table (GOT) with -fPIC.
+//    We need to preserve this register.
+// 2. When explicit registers are used in inline asm, the
+//    compiler may not be aware of it and might try to reuse
+//    the same register for another argument which has constraints
+//    that allow it ("r" for example).
+
+inline Atomic64 __sync_val_compare_and_swap(volatile Atomic64* ptr,
+                                            Atomic64 old_value,
+                                            Atomic64 new_value) {
+  Atomic64 prev;
+  __asm__ __volatile__("push %%ebx\n\t"
+                       "movl (%3), %%ebx\n\t"    // Move 64-bit new_value into
+                       "movl 4(%3), %%ecx\n\t"   // ecx:ebx
+                       "lock; cmpxchg8b (%1)\n\t"// If edx:eax (old_value) same
+                       "pop %%ebx\n\t"
+                       : "=A" (prev)             // as contents of ptr:
+                       : "D" (ptr),              //   ecx:ebx => ptr
+                         "0" (old_value),        // else:
+                         "S" (&new_value)        //   old *ptr => edx:eax
+                       : "memory", "%ecx");
+  return prev;
+}
+#endif  // Compiler < gcc-4.1
+
+inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                         Atomic64 old_val,
+                                         Atomic64 new_val) {
+  return __sync_val_compare_and_swap(ptr, old_val, new_val);
+}
+
+inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr,
+                                         Atomic64 new_val) {
+  Atomic64 old_val;
+
+  do {
+    old_val = *ptr;
+  } while (__sync_val_compare_and_swap(ptr, old_val, new_val) != old_val);
+
+  return old_val;
+}
+
+inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_val) {
+  Atomic64 old_val = NoBarrier_AtomicExchange(ptr, new_val);
+  return old_val;
+}
+
+inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr,
+                                       Atomic64 new_val) {
+ return NoBarrier_AtomicExchange(ptr, new_val);
+}
+
+inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) {
+  __asm__ __volatile__("movq %1, %%mm0\n\t"  // Use mmx reg for 64-bit atomic
+                       "movq %%mm0, %0\n\t"  // moves (ptr could be read-only)
+                       "emms\n\t"            // Empty mmx state/Reset FP regs
+                       : "=m" (*ptr)
+                       : "m" (value)
+                       : // mark the FP stack and mmx registers as clobbered
+			 "st", "st(1)", "st(2)", "st(3)", "st(4)",
+                         "st(5)", "st(6)", "st(7)", "mm0", "mm1",
+                         "mm2", "mm3", "mm4", "mm5", "mm6", "mm7");
+}
+
+inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) {
+  NoBarrier_Store(ptr, value);
+  MemoryBarrier();
+}
+
+inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) {
+  ATOMICOPS_COMPILER_BARRIER();
+  NoBarrier_Store(ptr, value);
+}
+
+inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) {
+  Atomic64 value;
+  __asm__ __volatile__("movq %1, %%mm0\n\t"  // Use mmx reg for 64-bit atomic
+                       "movq %%mm0, %0\n\t"  // moves (ptr could be read-only)
+                       "emms\n\t"            // Empty mmx state/Reset FP regs
+                       : "=m" (value)
+                       : "m" (*ptr)
+                       : // mark the FP stack and mmx registers as clobbered
+                         "st", "st(1)", "st(2)", "st(3)", "st(4)",
+                         "st(5)", "st(6)", "st(7)", "mm0", "mm1",
+                         "mm2", "mm3", "mm4", "mm5", "mm6", "mm7");
+  return value;
+}
+
+inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) {
+  Atomic64 value = NoBarrier_Load(ptr);
+  ATOMICOPS_COMPILER_BARRIER();
+  return value;
+}
+
+inline Atomic64 Release_Load(volatile const Atomic64* ptr) {
+  MemoryBarrier();
+  return NoBarrier_Load(ptr);
+}
+
+#endif // defined(__x86_64__)
+
+inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  Atomic64 x = NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+  return x;
+}
+
+inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr,
+                                       Atomic64 old_value,
+                                       Atomic64 new_value) {
+  return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
+}
+
+} // namespace base::subtle
+} // namespace base
+
+#undef ATOMICOPS_COMPILER_BARRIER
+
+#endif  // BASE_ATOMICOPS_INTERNALS_X86_H_

diff --git a/src/base/atomicops.h b/src/base/atomicops.h
new file mode 100644
index 0000000..be038f3
--- /dev/null
+++ b/src/base/atomicops.h

@@ -0,0 +1,391 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2006, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Sanjay Ghemawat
+ */
+
+// For atomic operations on statistics counters, see atomic_stats_counter.h.
+// For atomic operations on sequence numbers, see atomic_sequence_num.h.
+// For atomic operations on reference counts, see atomic_refcount.h.
+
+// Some fast atomic operations -- typically with machine-dependent
+// implementations.  This file may need editing as Google code is
+// ported to different architectures.
+
+// The routines exported by this module are subtle.  If you use them, even if
+// you get the code right, it will depend on careful reasoning about atomicity
+// and memory ordering; it will be less readable, and harder to maintain.  If
+// you plan to use these routines, you should have a good reason, such as solid
+// evidence that performance would otherwise suffer, or there being no
+// alternative.  You should assume only properties explicitly guaranteed by the
+// specifications in this file.  You are almost certainly _not_ writing code
+// just for the x86; if you assume x86 semantics, x86 hardware bugs and
+// implementations on other archtectures will cause your code to break.  If you
+// do not know what you are doing, avoid these routines, and use a Mutex.
+//
+// These following lower-level operations are typically useful only to people
+// implementing higher-level synchronization operations like spinlocks,
+// mutexes, and condition-variables.  They combine CompareAndSwap(), a load, or
+// a store with appropriate memory-ordering instructions.  "Acquire" operations
+// ensure that no later memory access can be reordered ahead of the operation.
+// "Release" operations ensure that no previous memory access can be reordered
+// after the operation.  "Barrier" operations have both "Acquire" and "Release"
+// semantics.   A MemoryBarrier() has "Barrier" semantics, but does no memory
+// access.
+//
+// It is incorrect to make direct assignments to/from an atomic variable.
+// You should use one of the Load or Store routines.  The NoBarrier
+// versions are provided when no barriers are needed:
+//   NoBarrier_Store()
+//   NoBarrier_Load()
+// Although there are currently no compiler enforcement, you are encouraged
+// to use these.  Moreover, if you choose to use base::subtle::Atomic64 type,
+// you MUST use one of the Load or Store routines to get correct behavior
+// on 32-bit platforms.
+//
+// The intent is eventually to put all of these routines in namespace
+// base::subtle
+
+#ifndef THREAD_ATOMICOPS_H_
+#define THREAD_ATOMICOPS_H_
+
+#include <config.h>
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+
+// ------------------------------------------------------------------------
+// Include the platform specific implementations of the types
+// and operations listed below.  Implementations are to provide Atomic32
+// and Atomic64 operations. If there is a mismatch between intptr_t and
+// the Atomic32 or Atomic64 types for a platform, the platform-specific header
+// should define the macro, AtomicWordCastType in a clause similar to the
+// following:
+// #if ...pointers are 64 bits...
+// # define AtomicWordCastType base::subtle::Atomic64
+// #else
+// # define AtomicWordCastType Atomic32
+// #endif
+// TODO(csilvers): figure out ARCH_PIII/ARCH_K8 (perhaps via ./configure?)
+// ------------------------------------------------------------------------
+
+#include "base/arm_instruction_set_select.h"
+#define GCC_VERSION (__GNUC__ * 10000                 \
+                     + __GNUC_MINOR__ * 100           \
+                     + __GNUC_PATCHLEVEL__)
+
+#if defined(TCMALLOC_PREFER_GCC_ATOMICS) && defined(__GNUC__) && GCC_VERSION >= 40700
+#include "base/atomicops-internals-gcc.h"
+#elif defined(__MACH__) && defined(__APPLE__)
+#include "base/atomicops-internals-macosx.h"
+#elif defined(__GNUC__) && defined(ARMV6)
+#include "base/atomicops-internals-arm-v6plus.h"
+#elif defined(ARMV3)
+#include "base/atomicops-internals-arm-generic.h"
+#elif defined(__GNUC__) && (defined(__i386) || defined(__x86_64__))
+#include "base/atomicops-internals-x86.h"
+#elif defined(_WIN32)
+#include "base/atomicops-internals-windows.h"
+#elif defined(__linux__) && defined(__PPC__)
+#include "base/atomicops-internals-linuxppc.h"
+#elif defined(__GNUC__) && defined(__mips__)
+#include "base/atomicops-internals-mips.h"
+#elif defined(__GNUC__) && GCC_VERSION >= 40700
+#include "base/atomicops-internals-gcc.h"
+#else
+#error You need to implement atomic operations for this architecture
+#endif
+
+// Signed type that can hold a pointer and supports the atomic ops below, as
+// well as atomic loads and stores.  Instances must be naturally-aligned.
+typedef intptr_t AtomicWord;
+
+#ifdef AtomicWordCastType
+// ------------------------------------------------------------------------
+// This section is needed only when explicit type casting is required to
+// cast AtomicWord to one of the basic atomic types (Atomic64 or Atomic32).
+// It also serves to document the AtomicWord interface.
+// ------------------------------------------------------------------------
+
+namespace base {
+namespace subtle {
+
+// Atomically execute:
+//      result = *ptr;
+//      if (*ptr == old_value)
+//        *ptr = new_value;
+//      return result;
+//
+// I.e., replace "*ptr" with "new_value" if "*ptr" used to be "old_value".
+// Always return the old value of "*ptr"
+//
+// This routine implies no memory barriers.
+inline AtomicWord NoBarrier_CompareAndSwap(volatile AtomicWord* ptr,
+                                           AtomicWord old_value,
+                                           AtomicWord new_value) {
+  return NoBarrier_CompareAndSwap(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr),
+      old_value, new_value);
+}
+
+// Atomically store new_value into *ptr, returning the previous value held in
+// *ptr.  This routine implies no memory barriers.
+inline AtomicWord NoBarrier_AtomicExchange(volatile AtomicWord* ptr,
+                                           AtomicWord new_value) {
+  return NoBarrier_AtomicExchange(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr), new_value);
+}
+
+inline AtomicWord Acquire_AtomicExchange(volatile AtomicWord* ptr,
+                                         AtomicWord new_value) {
+  return Acquire_AtomicExchange(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr), new_value);
+}
+
+inline AtomicWord Release_AtomicExchange(volatile AtomicWord* ptr,
+                                         AtomicWord new_value) {
+  return Release_AtomicExchange(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr), new_value);
+}
+
+inline AtomicWord Acquire_CompareAndSwap(volatile AtomicWord* ptr,
+                                         AtomicWord old_value,
+                                         AtomicWord new_value) {
+  return base::subtle::Acquire_CompareAndSwap(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr),
+      old_value, new_value);
+}
+
+inline AtomicWord Release_CompareAndSwap(volatile AtomicWord* ptr,
+                                         AtomicWord old_value,
+                                         AtomicWord new_value) {
+  return base::subtle::Release_CompareAndSwap(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr),
+      old_value, new_value);
+}
+
+inline void NoBarrier_Store(volatile AtomicWord *ptr, AtomicWord value) {
+  NoBarrier_Store(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr), value);
+}
+
+inline void Acquire_Store(volatile AtomicWord* ptr, AtomicWord value) {
+  return base::subtle::Acquire_Store(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr), value);
+}
+
+inline void Release_Store(volatile AtomicWord* ptr, AtomicWord value) {
+  return base::subtle::Release_Store(
+      reinterpret_cast<volatile AtomicWordCastType*>(ptr), value);
+}
+
+inline AtomicWord NoBarrier_Load(volatile const AtomicWord *ptr) {
+  return NoBarrier_Load(
+      reinterpret_cast<volatile const AtomicWordCastType*>(ptr));
+}
+
+inline AtomicWord Acquire_Load(volatile const AtomicWord* ptr) {
+  return base::subtle::Acquire_Load(
+      reinterpret_cast<volatile const AtomicWordCastType*>(ptr));
+}
+
+inline AtomicWord Release_Load(volatile const AtomicWord* ptr) {
+  return base::subtle::Release_Load(
+      reinterpret_cast<volatile const AtomicWordCastType*>(ptr));
+}
+
+}  // namespace base::subtle
+}  // namespace base
+#endif  // AtomicWordCastType
+
+// ------------------------------------------------------------------------
+// Commented out type definitions and method declarations for documentation
+// of the interface provided by this module.
+// ------------------------------------------------------------------------
+
+#if 0
+
+// Signed 32-bit type that supports the atomic ops below, as well as atomic
+// loads and stores.  Instances must be naturally aligned.  This type differs
+// from AtomicWord in 64-bit binaries where AtomicWord is 64-bits.
+typedef int32_t Atomic32;
+
+// Corresponding operations on Atomic32
+namespace base {
+namespace subtle {
+
+// Signed 64-bit type that supports the atomic ops below, as well as atomic
+// loads and stores.  Instances must be naturally aligned.  This type differs
+// from AtomicWord in 32-bit binaries where AtomicWord is 32-bits.
+typedef int64_t Atomic64;
+
+Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
+                                  Atomic32 old_value,
+                                  Atomic32 new_value);
+Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, Atomic32 new_value);
+Atomic32 Acquire_AtomicExchange(volatile Atomic32* ptr, Atomic32 new_value);
+Atomic32 Release_AtomicExchange(volatile Atomic32* ptr, Atomic32 new_value);
+Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
+                                Atomic32 old_value,
+                                Atomic32 new_value);
+Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
+                                Atomic32 old_value,
+                                Atomic32 new_value);
+void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value);
+void Acquire_Store(volatile Atomic32* ptr, Atomic32 value);
+void Release_Store(volatile Atomic32* ptr, Atomic32 value);
+Atomic32 NoBarrier_Load(volatile const Atomic32* ptr);
+Atomic32 Acquire_Load(volatile const Atomic32* ptr);
+Atomic32 Release_Load(volatile const Atomic32* ptr);
+
+// Corresponding operations on Atomic64
+Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr,
+                                  Atomic64 old_value,
+                                  Atomic64 new_value);
+Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, Atomic64 new_value);
+Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr, Atomic64 new_value);
+Atomic64 Release_AtomicExchange(volatile Atomic64* ptr, Atomic64 new_value);
+
+Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr,
+                                Atomic64 old_value,
+                                Atomic64 new_value);
+Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr,
+                                Atomic64 old_value,
+                                Atomic64 new_value);
+void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value);
+void Acquire_Store(volatile Atomic64* ptr, Atomic64 value);
+void Release_Store(volatile Atomic64* ptr, Atomic64 value);
+Atomic64 NoBarrier_Load(volatile const Atomic64* ptr);
+Atomic64 Acquire_Load(volatile const Atomic64* ptr);
+Atomic64 Release_Load(volatile const Atomic64* ptr);
+}  // namespace base::subtle
+}  // namespace base
+
+void MemoryBarrier();
+
+#endif  // 0
+
+
+// ------------------------------------------------------------------------
+// The following are to be deprecated when all uses have been changed to
+// use the base::subtle namespace.
+// ------------------------------------------------------------------------
+
+#ifdef AtomicWordCastType
+// AtomicWord versions to be deprecated
+inline AtomicWord Acquire_CompareAndSwap(volatile AtomicWord* ptr,
+                                         AtomicWord old_value,
+                                         AtomicWord new_value) {
+  return base::subtle::Acquire_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline AtomicWord Release_CompareAndSwap(volatile AtomicWord* ptr,
+                                         AtomicWord old_value,
+                                         AtomicWord new_value) {
+  return base::subtle::Release_CompareAndSwap(ptr, old_value, new_value);
+}
+
+inline void Acquire_Store(volatile AtomicWord* ptr, AtomicWord value) {
+  return base::subtle::Acquire_Store(ptr, value);
+}
+
+inline void Release_Store(volatile AtomicWord* ptr, AtomicWord value) {
+  return base::subtle::Release_Store(ptr, value);
+}
+
+inline AtomicWord Acquire_Load(volatile const AtomicWord* ptr) {
+  return base::subtle::Acquire_Load(ptr);
+}
+
+inline AtomicWord Release_Load(volatile const AtomicWord* ptr) {
+  return base::subtle::Release_Load(ptr);
+}
+#endif  // AtomicWordCastType
+
+// 32-bit Acquire/Release operations to be deprecated.
+
+inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return base::subtle::Acquire_CompareAndSwap(ptr, old_value, new_value);
+}
+inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
+                                       Atomic32 old_value,
+                                       Atomic32 new_value) {
+  return base::subtle::Release_CompareAndSwap(ptr, old_value, new_value);
+}
+inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
+  base::subtle::Acquire_Store(ptr, value);
+}
+inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
+  return base::subtle::Release_Store(ptr, value);
+}
+inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
+  return base::subtle::Acquire_Load(ptr);
+}
+inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
+  return base::subtle::Release_Load(ptr);
+}
+
+#ifdef BASE_HAS_ATOMIC64
+
+// 64-bit Acquire/Release operations to be deprecated.
+
+inline base::subtle::Atomic64 Acquire_CompareAndSwap(
+    volatile base::subtle::Atomic64* ptr,
+    base::subtle::Atomic64 old_value, base::subtle::Atomic64 new_value) {
+  return base::subtle::Acquire_CompareAndSwap(ptr, old_value, new_value);
+}
+inline base::subtle::Atomic64 Release_CompareAndSwap(
+    volatile base::subtle::Atomic64* ptr,
+    base::subtle::Atomic64 old_value, base::subtle::Atomic64 new_value) {
+  return base::subtle::Release_CompareAndSwap(ptr, old_value, new_value);
+}
+inline void Acquire_Store(
+    volatile base::subtle::Atomic64* ptr, base::subtle::Atomic64 value) {
+  base::subtle::Acquire_Store(ptr, value);
+}
+inline void Release_Store(
+    volatile base::subtle::Atomic64* ptr, base::subtle::Atomic64 value) {
+  return base::subtle::Release_Store(ptr, value);
+}
+inline base::subtle::Atomic64 Acquire_Load(
+    volatile const base::subtle::Atomic64* ptr) {
+  return base::subtle::Acquire_Load(ptr);
+}
+inline base::subtle::Atomic64 Release_Load(
+    volatile const base::subtle::Atomic64* ptr) {
+  return base::subtle::Release_Load(ptr);
+}
+
+#endif  // BASE_HAS_ATOMIC64
+
+#endif  // THREAD_ATOMICOPS_H_

diff --git a/src/base/basictypes.h b/src/base/basictypes.h
new file mode 100644
index 0000000..4779611
--- /dev/null
+++ b/src/base/basictypes.h

@@ -0,0 +1,384 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef _BASICTYPES_H_
+#define _BASICTYPES_H_
+
+#include <config.h>
+#include <string.h>       // for memcpy()
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>     // gets us PRId64, etc
+#endif
+
+// To use this in an autoconf setting, make sure you run the following
+// autoconf macros:
+//    AC_HEADER_STDC              /* for stdint_h and inttypes_h */
+//    AC_CHECK_TYPES([__int64])   /* defined in some windows platforms */
+
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>           // uint16_t might be here; PRId64 too.
+#endif
+#ifdef HAVE_STDINT_H
+#include <stdint.h>             // to get uint16_t (ISO naming madness)
+#endif
+#include <sys/types.h>          // our last best hope for uint16_t
+
+// Standard typedefs
+// All Google code is compiled with -funsigned-char to make "char"
+// unsigned.  Google code therefore doesn't need a "uchar" type.
+// TODO(csilvers): how do we make sure unsigned-char works on non-gcc systems?
+typedef signed char         schar;
+typedef int8_t              int8;
+typedef int16_t             int16;
+typedef int32_t             int32;
+typedef int64_t             int64;
+
+// NOTE: unsigned types are DANGEROUS in loops and other arithmetical
+// places.  Use the signed types unless your variable represents a bit
+// pattern (eg a hash value) or you really need the extra bit.  Do NOT
+// use 'unsigned' to express "this value should always be positive";
+// use assertions for this.
+
+typedef uint8_t            uint8;
+typedef uint16_t           uint16;
+typedef uint32_t           uint32;
+typedef uint64_t           uint64;
+
+const uint16 kuint16max = (   (uint16) 0xFFFF);
+const uint32 kuint32max = (   (uint32) 0xFFFFFFFF);
+const uint64 kuint64max = ( (((uint64) kuint32max) << 32) | kuint32max );
+
+const  int8  kint8max   = (   (  int8) 0x7F);
+const  int16 kint16max  = (   ( int16) 0x7FFF);
+const  int32 kint32max  = (   ( int32) 0x7FFFFFFF);
+const  int64 kint64max =  ( ((( int64) kint32max) << 32) | kuint32max );
+
+const  int8  kint8min   = (   (  int8) 0x80);
+const  int16 kint16min  = (   ( int16) 0x8000);
+const  int32 kint32min  = (   ( int32) 0x80000000);
+const  int64 kint64min =  ( ((( int64) kint32min) << 32) | 0 );
+
+// Define the "portable" printf and scanf macros, if they're not
+// already there (via the inttypes.h we #included above, hopefully).
+// Mostly it's old systems that don't support inttypes.h, so we assume
+// they're 32 bit.
+#ifndef PRIx64
+#define PRIx64 "llx"
+#endif
+#ifndef SCNx64
+#define SCNx64 "llx"
+#endif
+#ifndef PRId64
+#define PRId64 "lld"
+#endif
+#ifndef SCNd64
+#define SCNd64 "lld"
+#endif
+#ifndef PRIu64
+#define PRIu64 "llu"
+#endif
+#ifndef PRIxPTR
+#define PRIxPTR "lx"
+#endif
+
+// Also allow for printing of a pthread_t.
+#define GPRIuPTHREAD "lu"
+#define GPRIxPTHREAD "lx"
+#if defined(__CYGWIN__) || defined(__CYGWIN32__) || defined(__APPLE__) || defined(__FreeBSD__)
+#define PRINTABLE_PTHREAD(pthreadt) reinterpret_cast<uintptr_t>(pthreadt)
+#else
+#define PRINTABLE_PTHREAD(pthreadt) pthreadt
+#endif
+
+// A macro to disallow the evil copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#define DISALLOW_EVIL_CONSTRUCTORS(TypeName)    \
+  TypeName(const TypeName&);                    \
+  void operator=(const TypeName&)
+
+// An alternate name that leaves out the moral judgment... :-)
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) DISALLOW_EVIL_CONSTRUCTORS(TypeName)
+
+// The COMPILE_ASSERT macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   COMPILE_ASSERT(sizeof(num_content_type_names) == sizeof(int),
+//                  content_type_names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   COMPILE_ASSERT(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+//
+// Implementation details of COMPILE_ASSERT:
+//
+// - COMPILE_ASSERT works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//     #define COMPILE_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     COMPILE_ASSERT(foo, msg); // not supposed to compile as foo is
+//                               // not a compile-time constant.
+//
+// - By using the type CompileAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     CompileAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     COMPILE_ASSERT(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+template <bool>
+struct CompileAssert {
+};
+
+#ifdef HAVE___ATTRIBUTE__
+# define ATTRIBUTE_UNUSED __attribute__((unused))
+#else
+# define ATTRIBUTE_UNUSED
+#endif
+
+#define COMPILE_ASSERT(expr, msg)                               \
+  typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] ATTRIBUTE_UNUSED
+
+#define arraysize(a)  (sizeof(a) / sizeof(*(a)))
+
+#define OFFSETOF_MEMBER(strct, field)                                   \
+   (reinterpret_cast<char*>(&reinterpret_cast<strct*>(16)->field) -     \
+    reinterpret_cast<char*>(16))
+
+// bit_cast<Dest,Source> implements the equivalent of
+// "*reinterpret_cast<Dest*>(&source)".
+//
+// The reinterpret_cast method would produce undefined behavior
+// according to ISO C++ specification section 3.10 -15 -.
+// bit_cast<> calls memcpy() which is blessed by the standard,
+// especially by the example in section 3.9.
+//
+// Fortunately memcpy() is very fast.  In optimized mode, with a
+// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline
+// code with the minimal amount of data movement.  On a 32-bit system,
+// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8)
+// compiles to two loads and two stores.
+
+template <class Dest, class Source>
+inline Dest bit_cast(const Source& source) {
+  COMPILE_ASSERT(sizeof(Dest) == sizeof(Source), bitcasting_unequal_sizes);
+  Dest dest;
+  memcpy(&dest, &source, sizeof(dest));
+  return dest;
+}
+
+#ifdef HAVE___ATTRIBUTE__
+# define ATTRIBUTE_WEAK      __attribute__((weak))
+# define ATTRIBUTE_NOINLINE  __attribute__((noinline))
+#else
+# define ATTRIBUTE_WEAK
+# define ATTRIBUTE_NOINLINE
+#endif
+
+#if defined(HAVE___ATTRIBUTE__) && defined(__ELF__)
+# define ATTRIBUTE_VISIBILITY_HIDDEN __attribute__((visibility("hidden")))
+#else
+# define ATTRIBUTE_VISIBILITY_HIDDEN
+#endif
+
+// Section attributes are supported for both ELF and Mach-O, but in
+// very different ways.  Here's the API we provide:
+// 1) ATTRIBUTE_SECTION: put this with the declaration of all functions
+//    you want to be in the same linker section
+// 2) DEFINE_ATTRIBUTE_SECTION_VARS: must be called once per unique
+//    name.  You want to make sure this is executed before any
+//    DECLARE_ATTRIBUTE_SECTION_VARS; the easiest way is to put them
+//    in the same .cc file.  Put this call at the global level.
+// 3) INIT_ATTRIBUTE_SECTION_VARS: you can scatter calls to this in
+//    multiple places to help ensure execution before any
+//    DECLARE_ATTRIBUTE_SECTION_VARS.  You must have at least one
+//    DEFINE, but you can have many INITs.  Put each in its own scope.
+// 4) DECLARE_ATTRIBUTE_SECTION_VARS: must be called before using
+//    ATTRIBUTE_SECTION_START or ATTRIBUTE_SECTION_STOP on a name.
+//    Put this call at the global level.
+// 5) ATTRIBUTE_SECTION_START/ATTRIBUTE_SECTION_STOP: call this to say
+//    where in memory a given section is.  All functions declared with
+//    ATTRIBUTE_SECTION are guaranteed to be between START and STOP.
+
+#if defined(HAVE___ATTRIBUTE__) && defined(__ELF__)
+# define ATTRIBUTE_SECTION(name) __attribute__ ((section (#name)))
+
+  // Weak section declaration to be used as a global declaration
+  // for ATTRIBUTE_SECTION_START|STOP(name) to compile and link
+  // even without functions with ATTRIBUTE_SECTION(name).
+# define DECLARE_ATTRIBUTE_SECTION_VARS(name) \
+    extern char __start_##name[] ATTRIBUTE_WEAK; \
+    extern char __stop_##name[] ATTRIBUTE_WEAK
+# define INIT_ATTRIBUTE_SECTION_VARS(name)     // no-op for ELF
+# define DEFINE_ATTRIBUTE_SECTION_VARS(name)   // no-op for ELF
+
+  // Return void* pointers to start/end of a section of code with functions
+  // having ATTRIBUTE_SECTION(name), or 0 if no such function exists.
+  // One must DECLARE_ATTRIBUTE_SECTION(name) for this to compile and link.
+# define ATTRIBUTE_SECTION_START(name) (reinterpret_cast<void*>(__start_##name))
+# define ATTRIBUTE_SECTION_STOP(name) (reinterpret_cast<void*>(__stop_##name))
+# define HAVE_ATTRIBUTE_SECTION_START 1
+
+#elif defined(HAVE___ATTRIBUTE__) && defined(__MACH__)
+# define ATTRIBUTE_SECTION(name) __attribute__ ((section ("__TEXT, " #name)))
+
+#include <mach-o/getsect.h>
+#include <mach-o/dyld.h>
+class AssignAttributeStartEnd {
+ public:
+  AssignAttributeStartEnd(const char* name, char** pstart, char** pend) {
+    // Find out what dynamic library name is defined in
+    if (_dyld_present()) {
+      for (int i = _dyld_image_count() - 1; i >= 0; --i) {
+        const mach_header* hdr = _dyld_get_image_header(i);
+#ifdef MH_MAGIC_64
+        if (hdr->magic == MH_MAGIC_64) {
+          uint64_t len;
+          *pstart = getsectdatafromheader_64((mach_header_64*)hdr,
+                                             "__TEXT", name, &len);
+          if (*pstart) {   // NULL if not defined in this dynamic library
+            *pstart += _dyld_get_image_vmaddr_slide(i);   // correct for reloc
+            *pend = *pstart + len;
+            return;
+          }
+        }
+#endif
+        if (hdr->magic == MH_MAGIC) {
+          uint32_t len;
+          *pstart = getsectdatafromheader(hdr, "__TEXT", name, &len);
+          if (*pstart) {   // NULL if not defined in this dynamic library
+            *pstart += _dyld_get_image_vmaddr_slide(i);   // correct for reloc
+            *pend = *pstart + len;
+            return;
+          }
+        }
+      }
+    }
+    // If we get here, not defined in a dll at all.  See if defined statically.
+    unsigned long len;    // don't ask me why this type isn't uint32_t too...
+    *pstart = getsectdata("__TEXT", name, &len);
+    *pend = *pstart + len;
+  }
+};
+
+#define DECLARE_ATTRIBUTE_SECTION_VARS(name)    \
+  extern char* __start_##name;                  \
+  extern char* __stop_##name
+
+#define INIT_ATTRIBUTE_SECTION_VARS(name)               \
+  DECLARE_ATTRIBUTE_SECTION_VARS(name);                 \
+  static const AssignAttributeStartEnd __assign_##name( \
+    #name, &__start_##name, &__stop_##name)
+
+#define DEFINE_ATTRIBUTE_SECTION_VARS(name)     \
+  char* __start_##name, *__stop_##name;         \
+  INIT_ATTRIBUTE_SECTION_VARS(name)
+
+# define ATTRIBUTE_SECTION_START(name) (reinterpret_cast<void*>(__start_##name))
+# define ATTRIBUTE_SECTION_STOP(name) (reinterpret_cast<void*>(__stop_##name))
+# define HAVE_ATTRIBUTE_SECTION_START 1
+
+#else  // not HAVE___ATTRIBUTE__ && __ELF__, nor HAVE___ATTRIBUTE__ && __MACH__
+# define ATTRIBUTE_SECTION(name)
+# define DECLARE_ATTRIBUTE_SECTION_VARS(name)
+# define INIT_ATTRIBUTE_SECTION_VARS(name)
+# define DEFINE_ATTRIBUTE_SECTION_VARS(name)
+# define ATTRIBUTE_SECTION_START(name) (reinterpret_cast<void*>(0))
+# define ATTRIBUTE_SECTION_STOP(name) (reinterpret_cast<void*>(0))
+
+#endif  // HAVE___ATTRIBUTE__ and __ELF__ or __MACH__
+
+#if defined(HAVE___ATTRIBUTE__)
+# if (defined(__i386__) || defined(__x86_64__))
+#   define CACHELINE_ALIGNED __attribute__((aligned(64)))
+# elif (defined(__PPC__) || defined(__PPC64__))
+#   define CACHELINE_ALIGNED __attribute__((aligned(16)))
+# elif (defined(__arm__))
+#   define CACHELINE_ALIGNED __attribute__((aligned(64)))
+    // some ARMs have shorter cache lines (ARM1176JZF-S is 32 bytes for example) but obviously 64-byte aligned implies 32-byte aligned
+# elif (defined(__mips__))
+#   define CACHELINE_ALIGNED __attribute__((aligned(128)))
+# elif (defined(__aarch64__))
+#   define CACHELINE_ALIGNED __attribute__((aligned(64)))
+    // implementation specific, Cortex-A53 and 57 should have 64 bytes
+# else
+#   error Could not determine cache line length - unknown architecture
+# endif
+#else
+# define CACHELINE_ALIGNED
+#endif  // defined(HAVE___ATTRIBUTE__) && (__i386__ || __x86_64__)
+
+
+// The following enum should be used only as a constructor argument to indicate
+// that the variable has static storage class, and that the constructor should
+// do nothing to its state.  It indicates to the reader that it is legal to
+// declare a static nistance of the class, provided the constructor is given
+// the base::LINKER_INITIALIZED argument.  Normally, it is unsafe to declare a
+// static variable that has a constructor or a destructor because invocation
+// order is undefined.  However, IF the type can be initialized by filling with
+// zeroes (which the loader does for static variables), AND the destructor also
+// does nothing to the storage, then a constructor declared as
+//       explicit MyClass(base::LinkerInitialized x) {}
+// and invoked as
+//       static MyClass my_variable_name(base::LINKER_INITIALIZED);
+namespace base {
+enum LinkerInitialized { LINKER_INITIALIZED };
+}
+
+#endif  // _BASICTYPES_H_

diff --git a/src/base/commandlineflags.h b/src/base/commandlineflags.h
new file mode 100644
index 0000000..f54776a
--- /dev/null
+++ b/src/base/commandlineflags.h

@@ -0,0 +1,166 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// This file is a compatibility layer that defines Google's version of
+// command line flags that are used for configuration.
+//
+// We put flags into their own namespace.  It is purposefully
+// named in an opaque way that people should have trouble typing
+// directly.  The idea is that DEFINE puts the flag in the weird
+// namespace, and DECLARE imports the flag from there into the
+// current namespace.  The net result is to force people to use
+// DECLARE to get access to a flag, rather than saying
+//   extern bool FLAGS_logtostderr;
+// or some such instead.  We want this so we can put extra
+// functionality (like sanity-checking) in DECLARE if we want,
+// and make sure it is picked up everywhere.
+//
+// We also put the type of the variable in the namespace, so that
+// people can't DECLARE_int32 something that they DEFINE_bool'd
+// elsewhere.
+#ifndef BASE_COMMANDLINEFLAGS_H_
+#define BASE_COMMANDLINEFLAGS_H_
+
+#include <config.h>
+#include <string>
+#include <string.h>               // for memchr
+#include <stdlib.h>               // for getenv
+#include "base/basictypes.h"
+
+#define DECLARE_VARIABLE(type, name)                                          \
+  namespace FLAG__namespace_do_not_use_directly_use_DECLARE_##type##_instead {  \
+  extern PERFTOOLS_DLL_DECL type FLAGS_##name;                                \
+  }                                                                           \
+  using FLAG__namespace_do_not_use_directly_use_DECLARE_##type##_instead::FLAGS_##name
+
+#define DEFINE_VARIABLE(type, name, value, meaning) \
+  namespace FLAG__namespace_do_not_use_directly_use_DECLARE_##type##_instead {  \
+  PERFTOOLS_DLL_DECL type FLAGS_##name(value);                                \
+  char FLAGS_no##name;                                                        \
+  }                                                                           \
+  using FLAG__namespace_do_not_use_directly_use_DECLARE_##type##_instead::FLAGS_##name
+
+// bool specialization
+#define DECLARE_bool(name) \
+  DECLARE_VARIABLE(bool, name)
+#define DEFINE_bool(name, value, meaning) \
+  DEFINE_VARIABLE(bool, name, value, meaning)
+
+// int32 specialization
+#define DECLARE_int32(name) \
+  DECLARE_VARIABLE(int32, name)
+#define DEFINE_int32(name, value, meaning) \
+  DEFINE_VARIABLE(int32, name, value, meaning)
+
+// int64 specialization
+#define DECLARE_int64(name) \
+  DECLARE_VARIABLE(int64, name)
+#define DEFINE_int64(name, value, meaning) \
+  DEFINE_VARIABLE(int64, name, value, meaning)
+
+#define DECLARE_uint64(name) \
+  DECLARE_VARIABLE(uint64, name)
+#define DEFINE_uint64(name, value, meaning) \
+  DEFINE_VARIABLE(uint64, name, value, meaning)
+
+// double specialization
+#define DECLARE_double(name) \
+  DECLARE_VARIABLE(double, name)
+#define DEFINE_double(name, value, meaning) \
+  DEFINE_VARIABLE(double, name, value, meaning)
+
+// Special case for string, because we have to specify the namespace
+// std::string, which doesn't play nicely with our FLAG__namespace hackery.
+#define DECLARE_string(name)                                          \
+  namespace FLAG__namespace_do_not_use_directly_use_DECLARE_string_instead {  \
+  extern std::string FLAGS_##name;                                                   \
+  }                                                                           \
+  using FLAG__namespace_do_not_use_directly_use_DECLARE_string_instead::FLAGS_##name
+#define DEFINE_string(name, value, meaning) \
+  namespace FLAG__namespace_do_not_use_directly_use_DECLARE_string_instead {  \
+  std::string FLAGS_##name(value);                                                   \
+  char FLAGS_no##name;                                                        \
+  }                                                                           \
+  using FLAG__namespace_do_not_use_directly_use_DECLARE_string_instead::FLAGS_##name
+
+// implemented in sysinfo.cc
+namespace tcmalloc {
+  namespace commandlineflags {
+
+    inline bool StringToBool(const char *value, bool def) {
+      if (!value) {
+        return def;
+      }
+      return memchr("tTyY1\0", value[0], 6) != NULL;
+    }
+
+    inline int StringToInt(const char *value, int def) {
+      if (!value) {
+        return def;
+      }
+      return strtol(value, NULL, 10);
+    }
+
+    inline long long StringToLongLong(const char *value, long long def) {
+      if (!value) {
+        return def;
+      }
+      return strtoll(value, NULL, 10);
+    }
+
+    inline double StringToDouble(const char *value, double def) {
+      if (!value) {
+        return def;
+      }
+      return strtod(value, NULL);
+    }
+  }
+}
+
+// These macros (could be functions, but I don't want to bother with a .cc
+// file), make it easier to initialize flags from the environment.
+
+#define EnvToString(envname, dflt)   \
+  (!getenv(envname) ? (dflt) : getenv(envname))
+
+#define EnvToBool(envname, dflt)   \
+  tcmalloc::commandlineflags::StringToBool(getenv(envname), dflt)
+
+#define EnvToInt(envname, dflt)  \
+  tcmalloc::commandlineflags::StringToInt(getenv(envname), dflt)
+
+#define EnvToInt64(envname, dflt)  \
+  tcmalloc::commandlineflags::StringToLongLong(getenv(envname), dflt)
+
+#define EnvToDouble(envname, dflt)  \
+  tcmalloc::commandlineflags::StringToDouble(getenv(envname), dflt)
+
+#endif  // BASE_COMMANDLINEFLAGS_H_

diff --git a/src/base/cycleclock.h b/src/base/cycleclock.h
new file mode 100644
index 0000000..dc2d569
--- /dev/null
+++ b/src/base/cycleclock.h

@@ -0,0 +1,173 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2004, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ----------------------------------------------------------------------
+// CycleClock
+//    A CycleClock tells you the current time in Cycles.  The "time"
+//    is actually time since power-on.  This is like time() but doesn't
+//    involve a system call and is much more precise.
+//
+// NOTE: Not all cpu/platform/kernel combinations guarantee that this
+// clock increments at a constant rate or is synchronized across all logical
+// cpus in a system.
+//
+// Also, in some out of order CPU implementations, the CycleClock is not 
+// serializing. So if you're trying to count at cycles granularity, your
+// data might be inaccurate due to out of order instruction execution.
+// ----------------------------------------------------------------------
+
+#ifndef GOOGLE_BASE_CYCLECLOCK_H_
+#define GOOGLE_BASE_CYCLECLOCK_H_
+
+#include "base/basictypes.h"   // make sure we get the def for int64
+#include "base/arm_instruction_set_select.h"
+// base/sysinfo.h is really big and we don't want to include it unless
+// it is necessary.
+#if defined(__arm__) || defined(__mips__) || defined(__aarch64__)
+# include "base/sysinfo.h"
+#endif
+#if defined(__MACH__) && defined(__APPLE__)
+# include <mach/mach_time.h>
+#endif
+// For MSVC, we want to use '_asm rdtsc' when possible (since it works
+// with even ancient MSVC compilers), and when not possible the
+// __rdtsc intrinsic, declared in <intrin.h>.  Unfortunately, in some
+// environments, <windows.h> and <intrin.h> have conflicting
+// declarations of some other intrinsics, breaking compilation.
+// Therefore, we simply declare __rdtsc ourselves. See also
+// http://connect.microsoft.com/VisualStudio/feedback/details/262047
+#if defined(_MSC_VER) && !defined(_M_IX86)
+extern "C" uint64 __rdtsc();
+#pragma intrinsic(__rdtsc)
+#endif
+#if defined(ARMV3) || defined(__mips__) || defined(__aarch64__)
+#include <sys/time.h>
+#endif
+
+// NOTE: only i386 and x86_64 have been well tested.
+// PPC, sparc, alpha, and ia64 are based on
+//    http://peter.kuscsik.com/wordpress/?p=14
+// with modifications by m3b.  See also
+//    https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h
+struct CycleClock {
+  // This should return the number of cycles since power-on.  Thread-safe.
+  static inline int64 Now() {
+#if defined(__MACH__) && defined(__APPLE__)
+    // this goes at the top because we need ALL Macs, regardless of
+    // architecture, to return the number of "mach time units" that
+    // have passed since startup.  See sysinfo.cc where
+    // InitializeSystemInfo() sets the supposed cpu clock frequency of
+    // macs to the number of mach time units per second, not actual
+    // CPU clock frequency (which can change in the face of CPU
+    // frequency scaling).  Also note that when the Mac sleeps, this
+    // counter pauses; it does not continue counting, nor does it
+    // reset to zero.
+    return mach_absolute_time();
+#elif defined(__i386__)
+    int64 ret;
+    __asm__ volatile ("rdtsc" : "=A" (ret) );
+    return ret;
+#elif defined(__x86_64__) || defined(__amd64__)
+    uint64 low, high;
+    __asm__ volatile ("rdtsc" : "=a" (low), "=d" (high));
+    return (high << 32) | low;
+#elif defined(__powerpc64__) || defined(__ppc64__)
+    uint64 tb;
+    __asm__ volatile (\
+      "mfspr %0, 268"
+      : "=r" (tb));
+    return tb;
+#elif defined(__powerpc__) || defined(__ppc__)
+    // This returns a time-base, which is not always precisely a cycle-count.
+    uint32 tbu, tbl, tmp;
+    __asm__ volatile (\
+      "0:\n"
+      "mftbu %0\n"
+      "mftbl %1\n"
+      "mftbu %2\n"
+      "cmpw %0, %2\n"
+      "bne- 0b"
+      : "=r" (tbu), "=r" (tbl), "=r" (tmp));
+    return (((uint64) tbu << 32) | tbl);
+#elif defined(__sparc__)
+    int64 tick;
+    asm(".byte 0x83, 0x41, 0x00, 0x00");
+    asm("mov   %%g1, %0" : "=r" (tick));
+    return tick;
+#elif defined(__ia64__)
+    int64 itc;
+    asm("mov %0 = ar.itc" : "=r" (itc));
+    return itc;
+#elif defined(_MSC_VER) && defined(_M_IX86)
+    // Older MSVC compilers (like 7.x) don't seem to support the
+    // __rdtsc intrinsic properly, so I prefer to use _asm instead
+    // when I know it will work.  Otherwise, I'll use __rdtsc and hope
+    // the code is being compiled with a non-ancient compiler.
+    _asm rdtsc
+#elif defined(_MSC_VER)
+    return __rdtsc();
+#elif defined(ARMV3) || defined(__aarch64__)
+#if defined(ARMV7)  // V7 is the earliest arch that has a standard cyclecount
+    uint32 pmccntr;
+    uint32 pmuseren;
+    uint32 pmcntenset;
+    // Read the user mode perf monitor counter access permissions.
+    asm volatile ("mrc p15, 0, %0, c9, c14, 0" : "=r" (pmuseren));
+    if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
+      asm volatile ("mrc p15, 0, %0, c9, c12, 1" : "=r" (pmcntenset));
+      if (pmcntenset & 0x80000000ul) {  // Is it counting?
+        asm volatile ("mrc p15, 0, %0, c9, c13, 0" : "=r" (pmccntr));
+        // The counter is set up to count every 64th cycle
+        return static_cast<int64>(pmccntr) * 64;  // Should optimize to << 6
+      }
+    }
+#endif
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return static_cast<int64>((tv.tv_sec + tv.tv_usec * 0.000001)
+                              * CyclesPerSecond());
+#elif defined(__mips__)
+    // mips apparently only allows rdtsc for superusers, so we fall
+    // back to gettimeofday.  It's possible clock_gettime would be better.
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return static_cast<int64>((tv.tv_sec + tv.tv_usec * 0.000001)
+                              * CyclesPerSecond());
+#else
+// The soft failover to a generic implementation is automatic only for ARM.
+// For other platforms the developer is expected to make an attempt to create
+// a fast implementation and use generic version if nothing better is available.
+#error You need to define CycleTimer for your O/S and CPU
+#endif
+  }
+};
+
+
+#endif  // GOOGLE_BASE_CYCLECLOCK_H_

diff --git a/src/base/dynamic_annotations.c b/src/base/dynamic_annotations.c
new file mode 100644
index 0000000..87bd2ec
--- /dev/null
+++ b/src/base/dynamic_annotations.c

@@ -0,0 +1,179 @@
+/* Copyright (c) 2008-2009, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Kostya Serebryany
+ */
+
+#ifdef __cplusplus
+# error "This file should be built as pure C to avoid name mangling"
+#endif
+
+#include "config.h"
+#include <stdlib.h>
+#include <string.h>
+
+#include "base/dynamic_annotations.h"
+#include "getenv_safe.h" // for TCMallocGetenvSafe
+
+#ifdef __GNUC__
+/* valgrind.h uses gcc extensions so it won't build with other compilers */
+# ifdef HAVE_VALGRIND_H    /* prefer the user's copy if they have it */
+#  include <valgrind.h>
+# else                     /* otherwise just use the copy that we have */
+#  include "third_party/valgrind.h"
+# endif
+#endif
+
+/* Compiler-based ThreadSanitizer defines
+   DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL = 1
+   and provides its own definitions of the functions. */
+
+#ifndef DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL
+# define DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL 0
+#endif
+
+/* Each function is empty and called (via a macro) only in debug mode.
+   The arguments are captured by dynamic tools at runtime. */
+
+#if DYNAMIC_ANNOTATIONS_ENABLED == 1 \
+    && DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL == 0
+
+void AnnotateRWLockCreate(const char *file, int line,
+                          const volatile void *lock){}
+void AnnotateRWLockDestroy(const char *file, int line,
+                           const volatile void *lock){}
+void AnnotateRWLockAcquired(const char *file, int line,
+                            const volatile void *lock, long is_w){}
+void AnnotateRWLockReleased(const char *file, int line,
+                            const volatile void *lock, long is_w){}
+void AnnotateBarrierInit(const char *file, int line,
+                         const volatile void *barrier, long count,
+                         long reinitialization_allowed) {}
+void AnnotateBarrierWaitBefore(const char *file, int line,
+                               const volatile void *barrier) {}
+void AnnotateBarrierWaitAfter(const char *file, int line,
+                              const volatile void *barrier) {}
+void AnnotateBarrierDestroy(const char *file, int line,
+                            const volatile void *barrier) {}
+
+void AnnotateCondVarWait(const char *file, int line,
+                         const volatile void *cv,
+                         const volatile void *lock){}
+void AnnotateCondVarSignal(const char *file, int line,
+                           const volatile void *cv){}
+void AnnotateCondVarSignalAll(const char *file, int line,
+                              const volatile void *cv){}
+void AnnotatePublishMemoryRange(const char *file, int line,
+                                const volatile void *address,
+                                long size){}
+void AnnotateUnpublishMemoryRange(const char *file, int line,
+                                  const volatile void *address,
+                                  long size){}
+void AnnotatePCQCreate(const char *file, int line,
+                       const volatile void *pcq){}
+void AnnotatePCQDestroy(const char *file, int line,
+                        const volatile void *pcq){}
+void AnnotatePCQPut(const char *file, int line,
+                    const volatile void *pcq){}
+void AnnotatePCQGet(const char *file, int line,
+                    const volatile void *pcq){}
+void AnnotateNewMemory(const char *file, int line,
+                       const volatile void *mem,
+                       long size){}
+void AnnotateExpectRace(const char *file, int line,
+                        const volatile void *mem,
+                        const char *description){}
+void AnnotateBenignRace(const char *file, int line,
+                        const volatile void *mem,
+                        const char *description){}
+void AnnotateBenignRaceSized(const char *file, int line,
+                             const volatile void *mem,
+                             long size,
+                             const char *description) {}
+void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
+                                  const volatile void *mu){}
+void AnnotateTraceMemory(const char *file, int line,
+                         const volatile void *arg){}
+void AnnotateThreadName(const char *file, int line,
+                        const char *name){}
+void AnnotateIgnoreReadsBegin(const char *file, int line){}
+void AnnotateIgnoreReadsEnd(const char *file, int line){}
+void AnnotateIgnoreWritesBegin(const char *file, int line){}
+void AnnotateIgnoreWritesEnd(const char *file, int line){}
+void AnnotateEnableRaceDetection(const char *file, int line, int enable){}
+void AnnotateNoOp(const char *file, int line,
+                  const volatile void *arg){}
+void AnnotateFlushState(const char *file, int line){}
+
+#endif  /* DYNAMIC_ANNOTATIONS_ENABLED == 1
+    && DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL == 0 */
+
+#if DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL == 0
+
+static int GetRunningOnValgrind(void) {
+#ifdef RUNNING_ON_VALGRIND
+  if (RUNNING_ON_VALGRIND) return 1;
+#endif
+  const char *running_on_valgrind_str = TCMallocGetenvSafe("RUNNING_ON_VALGRIND");
+  if (running_on_valgrind_str) {
+    return strcmp(running_on_valgrind_str, "0") != 0;
+  }
+  return 0;
+}
+
+/* See the comments in dynamic_annotations.h */
+int RunningOnValgrind(void) {
+  static volatile int running_on_valgrind = -1;
+  int local_running_on_valgrind = running_on_valgrind;
+  /* C doesn't have thread-safe initialization of statics, and we
+     don't want to depend on pthread_once here, so hack it. */
+  ANNOTATE_BENIGN_RACE(&running_on_valgrind, "safe hack");
+  if (local_running_on_valgrind == -1)
+    running_on_valgrind = local_running_on_valgrind = GetRunningOnValgrind();
+  return local_running_on_valgrind;
+}
+
+#endif  /* DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL == 0 */
+
+/* See the comments in dynamic_annotations.h */
+double ValgrindSlowdown(void) {
+  /* Same initialization hack as in RunningOnValgrind(). */
+  static volatile double slowdown = 0.0;
+  double local_slowdown = slowdown;
+  ANNOTATE_BENIGN_RACE(&slowdown, "safe hack");
+  if (RunningOnValgrind() == 0) {
+    return 1.0;
+  }
+  if (local_slowdown == 0.0) {
+    char *env = getenv("VALGRIND_SLOWDOWN");
+    slowdown = local_slowdown = env ? atof(env) : 50.0;
+  }
+  return local_slowdown;
+}

diff --git a/src/base/dynamic_annotations.h b/src/base/dynamic_annotations.h
new file mode 100644
index 0000000..4669315
--- /dev/null
+++ b/src/base/dynamic_annotations.h

@@ -0,0 +1,627 @@
+/* Copyright (c) 2008, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Kostya Serebryany
+ */
+
+/* This file defines dynamic annotations for use with dynamic analysis
+   tool such as valgrind, PIN, etc.
+
+   Dynamic annotation is a source code annotation that affects
+   the generated code (that is, the annotation is not a comment).
+   Each such annotation is attached to a particular
+   instruction and/or to a particular object (address) in the program.
+
+   The annotations that should be used by users are macros in all upper-case
+   (e.g., ANNOTATE_NEW_MEMORY).
+
+   Actual implementation of these macros may differ depending on the
+   dynamic analysis tool being used.
+
+   See http://code.google.com/p/data-race-test/  for more information.
+
+   This file supports the following dynamic analysis tools:
+   - None (DYNAMIC_ANNOTATIONS_ENABLED is not defined or zero).
+      Macros are defined empty.
+   - ThreadSanitizer, Helgrind, DRD (DYNAMIC_ANNOTATIONS_ENABLED is 1).
+      Macros are defined as calls to non-inlinable empty functions
+      that are intercepted by Valgrind. */
+
+#ifndef BASE_DYNAMIC_ANNOTATIONS_H_
+#define BASE_DYNAMIC_ANNOTATIONS_H_
+
+#ifndef DYNAMIC_ANNOTATIONS_ENABLED
+# define DYNAMIC_ANNOTATIONS_ENABLED 0
+#endif
+
+#if DYNAMIC_ANNOTATIONS_ENABLED != 0
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing condition variables such as CondVar,
+     using conditional critical sections (Await/LockWhen) and when constructing
+     user-defined synchronization mechanisms.
+
+     The annotations ANNOTATE_HAPPENS_BEFORE() and ANNOTATE_HAPPENS_AFTER() can
+     be used to define happens-before arcs in user-defined synchronization
+     mechanisms:  the race detector will infer an arc from the former to the
+     latter when they share the same argument pointer.
+
+     Example 1 (reference counting):
+
+     void Unref() {
+       ANNOTATE_HAPPENS_BEFORE(&refcount_);
+       if (AtomicDecrementByOne(&refcount_) == 0) {
+         ANNOTATE_HAPPENS_AFTER(&refcount_);
+         delete this;
+       }
+     }
+
+     Example 2 (message queue):
+
+     void MyQueue::Put(Type *e) {
+       MutexLock lock(&mu_);
+       ANNOTATE_HAPPENS_BEFORE(e);
+       PutElementIntoMyQueue(e);
+     }
+
+     Type *MyQueue::Get() {
+       MutexLock lock(&mu_);
+       Type *e = GetElementFromMyQueue();
+       ANNOTATE_HAPPENS_AFTER(e);
+       return e;
+     }
+
+     Note: when possible, please use the existing reference counting and message
+     queue implementations instead of inventing new ones. */
+
+  /* Report that wait on the condition variable at address "cv" has succeeded
+     and the lock at address "lock" is held. */
+  #define ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) \
+    AnnotateCondVarWait(__FILE__, __LINE__, cv, lock)
+
+  /* Report that wait on the condition variable at "cv" has succeeded.  Variant
+     w/o lock. */
+  #define ANNOTATE_CONDVAR_WAIT(cv) \
+    AnnotateCondVarWait(__FILE__, __LINE__, cv, NULL)
+
+  /* Report that we are about to signal on the condition variable at address
+     "cv". */
+  #define ANNOTATE_CONDVAR_SIGNAL(cv) \
+    AnnotateCondVarSignal(__FILE__, __LINE__, cv)
+
+  /* Report that we are about to signal_all on the condition variable at "cv". */
+  #define ANNOTATE_CONDVAR_SIGNAL_ALL(cv) \
+    AnnotateCondVarSignalAll(__FILE__, __LINE__, cv)
+
+  /* Annotations for user-defined synchronization mechanisms. */
+  #define ANNOTATE_HAPPENS_BEFORE(obj) ANNOTATE_CONDVAR_SIGNAL(obj)
+  #define ANNOTATE_HAPPENS_AFTER(obj)  ANNOTATE_CONDVAR_WAIT(obj)
+
+  /* Report that the bytes in the range [pointer, pointer+size) are about
+     to be published safely. The race checker will create a happens-before
+     arc from the call ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) to
+     subsequent accesses to this memory.
+     Note: this annotation may not work properly if the race detector uses
+     sampling, i.e. does not observe all memory accesses.
+     */
+  #define ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) \
+    AnnotatePublishMemoryRange(__FILE__, __LINE__, pointer, size)
+
+  /* DEPRECATED. Don't use it. */
+  #define ANNOTATE_UNPUBLISH_MEMORY_RANGE(pointer, size) \
+    AnnotateUnpublishMemoryRange(__FILE__, __LINE__, pointer, size)
+
+  /* DEPRECATED. Don't use it. */
+  #define ANNOTATE_SWAP_MEMORY_RANGE(pointer, size)   \
+    do {                                              \
+      ANNOTATE_UNPUBLISH_MEMORY_RANGE(pointer, size); \
+      ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size);   \
+    } while (0)
+
+  /* Instruct the tool to create a happens-before arc between mu->Unlock() and
+     mu->Lock(). This annotation may slow down the race detector and hide real
+     races. Normally it is used only when it would be difficult to annotate each
+     of the mutex's critical sections individually using the annotations above.
+     This annotation makes sense only for hybrid race detectors. For pure
+     happens-before detectors this is a no-op. For more details see
+     http://code.google.com/p/data-race-test/wiki/PureHappensBeforeVsHybrid . */
+  #define ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) \
+    AnnotateMutexIsUsedAsCondVar(__FILE__, __LINE__, mu)
+
+  /* Deprecated. Use ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX. */
+  #define ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) \
+    AnnotateMutexIsUsedAsCondVar(__FILE__, __LINE__, mu)
+
+  /* -------------------------------------------------------------
+     Annotations useful when defining memory allocators, or when memory that
+     was protected in one way starts to be protected in another. */
+
+  /* Report that a new memory at "address" of size "size" has been allocated.
+     This might be used when the memory has been retrieved from a free list and
+     is about to be reused, or when a the locking discipline for a variable
+     changes. */
+  #define ANNOTATE_NEW_MEMORY(address, size) \
+    AnnotateNewMemory(__FILE__, __LINE__, address, size)
+
+  /* -------------------------------------------------------------
+     Annotations useful when defining FIFO queues that transfer data between
+     threads. */
+
+  /* Report that the producer-consumer queue (such as ProducerConsumerQueue) at
+     address "pcq" has been created.  The ANNOTATE_PCQ_* annotations
+     should be used only for FIFO queues.  For non-FIFO queues use
+     ANNOTATE_HAPPENS_BEFORE (for put) and ANNOTATE_HAPPENS_AFTER (for get). */
+  #define ANNOTATE_PCQ_CREATE(pcq) \
+    AnnotatePCQCreate(__FILE__, __LINE__, pcq)
+
+  /* Report that the queue at address "pcq" is about to be destroyed. */
+  #define ANNOTATE_PCQ_DESTROY(pcq) \
+    AnnotatePCQDestroy(__FILE__, __LINE__, pcq)
+
+  /* Report that we are about to put an element into a FIFO queue at address
+     "pcq". */
+  #define ANNOTATE_PCQ_PUT(pcq) \
+    AnnotatePCQPut(__FILE__, __LINE__, pcq)
+
+  /* Report that we've just got an element from a FIFO queue at address "pcq". */
+  #define ANNOTATE_PCQ_GET(pcq) \
+    AnnotatePCQGet(__FILE__, __LINE__, pcq)
+
+  /* -------------------------------------------------------------
+     Annotations that suppress errors.  It is usually better to express the
+     program's synchronization using the other annotations, but these can
+     be used when all else fails. */
+
+  /* Report that we may have a benign race at "pointer", with size
+     "sizeof(*(pointer))". "pointer" must be a non-void* pointer.  Insert at the
+     point where "pointer" has been allocated, preferably close to the point
+     where the race happens.  See also ANNOTATE_BENIGN_RACE_STATIC. */
+  #define ANNOTATE_BENIGN_RACE(pointer, description) \
+    AnnotateBenignRaceSized(__FILE__, __LINE__, pointer, \
+                            sizeof(*(pointer)), description)
+
+  /* Same as ANNOTATE_BENIGN_RACE(address, description), but applies to
+     the memory range [address, address+size). */
+  #define ANNOTATE_BENIGN_RACE_SIZED(address, size, description) \
+    AnnotateBenignRaceSized(__FILE__, __LINE__, address, size, description)
+
+  /* Request the analysis tool to ignore all reads in the current thread
+     until ANNOTATE_IGNORE_READS_END is called.
+     Useful to ignore intentional racey reads, while still checking
+     other reads and all writes.
+     See also ANNOTATE_UNPROTECTED_READ. */
+  #define ANNOTATE_IGNORE_READS_BEGIN() \
+    AnnotateIgnoreReadsBegin(__FILE__, __LINE__)
+
+  /* Stop ignoring reads. */
+  #define ANNOTATE_IGNORE_READS_END() \
+    AnnotateIgnoreReadsEnd(__FILE__, __LINE__)
+
+  /* Similar to ANNOTATE_IGNORE_READS_BEGIN, but ignore writes. */
+  #define ANNOTATE_IGNORE_WRITES_BEGIN() \
+    AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
+
+  /* Stop ignoring writes. */
+  #define ANNOTATE_IGNORE_WRITES_END() \
+    AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
+
+  /* Start ignoring all memory accesses (reads and writes). */
+  #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() \
+    do {\
+      ANNOTATE_IGNORE_READS_BEGIN();\
+      ANNOTATE_IGNORE_WRITES_BEGIN();\
+    }while(0)\
+
+  /* Stop ignoring all memory accesses. */
+  #define ANNOTATE_IGNORE_READS_AND_WRITES_END() \
+    do {\
+      ANNOTATE_IGNORE_WRITES_END();\
+      ANNOTATE_IGNORE_READS_END();\
+    }while(0)\
+
+  /* Enable (enable!=0) or disable (enable==0) race detection for all threads.
+     This annotation could be useful if you want to skip expensive race analysis
+     during some period of program execution, e.g. during initialization. */
+  #define ANNOTATE_ENABLE_RACE_DETECTION(enable) \
+    AnnotateEnableRaceDetection(__FILE__, __LINE__, enable)
+
+  /* -------------------------------------------------------------
+     Annotations useful for debugging. */
+
+  /* Request to trace every access to "address". */
+  #define ANNOTATE_TRACE_MEMORY(address) \
+    AnnotateTraceMemory(__FILE__, __LINE__, address)
+
+  /* Report the current thread name to a race detector. */
+  #define ANNOTATE_THREAD_NAME(name) \
+    AnnotateThreadName(__FILE__, __LINE__, name)
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing locks.  They are not
+     normally needed by modules that merely use locks.
+     The "lock" argument is a pointer to the lock object. */
+
+  /* Report that a lock has been created at address "lock". */
+  #define ANNOTATE_RWLOCK_CREATE(lock) \
+    AnnotateRWLockCreate(__FILE__, __LINE__, lock)
+
+  /* Report that the lock at address "lock" is about to be destroyed. */
+  #define ANNOTATE_RWLOCK_DESTROY(lock) \
+    AnnotateRWLockDestroy(__FILE__, __LINE__, lock)
+
+  /* Report that the lock at address "lock" has been acquired.
+     is_w=1 for writer lock, is_w=0 for reader lock. */
+  #define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) \
+    AnnotateRWLockAcquired(__FILE__, __LINE__, lock, is_w)
+
+  /* Report that the lock at address "lock" is about to be released. */
+  #define ANNOTATE_RWLOCK_RELEASED(lock, is_w) \
+    AnnotateRWLockReleased(__FILE__, __LINE__, lock, is_w)
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing barriers.  They are not
+     normally needed by modules that merely use barriers.
+     The "barrier" argument is a pointer to the barrier object. */
+
+  /* Report that the "barrier" has been initialized with initial "count".
+   If 'reinitialization_allowed' is true, initialization is allowed to happen
+   multiple times w/o calling barrier_destroy() */
+  #define ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) \
+    AnnotateBarrierInit(__FILE__, __LINE__, barrier, count, \
+                        reinitialization_allowed)
+
+  /* Report that we are about to enter barrier_wait("barrier"). */
+  #define ANNOTATE_BARRIER_WAIT_BEFORE(barrier) \
+    AnnotateBarrierWaitBefore(__FILE__, __LINE__, barrier)
+
+  /* Report that we just exited barrier_wait("barrier"). */
+  #define ANNOTATE_BARRIER_WAIT_AFTER(barrier) \
+    AnnotateBarrierWaitAfter(__FILE__, __LINE__, barrier)
+
+  /* Report that the "barrier" has been destroyed. */
+  #define ANNOTATE_BARRIER_DESTROY(barrier) \
+    AnnotateBarrierDestroy(__FILE__, __LINE__, barrier)
+
+  /* -------------------------------------------------------------
+     Annotations useful for testing race detectors. */
+
+  /* Report that we expect a race on the variable at "address".
+     Use only in unit tests for a race detector. */
+  #define ANNOTATE_EXPECT_RACE(address, description) \
+    AnnotateExpectRace(__FILE__, __LINE__, address, description)
+
+  /* A no-op. Insert where you like to test the interceptors. */
+  #define ANNOTATE_NO_OP(arg) \
+    AnnotateNoOp(__FILE__, __LINE__, arg)
+
+  /* Force the race detector to flush its state. The actual effect depends on
+   * the implementation of the detector. */
+  #define ANNOTATE_FLUSH_STATE() \
+    AnnotateFlushState(__FILE__, __LINE__)
+
+
+#else  /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */
+
+  #define ANNOTATE_RWLOCK_CREATE(lock) /* empty */
+  #define ANNOTATE_RWLOCK_DESTROY(lock) /* empty */
+  #define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) /* empty */
+  #define ANNOTATE_RWLOCK_RELEASED(lock, is_w) /* empty */
+  #define ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) /* */
+  #define ANNOTATE_BARRIER_WAIT_BEFORE(barrier) /* empty */
+  #define ANNOTATE_BARRIER_WAIT_AFTER(barrier) /* empty */
+  #define ANNOTATE_BARRIER_DESTROY(barrier) /* empty */
+  #define ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) /* empty */
+  #define ANNOTATE_CONDVAR_WAIT(cv) /* empty */
+  #define ANNOTATE_CONDVAR_SIGNAL(cv) /* empty */
+  #define ANNOTATE_CONDVAR_SIGNAL_ALL(cv) /* empty */
+  #define ANNOTATE_HAPPENS_BEFORE(obj) /* empty */
+  #define ANNOTATE_HAPPENS_AFTER(obj) /* empty */
+  #define ANNOTATE_PUBLISH_MEMORY_RANGE(address, size) /* empty */
+  #define ANNOTATE_UNPUBLISH_MEMORY_RANGE(address, size)  /* empty */
+  #define ANNOTATE_SWAP_MEMORY_RANGE(address, size)  /* empty */
+  #define ANNOTATE_PCQ_CREATE(pcq) /* empty */
+  #define ANNOTATE_PCQ_DESTROY(pcq) /* empty */
+  #define ANNOTATE_PCQ_PUT(pcq) /* empty */
+  #define ANNOTATE_PCQ_GET(pcq) /* empty */
+  #define ANNOTATE_NEW_MEMORY(address, size) /* empty */
+  #define ANNOTATE_EXPECT_RACE(address, description) /* empty */
+  #define ANNOTATE_BENIGN_RACE(address, description) /* empty */
+  #define ANNOTATE_BENIGN_RACE_SIZED(address, size, description) /* empty */
+  #define ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) /* empty */
+  #define ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) /* empty */
+  #define ANNOTATE_TRACE_MEMORY(arg) /* empty */
+  #define ANNOTATE_THREAD_NAME(name) /* empty */
+  #define ANNOTATE_IGNORE_READS_BEGIN() /* empty */
+  #define ANNOTATE_IGNORE_READS_END() /* empty */
+  #define ANNOTATE_IGNORE_WRITES_BEGIN() /* empty */
+  #define ANNOTATE_IGNORE_WRITES_END() /* empty */
+  #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() /* empty */
+  #define ANNOTATE_IGNORE_READS_AND_WRITES_END() /* empty */
+  #define ANNOTATE_ENABLE_RACE_DETECTION(enable) /* empty */
+  #define ANNOTATE_NO_OP(arg) /* empty */
+  #define ANNOTATE_FLUSH_STATE() /* empty */
+
+#endif  /* DYNAMIC_ANNOTATIONS_ENABLED */
+
+/* Macro definitions for GCC attributes that allow static thread safety
+   analysis to recognize and use some of the dynamic annotations as
+   escape hatches.
+   TODO(lcwu): remove the check for __SUPPORT_DYN_ANNOTATION__ once the
+   default crosstool/GCC supports these GCC attributes.  */
+
+#define ANNOTALYSIS_STATIC_INLINE
+#define ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY ;
+#define ANNOTALYSIS_IGNORE_READS_BEGIN
+#define ANNOTALYSIS_IGNORE_READS_END
+#define ANNOTALYSIS_IGNORE_WRITES_BEGIN
+#define ANNOTALYSIS_IGNORE_WRITES_END
+#define ANNOTALYSIS_UNPROTECTED_READ
+
+#if defined(__GNUC__) && (!defined(SWIG)) && (!defined(__clang__)) && \
+    defined(__SUPPORT_TS_ANNOTATION__) && defined(__SUPPORT_DYN_ANNOTATION__)
+
+#if DYNAMIC_ANNOTATIONS_ENABLED == 0
+#define ANNOTALYSIS_ONLY 1
+#undef ANNOTALYSIS_STATIC_INLINE
+#define ANNOTALYSIS_STATIC_INLINE static inline
+#undef ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY
+#define ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY { (void)file; (void)line; }
+#endif
+
+/* Only emit attributes when annotalysis is enabled. */
+#if defined(__SUPPORT_TS_ANNOTATION__) && defined(__SUPPORT_DYN_ANNOTATION__)
+#undef  ANNOTALYSIS_IGNORE_READS_BEGIN
+#define ANNOTALYSIS_IGNORE_READS_BEGIN  __attribute__ ((ignore_reads_begin))
+#undef  ANNOTALYSIS_IGNORE_READS_END
+#define ANNOTALYSIS_IGNORE_READS_END    __attribute__ ((ignore_reads_end))
+#undef  ANNOTALYSIS_IGNORE_WRITES_BEGIN
+#define ANNOTALYSIS_IGNORE_WRITES_BEGIN __attribute__ ((ignore_writes_begin))
+#undef  ANNOTALYSIS_IGNORE_WRITES_END
+#define ANNOTALYSIS_IGNORE_WRITES_END   __attribute__ ((ignore_writes_end))
+#undef  ANNOTALYSIS_UNPROTECTED_READ
+#define ANNOTALYSIS_UNPROTECTED_READ    __attribute__ ((unprotected_read))
+#endif
+
+#endif // defined(__GNUC__) && (!defined(SWIG)) && (!defined(__clang__))
+
+/* Use the macros above rather than using these functions directly. */
+#ifdef __cplusplus
+extern "C" {
+#endif
+void AnnotateRWLockCreate(const char *file, int line,
+                          const volatile void *lock);
+void AnnotateRWLockDestroy(const char *file, int line,
+                           const volatile void *lock);
+void AnnotateRWLockAcquired(const char *file, int line,
+                            const volatile void *lock, long is_w);
+void AnnotateRWLockReleased(const char *file, int line,
+                            const volatile void *lock, long is_w);
+void AnnotateBarrierInit(const char *file, int line,
+                         const volatile void *barrier, long count,
+                         long reinitialization_allowed);
+void AnnotateBarrierWaitBefore(const char *file, int line,
+                               const volatile void *barrier);
+void AnnotateBarrierWaitAfter(const char *file, int line,
+                              const volatile void *barrier);
+void AnnotateBarrierDestroy(const char *file, int line,
+                            const volatile void *barrier);
+void AnnotateCondVarWait(const char *file, int line,
+                         const volatile void *cv,
+                         const volatile void *lock);
+void AnnotateCondVarSignal(const char *file, int line,
+                           const volatile void *cv);
+void AnnotateCondVarSignalAll(const char *file, int line,
+                              const volatile void *cv);
+void AnnotatePublishMemoryRange(const char *file, int line,
+                                const volatile void *address,
+                                long size);
+void AnnotateUnpublishMemoryRange(const char *file, int line,
+                                  const volatile void *address,
+                                  long size);
+void AnnotatePCQCreate(const char *file, int line,
+                       const volatile void *pcq);
+void AnnotatePCQDestroy(const char *file, int line,
+                        const volatile void *pcq);
+void AnnotatePCQPut(const char *file, int line,
+                    const volatile void *pcq);
+void AnnotatePCQGet(const char *file, int line,
+                    const volatile void *pcq);
+void AnnotateNewMemory(const char *file, int line,
+                       const volatile void *address,
+                       long size);
+void AnnotateExpectRace(const char *file, int line,
+                        const volatile void *address,
+                        const char *description);
+void AnnotateBenignRace(const char *file, int line,
+                        const volatile void *address,
+                        const char *description);
+void AnnotateBenignRaceSized(const char *file, int line,
+                        const volatile void *address,
+                        long size,
+                        const char *description);
+void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
+                                  const volatile void *mu);
+void AnnotateTraceMemory(const char *file, int line,
+                         const volatile void *arg);
+void AnnotateThreadName(const char *file, int line,
+                        const char *name);
+ANNOTALYSIS_STATIC_INLINE
+void AnnotateIgnoreReadsBegin(const char *file, int line)
+    ANNOTALYSIS_IGNORE_READS_BEGIN ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY
+ANNOTALYSIS_STATIC_INLINE
+void AnnotateIgnoreReadsEnd(const char *file, int line)
+    ANNOTALYSIS_IGNORE_READS_END ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY
+ANNOTALYSIS_STATIC_INLINE
+void AnnotateIgnoreWritesBegin(const char *file, int line)
+    ANNOTALYSIS_IGNORE_WRITES_BEGIN ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY
+ANNOTALYSIS_STATIC_INLINE
+void AnnotateIgnoreWritesEnd(const char *file, int line)
+    ANNOTALYSIS_IGNORE_WRITES_END ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY
+void AnnotateEnableRaceDetection(const char *file, int line, int enable);
+void AnnotateNoOp(const char *file, int line,
+                  const volatile void *arg);
+void AnnotateFlushState(const char *file, int line);
+
+/* Return non-zero value if running under valgrind.
+
+  If "valgrind.h" is included into dynamic_annotations.c,
+  the regular valgrind mechanism will be used.
+  See http://valgrind.org/docs/manual/manual-core-adv.html about
+  RUNNING_ON_VALGRIND and other valgrind "client requests".
+  The file "valgrind.h" may be obtained by doing
+     svn co svn://svn.valgrind.org/valgrind/trunk/include
+
+  If for some reason you can't use "valgrind.h" or want to fake valgrind,
+  there are two ways to make this function return non-zero:
+    - Use environment variable: export RUNNING_ON_VALGRIND=1
+    - Make your tool intercept the function RunningOnValgrind() and
+      change its return value.
+ */
+int RunningOnValgrind(void);
+
+/* ValgrindSlowdown returns:
+    * 1.0, if (RunningOnValgrind() == 0)
+    * 50.0, if (RunningOnValgrind() != 0 && getenv("VALGRIND_SLOWDOWN") == NULL)
+    * atof(getenv("VALGRIND_SLOWDOWN")) otherwise
+   This function can be used to scale timeout values:
+   EXAMPLE:
+   for (;;) {
+     DoExpensiveBackgroundTask();
+     SleepForSeconds(5 * ValgrindSlowdown());
+   }
+ */
+double ValgrindSlowdown(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#if DYNAMIC_ANNOTATIONS_ENABLED != 0 && defined(__cplusplus)
+
+  /* ANNOTATE_UNPROTECTED_READ is the preferred way to annotate racey reads.
+
+     Instead of doing
+        ANNOTATE_IGNORE_READS_BEGIN();
+        ... = x;
+        ANNOTATE_IGNORE_READS_END();
+     one can use
+        ... = ANNOTATE_UNPROTECTED_READ(x); */
+  template <class T>
+  inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x)
+      ANNOTALYSIS_UNPROTECTED_READ {
+    ANNOTATE_IGNORE_READS_BEGIN();
+    T res = x;
+    ANNOTATE_IGNORE_READS_END();
+    return res;
+  }
+  /* Apply ANNOTATE_BENIGN_RACE_SIZED to a static variable. */
+  #define ANNOTATE_BENIGN_RACE_STATIC(static_var, description)        \
+    namespace {                                                       \
+      class static_var ## _annotator {                                \
+       public:                                                        \
+        static_var ## _annotator() {                                  \
+          ANNOTATE_BENIGN_RACE_SIZED(&static_var,                     \
+                                      sizeof(static_var),             \
+            # static_var ": " description);                           \
+        }                                                             \
+      };                                                              \
+      static static_var ## _annotator the ## static_var ## _annotator;\
+    }
+#else /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */
+
+  #define ANNOTATE_UNPROTECTED_READ(x) (x)
+  #define ANNOTATE_BENIGN_RACE_STATIC(static_var, description)  /* empty */
+
+#endif /* DYNAMIC_ANNOTATIONS_ENABLED */
+
+/* Annotalysis, a GCC based static analyzer, is able to understand and use
+   some of the dynamic annotations defined in this file. However, dynamic
+   annotations are usually disabled in the opt mode (to avoid additional
+   runtime overheads) while Annotalysis only works in the opt mode.
+   In order for Annotalysis to use these dynamic annotations when they
+   are disabled, we re-define these annotations here. Note that unlike the
+   original macro definitions above, these macros are expanded to calls to
+   static inline functions so that the compiler will be able to remove the
+   calls after the analysis. */
+
+#ifdef ANNOTALYSIS_ONLY
+
+  #undef ANNOTALYSIS_ONLY
+
+  /* Undefine and re-define the macros that the static analyzer understands. */
+  #undef ANNOTATE_IGNORE_READS_BEGIN
+  #define ANNOTATE_IGNORE_READS_BEGIN()           \
+    AnnotateIgnoreReadsBegin(__FILE__, __LINE__)
+
+  #undef ANNOTATE_IGNORE_READS_END
+  #define ANNOTATE_IGNORE_READS_END()             \
+    AnnotateIgnoreReadsEnd(__FILE__, __LINE__)
+
+  #undef ANNOTATE_IGNORE_WRITES_BEGIN
+  #define ANNOTATE_IGNORE_WRITES_BEGIN()          \
+    AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
+
+  #undef ANNOTATE_IGNORE_WRITES_END
+  #define ANNOTATE_IGNORE_WRITES_END()            \
+    AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
+
+  #undef ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN
+  #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN()       \
+    do {                                                 \
+      ANNOTATE_IGNORE_READS_BEGIN();                     \
+      ANNOTATE_IGNORE_WRITES_BEGIN();                    \
+    }while(0)                                            \
+
+  #undef ANNOTATE_IGNORE_READS_AND_WRITES_END
+  #define ANNOTATE_IGNORE_READS_AND_WRITES_END()  \
+    do {                                          \
+      ANNOTATE_IGNORE_WRITES_END();               \
+      ANNOTATE_IGNORE_READS_END();                \
+    }while(0)                                     \
+
+  #if defined(__cplusplus)
+    #undef ANNOTATE_UNPROTECTED_READ
+    template <class T>
+    inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x)
+         ANNOTALYSIS_UNPROTECTED_READ {
+      ANNOTATE_IGNORE_READS_BEGIN();
+      T res = x;
+      ANNOTATE_IGNORE_READS_END();
+      return res;
+    }
+  #endif /* __cplusplus */
+
+#endif /* ANNOTALYSIS_ONLY */
+
+/* Undefine the macros intended only in this file. */
+#undef ANNOTALYSIS_STATIC_INLINE
+#undef ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY
+
+#endif  /* BASE_DYNAMIC_ANNOTATIONS_H_ */

diff --git a/src/base/elf_mem_image.cc b/src/base/elf_mem_image.cc
new file mode 100644
index 0000000..d2ca1a5
--- /dev/null
+++ b/src/base/elf_mem_image.cc

@@ -0,0 +1,434 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Paul Pluzhnikov
+//
+// Allow dynamic symbol lookup in an in-memory Elf image.
+//
+
+#include "base/elf_mem_image.h"
+
+#ifdef HAVE_ELF_MEM_IMAGE  // defined in elf_mem_image.h
+
+#include <stddef.h>   // for size_t, ptrdiff_t
+#include "base/logging.h"
+
+// From binutils/include/elf/common.h (this doesn't appear to be documented
+// anywhere else).
+//
+//   /* This flag appears in a Versym structure.  It means that the symbol
+//      is hidden, and is only visible with an explicit version number.
+//      This is a GNU extension.  */
+//   #define VERSYM_HIDDEN           0x8000
+//
+//   /* This is the mask for the rest of the Versym information.  */
+//   #define VERSYM_VERSION          0x7fff
+
+#define VERSYM_VERSION 0x7fff
+
+namespace base {
+
+namespace {
+template <int N> class ElfClass {
+ public:
+  static const int kElfClass = -1;
+  static int ElfBind(const ElfW(Sym) *) {
+    CHECK(false); // << "Unexpected word size";
+    return 0;
+  }
+  static int ElfType(const ElfW(Sym) *) {
+    CHECK(false); // << "Unexpected word size";
+    return 0;
+  }
+};
+
+template <> class ElfClass<32> {
+ public:
+  static const int kElfClass = ELFCLASS32;
+  static int ElfBind(const ElfW(Sym) *symbol) {
+    return ELF32_ST_BIND(symbol->st_info);
+  }
+  static int ElfType(const ElfW(Sym) *symbol) {
+    return ELF32_ST_TYPE(symbol->st_info);
+  }
+};
+
+template <> class ElfClass<64> {
+ public:
+  static const int kElfClass = ELFCLASS64;
+  static int ElfBind(const ElfW(Sym) *symbol) {
+    return ELF64_ST_BIND(symbol->st_info);
+  }
+  static int ElfType(const ElfW(Sym) *symbol) {
+    return ELF64_ST_TYPE(symbol->st_info);
+  }
+};
+
+typedef ElfClass<__WORDSIZE> CurrentElfClass;
+
+// Extract an element from one of the ELF tables, cast it to desired type.
+// This is just a simple arithmetic and a glorified cast.
+// Callers are responsible for bounds checking.
+template <class T>
+const T* GetTableElement(const ElfW(Ehdr) *ehdr,
+                         ElfW(Off) table_offset,
+                         ElfW(Word) element_size,
+                         size_t index) {
+  return reinterpret_cast<const T*>(reinterpret_cast<const char *>(ehdr)
+                                    + table_offset
+                                    + index * element_size);
+}
+}  // namespace
+
+const void *const ElfMemImage::kInvalidBase =
+    reinterpret_cast<const void *>(~0L);
+
+ElfMemImage::ElfMemImage(const void *base) {
+  CHECK(base != kInvalidBase);
+  Init(base);
+}
+
+int ElfMemImage::GetNumSymbols() const {
+  if (!hash_) {
+    return 0;
+  }
+  // See http://www.caldera.com/developers/gabi/latest/ch5.dynamic.html#hash
+  return hash_[1];
+}
+
+const ElfW(Sym) *ElfMemImage::GetDynsym(int index) const {
+  CHECK_LT(index, GetNumSymbols());
+  return dynsym_ + index;
+}
+
+const ElfW(Versym) *ElfMemImage::GetVersym(int index) const {
+  CHECK_LT(index, GetNumSymbols());
+  return versym_ + index;
+}
+
+const ElfW(Phdr) *ElfMemImage::GetPhdr(int index) const {
+  CHECK_LT(index, ehdr_->e_phnum);
+  return GetTableElement<ElfW(Phdr)>(ehdr_,
+                                     ehdr_->e_phoff,
+                                     ehdr_->e_phentsize,
+                                     index);
+}
+
+const char *ElfMemImage::GetDynstr(ElfW(Word) offset) const {
+  CHECK_LT(offset, strsize_);
+  return dynstr_ + offset;
+}
+
+const void *ElfMemImage::GetSymAddr(const ElfW(Sym) *sym) const {
+  if (sym->st_shndx == SHN_UNDEF || sym->st_shndx >= SHN_LORESERVE) {
+    // Symbol corresponds to "special" (e.g. SHN_ABS) section.
+    return reinterpret_cast<const void *>(sym->st_value);
+  }
+  CHECK_LT(link_base_, sym->st_value);
+  return GetTableElement<char>(ehdr_, 0, 1, sym->st_value) - link_base_;
+}
+
+const ElfW(Verdef) *ElfMemImage::GetVerdef(int index) const {
+  CHECK_LE(index, verdefnum_);
+  const ElfW(Verdef) *version_definition = verdef_;
+  while (version_definition->vd_ndx < index && version_definition->vd_next) {
+    const char *const version_definition_as_char =
+        reinterpret_cast<const char *>(version_definition);
+    version_definition =
+        reinterpret_cast<const ElfW(Verdef) *>(version_definition_as_char +
+                                               version_definition->vd_next);
+  }
+  return version_definition->vd_ndx == index ? version_definition : NULL;
+}
+
+const ElfW(Verdaux) *ElfMemImage::GetVerdefAux(
+    const ElfW(Verdef) *verdef) const {
+  return reinterpret_cast<const ElfW(Verdaux) *>(verdef+1);
+}
+
+const char *ElfMemImage::GetVerstr(ElfW(Word) offset) const {
+  CHECK_LT(offset, strsize_);
+  return dynstr_ + offset;
+}
+
+void ElfMemImage::Init(const void *base) {
+  ehdr_      = NULL;
+  dynsym_    = NULL;
+  dynstr_    = NULL;
+  versym_    = NULL;
+  verdef_    = NULL;
+  hash_      = NULL;
+  strsize_   = 0;
+  verdefnum_ = 0;
+  link_base_ = ~0L;  // Sentinel: PT_LOAD .p_vaddr can't possibly be this.
+  if (!base) {
+    return;
+  }
+  const intptr_t base_as_uintptr_t = reinterpret_cast<uintptr_t>(base);
+  // Fake VDSO has low bit set.
+  const bool fake_vdso = ((base_as_uintptr_t & 1) != 0);
+  base = reinterpret_cast<const void *>(base_as_uintptr_t & ~1);
+  const char *const base_as_char = reinterpret_cast<const char *>(base);
+  if (base_as_char[EI_MAG0] != ELFMAG0 || base_as_char[EI_MAG1] != ELFMAG1 ||
+      base_as_char[EI_MAG2] != ELFMAG2 || base_as_char[EI_MAG3] != ELFMAG3) {
+    RAW_DCHECK(false, "no ELF magic"); // at %p", base);
+    return;
+  }
+  int elf_class = base_as_char[EI_CLASS];
+  if (elf_class != CurrentElfClass::kElfClass) {
+    DCHECK_EQ(elf_class, CurrentElfClass::kElfClass);
+    return;
+  }
+  switch (base_as_char[EI_DATA]) {
+    case ELFDATA2LSB: {
+      if (__LITTLE_ENDIAN != __BYTE_ORDER) {
+        DCHECK_EQ(__LITTLE_ENDIAN, __BYTE_ORDER); // << ": wrong byte order";
+        return;
+      }
+      break;
+    }
+    case ELFDATA2MSB: {
+      if (__BIG_ENDIAN != __BYTE_ORDER) {
+        DCHECK_EQ(__BIG_ENDIAN, __BYTE_ORDER); // << ": wrong byte order";
+        return;
+      }
+      break;
+    }
+    default: {
+      RAW_DCHECK(false, "unexpected data encoding"); // << base_as_char[EI_DATA];
+      return;
+    }
+  }
+
+  ehdr_ = reinterpret_cast<const ElfW(Ehdr) *>(base);
+  const ElfW(Phdr) *dynamic_program_header = NULL;
+  for (int i = 0; i < ehdr_->e_phnum; ++i) {
+    const ElfW(Phdr) *const program_header = GetPhdr(i);
+    switch (program_header->p_type) {
+      case PT_LOAD:
+        if (link_base_ == ~0L) {
+          link_base_ = program_header->p_vaddr;
+        }
+        break;
+      case PT_DYNAMIC:
+        dynamic_program_header = program_header;
+        break;
+    }
+  }
+  if (link_base_ == ~0L || !dynamic_program_header) {
+    RAW_DCHECK(~0L != link_base_, "no PT_LOADs in VDSO");
+    RAW_DCHECK(dynamic_program_header, "no PT_DYNAMIC in VDSO");
+    // Mark this image as not present. Can not recur infinitely.
+    Init(0);
+    return;
+  }
+  ptrdiff_t relocation =
+      base_as_char - reinterpret_cast<const char *>(link_base_);
+  ElfW(Dyn) *dynamic_entry =
+      reinterpret_cast<ElfW(Dyn) *>(dynamic_program_header->p_vaddr +
+                                    relocation);
+  for (; dynamic_entry->d_tag != DT_NULL; ++dynamic_entry) {
+    ElfW(Xword) value = dynamic_entry->d_un.d_val;
+    if (fake_vdso) {
+      // A complication: in the real VDSO, dynamic entries are not relocated
+      // (it wasn't loaded by a dynamic loader). But when testing with a
+      // "fake" dlopen()ed vdso library, the loader relocates some (but
+      // not all!) of them before we get here.
+      if (dynamic_entry->d_tag == DT_VERDEF) {
+        // The only dynamic entry (of the ones we care about) libc-2.3.6
+        // loader doesn't relocate.
+        value += relocation;
+      }
+    } else {
+      // Real VDSO. Everything needs to be relocated.
+      value += relocation;
+    }
+    switch (dynamic_entry->d_tag) {
+      case DT_HASH:
+        hash_ = reinterpret_cast<ElfW(Word) *>(value);
+        break;
+      case DT_SYMTAB:
+        dynsym_ = reinterpret_cast<ElfW(Sym) *>(value);
+        break;
+      case DT_STRTAB:
+        dynstr_ = reinterpret_cast<const char *>(value);
+        break;
+      case DT_VERSYM:
+        versym_ = reinterpret_cast<ElfW(Versym) *>(value);
+        break;
+      case DT_VERDEF:
+        verdef_ = reinterpret_cast<ElfW(Verdef) *>(value);
+        break;
+      case DT_VERDEFNUM:
+        verdefnum_ = dynamic_entry->d_un.d_val;
+        break;
+      case DT_STRSZ:
+        strsize_ = dynamic_entry->d_un.d_val;
+        break;
+      default:
+        // Unrecognized entries explicitly ignored.
+        break;
+    }
+  }
+  if (!hash_ || !dynsym_ || !dynstr_ || !versym_ ||
+      !verdef_ || !verdefnum_ || !strsize_) {
+    RAW_DCHECK(hash_, "invalid VDSO (no DT_HASH)");
+    RAW_DCHECK(dynsym_, "invalid VDSO (no DT_SYMTAB)");
+    RAW_DCHECK(dynstr_, "invalid VDSO (no DT_STRTAB)");
+    RAW_DCHECK(versym_, "invalid VDSO (no DT_VERSYM)");
+    RAW_DCHECK(verdef_, "invalid VDSO (no DT_VERDEF)");
+    RAW_DCHECK(verdefnum_, "invalid VDSO (no DT_VERDEFNUM)");
+    RAW_DCHECK(strsize_, "invalid VDSO (no DT_STRSZ)");
+    // Mark this image as not present. Can not recur infinitely.
+    Init(0);
+    return;
+  }
+}
+
+bool ElfMemImage::LookupSymbol(const char *name,
+                               const char *version,
+                               int type,
+                               SymbolInfo *info) const {
+  for (SymbolIterator it = begin(); it != end(); ++it) {
+    if (strcmp(it->name, name) == 0 && strcmp(it->version, version) == 0 &&
+        CurrentElfClass::ElfType(it->symbol) == type) {
+      if (info) {
+        *info = *it;
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ElfMemImage::LookupSymbolByAddress(const void *address,
+                                        SymbolInfo *info_out) const {
+  for (SymbolIterator it = begin(); it != end(); ++it) {
+    const char *const symbol_start =
+        reinterpret_cast<const char *>(it->address);
+    const char *const symbol_end = symbol_start + it->symbol->st_size;
+    if (symbol_start <= address && address < symbol_end) {
+      if (info_out) {
+        // Client wants to know details for that symbol (the usual case).
+        if (CurrentElfClass::ElfBind(it->symbol) == STB_GLOBAL) {
+          // Strong symbol; just return it.
+          *info_out = *it;
+          return true;
+        } else {
+          // Weak or local. Record it, but keep looking for a strong one.
+          *info_out = *it;
+        }
+      } else {
+        // Client only cares if there is an overlapping symbol.
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+ElfMemImage::SymbolIterator::SymbolIterator(const void *const image, int index)
+    : index_(index), image_(image) {
+}
+
+const ElfMemImage::SymbolInfo *ElfMemImage::SymbolIterator::operator->() const {
+  return &info_;
+}
+
+const ElfMemImage::SymbolInfo& ElfMemImage::SymbolIterator::operator*() const {
+  return info_;
+}
+
+bool ElfMemImage::SymbolIterator::operator==(const SymbolIterator &rhs) const {
+  return this->image_ == rhs.image_ && this->index_ == rhs.index_;
+}
+
+bool ElfMemImage::SymbolIterator::operator!=(const SymbolIterator &rhs) const {
+  return !(*this == rhs);
+}
+
+ElfMemImage::SymbolIterator &ElfMemImage::SymbolIterator::operator++() {
+  this->Update(1);
+  return *this;
+}
+
+ElfMemImage::SymbolIterator ElfMemImage::begin() const {
+  SymbolIterator it(this, 0);
+  it.Update(0);
+  return it;
+}
+
+ElfMemImage::SymbolIterator ElfMemImage::end() const {
+  return SymbolIterator(this, GetNumSymbols());
+}
+
+void ElfMemImage::SymbolIterator::Update(int increment) {
+  const ElfMemImage *image = reinterpret_cast<const ElfMemImage *>(image_);
+  CHECK(image->IsPresent() || increment == 0);
+  if (!image->IsPresent()) {
+    return;
+  }
+  index_ += increment;
+  if (index_ >= image->GetNumSymbols()) {
+    index_ = image->GetNumSymbols();
+    return;
+  }
+  const ElfW(Sym)    *symbol = image->GetDynsym(index_);
+  const ElfW(Versym) *version_symbol = image->GetVersym(index_);
+  CHECK(symbol && version_symbol);
+  const char *const symbol_name = image->GetDynstr(symbol->st_name);
+  const ElfW(Versym) version_index = version_symbol[0] & VERSYM_VERSION;
+  const ElfW(Verdef) *version_definition = NULL;
+  const char *version_name = "";
+  if (symbol->st_shndx == SHN_UNDEF) {
+    // Undefined symbols reference DT_VERNEED, not DT_VERDEF, and
+    // version_index could well be greater than verdefnum_, so calling
+    // GetVerdef(version_index) may trigger assertion.
+  } else {
+    version_definition = image->GetVerdef(version_index);
+  }
+  if (version_definition) {
+    // I am expecting 1 or 2 auxiliary entries: 1 for the version itself,
+    // optional 2nd if the version has a parent.
+    CHECK_LE(1, version_definition->vd_cnt);
+    CHECK_LE(version_definition->vd_cnt, 2);
+    const ElfW(Verdaux) *version_aux = image->GetVerdefAux(version_definition);
+    version_name = image->GetVerstr(version_aux->vda_name);
+  }
+  info_.name    = symbol_name;
+  info_.version = version_name;
+  info_.address = image->GetSymAddr(symbol);
+  info_.symbol  = symbol;
+}
+
+}  // namespace base
+
+#endif  // HAVE_ELF_MEM_IMAGE

diff --git a/src/base/elf_mem_image.h b/src/base/elf_mem_image.h
new file mode 100644
index 0000000..5fb00ff
--- /dev/null
+++ b/src/base/elf_mem_image.h

@@ -0,0 +1,135 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Paul Pluzhnikov
+//
+// Allow dynamic symbol lookup for in-memory Elf images.
+
+#ifndef BASE_ELF_MEM_IMAGE_H_
+#define BASE_ELF_MEM_IMAGE_H_
+
+#include <config.h>
+#ifdef HAVE_FEATURES_H
+#include <features.h>   // for __GLIBC__
+#endif
+
+// Maybe one day we can rewrite this file not to require the elf
+// symbol extensions in glibc, but for right now we need them.
+#if defined(__ELF__) && defined(__GLIBC__) && !defined(__native_client__)
+
+#define HAVE_ELF_MEM_IMAGE 1
+
+#include <stdlib.h>
+#include <link.h>  // for ElfW
+
+namespace base {
+
+// An in-memory ELF image (may not exist on disk).
+class ElfMemImage {
+ public:
+  // Sentinel: there could never be an elf image at this address.
+  static const void *const kInvalidBase;
+
+  // Information about a single vdso symbol.
+  // All pointers are into .dynsym, .dynstr, or .text of the VDSO.
+  // Do not free() them or modify through them.
+  struct SymbolInfo {
+    const char      *name;      // E.g. "__vdso_getcpu"
+    const char      *version;   // E.g. "LINUX_2.6", could be ""
+                                // for unversioned symbol.
+    const void      *address;   // Relocated symbol address.
+    const ElfW(Sym) *symbol;    // Symbol in the dynamic symbol table.
+  };
+
+  // Supports iteration over all dynamic symbols.
+  class SymbolIterator {
+   public:
+    friend class ElfMemImage;
+    const SymbolInfo *operator->() const;
+    const SymbolInfo &operator*() const;
+    SymbolIterator& operator++();
+    bool operator!=(const SymbolIterator &rhs) const;
+    bool operator==(const SymbolIterator &rhs) const;
+   private:
+    SymbolIterator(const void *const image, int index);
+    void Update(int incr);
+    SymbolInfo info_;
+    int index_;
+    const void *const image_;
+  };
+
+
+  explicit ElfMemImage(const void *base);
+  void                 Init(const void *base);
+  bool                 IsPresent() const { return ehdr_ != NULL; }
+  const ElfW(Phdr)*    GetPhdr(int index) const;
+  const ElfW(Sym)*     GetDynsym(int index) const;
+  const ElfW(Versym)*  GetVersym(int index) const;
+  const ElfW(Verdef)*  GetVerdef(int index) const;
+  const ElfW(Verdaux)* GetVerdefAux(const ElfW(Verdef) *verdef) const;
+  const char*          GetDynstr(ElfW(Word) offset) const;
+  const void*          GetSymAddr(const ElfW(Sym) *sym) const;
+  const char*          GetVerstr(ElfW(Word) offset) const;
+  int                  GetNumSymbols() const;
+
+  SymbolIterator begin() const;
+  SymbolIterator end() const;
+
+  // Look up versioned dynamic symbol in the image.
+  // Returns false if image is not present, or doesn't contain given
+  // symbol/version/type combination.
+  // If info_out != NULL, additional details are filled in.
+  bool LookupSymbol(const char *name, const char *version,
+                    int symbol_type, SymbolInfo *info_out) const;
+
+  // Find info about symbol (if any) which overlaps given address.
+  // Returns true if symbol was found; false if image isn't present
+  // or doesn't have a symbol overlapping given address.
+  // If info_out != NULL, additional details are filled in.
+  bool LookupSymbolByAddress(const void *address, SymbolInfo *info_out) const;
+
+ private:
+  const ElfW(Ehdr) *ehdr_;
+  const ElfW(Sym) *dynsym_;
+  const ElfW(Versym) *versym_;
+  const ElfW(Verdef) *verdef_;
+  const ElfW(Word) *hash_;
+  const char *dynstr_;
+  size_t strsize_;
+  size_t verdefnum_;
+  ElfW(Addr) link_base_;     // Link-time base (p_vaddr of first PT_LOAD).
+};
+
+}  // namespace base
+
+#endif  // __ELF__ and __GLIBC__ and !__native_client__
+
+#endif  // BASE_ELF_MEM_IMAGE_H_

diff --git a/src/base/elfcore.h b/src/base/elfcore.h
new file mode 100644
index 0000000..d9599ed
--- /dev/null
+++ b/src/base/elfcore.h

@@ -0,0 +1,401 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2005-2008, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Markus Gutschke, Carl Crous
+ */
+
+#ifndef _ELFCORE_H
+#define _ELFCORE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* We currently only support x86-32, x86-64, ARM, MIPS, PPC on Linux.
+ * Porting to other related platforms should not be difficult.
+ */
+#if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__) || \
+     defined(__mips__) || defined(__PPC__)) && defined(__linux)
+
+#include <stdarg.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <config.h>
+
+
+/* Define the DUMPER symbol to make sure that there is exactly one
+ * core dumper built into the library.
+ */
+#define DUMPER "ELF"
+
+/* By the time that we get a chance to read CPU registers in the
+ * calling thread, they are already in a not particularly useful
+ * state. Besides, there will be multiple frames on the stack that are
+ * just making the core file confusing. To fix this problem, we take a
+ * snapshot of the frame pointer, stack pointer, and instruction
+ * pointer at an earlier time, and then insert these values into the
+ * core file.
+ */
+
+#if defined(__i386__) || defined(__x86_64__)
+  typedef struct i386_regs {    /* Normal (non-FPU) CPU registers            */
+  #ifdef __x86_64__
+    #define BP rbp
+    #define SP rsp
+    #define IP rip
+    uint64_t  r15,r14,r13,r12,rbp,rbx,r11,r10;
+    uint64_t  r9,r8,rax,rcx,rdx,rsi,rdi,orig_rax;
+    uint64_t  rip,cs,eflags;
+    uint64_t  rsp,ss;
+    uint64_t  fs_base, gs_base;
+    uint64_t  ds,es,fs,gs;
+  #else
+    #define BP ebp
+    #define SP esp
+    #define IP eip
+    uint32_t  ebx, ecx, edx, esi, edi, ebp, eax;
+    uint16_t  ds, __ds, es, __es;
+    uint16_t  fs, __fs, gs, __gs;
+    uint32_t  orig_eax, eip;
+    uint16_t  cs, __cs;
+    uint32_t  eflags, esp;
+    uint16_t  ss, __ss;
+  #endif
+  } i386_regs;
+#elif defined(__ARM_ARCH_3__)
+  typedef struct arm_regs {     /* General purpose registers                 */
+    #define BP uregs[11]        /* Frame pointer                             */
+    #define SP uregs[13]        /* Stack pointer                             */
+    #define IP uregs[15]        /* Program counter                           */
+    #define LR uregs[14]        /* Link register                             */
+    long uregs[18];
+  } arm_regs;
+#elif defined(__mips__)
+  typedef struct mips_regs {
+    unsigned long pad[6];       /* Unused padding to match kernel structures */
+    unsigned long uregs[32];    /* General purpose registers.                */
+    unsigned long hi;           /* Used for multiplication and division.     */
+    unsigned long lo;
+    unsigned long cp0_epc;      /* Program counter.                          */
+    unsigned long cp0_badvaddr;
+    unsigned long cp0_status;
+    unsigned long cp0_cause;
+    unsigned long unused;
+  } mips_regs;
+#elif defined (__PPC__)
+  typedef struct ppc_regs {
+    #define SP uregs[1]         /* Stack pointer                             */
+    #define IP rip              /* Program counter                           */
+    #define LR lr               /* Link register                             */
+    unsigned long uregs[32];	/* General Purpose Registers - r0-r31.       */
+    double        fpr[32];	/* Floating-Point Registers - f0-f31.        */
+    unsigned long rip;		/* Program counter.                          */
+    unsigned long msr;
+    unsigned long ccr;
+    unsigned long lr;
+    unsigned long ctr;
+    unsigned long xeq;
+    unsigned long mq;
+  } ppc_regs;
+#endif
+
+#if defined(__i386__) && defined(__GNUC__)
+  /* On x86 we provide an optimized version of the FRAME() macro, if the
+   * compiler supports a GCC-style asm() directive. This results in somewhat
+   * more accurate values for CPU registers.
+   */
+  typedef struct Frame {
+    struct i386_regs uregs;
+    int              errno_;
+    pid_t            tid;
+  } Frame;
+  #define FRAME(f) Frame f;                                           \
+                   do {                                               \
+                     f.errno_ = errno;                                \
+                     f.tid    = sys_gettid();                         \
+                     __asm__ volatile (                               \
+                       "push %%ebp\n"                                 \
+                       "push %%ebx\n"                                 \
+                       "mov  %%ebx,0(%%eax)\n"                        \
+                       "mov  %%ecx,4(%%eax)\n"                        \
+                       "mov  %%edx,8(%%eax)\n"                        \
+                       "mov  %%esi,12(%%eax)\n"                       \
+                       "mov  %%edi,16(%%eax)\n"                       \
+                       "mov  %%ebp,20(%%eax)\n"                       \
+                       "mov  %%eax,24(%%eax)\n"                       \
+                       "mov  %%ds,%%ebx\n"                            \
+                       "mov  %%ebx,28(%%eax)\n"                       \
+                       "mov  %%es,%%ebx\n"                            \
+                       "mov  %%ebx,32(%%eax)\n"                       \
+                       "mov  %%fs,%%ebx\n"                            \
+                       "mov  %%ebx,36(%%eax)\n"                       \
+                       "mov  %%gs,%%ebx\n"                            \
+                       "mov  %%ebx, 40(%%eax)\n"                      \
+                       "call 0f\n"                                    \
+                     "0:pop %%ebx\n"                                  \
+                       "add  $1f-0b,%%ebx\n"                          \
+                       "mov  %%ebx,48(%%eax)\n"                       \
+                       "mov  %%cs,%%ebx\n"                            \
+                       "mov  %%ebx,52(%%eax)\n"                       \
+                       "pushf\n"                                      \
+                       "pop  %%ebx\n"                                 \
+                       "mov  %%ebx,56(%%eax)\n"                       \
+                       "mov  %%esp,%%ebx\n"                           \
+                       "add  $8,%%ebx\n"                              \
+                       "mov  %%ebx,60(%%eax)\n"                       \
+                       "mov  %%ss,%%ebx\n"                            \
+                       "mov  %%ebx,64(%%eax)\n"                       \
+                       "pop  %%ebx\n"                                 \
+                       "pop  %%ebp\n"                                 \
+                     "1:"                                             \
+                       : : "a" (&f) : "memory");                      \
+                     } while (0)
+  #define SET_FRAME(f,r)                                              \
+                     do {                                             \
+                       errno = (f).errno_;                            \
+                       (r)   = (f).uregs;                             \
+                     } while (0)
+#elif defined(__x86_64__) && defined(__GNUC__)
+  /* The FRAME and SET_FRAME macros for x86_64.  */
+  typedef struct Frame {
+    struct i386_regs uregs;
+    int              errno_;
+    pid_t            tid;
+  } Frame;
+  #define FRAME(f) Frame f;                                           \
+                   do {                                               \
+                     f.errno_ = errno;                                \
+                     f.tid    = sys_gettid();                         \
+                     __asm__ volatile (                               \
+                       "push %%rbp\n"                                 \
+                       "push %%rbx\n"                                 \
+                       "mov  %%r15,0(%%rax)\n"                        \
+                       "mov  %%r14,8(%%rax)\n"                        \
+                       "mov  %%r13,16(%%rax)\n"                       \
+                       "mov  %%r12,24(%%rax)\n"                       \
+                       "mov  %%rbp,32(%%rax)\n"                       \
+                       "mov  %%rbx,40(%%rax)\n"                       \
+                       "mov  %%r11,48(%%rax)\n"                       \
+                       "mov  %%r10,56(%%rax)\n"                       \
+                       "mov  %%r9,64(%%rax)\n"                        \
+                       "mov  %%r8,72(%%rax)\n"                        \
+                       "mov  %%rax,80(%%rax)\n"                       \
+                       "mov  %%rcx,88(%%rax)\n"                       \
+                       "mov  %%rdx,96(%%rax)\n"                       \
+                       "mov  %%rsi,104(%%rax)\n"                      \
+                       "mov  %%rdi,112(%%rax)\n"                      \
+                       "mov  %%ds,%%rbx\n"                            \
+                       "mov  %%rbx,184(%%rax)\n"                      \
+                       "mov  %%es,%%rbx\n"                            \
+                       "mov  %%rbx,192(%%rax)\n"                      \
+                       "mov  %%fs,%%rbx\n"                            \
+                       "mov  %%rbx,200(%%rax)\n"                      \
+                       "mov  %%gs,%%rbx\n"                            \
+                       "mov  %%rbx,208(%%rax)\n"                      \
+                       "call 0f\n"                                    \
+                     "0:pop %%rbx\n"                                  \
+                       "add  $1f-0b,%%rbx\n"                          \
+                       "mov  %%rbx,128(%%rax)\n"                      \
+                       "mov  %%cs,%%rbx\n"                            \
+                       "mov  %%rbx,136(%%rax)\n"                      \
+                       "pushf\n"                                      \
+                       "pop  %%rbx\n"                                 \
+                       "mov  %%rbx,144(%%rax)\n"                      \
+                       "mov  %%rsp,%%rbx\n"                           \
+                       "add  $16,%%ebx\n"                             \
+                       "mov  %%rbx,152(%%rax)\n"                      \
+                       "mov  %%ss,%%rbx\n"                            \
+                       "mov  %%rbx,160(%%rax)\n"                      \
+                       "pop  %%rbx\n"                                 \
+                       "pop  %%rbp\n"                                 \
+                     "1:"                                             \
+                       : : "a" (&f) : "memory");                      \
+                     } while (0)
+  #define SET_FRAME(f,r)                                              \
+                     do {                                             \
+                       errno = (f).errno_;                            \
+                       (f).uregs.fs_base = (r).fs_base;               \
+                       (f).uregs.gs_base = (r).gs_base;               \
+                       (r)   = (f).uregs;                             \
+                     } while (0)
+#elif defined(__ARM_ARCH_3__) && defined(__GNUC__)
+  /* ARM calling conventions are a little more tricky. A little assembly
+   * helps in obtaining an accurate snapshot of all registers.
+   */
+  typedef struct Frame {
+    struct arm_regs arm;
+    int             errno_;
+    pid_t           tid;
+  } Frame;
+  #define FRAME(f) Frame f;                                           \
+                   do {                                               \
+                     long cpsr;                                       \
+                     f.errno_ = errno;                                \
+                     f.tid    = sys_gettid();                         \
+                     __asm__ volatile(                                \
+                       "stmia %0, {r0-r15}\n" /* All integer regs   */\
+                       : : "r"(&f.arm) : "memory");                   \
+                     f.arm.uregs[16] = 0;                             \
+                     __asm__ volatile(                                \
+                       "mrs %0, cpsr\n"       /* Condition code reg */\
+                       : "=r"(cpsr));                                 \
+                     f.arm.uregs[17] = cpsr;                          \
+                   } while (0)
+  #define SET_FRAME(f,r)                                              \
+                     do {                                             \
+                       /* Don't override the FPU status register.   */\
+                       /* Use the value obtained from ptrace(). This*/\
+                       /* works, because our code does not perform  */\
+                       /* any FPU operations, itself.               */\
+                       long fps      = (f).arm.uregs[16];             \
+                       errno         = (f).errno_;                    \
+                       (r)           = (f).arm;                       \
+                       (r).uregs[16] = fps;                           \
+                     } while (0)
+#elif defined(__mips__) && defined(__GNUC__)
+  typedef struct Frame {
+    struct mips_regs mips_regs;
+    int              errno_;
+    pid_t            tid;
+  } Frame;
+  #define MIPSREG(n) ({ register unsigned long r __asm__("$"#n); r; })
+  #define FRAME(f) Frame f = { 0 };                                   \
+                   do {                                               \
+                     unsigned long hi, lo;                            \
+                     register unsigned long pc __asm__("$31");        \
+                     f.mips_regs.uregs[ 0] = MIPSREG( 0);             \
+                     f.mips_regs.uregs[ 1] = MIPSREG( 1);             \
+                     f.mips_regs.uregs[ 2] = MIPSREG( 2);             \
+                     f.mips_regs.uregs[ 3] = MIPSREG( 3);             \
+                     f.mips_regs.uregs[ 4] = MIPSREG( 4);             \
+                     f.mips_regs.uregs[ 5] = MIPSREG( 5);             \
+                     f.mips_regs.uregs[ 6] = MIPSREG( 6);             \
+                     f.mips_regs.uregs[ 7] = MIPSREG( 7);             \
+                     f.mips_regs.uregs[ 8] = MIPSREG( 8);             \
+                     f.mips_regs.uregs[ 9] = MIPSREG( 9);             \
+                     f.mips_regs.uregs[10] = MIPSREG(10);             \
+                     f.mips_regs.uregs[11] = MIPSREG(11);             \
+                     f.mips_regs.uregs[12] = MIPSREG(12);             \
+                     f.mips_regs.uregs[13] = MIPSREG(13);             \
+                     f.mips_regs.uregs[14] = MIPSREG(14);             \
+                     f.mips_regs.uregs[15] = MIPSREG(15);             \
+                     f.mips_regs.uregs[16] = MIPSREG(16);             \
+                     f.mips_regs.uregs[17] = MIPSREG(17);             \
+                     f.mips_regs.uregs[18] = MIPSREG(18);             \
+                     f.mips_regs.uregs[19] = MIPSREG(19);             \
+                     f.mips_regs.uregs[20] = MIPSREG(20);             \
+                     f.mips_regs.uregs[21] = MIPSREG(21);             \
+                     f.mips_regs.uregs[22] = MIPSREG(22);             \
+                     f.mips_regs.uregs[23] = MIPSREG(23);             \
+                     f.mips_regs.uregs[24] = MIPSREG(24);             \
+                     f.mips_regs.uregs[25] = MIPSREG(25);             \
+                     f.mips_regs.uregs[26] = MIPSREG(26);             \
+                     f.mips_regs.uregs[27] = MIPSREG(27);             \
+                     f.mips_regs.uregs[28] = MIPSREG(28);             \
+                     f.mips_regs.uregs[29] = MIPSREG(29);             \
+                     f.mips_regs.uregs[30] = MIPSREG(30);             \
+                     f.mips_regs.uregs[31] = MIPSREG(31);             \
+                     __asm__ volatile ("mfhi %0" : "=r"(hi));         \
+                     __asm__ volatile ("mflo %0" : "=r"(lo));         \
+                     __asm__ volatile ("jal 1f; 1:nop" : "=r"(pc));   \
+                     f.mips_regs.hi       = hi;                       \
+                     f.mips_regs.lo       = lo;                       \
+                     f.mips_regs.cp0_epc  = pc;                       \
+                     f.errno_             = errno;                    \
+                     f.tid                = sys_gettid();             \
+                   } while (0)
+  #define SET_FRAME(f,r)                                              \
+                   do {                                               \
+                     errno       = (f).errno_;                        \
+                     memcpy((r).uregs, (f).mips_regs.uregs,           \
+                            32*sizeof(unsigned long));                \
+                     (r).hi      = (f).mips_regs.hi;                  \
+                     (r).lo      = (f).mips_regs.lo;                  \
+                     (r).cp0_epc = (f).mips_regs.cp0_epc;             \
+                   } while (0)
+#else
+  /* If we do not have a hand-optimized assembly version of the FRAME()
+   * macro, we cannot reliably unroll the stack. So, we show a few additional
+   * stack frames for the coredumper.
+   */
+  typedef struct Frame {
+    pid_t tid;
+  } Frame;
+  #define FRAME(f) Frame f; do { f.tid = sys_gettid(); } while (0)
+  #define SET_FRAME(f,r) do { } while (0)
+#endif
+
+
+/* Internal function for generating a core file. This API can change without
+ * notice and is only supposed to be used internally by the core dumper.
+ *
+ * This function works for both single- and multi-threaded core
+ * dumps. If called as
+ *
+ *   FRAME(frame);
+ *   InternalGetCoreDump(&frame, 0, NULL, ap);
+ *
+ * it creates a core file that only contains information about the
+ * calling thread.
+ *
+ * Optionally, the caller can provide information about other threads
+ * by passing their process ids in "thread_pids". The process id of
+ * the caller should not be included in this array. All of the threads
+ * must have been attached to with ptrace(), prior to calling this
+ * function. They will be detached when "InternalGetCoreDump()" returns.
+ *
+ * This function either returns a file handle that can be read for obtaining
+ * a core dump, or "-1" in case of an error. In the latter case, "errno"
+ * will be set appropriately.
+ *
+ * While "InternalGetCoreDump()" is not technically async signal safe, you
+ * might be tempted to invoke it from a signal handler. The code goes to
+ * great lengths to make a best effort that this will actually work. But in
+ * any case, you must make sure that you preserve the value of "errno"
+ * yourself. It is guaranteed to be clobbered otherwise.
+ *
+ * Also, "InternalGetCoreDump" is not strictly speaking re-entrant. Again,
+ * it makes a best effort to behave reasonably when called in a multi-
+ * threaded environment, but it is ultimately the caller's responsibility
+ * to provide locking.
+ */
+int InternalGetCoreDump(void *frame, int num_threads, pid_t *thread_pids,
+                        va_list ap
+                     /* const struct CoreDumpParameters *params,
+                        const char *file_name,
+                        const char *PATH
+                      */);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* _ELFCORE_H */

diff --git a/src/base/googleinit.h b/src/base/googleinit.h
new file mode 100644
index 0000000..3ea411a
--- /dev/null
+++ b/src/base/googleinit.h

@@ -0,0 +1,74 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Jacob Hoffman-Andrews
+
+#ifndef _GOOGLEINIT_H
+#define _GOOGLEINIT_H
+
+#include "base/logging.h"
+
+class GoogleInitializer {
+ public:
+  typedef void (*VoidFunction)(void);
+  GoogleInitializer(const char* name, VoidFunction ctor, VoidFunction dtor)
+      : name_(name), destructor_(dtor) {
+    RAW_VLOG(10, "<GoogleModuleObject> constructing: %s\n", name_);
+    if (ctor)
+      ctor();
+  }
+  ~GoogleInitializer() {
+    RAW_VLOG(10, "<GoogleModuleObject> destroying: %s\n", name_);
+    if (destructor_)
+      destructor_();
+  }
+
+ private:
+  const char* const name_;
+  const VoidFunction destructor_;
+};
+
+#define REGISTER_MODULE_INITIALIZER(name, body)                 \
+  namespace {                                                   \
+    static void google_init_module_##name () { body; }          \
+    GoogleInitializer google_initializer_module_##name(#name,   \
+            google_init_module_##name, NULL);                   \
+  }
+
+#define REGISTER_MODULE_DESTRUCTOR(name, body)                  \
+  namespace {                                                   \
+    static void google_destruct_module_##name () { body; }      \
+    GoogleInitializer google_destructor_module_##name(#name,    \
+            NULL, google_destruct_module_##name);               \
+  }
+
+
+#endif /* _GOOGLEINIT_H */

diff --git a/src/base/linux_syscall_support.h b/src/base/linux_syscall_support.h
new file mode 100644
index 0000000..56b8fac
--- /dev/null
+++ b/src/base/linux_syscall_support.h

@@ -0,0 +1,2484 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2005-2008, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Markus Gutschke
+ */
+
+/* This file includes Linux-specific support functions common to the
+ * coredumper and the thread lister; primarily, this is a collection
+ * of direct system calls, and a couple of symbols missing from
+ * standard header files.
+ * There are a few options that the including file can set to control
+ * the behavior of this file:
+ *
+ * SYS_CPLUSPLUS:
+ *   The entire header file will normally be wrapped in 'extern "C" { }",
+ *   making it suitable for compilation as both C and C++ source. If you
+ *   do not want to do this, you can set the SYS_CPLUSPLUS macro to inhibit
+ *   the wrapping. N.B. doing so will suppress inclusion of all prerequisite
+ *   system header files, too. It is the caller's responsibility to provide
+ *   the necessary definitions.
+ *
+ * SYS_ERRNO:
+ *   All system calls will update "errno" unless overriden by setting the
+ *   SYS_ERRNO macro prior to including this file. SYS_ERRNO should be
+ *   an l-value.
+ *
+ * SYS_INLINE:
+ *   New symbols will be defined "static inline", unless overridden by
+ *   the SYS_INLINE macro.
+ *
+ * SYS_LINUX_SYSCALL_SUPPORT_H
+ *   This macro is used to avoid multiple inclusions of this header file.
+ *   If you need to include this file more than once, make sure to
+ *   unset SYS_LINUX_SYSCALL_SUPPORT_H before each inclusion.
+ *
+ * SYS_PREFIX:
+ *   New system calls will have a prefix of "sys_" unless overridden by
+ *   the SYS_PREFIX macro. Valid values for this macro are [0..9] which
+ *   results in prefixes "sys[0..9]_". It is also possible to set this
+ *   macro to -1, which avoids all prefixes.
+ *
+ * This file defines a few internal symbols that all start with "LSS_".
+ * Do not access these symbols from outside this file. They are not part
+ * of the supported API.
+ *
+ * NOTE: This is a stripped down version of the official opensource
+ * version of linux_syscall_support.h, which lives at
+ *    http://code.google.com/p/linux-syscall-support/
+ * It includes only the syscalls that are used in perftools, plus a
+ * few extra.  Here's the breakdown:
+ * 1) Perftools uses these: grep -rho 'sys_[a-z0-9_A-Z]* *(' src | sort -u
+ *      sys__exit(
+ *      sys_clone(
+ *      sys_close(
+ *      sys_fcntl(
+ *      sys_fstat(
+ *      sys_futex(
+ *      sys_getcpu(
+ *      sys_getdents64(
+ *      sys_getppid(
+ *      sys_gettid(
+ *      sys_lseek(
+ *      sys_mmap(
+ *      sys_mremap(
+ *      sys_munmap(
+ *      sys_open(
+ *      sys_pipe(
+ *      sys_prctl(
+ *      sys_ptrace(
+ *      sys_ptrace_detach(
+ *      sys_read(
+ *      sys_sched_yield(
+ *      sys_sigaction(
+ *      sys_sigaltstack(
+ *      sys_sigdelset(
+ *      sys_sigfillset(
+ *      sys_sigprocmask(
+ *      sys_socket(
+ *      sys_stat(
+ *      sys_waitpid(
+ * 2) These are used as subroutines of the above:
+ *      sys_getpid       -- gettid
+ *      sys_kill         -- ptrace_detach
+ *      sys_restore      -- sigaction
+ *      sys_restore_rt   -- sigaction
+ *      sys_socketcall   -- socket
+ *      sys_wait4        -- waitpid
+ * 3) I left these in even though they're not used.  They either
+ * complement the above (write vs read) or are variants (rt_sigaction):
+ *      sys_fstat64
+ *      sys_llseek
+ *      sys_mmap2
+ *      sys_openat
+ *      sys_getdents
+ *      sys_rt_sigaction
+ *      sys_rt_sigprocmask
+ *      sys_sigaddset
+ *      sys_sigemptyset
+ *      sys_stat64
+ *      sys_write
+ */
+#ifndef SYS_LINUX_SYSCALL_SUPPORT_H
+#define SYS_LINUX_SYSCALL_SUPPORT_H
+
+/* We currently only support x86-32, x86-64, ARM, MIPS, PPC/PPC64 and Aarch64 on Linux.
+ * Porting to other related platforms should not be difficult.
+ */
+#if (defined(__i386__) || defined(__x86_64__) || defined(__arm__) || \
+     defined(__mips__) || defined(__PPC__) || defined(__aarch64__)) && defined(__linux)
+
+#ifndef SYS_CPLUSPLUS
+#ifdef __cplusplus
+/* Some system header files in older versions of gcc neglect to properly
+ * handle being included from C++. As it appears to be harmless to have
+ * multiple nested 'extern "C"' blocks, just add another one here.
+ */
+extern "C" {
+#endif
+
+#include <errno.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ptrace.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <linux/unistd.h>
+#include <endian.h>
+
+#ifdef __mips__
+/* Include definitions of the ABI currently in use.                          */
+#include <sgidefs.h>
+#endif
+
+#endif
+
+/* As glibc often provides subtly incompatible data structures (and implicit
+ * wrapper functions that convert them), we provide our own kernel data
+ * structures for use by the system calls.
+ * These structures have been developed by using Linux 2.6.23 headers for
+ * reference. Note though, we do not care about exact API compatibility
+ * with the kernel, and in fact the kernel often does not have a single
+ * API that works across architectures. Instead, we try to mimic the glibc
+ * API where reasonable, and only guarantee ABI compatibility with the
+ * kernel headers.
+ * Most notably, here are a few changes that were made to the structures
+ * defined by kernel headers:
+ *
+ * - we only define structures, but not symbolic names for kernel data
+ *   types. For the latter, we directly use the native C datatype
+ *   (i.e. "unsigned" instead of "mode_t").
+ * - in a few cases, it is possible to define identical structures for
+ *   both 32bit (e.g. i386) and 64bit (e.g. x86-64) platforms by
+ *   standardizing on the 64bit version of the data types. In particular,
+ *   this means that we use "unsigned" where the 32bit headers say
+ *   "unsigned long".
+ * - overall, we try to minimize the number of cases where we need to
+ *   conditionally define different structures.
+ * - the "struct kernel_sigaction" class of structures have been
+ *   modified to more closely mimic glibc's API by introducing an
+ *   anonymous union for the function pointer.
+ * - a small number of field names had to have an underscore appended to
+ *   them, because glibc defines a global macro by the same name.
+ */
+
+/* include/linux/dirent.h                                                    */
+struct kernel_dirent64 {
+  unsigned long long d_ino;
+  long long          d_off;
+  unsigned short     d_reclen;
+  unsigned char      d_type;
+  char               d_name[256];
+};
+
+/* include/linux/dirent.h                                                    */
+struct kernel_dirent {
+  long               d_ino;
+  long               d_off;
+  unsigned short     d_reclen;
+  char               d_name[256];
+};
+
+/* include/linux/time.h                                                      */
+struct kernel_timespec {
+  long               tv_sec;
+  long               tv_nsec;
+};
+
+/* include/linux/time.h                                                      */
+struct kernel_timeval {
+  long               tv_sec;
+  long               tv_usec;
+};
+
+/* include/linux/resource.h                                                  */
+struct kernel_rusage {
+  struct kernel_timeval ru_utime;
+  struct kernel_timeval ru_stime;
+  long               ru_maxrss;
+  long               ru_ixrss;
+  long               ru_idrss;
+  long               ru_isrss;
+  long               ru_minflt;
+  long               ru_majflt;
+  long               ru_nswap;
+  long               ru_inblock;
+  long               ru_oublock;
+  long               ru_msgsnd;
+  long               ru_msgrcv;
+  long               ru_nsignals;
+  long               ru_nvcsw;
+  long               ru_nivcsw;
+};
+
+#if defined(__i386__) || defined(__arm__) || defined(__PPC__)
+
+/* include/asm-{arm,i386,mips,ppc}/signal.h                                  */
+struct kernel_old_sigaction {
+  union {
+    void             (*sa_handler_)(int);
+    void             (*sa_sigaction_)(int, siginfo_t *, void *);
+  };
+  unsigned long      sa_mask;
+  unsigned long      sa_flags;
+  void               (*sa_restorer)(void);
+} __attribute__((packed,aligned(4)));
+#elif (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32)
+  #define kernel_old_sigaction kernel_sigaction
+#endif
+
+/* Some kernel functions (e.g. sigaction() in 2.6.23) require that the
+ * exactly match the size of the signal set, even though the API was
+ * intended to be extensible. We define our own KERNEL_NSIG to deal with
+ * this.
+ * Please note that glibc provides signals [1.._NSIG-1], whereas the
+ * kernel (and this header) provides the range [1..KERNEL_NSIG]. The
+ * actual number of signals is obviously the same, but the constants
+ * differ by one.
+ */
+#ifdef __mips__
+#define KERNEL_NSIG 128
+#else
+#define KERNEL_NSIG  64
+#endif
+
+/* include/asm-{arm,i386,mips,x86_64}/signal.h                               */
+struct kernel_sigset_t {
+  unsigned long sig[(KERNEL_NSIG + 8*sizeof(unsigned long) - 1)/
+                    (8*sizeof(unsigned long))];
+};
+
+/* include/asm-{arm,generic,i386,mips,x86_64,ppc}/signal.h                   */
+struct kernel_sigaction {
+#ifdef __mips__
+  unsigned long      sa_flags;
+  union {
+    void             (*sa_handler_)(int);
+    void             (*sa_sigaction_)(int, siginfo_t *, void *);
+  };
+  struct kernel_sigset_t sa_mask;
+#else
+  union {
+    void             (*sa_handler_)(int);
+    void             (*sa_sigaction_)(int, siginfo_t *, void *);
+  };
+  unsigned long      sa_flags;
+  void               (*sa_restorer)(void);
+  struct kernel_sigset_t sa_mask;
+#endif
+};
+
+/* include/asm-{arm,i386,mips,ppc}/stat.h                                    */
+#ifdef __mips__
+#if _MIPS_SIM == _MIPS_SIM_ABI64
+struct kernel_stat {
+#else
+struct kernel_stat64 {
+#endif
+  unsigned           st_dev;
+  unsigned           __pad0[3];
+  unsigned long long st_ino;
+  unsigned           st_mode;
+  unsigned           st_nlink;
+  unsigned           st_uid;
+  unsigned           st_gid;
+  unsigned           st_rdev;
+  unsigned           __pad1[3];
+  long long          st_size;
+  unsigned           st_atime_;
+  unsigned           st_atime_nsec_;
+  unsigned           st_mtime_;
+  unsigned           st_mtime_nsec_;
+  unsigned           st_ctime_;
+  unsigned           st_ctime_nsec_;
+  unsigned           st_blksize;
+  unsigned           __pad2;
+  unsigned long long st_blocks;
+};
+#elif defined __PPC__
+struct kernel_stat64 {
+  unsigned long long st_dev;
+  unsigned long long st_ino;
+  unsigned           st_nlink;
+  unsigned           st_mode;
+  unsigned           st_uid;
+  unsigned           st_gid;
+  int                __pad2;
+  unsigned long long st_rdev;
+  long long          st_size;
+  long long          st_blksize;
+  long long          st_blocks;
+  kernel_timespec    st_atim;
+  kernel_timespec    st_mtim;
+  kernel_timespec    st_ctim;
+  unsigned long      __unused4;
+  unsigned long      __unused5;
+  unsigned long      __unused6;
+};
+#else
+struct kernel_stat64 {
+  unsigned long long st_dev;
+  unsigned char      __pad0[4];
+  unsigned           __st_ino;
+  unsigned           st_mode;
+  unsigned           st_nlink;
+  unsigned           st_uid;
+  unsigned           st_gid;
+  unsigned long long st_rdev;
+  unsigned char      __pad3[4];
+  long long          st_size;
+  unsigned           st_blksize;
+  unsigned long long st_blocks;
+  unsigned           st_atime_;
+  unsigned           st_atime_nsec_;
+  unsigned           st_mtime_;
+  unsigned           st_mtime_nsec_;
+  unsigned           st_ctime_;
+  unsigned           st_ctime_nsec_;
+  unsigned long long st_ino;
+};
+#endif
+
+/* include/asm-{arm,generic,i386,mips,x86_64,ppc}/stat.h                     */
+#if defined(__i386__) || defined(__arm__)
+struct kernel_stat {
+  /* The kernel headers suggest that st_dev and st_rdev should be 32bit
+   * quantities encoding 12bit major and 20bit minor numbers in an interleaved
+   * format. In reality, we do not see useful data in the top bits. So,
+   * we'll leave the padding in here, until we find a better solution.
+   */
+  unsigned short     st_dev;
+  short              pad1;
+  unsigned           st_ino;
+  unsigned short     st_mode;
+  unsigned short     st_nlink;
+  unsigned short     st_uid;
+  unsigned short     st_gid;
+  unsigned short     st_rdev;
+  short              pad2;
+  unsigned           st_size;
+  unsigned           st_blksize;
+  unsigned           st_blocks;
+  unsigned           st_atime_;
+  unsigned           st_atime_nsec_;
+  unsigned           st_mtime_;
+  unsigned           st_mtime_nsec_;
+  unsigned           st_ctime_;
+  unsigned           st_ctime_nsec_;
+  unsigned           __unused4;
+  unsigned           __unused5;
+};
+#elif defined(__x86_64__)
+struct kernel_stat {
+  uint64_t           st_dev;
+  uint64_t           st_ino;
+  uint64_t           st_nlink;
+  unsigned           st_mode;
+  unsigned           st_uid;
+  unsigned           st_gid;
+  unsigned           __pad0;
+  uint64_t           st_rdev;
+  int64_t            st_size;
+  int64_t            st_blksize;
+  int64_t            st_blocks;
+  uint64_t           st_atime_;
+  uint64_t           st_atime_nsec_;
+  uint64_t           st_mtime_;
+  uint64_t           st_mtime_nsec_;
+  uint64_t           st_ctime_;
+  uint64_t           st_ctime_nsec_;
+  int64_t            __unused[3];
+};
+#elif defined(__PPC__)
+struct kernel_stat {
+  unsigned long long st_dev;
+  unsigned long      st_ino;
+  unsigned long      st_nlink;
+  unsigned long      st_mode;
+  unsigned           st_uid;
+  unsigned           st_gid;
+  int                __pad2;
+  unsigned long long st_rdev;
+  long               st_size;
+  unsigned long      st_blksize;
+  unsigned long      st_blocks;
+  kernel_timespec    st_atim;
+  kernel_timespec    st_mtim;
+  kernel_timespec    st_ctim;
+  unsigned long      __unused4;
+  unsigned long      __unused5;
+  unsigned long      __unused6;
+};
+#elif (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64)
+struct kernel_stat {
+  unsigned           st_dev;
+  int                st_pad1[3];
+  unsigned           st_ino;
+  unsigned           st_mode;
+  unsigned           st_nlink;
+  unsigned           st_uid;
+  unsigned           st_gid;
+  unsigned           st_rdev;
+  int                st_pad2[2];
+  long               st_size;
+  int                st_pad3;
+  long               st_atime_;
+  long               st_atime_nsec_;
+  long               st_mtime_;
+  long               st_mtime_nsec_;
+  long               st_ctime_;
+  long               st_ctime_nsec_;
+  int                st_blksize;
+  int                st_blocks;
+  int                st_pad4[14];
+};
+#elif defined(__aarch64__)
+struct kernel_stat {
+  unsigned long      st_dev;
+  unsigned long      st_ino;
+  unsigned int       st_mode;
+  unsigned int       st_nlink;
+  unsigned int       st_uid;
+  unsigned int       st_gid;
+  unsigned long      st_rdev;
+  unsigned long      __pad1;
+  long               st_size;
+  int                st_blksize;
+  int                __pad2;
+  long               st_blocks;
+  long               st_atime_;
+  unsigned long      st_atime_nsec_;
+  long               st_mtime_;
+  unsigned long      st_mtime_nsec_;
+  long               st_ctime_;
+  unsigned long      st_ctime_nsec_;
+  unsigned int       __unused4;
+  unsigned int       __unused5;
+};
+#endif
+
+
+/* Definitions missing from the standard header files                        */
+#ifndef O_DIRECTORY
+#if defined(__arm__)
+#define O_DIRECTORY             0040000
+#else
+#define O_DIRECTORY             0200000
+#endif
+#endif
+#ifndef PR_GET_DUMPABLE
+#define PR_GET_DUMPABLE         3
+#endif
+#ifndef PR_SET_DUMPABLE
+#define PR_SET_DUMPABLE         4
+#endif
+#ifndef AT_FDCWD
+#define AT_FDCWD                (-100)
+#endif
+#ifndef AT_SYMLINK_NOFOLLOW
+#define AT_SYMLINK_NOFOLLOW     0x100
+#endif
+#ifndef AT_REMOVEDIR
+#define AT_REMOVEDIR            0x200
+#endif
+#ifndef MREMAP_FIXED
+#define MREMAP_FIXED            2
+#endif
+#ifndef SA_RESTORER
+#define SA_RESTORER             0x04000000
+#endif
+
+#if defined(__i386__)
+#ifndef __NR_rt_sigaction
+#define __NR_rt_sigaction       174
+#define __NR_rt_sigprocmask     175
+#endif
+#ifndef __NR_stat64
+#define __NR_stat64             195
+#endif
+#ifndef __NR_fstat64
+#define __NR_fstat64            197
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64         220
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid             224
+#endif
+#ifndef __NR_futex
+#define __NR_futex              240
+#endif
+#ifndef __NR_openat
+#define __NR_openat             295
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu             318
+#endif
+/* End of i386 definitions                                                   */
+#elif defined(__arm__)
+#ifndef __syscall
+#if defined(__thumb__) || defined(__ARM_EABI__)
+#define __SYS_REG(name) register long __sysreg __asm__("r6") = __NR_##name;
+#define __SYS_REG_LIST(regs...) [sysreg] "r" (__sysreg) , ##regs
+#define __syscall(name) "swi\t0"
+#define __syscall_safe(name)                     \
+  "push  {r7}\n"                                 \
+  "mov   r7,%[sysreg]\n"                         \
+  __syscall(name)"\n"                            \
+  "pop   {r7}"
+#else
+#define __SYS_REG(name)
+#define __SYS_REG_LIST(regs...) regs
+#define __syscall(name) "swi\t" __sys1(__NR_##name) ""
+#define __syscall_safe(name) __syscall(name)
+#endif
+#endif
+#ifndef __NR_rt_sigaction
+#define __NR_rt_sigaction       (__NR_SYSCALL_BASE + 174)
+#define __NR_rt_sigprocmask     (__NR_SYSCALL_BASE + 175)
+#endif
+#ifndef __NR_stat64
+#define __NR_stat64             (__NR_SYSCALL_BASE + 195)
+#endif
+#ifndef __NR_fstat64
+#define __NR_fstat64            (__NR_SYSCALL_BASE + 197)
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64         (__NR_SYSCALL_BASE + 217)
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid             (__NR_SYSCALL_BASE + 224)
+#endif
+#ifndef __NR_futex
+#define __NR_futex              (__NR_SYSCALL_BASE + 240)
+#endif
+/* End of ARM definitions                                                  */
+#elif defined(__x86_64__)
+#ifndef __NR_gettid
+#define __NR_gettid             186
+#endif
+#ifndef __NR_futex
+#define __NR_futex              202
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64         217
+#endif
+#ifndef __NR_openat
+#define __NR_openat             257
+#endif
+/* End of x86-64 definitions                                                 */
+#elif defined(__mips__)
+#if _MIPS_SIM == _MIPS_SIM_ABI32
+#ifndef __NR_rt_sigaction
+#define __NR_rt_sigaction       (__NR_Linux + 194)
+#define __NR_rt_sigprocmask     (__NR_Linux + 195)
+#endif
+#ifndef __NR_stat64
+#define __NR_stat64             (__NR_Linux + 213)
+#endif
+#ifndef __NR_fstat64
+#define __NR_fstat64            (__NR_Linux + 215)
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64         (__NR_Linux + 219)
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid             (__NR_Linux + 222)
+#endif
+#ifndef __NR_futex
+#define __NR_futex              (__NR_Linux + 238)
+#endif
+#ifndef __NR_openat
+#define __NR_openat             (__NR_Linux + 288)
+#endif
+#ifndef __NR_fstatat
+#define __NR_fstatat            (__NR_Linux + 293)
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu             (__NR_Linux + 312)
+#endif
+/* End of MIPS (old 32bit API) definitions */
+#elif  _MIPS_SIM == _MIPS_SIM_ABI64
+#ifndef __NR_gettid
+#define __NR_gettid             (__NR_Linux + 178)
+#endif
+#ifndef __NR_futex
+#define __NR_futex              (__NR_Linux + 194)
+#endif
+#ifndef __NR_openat
+#define __NR_openat             (__NR_Linux + 247)
+#endif
+#ifndef __NR_fstatat
+#define __NR_fstatat            (__NR_Linux + 252)
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu             (__NR_Linux + 271)
+#endif
+/* End of MIPS (64bit API) definitions */
+#else
+#ifndef __NR_gettid
+#define __NR_gettid             (__NR_Linux + 178)
+#endif
+#ifndef __NR_futex
+#define __NR_futex              (__NR_Linux + 194)
+#endif
+#ifndef __NR_openat
+#define __NR_openat             (__NR_Linux + 251)
+#endif
+#ifndef __NR_fstatat
+#define __NR_fstatat            (__NR_Linux + 256)
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu             (__NR_Linux + 275)
+#endif
+/* End of MIPS (new 32bit API) definitions                                   */
+#endif
+/* End of MIPS definitions                                                   */
+#elif defined(__PPC__)
+#ifndef __NR_rt_sigaction
+#define __NR_rt_sigaction       173
+#define __NR_rt_sigprocmask     174
+#endif
+#ifndef __NR_stat64
+#define __NR_stat64             195
+#endif
+#ifndef __NR_fstat64
+#define __NR_fstat64            197
+#endif
+#ifndef __NR_socket
+#define __NR_socket             198
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64         202
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid             207
+#endif
+#ifndef __NR_futex
+#define __NR_futex              221
+#endif
+#ifndef __NR_openat
+#define __NR_openat             286
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu             302
+#endif
+/* End of powerpc defininitions                                              */
+#elif defined(__aarch64__)
+#ifndef __NR_fstatat
+#define __NR_fstatat             79
+#endif
+/* End of aarch64 defininitions                                              */
+#endif
+
+
+/* After forking, we must make sure to only call system calls.               */
+#if __BOUNDED_POINTERS__
+  #error "Need to port invocations of syscalls for bounded ptrs"
+#else
+  /* The core dumper and the thread lister get executed after threads
+   * have been suspended. As a consequence, we cannot call any functions
+   * that acquire locks. Unfortunately, libc wraps most system calls
+   * (e.g. in order to implement pthread_atfork, and to make calls
+   * cancellable), which means we cannot call these functions. Instead,
+   * we have to call syscall() directly.
+   */
+  #undef LSS_ERRNO
+  #ifdef SYS_ERRNO
+    /* Allow the including file to override the location of errno. This can
+     * be useful when using clone() with the CLONE_VM option.
+     */
+    #define LSS_ERRNO SYS_ERRNO
+  #else
+    #define LSS_ERRNO errno
+  #endif
+
+  #undef LSS_INLINE
+  #ifdef SYS_INLINE
+    #define LSS_INLINE SYS_INLINE
+  #else
+    #define LSS_INLINE static inline
+  #endif
+
+  /* Allow the including file to override the prefix used for all new
+   * system calls. By default, it will be set to "sys_".
+   */
+  #undef LSS_NAME
+  #ifndef SYS_PREFIX
+    #define LSS_NAME(name) sys_##name
+  #elif SYS_PREFIX < 0
+    #define LSS_NAME(name) name
+  #elif SYS_PREFIX == 0
+    #define LSS_NAME(name) sys0_##name
+  #elif SYS_PREFIX == 1
+    #define LSS_NAME(name) sys1_##name
+  #elif SYS_PREFIX == 2
+    #define LSS_NAME(name) sys2_##name
+  #elif SYS_PREFIX == 3
+    #define LSS_NAME(name) sys3_##name
+  #elif SYS_PREFIX == 4
+    #define LSS_NAME(name) sys4_##name
+  #elif SYS_PREFIX == 5
+    #define LSS_NAME(name) sys5_##name
+  #elif SYS_PREFIX == 6
+    #define LSS_NAME(name) sys6_##name
+  #elif SYS_PREFIX == 7
+    #define LSS_NAME(name) sys7_##name
+  #elif SYS_PREFIX == 8
+    #define LSS_NAME(name) sys8_##name
+  #elif SYS_PREFIX == 9
+    #define LSS_NAME(name) sys9_##name
+  #endif
+
+  #undef  LSS_RETURN
+  #if (defined(__i386__) || defined(__x86_64__) || defined(__arm__) ||        \
+       defined(__aarch64__))
+  /* Failing system calls return a negative result in the range of
+   * -1..-4095. These are "errno" values with the sign inverted.
+   */
+  #define LSS_RETURN(type, res)                                               \
+    do {                                                                      \
+      if ((unsigned long)(res) >= (unsigned long)(-4095)) {                   \
+        LSS_ERRNO = -(res);                                                   \
+        res = -1;                                                             \
+      }                                                                       \
+      return (type) (res);                                                    \
+    } while (0)
+  #elif defined(__mips__)
+  /* On MIPS, failing system calls return -1, and set errno in a
+   * separate CPU register.
+   */
+  #define LSS_RETURN(type, res, err)                                          \
+    do {                                                                      \
+      if (err) {                                                              \
+        LSS_ERRNO = (res);                                                    \
+        res = -1;                                                             \
+      }                                                                       \
+      return (type) (res);                                                    \
+    } while (0)
+  #elif defined(__PPC__)
+  /* On PPC, failing system calls return -1, and set errno in a
+   * separate CPU register. See linux/unistd.h.
+   */
+  #define LSS_RETURN(type, res, err)                                          \
+   do {                                                                       \
+     if (err & 0x10000000 ) {                                                 \
+       LSS_ERRNO = (res);                                                     \
+       res = -1;                                                              \
+     }                                                                        \
+     return (type) (res);                                                     \
+   } while (0)
+  #endif
+  #if defined(__i386__)
+    #if defined(NO_FRAME_POINTER) && (100 * __GNUC__ + __GNUC_MINOR__ >= 404)
+      /* This only works for GCC-4.4 and above -- the first version to use
+         .cfi directives for dwarf unwind info.  */
+      #define CFI_ADJUST_CFA_OFFSET(adjust)                                   \
+                  ".cfi_adjust_cfa_offset " #adjust "\n"
+    #else
+      #define CFI_ADJUST_CFA_OFFSET(adjust) /**/
+    #endif
+
+    /* In PIC mode (e.g. when building shared libraries), gcc for i386
+     * reserves ebx. Unfortunately, most distribution ship with implementations
+     * of _syscallX() which clobber ebx.
+     * Also, most definitions of _syscallX() neglect to mark "memory" as being
+     * clobbered. This causes problems with compilers, that do a better job
+     * at optimizing across __asm__ calls.
+     * So, we just have to redefine all of the _syscallX() macros.
+     */
+    #undef  LSS_BODY
+    #define LSS_BODY(type,args...)                                            \
+      long __res;                                                             \
+      __asm__ __volatile__("push %%ebx\n"                                     \
+                           CFI_ADJUST_CFA_OFFSET(4)                           \
+                           "movl %2,%%ebx\n"                                  \
+                           "int $0x80\n"                                      \
+                           "pop %%ebx\n"                                      \
+                           CFI_ADJUST_CFA_OFFSET(-4)                          \
+                           args                                               \
+                           : "esp", "memory");                                \
+      LSS_RETURN(type,__res)
+    #undef  _syscall0
+    #define _syscall0(type,name)                                              \
+      type LSS_NAME(name)(void) {                                             \
+        long __res;                                                           \
+        __asm__ volatile("int $0x80"                                          \
+                         : "=a" (__res)                                       \
+                         : "0" (__NR_##name)                                  \
+                         : "memory");                                         \
+        LSS_RETURN(type,__res);                                               \
+      }
+    #undef  _syscall1
+    #define _syscall1(type,name,type1,arg1)                                   \
+      type LSS_NAME(name)(type1 arg1) {                                       \
+        LSS_BODY(type,                                                        \
+             : "=a" (__res)                                                   \
+             : "0" (__NR_##name), "ri" ((long)(arg1)));                       \
+      }
+    #undef  _syscall2
+    #define _syscall2(type,name,type1,arg1,type2,arg2)                        \
+      type LSS_NAME(name)(type1 arg1,type2 arg2) {                            \
+        LSS_BODY(type,                                                        \
+             : "=a" (__res)                                                   \
+             : "0" (__NR_##name),"ri" ((long)(arg1)), "c" ((long)(arg2)));    \
+      }
+    #undef  _syscall3
+    #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)             \
+      type LSS_NAME(name)(type1 arg1,type2 arg2,type3 arg3) {                 \
+        LSS_BODY(type,                                                        \
+             : "=a" (__res)                                                   \
+             : "0" (__NR_##name), "ri" ((long)(arg1)), "c" ((long)(arg2)),    \
+               "d" ((long)(arg3)));                                           \
+      }
+    #undef  _syscall4
+    #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {   \
+        LSS_BODY(type,                                                        \
+             : "=a" (__res)                                                   \
+             : "0" (__NR_##name), "ri" ((long)(arg1)), "c" ((long)(arg2)),    \
+               "d" ((long)(arg3)),"S" ((long)(arg4)));                        \
+      }
+    #undef  _syscall5
+    #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5)                                             \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5) {                                       \
+        long __res;                                                           \
+        __asm__ __volatile__("push %%ebx\n"                                   \
+                             "movl %2,%%ebx\n"                                \
+                             "movl %1,%%eax\n"                                \
+                             "int  $0x80\n"                                   \
+                             "pop  %%ebx"                                     \
+                             : "=a" (__res)                                   \
+                             : "i" (__NR_##name), "ri" ((long)(arg1)),        \
+                               "c" ((long)(arg2)), "d" ((long)(arg3)),        \
+                               "S" ((long)(arg4)), "D" ((long)(arg5))         \
+                             : "esp", "memory");                              \
+        LSS_RETURN(type,__res);                                               \
+      }
+    #undef  _syscall6
+    #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5,type6,arg6)                                  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5, type6 arg6) {                           \
+        long __res;                                                           \
+        struct { long __a1; long __a6; } __s = { (long)arg1, (long) arg6 };   \
+        __asm__ __volatile__("push %%ebp\n"                                   \
+                             "push %%ebx\n"                                   \
+                             "movl 4(%2),%%ebp\n"                             \
+                             "movl 0(%2), %%ebx\n"                            \
+                             "movl %1,%%eax\n"                                \
+                             "int  $0x80\n"                                   \
+                             "pop  %%ebx\n"                                   \
+                             "pop  %%ebp"                                     \
+                             : "=a" (__res)                                   \
+                             : "i" (__NR_##name),  "0" ((long)(&__s)),        \
+                               "c" ((long)(arg2)), "d" ((long)(arg3)),        \
+                               "S" ((long)(arg4)), "D" ((long)(arg5))         \
+                             : "esp", "memory");                              \
+        LSS_RETURN(type,__res);                                               \
+      }
+    LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+                                   int flags, void *arg, int *parent_tidptr,
+                                   void *newtls, int *child_tidptr) {
+      long __res;
+      __asm__ __volatile__(/* if (fn == NULL)
+                            *   return -EINVAL;
+                            */
+                           "movl   %3,%%ecx\n"
+                           "jecxz  1f\n"
+
+                           /* if (child_stack == NULL)
+                            *   return -EINVAL;
+                            */
+                           "movl   %4,%%ecx\n"
+                           "jecxz  1f\n"
+
+                           /* Set up alignment of the child stack:
+                            * child_stack = (child_stack & ~0xF) - 20;
+                            */
+                           "andl   $-16,%%ecx\n"
+                           "subl   $20,%%ecx\n"
+
+                           /* Push "arg" and "fn" onto the stack that will be
+                            * used by the child.
+                            */
+                           "movl   %6,%%eax\n"
+                           "movl   %%eax,4(%%ecx)\n"
+                           "movl   %3,%%eax\n"
+                           "movl   %%eax,(%%ecx)\n"
+
+                           /* %eax = syscall(%eax = __NR_clone,
+                            *                %ebx = flags,
+                            *                %ecx = child_stack,
+                            *                %edx = parent_tidptr,
+                            *                %esi = newtls,
+                            *                %edi = child_tidptr)
+                            * Also, make sure that %ebx gets preserved as it is
+                            * used in PIC mode.
+                            */
+                           "movl   %8,%%esi\n"
+                           "movl   %7,%%edx\n"
+                           "movl   %5,%%eax\n"
+                           "movl   %9,%%edi\n"
+                           "pushl  %%ebx\n"
+                           "movl   %%eax,%%ebx\n"
+                           "movl   %2,%%eax\n"
+                           "int    $0x80\n"
+
+                           /* In the parent: restore %ebx
+                            * In the child:  move "fn" into %ebx
+                            */
+                           "popl   %%ebx\n"
+
+                           /* if (%eax != 0)
+                            *   return %eax;
+                            */
+                           "test   %%eax,%%eax\n"
+                           "jnz    1f\n"
+
+                           /* In the child, now. Terminate frame pointer chain.
+                            */
+                           "movl   $0,%%ebp\n"
+
+                           /* Call "fn". "arg" is already on the stack.
+                            */
+                           "call   *%%ebx\n"
+
+                           /* Call _exit(%ebx). Unfortunately older versions
+                            * of gcc restrict the number of arguments that can
+                            * be passed to asm(). So, we need to hard-code the
+                            * system call number.
+                            */
+                           "movl   %%eax,%%ebx\n"
+                           "movl   $1,%%eax\n"
+                           "int    $0x80\n"
+
+                           /* Return to parent.
+                            */
+                         "1:\n"
+                           : "=a" (__res)
+                           : "0"(-EINVAL), "i"(__NR_clone),
+                             "m"(fn), "m"(child_stack), "m"(flags), "m"(arg),
+                             "m"(parent_tidptr), "m"(newtls), "m"(child_tidptr)
+                           : "esp", "memory", "ecx", "edx", "esi", "edi");
+      LSS_RETURN(int, __res);
+    }
+
+    LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) {
+      /* On i386, the kernel does not know how to return from a signal
+       * handler. Instead, it relies on user space to provide a
+       * restorer function that calls the {rt_,}sigreturn() system call.
+       * Unfortunately, we cannot just reference the glibc version of this
+       * function, as glibc goes out of its way to make it inaccessible.
+       */
+      void (*res)(void);
+      __asm__ __volatile__("call   2f\n"
+                         "0:.align 16\n"
+                         "1:movl   %1,%%eax\n"
+                           "int    $0x80\n"
+                         "2:popl   %0\n"
+                           "addl   $(1b-0b),%0\n"
+                           : "=a" (res)
+                           : "i"  (__NR_rt_sigreturn));
+      return res;
+    }
+    LSS_INLINE void (*LSS_NAME(restore)(void))(void) {
+      /* On i386, the kernel does not know how to return from a signal
+       * handler. Instead, it relies on user space to provide a
+       * restorer function that calls the {rt_,}sigreturn() system call.
+       * Unfortunately, we cannot just reference the glibc version of this
+       * function, as glibc goes out of its way to make it inaccessible.
+       */
+      void (*res)(void);
+      __asm__ __volatile__("call   2f\n"
+                         "0:.align 16\n"
+                         "1:pop    %%eax\n"
+                           "movl   %1,%%eax\n"
+                           "int    $0x80\n"
+                         "2:popl   %0\n"
+                           "addl   $(1b-0b),%0\n"
+                           : "=a" (res)
+                           : "i"  (__NR_sigreturn));
+      return res;
+    }
+  #elif defined(__x86_64__)
+    /* There are no known problems with any of the _syscallX() macros
+     * currently shipping for x86_64, but we still need to be able to define
+     * our own version so that we can override the location of the errno
+     * location (e.g. when using the clone() system call with the CLONE_VM
+     * option).
+     */
+    #undef  LSS_ENTRYPOINT
+    #define LSS_ENTRYPOINT "syscall\n"
+
+    /* The x32 ABI has 32 bit longs, but the syscall interface is 64 bit.
+     * We need to explicitly cast to an unsigned 64 bit type to avoid implicit
+     * sign extension.  We can't cast pointers directly because those are
+     * 32 bits, and gcc will dump ugly warnings about casting from a pointer
+     * to an integer of a different size.
+     */
+    #undef  LSS_SYSCALL_ARG
+    #define LSS_SYSCALL_ARG(a) ((uint64_t)(uintptr_t)(a))
+    #undef  _LSS_RETURN
+    #define _LSS_RETURN(type, res, cast)                                      \
+      do {                                                                    \
+        if ((uint64_t)(res) >= (uint64_t)(-4095)) {                           \
+          LSS_ERRNO = -(res);                                                 \
+          res = -1;                                                           \
+        }                                                                     \
+        return (type)(cast)(res);                                             \
+      } while (0)
+    #undef  LSS_RETURN
+    #define LSS_RETURN(type, res) _LSS_RETURN(type, res, uintptr_t)
+
+    #undef  _LSS_BODY
+    #define _LSS_BODY(nr, type, name, cast, ...)                              \
+          long long __res;                                                    \
+          __asm__ __volatile__(LSS_BODY_ASM##nr LSS_ENTRYPOINT                \
+            : "=a" (__res)                                                    \
+            : "0" (__NR_##name) LSS_BODY_ARG##nr(__VA_ARGS__)                 \
+            : LSS_BODY_CLOBBER##nr "r11", "rcx", "memory");                   \
+          _LSS_RETURN(type, __res, cast)
+    #undef  LSS_BODY
+    #define LSS_BODY(nr, type, name, args...) \
+      _LSS_BODY(nr, type, name, uintptr_t, ## args)
+
+    #undef  LSS_BODY_ASM0
+    #undef  LSS_BODY_ASM1
+    #undef  LSS_BODY_ASM2
+    #undef  LSS_BODY_ASM3
+    #undef  LSS_BODY_ASM4
+    #undef  LSS_BODY_ASM5
+    #undef  LSS_BODY_ASM6
+    #define LSS_BODY_ASM0
+    #define LSS_BODY_ASM1 LSS_BODY_ASM0
+    #define LSS_BODY_ASM2 LSS_BODY_ASM1
+    #define LSS_BODY_ASM3 LSS_BODY_ASM2
+    #define LSS_BODY_ASM4 LSS_BODY_ASM3 "movq %5,%%r10;"
+    #define LSS_BODY_ASM5 LSS_BODY_ASM4 "movq %6,%%r8;"
+    #define LSS_BODY_ASM6 LSS_BODY_ASM5 "movq %7,%%r9;"
+
+    #undef  LSS_BODY_CLOBBER0
+    #undef  LSS_BODY_CLOBBER1
+    #undef  LSS_BODY_CLOBBER2
+    #undef  LSS_BODY_CLOBBER3
+    #undef  LSS_BODY_CLOBBER4
+    #undef  LSS_BODY_CLOBBER5
+    #undef  LSS_BODY_CLOBBER6
+    #define LSS_BODY_CLOBBER0
+    #define LSS_BODY_CLOBBER1 LSS_BODY_CLOBBER0
+    #define LSS_BODY_CLOBBER2 LSS_BODY_CLOBBER1
+    #define LSS_BODY_CLOBBER3 LSS_BODY_CLOBBER2
+    #define LSS_BODY_CLOBBER4 LSS_BODY_CLOBBER3 "r10",
+    #define LSS_BODY_CLOBBER5 LSS_BODY_CLOBBER4 "r8",
+    #define LSS_BODY_CLOBBER6 LSS_BODY_CLOBBER5 "r9",
+
+    #undef  LSS_BODY_ARG0
+    #undef  LSS_BODY_ARG1
+    #undef  LSS_BODY_ARG2
+    #undef  LSS_BODY_ARG3
+    #undef  LSS_BODY_ARG4
+    #undef  LSS_BODY_ARG5
+    #undef  LSS_BODY_ARG6
+    #define LSS_BODY_ARG0()
+    #define LSS_BODY_ARG1(arg1) \
+      LSS_BODY_ARG0(), "D" (arg1)
+    #define LSS_BODY_ARG2(arg1, arg2) \
+      LSS_BODY_ARG1(arg1), "S" (arg2)
+    #define LSS_BODY_ARG3(arg1, arg2, arg3) \
+      LSS_BODY_ARG2(arg1, arg2), "d" (arg3)
+    #define LSS_BODY_ARG4(arg1, arg2, arg3, arg4) \
+      LSS_BODY_ARG3(arg1, arg2, arg3), "r" (arg4)
+    #define LSS_BODY_ARG5(arg1, arg2, arg3, arg4, arg5) \
+      LSS_BODY_ARG4(arg1, arg2, arg3, arg4), "r" (arg5)
+    #define LSS_BODY_ARG6(arg1, arg2, arg3, arg4, arg5, arg6) \
+      LSS_BODY_ARG5(arg1, arg2, arg3, arg4, arg5), "r" (arg6)
+
+    #undef _syscall0
+    #define _syscall0(type,name)                                              \
+      type LSS_NAME(name)() {                                                 \
+        LSS_BODY(0, type, name);                                              \
+      }
+    #undef _syscall1
+    #define _syscall1(type,name,type1,arg1)                                   \
+      type LSS_NAME(name)(type1 arg1) {                                       \
+        LSS_BODY(1, type, name, LSS_SYSCALL_ARG(arg1));                       \
+      }
+    #undef _syscall2
+    #define _syscall2(type,name,type1,arg1,type2,arg2)                        \
+      type LSS_NAME(name)(type1 arg1, type2 arg2) {                           \
+        LSS_BODY(2, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2));\
+      }
+    #undef _syscall3
+    #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)             \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) {               \
+        LSS_BODY(3, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2), \
+                                LSS_SYSCALL_ARG(arg3));                       \
+      }
+    #undef _syscall4
+    #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {   \
+        LSS_BODY(4, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2), \
+                                LSS_SYSCALL_ARG(arg3), LSS_SYSCALL_ARG(arg4));\
+      }
+    #undef _syscall5
+    #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5)                                             \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5) {                                       \
+        LSS_BODY(5, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2), \
+                                LSS_SYSCALL_ARG(arg3), LSS_SYSCALL_ARG(arg4), \
+                                LSS_SYSCALL_ARG(arg5));                       \
+      }
+    #undef _syscall6
+    #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5,type6,arg6)                                  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5, type6 arg6) {                           \
+        LSS_BODY(6, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2), \
+                                LSS_SYSCALL_ARG(arg3), LSS_SYSCALL_ARG(arg4), \
+                                LSS_SYSCALL_ARG(arg5), LSS_SYSCALL_ARG(arg6));\
+      }
+    LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+                                   int flags, void *arg, int *parent_tidptr,
+                                   void *newtls, int *child_tidptr) {
+      long long __res;
+      {
+        __asm__ __volatile__(/* if (fn == NULL)
+                              *   return -EINVAL;
+                              */
+                             "testq  %4,%4\n"
+                             "jz     1f\n"
+
+                             /* if (child_stack == NULL)
+                              *   return -EINVAL;
+                              */
+                             "testq  %5,%5\n"
+                             "jz     1f\n"
+
+                             /* Set up alignment of the child stack:
+                              * child_stack = (child_stack & ~0xF) - 16;
+                              */
+                             "andq   $-16,%5\n"
+                             "subq   $16,%5\n"
+
+                             /* Push "arg" and "fn" onto the stack that will be
+                              * used by the child.
+                              */
+                             "movq   %7,8(%5)\n"
+                             "movq   %4,0(%5)\n"
+
+                             /* %rax = syscall(%rax = __NR_clone,
+                              *                %rdi = flags,
+                              *                %rsi = child_stack,
+                              *                %rdx = parent_tidptr,
+                              *                %r8  = new_tls,
+                              *                %r10 = child_tidptr)
+                              */
+                             "movq   %2,%%rax\n"
+                             "movq   %9,%%r8\n"
+                             "movq   %10,%%r10\n"
+                             "syscall\n"
+
+                             /* if (%rax != 0)
+                              *   return;
+                              */
+                             "testq  %%rax,%%rax\n"
+                             "jnz    1f\n"
+
+                             /* In the child. Terminate frame pointer chain.
+                              */
+                             "xorq   %%rbp,%%rbp\n"
+
+                             /* Call "fn(arg)".
+                              */
+                             "popq   %%rax\n"
+                             "popq   %%rdi\n"
+                             "call   *%%rax\n"
+
+                             /* Call _exit(%ebx).
+                              */
+                             "movq   %%rax,%%rdi\n"
+                             "movq   %3,%%rax\n"
+                             "syscall\n"
+
+                             /* Return to parent.
+                              */
+                           "1:\n"
+                             : "=a" (__res)
+                             : "0"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit),
+                               "r"(LSS_SYSCALL_ARG(fn)),
+                               "S"(LSS_SYSCALL_ARG(child_stack)),
+                               "D"(LSS_SYSCALL_ARG(flags)),
+                               "r"(LSS_SYSCALL_ARG(arg)),
+                               "d"(LSS_SYSCALL_ARG(parent_tidptr)),
+                               "r"(LSS_SYSCALL_ARG(newtls)),
+                               "r"(LSS_SYSCALL_ARG(child_tidptr))
+                             : "rsp", "memory", "r8", "r10", "r11", "rcx");
+      }
+      LSS_RETURN(int, __res);
+    }
+
+    LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) {
+      /* On x86-64, the kernel does not know how to return from
+       * a signal handler. Instead, it relies on user space to provide a
+       * restorer function that calls the rt_sigreturn() system call.
+       * Unfortunately, we cannot just reference the glibc version of this
+       * function, as glibc goes out of its way to make it inaccessible.
+       */
+      long long res;
+      __asm__ __volatile__("call   2f\n"
+                         "0:.align 16\n"
+                         "1:movq   %1,%%rax\n"
+                           "syscall\n"
+                         "2:popq   %0\n"
+                           "addq   $(1b-0b),%0\n"
+                           : "=a" (res)
+                           : "i"  (__NR_rt_sigreturn));
+      return (void (*)(void))(uintptr_t)res;
+    }
+  #elif defined(__arm__)
+    /* Most definitions of _syscallX() neglect to mark "memory" as being
+     * clobbered. This causes problems with compilers, that do a better job
+     * at optimizing across __asm__ calls.
+     * So, we just have to redefine all fo the _syscallX() macros.
+     */
+    #undef LSS_REG
+    #define LSS_REG(r,a) register long __r##r __asm__("r"#r) = (long)a
+
+    /* r0..r3 are scratch registers and not preserved across function
+     * calls.  We need to first evaluate the first 4 syscall arguments
+     * and store them on stack.  They must be loaded into r0..r3 after
+     * all function calls to avoid r0..r3 being clobbered.
+     */
+    #undef LSS_SAVE_ARG
+    #define LSS_SAVE_ARG(r,a) long __tmp##r = (long)a
+    #undef LSS_LOAD_ARG
+    #define LSS_LOAD_ARG(r) register long __r##r __asm__("r"#r) = __tmp##r
+
+    #undef  LSS_BODY
+    #define LSS_BODY(type, name, args...)                                     \
+          register long __res_r0 __asm__("r0");                               \
+          long __res;                                                         \
+          __SYS_REG(name)                                                     \
+          __asm__ __volatile__ (__syscall_safe(name)                          \
+                                : "=r"(__res_r0)                              \
+                                : __SYS_REG_LIST(args)                        \
+                                : "lr", "memory");                            \
+          __res = __res_r0;                                                   \
+          LSS_RETURN(type, __res)
+    #undef _syscall0
+    #define _syscall0(type, name)                                             \
+      type LSS_NAME(name)() {                                                 \
+        LSS_BODY(type, name);                                                 \
+      }
+    #undef _syscall1
+    #define _syscall1(type, name, type1, arg1)                                \
+      type LSS_NAME(name)(type1 arg1) {                                       \
+        /* There is no need for using a volatile temp.  */                    \
+        LSS_REG(0, arg1);                                                     \
+        LSS_BODY(type, name, "r"(__r0));                                      \
+      }
+    #undef _syscall2
+    #define _syscall2(type, name, type1, arg1, type2, arg2)                   \
+      type LSS_NAME(name)(type1 arg1, type2 arg2) {                           \
+        LSS_SAVE_ARG(0, arg1);                                                \
+        LSS_SAVE_ARG(1, arg2);                                                \
+        LSS_LOAD_ARG(0);                                                      \
+        LSS_LOAD_ARG(1);                                                      \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1));                           \
+      }
+    #undef _syscall3
+    #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3)      \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) {               \
+        LSS_SAVE_ARG(0, arg1);                                                \
+        LSS_SAVE_ARG(1, arg2);                                                \
+        LSS_SAVE_ARG(2, arg3);                                                \
+        LSS_LOAD_ARG(0);                                                      \
+        LSS_LOAD_ARG(1);                                                      \
+        LSS_LOAD_ARG(2);                                                      \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2));                \
+      }
+    #undef _syscall4
+    #define _syscall4(type, name, type1, arg1, type2, arg2, type3, arg3,      \
+                      type4, arg4)                                            \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {   \
+        LSS_SAVE_ARG(0, arg1);                                                \
+        LSS_SAVE_ARG(1, arg2);                                                \
+        LSS_SAVE_ARG(2, arg3);                                                \
+        LSS_SAVE_ARG(3, arg4);                                                \
+        LSS_LOAD_ARG(0);                                                      \
+        LSS_LOAD_ARG(1);                                                      \
+        LSS_LOAD_ARG(2);                                                      \
+        LSS_LOAD_ARG(3);                                                      \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3));     \
+      }
+    #undef _syscall5
+    #define _syscall5(type, name, type1, arg1, type2, arg2, type3, arg3,      \
+                      type4, arg4, type5, arg5)                               \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5) {                                       \
+        LSS_SAVE_ARG(0, arg1);                                                \
+        LSS_SAVE_ARG(1, arg2);                                                \
+        LSS_SAVE_ARG(2, arg3);                                                \
+        LSS_SAVE_ARG(3, arg4);                                                \
+        LSS_REG(4, arg5);                                                     \
+        LSS_LOAD_ARG(0);                                                      \
+        LSS_LOAD_ARG(1);                                                      \
+        LSS_LOAD_ARG(2);                                                      \
+        LSS_LOAD_ARG(3);                                                      \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3),      \
+                             "r"(__r4));                                      \
+      }
+    #undef _syscall6
+    #define _syscall6(type, name, type1, arg1, type2, arg2, type3, arg3,      \
+                      type4, arg4, type5, arg5, type6, arg6)                  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5, type6 arg6) {                           \
+        LSS_SAVE_ARG(0, arg1);                                                \
+        LSS_SAVE_ARG(1, arg2);                                                \
+        LSS_SAVE_ARG(2, arg3);                                                \
+        LSS_SAVE_ARG(3, arg4);                                                \
+        LSS_REG(4, arg5);                                                     \
+        LSS_REG(5, arg6);                                                     \
+        LSS_LOAD_ARG(0);                                                      \
+        LSS_LOAD_ARG(1);                                                      \
+        LSS_LOAD_ARG(2);                                                      \
+        LSS_LOAD_ARG(3);                                                      \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3),      \
+                             "r"(__r4), "r"(__r5));                           \
+      }
+    LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+                                   int flags, void *arg, int *parent_tidptr,
+                                   void *newtls, int *child_tidptr) {
+      register long __res __asm__("r5");
+      {
+        if (fn == NULL || child_stack == NULL) {
+            __res = -EINVAL;
+            goto clone_exit;
+        }
+
+        /* stash first 4 arguments on stack first because we can only load
+         * them after all function calls.
+         */
+        int    tmp_flags = flags;
+        int  * tmp_stack = (int*) child_stack;
+        void * tmp_ptid  = parent_tidptr;
+        void * tmp_tls   = newtls;
+
+        register int  *__ctid  __asm__("r4") = child_tidptr;
+
+        /* Push "arg" and "fn" onto the stack that will be
+         * used by the child.
+         */
+        *(--tmp_stack) = (int) arg;
+        *(--tmp_stack) = (int) fn;
+
+        /* We must load r0..r3 last after all possible function calls.  */
+        register int   __flags __asm__("r0") = tmp_flags;
+        register void *__stack __asm__("r1") = tmp_stack;
+        register void *__ptid  __asm__("r2") = tmp_ptid;
+        register void *__tls   __asm__("r3") = tmp_tls;
+
+        /* %r0 = syscall(%r0 = flags,
+         *               %r1 = child_stack,
+         *               %r2 = parent_tidptr,
+         *               %r3 = newtls,
+         *               %r4 = child_tidptr)
+         */
+        __SYS_REG(clone)
+        __asm__ __volatile__(/* %r0 = syscall(%r0 = flags,
+                              *               %r1 = child_stack,
+                              *               %r2 = parent_tidptr,
+                              *               %r3 = newtls,
+                              *               %r4 = child_tidptr)
+                              */
+                             "push  {r7}\n"
+                             "mov   r7,%1\n"
+                             __syscall(clone)"\n"
+
+                             /* if (%r0 != 0)
+                              *   return %r0;
+                              */
+                             "movs  %0,r0\n"
+                             "bne   1f\n"
+
+                             /* In the child, now. Call "fn(arg)".
+                              */
+                             "ldr   r0,[sp, #4]\n"
+                             "mov   lr,pc\n"
+                             "ldr   pc,[sp]\n"
+
+                             /* Call _exit(%r0), which never returns.  We only
+                              * need to set r7 for EABI syscall ABI but we do
+                              * this always to simplify code sharing between
+                              * old and new syscall ABIs.
+                              */
+                             "mov   r7,%2\n"
+                             __syscall(exit)"\n"
+
+                             /* Pop r7 from the stack only in the parent.
+                              */
+                           "1: pop {r7}\n"
+                             : "=r" (__res)
+                             : "r"(__sysreg),
+                               "i"(__NR_exit), "r"(__stack), "r"(__flags),
+                               "r"(__ptid), "r"(__tls), "r"(__ctid)
+                             : "cc", "lr", "memory");
+      }
+      clone_exit:
+      LSS_RETURN(int, __res);
+    }
+  #elif defined(__mips__)
+    #undef LSS_REG
+    #define LSS_REG(r,a) register unsigned long __r##r __asm__("$"#r) =       \
+                                 (unsigned long)(a)
+
+    #if _MIPS_SIM == _MIPS_SIM_ABI32
+    // See http://sources.redhat.com/ml/libc-alpha/2004-10/msg00050.html
+    // or http://www.linux-mips.org/archives/linux-mips/2004-10/msg00142.html
+    #define MIPS_SYSCALL_CLOBBERS "$1", "$3", "$8", "$9", "$10", "$11", "$12",\
+                                "$13", "$14", "$15", "$24", "$25", "memory"
+    #else
+    #define MIPS_SYSCALL_CLOBBERS "$1", "$3", "$10", "$11", "$12", "$13",     \
+                                "$14", "$15", "$24", "$25", "memory"
+    #endif
+
+    #undef  LSS_BODY
+    #define LSS_BODY(type,name,r7,...)                                        \
+          register unsigned long __v0 __asm__("$2") = __NR_##name;            \
+          __asm__ __volatile__ ("syscall\n"                                   \
+                                : "=&r"(__v0), r7 (__r7)                      \
+                                : "0"(__v0), ##__VA_ARGS__                    \
+                                : MIPS_SYSCALL_CLOBBERS);                     \
+          LSS_RETURN(type, __v0, __r7)
+    #undef _syscall0
+    #define _syscall0(type, name)                                             \
+      type LSS_NAME(name)() {                                                 \
+        register unsigned long __r7 __asm__("$7");                            \
+        LSS_BODY(type, name, "=r");                                           \
+      }
+    #undef _syscall1
+    #define _syscall1(type, name, type1, arg1)                                \
+      type LSS_NAME(name)(type1 arg1) {                                       \
+        register unsigned long __r7 __asm__("$7");                            \
+        LSS_REG(4, arg1); LSS_BODY(type, name, "=r", "r"(__r4));              \
+      }
+    #undef _syscall2
+    #define _syscall2(type, name, type1, arg1, type2, arg2)                   \
+      type LSS_NAME(name)(type1 arg1, type2 arg2) {                           \
+        register unsigned long __r7 __asm__("$7");                            \
+        LSS_REG(4, arg1); LSS_REG(5, arg2);                                   \
+        LSS_BODY(type, name, "=r", "r"(__r4), "r"(__r5));                     \
+      }
+    #undef _syscall3
+    #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3)      \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) {               \
+        register unsigned long __r7 __asm__("$7");                            \
+        LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3);                 \
+        LSS_BODY(type, name, "=r", "r"(__r4), "r"(__r5), "r"(__r6));          \
+      }
+    #undef _syscall4
+    #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {   \
+        LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3);                 \
+        LSS_REG(7, arg4);                                                     \
+        LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6));          \
+      }
+    #undef _syscall5
+    #if _MIPS_SIM == _MIPS_SIM_ABI32
+    /* The old 32bit MIPS system call API passes the fifth and sixth argument
+     * on the stack, whereas the new APIs use registers "r8" and "r9".
+     */
+    #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5)                                             \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5) {                                       \
+        LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3);                 \
+        LSS_REG(7, arg4);                                                     \
+        register unsigned long __v0 __asm__("$2");                            \
+        __asm__ __volatile__ (".set noreorder\n"                              \
+                              "lw    $2, %6\n"                                \
+                              "subu  $29, 32\n"                               \
+                              "sw    $2, 16($29)\n"                           \
+                              "li    $2, %2\n"                                \
+                              "syscall\n"                                     \
+                              "addiu $29, 32\n"                               \
+                              ".set reorder\n"                                \
+                              : "=&r"(__v0), "+r" (__r7)                      \
+                              : "i" (__NR_##name), "r"(__r4), "r"(__r5),      \
+                                "r"(__r6), "m" ((unsigned long)arg5)          \
+                              : MIPS_SYSCALL_CLOBBERS);                       \
+        LSS_RETURN(type, __v0, __r7);                                         \
+      }
+    #else
+    #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5)                                             \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5) {                                       \
+        LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3);                 \
+        LSS_REG(7, arg4); LSS_REG(8, arg5);                                   \
+        LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6),           \
+                 "r"(__r8));                                                  \
+      }
+    #endif
+    #undef _syscall6
+    #if _MIPS_SIM == _MIPS_SIM_ABI32
+    /* The old 32bit MIPS system call API passes the fifth and sixth argument
+     * on the stack, whereas the new APIs use registers "r8" and "r9".
+     */
+    #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5,type6,arg6)                                  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5, type6 arg6) {                           \
+        LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3);                 \
+        LSS_REG(7, arg4);                                                     \
+        register unsigned long __v0 __asm__("$2");                            \
+        __asm__ __volatile__ (".set noreorder\n"                              \
+                              "lw    $2, %6\n"                                \
+                              "lw    $8, %7\n"                                \
+                              "subu  $29, 32\n"                               \
+                              "sw    $2, 16($29)\n"                           \
+                              "sw    $8, 20($29)\n"                           \
+                              "li    $2, %2\n"                                \
+                              "syscall\n"                                     \
+                              "addiu $29, 32\n"                               \
+                              ".set reorder\n"                                \
+                              : "=&r"(__v0), "+r" (__r7)                      \
+                              : "i" (__NR_##name), "r"(__r4), "r"(__r5),      \
+                                "r"(__r6), "r" ((unsigned long)arg5),         \
+                                "r" ((unsigned long)arg6)                     \
+                              : MIPS_SYSCALL_CLOBBERS);                       \
+        LSS_RETURN(type, __v0, __r7);                                         \
+      }
+    #else
+    #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5,type6,arg6)                                  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5,type6 arg6) {                            \
+        LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3);                 \
+        LSS_REG(7, arg4); LSS_REG(8, arg5); LSS_REG(9, arg6);                 \
+        LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6),           \
+                 "r"(__r8), "r"(__r9));                                       \
+      }
+    #endif
+    LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+                                   int flags, void *arg, int *parent_tidptr,
+                                   void *newtls, int *child_tidptr) {
+      register unsigned long __v0 __asm__("$2");
+      register unsigned long __r7 __asm__("$7") = (unsigned long)newtls;
+      {
+        register int   __flags __asm__("$4") = flags;
+        register void *__stack __asm__("$5") = child_stack;
+        register void *__ptid  __asm__("$6") = parent_tidptr;
+        register int  *__ctid  __asm__("$8") = child_tidptr;
+        __asm__ __volatile__(
+          #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32
+                             "subu  $29,24\n"
+          #elif _MIPS_SIM == _MIPS_SIM_NABI32
+                             "sub   $29,16\n"
+          #else
+                             "dsubu $29,16\n"
+          #endif
+
+                             /* if (fn == NULL || child_stack == NULL)
+                              *   return -EINVAL;
+                              */
+                             "li    %0,%2\n"
+                             "beqz  %5,1f\n"
+                             "beqz  %6,1f\n"
+
+                             /* Push "arg" and "fn" onto the stack that will be
+                              * used by the child.
+                              */
+          #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32
+                             "subu  %6,32\n"
+                             "sw    %5,0(%6)\n"
+                             "sw    %8,4(%6)\n"
+          #elif _MIPS_SIM == _MIPS_SIM_NABI32
+                             "sub   %6,32\n"
+                             "sw    %5,0(%6)\n"
+                             "sw    %8,8(%6)\n"
+          #else
+                             "dsubu %6,32\n"
+                             "sd    %5,0(%6)\n"
+                             "sd    %8,8(%6)\n"
+          #endif
+
+                             /* $7 = syscall($4 = flags,
+                              *              $5 = child_stack,
+                              *              $6 = parent_tidptr,
+                              *              $7 = newtls,
+                              *              $8 = child_tidptr)
+                              */
+                             "li    $2,%3\n"
+                             "syscall\n"
+
+                             /* if ($7 != 0)
+                              *   return $2;
+                              */
+                             "bnez  $7,1f\n"
+                             "bnez  $2,1f\n"
+
+                             /* In the child, now. Call "fn(arg)".
+                              */
+          #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32
+                            "lw    $25,0($29)\n"
+                            "lw    $4,4($29)\n"
+          #elif _MIPS_SIM == _MIPS_SIM_NABI32
+                            "lw    $25,0($29)\n"
+                            "lw    $4,8($29)\n"
+          #else
+                            "ld    $25,0($29)\n"
+                            "ld    $4,8($29)\n"
+          #endif
+                            "jalr  $25\n"
+
+                             /* Call _exit($2)
+                              */
+                            "move  $4,$2\n"
+                            "li    $2,%4\n"
+                            "syscall\n"
+
+                           "1:\n"
+          #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32
+                             "addu  $29, 24\n"
+          #elif _MIPS_SIM == _MIPS_SIM_NABI32
+                             "add   $29, 16\n"
+          #else
+                             "daddu $29,16\n"
+          #endif
+                             : "=&r" (__v0), "=r" (__r7)
+                             : "i"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit),
+                               "r"(fn), "r"(__stack), "r"(__flags), "r"(arg),
+                               "r"(__ptid), "r"(__r7), "r"(__ctid)
+                             : "$9", "$10", "$11", "$12", "$13", "$14", "$15",
+                               "$24", "memory");
+      }
+      LSS_RETURN(int, __v0, __r7);
+    }
+  #elif defined (__PPC__)
+    #undef  LSS_LOADARGS_0
+    #define LSS_LOADARGS_0(name, dummy...)                                    \
+        __sc_0 = __NR_##name
+    #undef  LSS_LOADARGS_1
+    #define LSS_LOADARGS_1(name, arg1)                                        \
+            LSS_LOADARGS_0(name);                                             \
+            __sc_3 = (unsigned long) (arg1)
+    #undef  LSS_LOADARGS_2
+    #define LSS_LOADARGS_2(name, arg1, arg2)                                  \
+            LSS_LOADARGS_1(name, arg1);                                       \
+            __sc_4 = (unsigned long) (arg2)
+    #undef  LSS_LOADARGS_3
+    #define LSS_LOADARGS_3(name, arg1, arg2, arg3)                            \
+            LSS_LOADARGS_2(name, arg1, arg2);                                 \
+            __sc_5 = (unsigned long) (arg3)
+    #undef  LSS_LOADARGS_4
+    #define LSS_LOADARGS_4(name, arg1, arg2, arg3, arg4)                      \
+            LSS_LOADARGS_3(name, arg1, arg2, arg3);                           \
+            __sc_6 = (unsigned long) (arg4)
+    #undef  LSS_LOADARGS_5
+    #define LSS_LOADARGS_5(name, arg1, arg2, arg3, arg4, arg5)                \
+            LSS_LOADARGS_4(name, arg1, arg2, arg3, arg4);                     \
+            __sc_7 = (unsigned long) (arg5)
+    #undef  LSS_LOADARGS_6
+    #define LSS_LOADARGS_6(name, arg1, arg2, arg3, arg4, arg5, arg6)          \
+            LSS_LOADARGS_5(name, arg1, arg2, arg3, arg4, arg5);               \
+            __sc_8 = (unsigned long) (arg6)
+    #undef  LSS_ASMINPUT_0
+    #define LSS_ASMINPUT_0 "0" (__sc_0)
+    #undef  LSS_ASMINPUT_1
+    #define LSS_ASMINPUT_1 LSS_ASMINPUT_0, "1" (__sc_3)
+    #undef  LSS_ASMINPUT_2
+    #define LSS_ASMINPUT_2 LSS_ASMINPUT_1, "2" (__sc_4)
+    #undef  LSS_ASMINPUT_3
+    #define LSS_ASMINPUT_3 LSS_ASMINPUT_2, "3" (__sc_5)
+    #undef  LSS_ASMINPUT_4
+    #define LSS_ASMINPUT_4 LSS_ASMINPUT_3, "4" (__sc_6)
+    #undef  LSS_ASMINPUT_5
+    #define LSS_ASMINPUT_5 LSS_ASMINPUT_4, "5" (__sc_7)
+    #undef  LSS_ASMINPUT_6
+    #define LSS_ASMINPUT_6 LSS_ASMINPUT_5, "6" (__sc_8)
+    #undef  LSS_BODY
+    #define LSS_BODY(nr, type, name, args...)                                 \
+        long __sc_ret, __sc_err;                                              \
+        {                                                                     \
+            register unsigned long __sc_0 __asm__ ("r0");                     \
+            register unsigned long __sc_3 __asm__ ("r3");                     \
+            register unsigned long __sc_4 __asm__ ("r4");                     \
+            register unsigned long __sc_5 __asm__ ("r5");                     \
+            register unsigned long __sc_6 __asm__ ("r6");                     \
+            register unsigned long __sc_7 __asm__ ("r7");                     \
+            register unsigned long __sc_8 __asm__ ("r8");                     \
+                                                                              \
+            LSS_LOADARGS_##nr(name, args);                                    \
+            __asm__ __volatile__                                              \
+                ("sc\n\t"                                                     \
+                 "mfcr %0"                                                    \
+                 : "=&r" (__sc_0),                                            \
+                   "=&r" (__sc_3), "=&r" (__sc_4),                            \
+                   "=&r" (__sc_5), "=&r" (__sc_6),                            \
+                   "=&r" (__sc_7), "=&r" (__sc_8)                             \
+                 : LSS_ASMINPUT_##nr                                          \
+                 : "cr0", "ctr", "memory",                                    \
+                   "r9", "r10", "r11", "r12");                                \
+            __sc_ret = __sc_3;                                                \
+            __sc_err = __sc_0;                                                \
+        }                                                                     \
+        LSS_RETURN(type, __sc_ret, __sc_err)
+    #undef _syscall0
+    #define _syscall0(type, name)                                             \
+       type LSS_NAME(name)(void) {                                            \
+          LSS_BODY(0, type, name);                                            \
+       }
+    #undef _syscall1
+    #define _syscall1(type, name, type1, arg1)                                \
+       type LSS_NAME(name)(type1 arg1) {                                      \
+          LSS_BODY(1, type, name, arg1);                                      \
+       }
+    #undef _syscall2
+    #define _syscall2(type, name, type1, arg1, type2, arg2)                   \
+       type LSS_NAME(name)(type1 arg1, type2 arg2) {                          \
+          LSS_BODY(2, type, name, arg1, arg2);                                \
+       }
+    #undef _syscall3
+    #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3)      \
+       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) {              \
+          LSS_BODY(3, type, name, arg1, arg2, arg3);                          \
+       }
+    #undef _syscall4
+    #define _syscall4(type, name, type1, arg1, type2, arg2, type3, arg3,      \
+                                  type4, arg4)                                \
+       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {  \
+          LSS_BODY(4, type, name, arg1, arg2, arg3, arg4);                    \
+       }
+    #undef _syscall5
+    #define _syscall5(type, name, type1, arg1, type2, arg2, type3, arg3,      \
+                                  type4, arg4, type5, arg5)                   \
+       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,    \
+                                               type5 arg5) {                  \
+          LSS_BODY(5, type, name, arg1, arg2, arg3, arg4, arg5);              \
+       }
+    #undef _syscall6
+    #define _syscall6(type, name, type1, arg1, type2, arg2, type3, arg3,      \
+                                  type4, arg4, type5, arg5, type6, arg6)      \
+       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,    \
+                                               type5 arg5, type6 arg6) {      \
+          LSS_BODY(6, type, name, arg1, arg2, arg3, arg4, arg5, arg6);        \
+       }
+    /* clone function adapted from glibc 2.18 clone.S                       */
+    LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+                                   int flags, void *arg, int *parent_tidptr,
+                                   void *newtls, int *child_tidptr) {
+      long __ret, __err;
+      {
+#if defined(__PPC64__)
+
+/* Stack frame offsets.  */
+#if _CALL_ELF != 2
+#define FRAME_MIN_SIZE         112
+#define FRAME_TOC_SAVE         40
+#else
+#define FRAME_MIN_SIZE         32
+#define FRAME_TOC_SAVE         24
+#endif
+
+
+        register int (*__fn)(void *) __asm__ ("r3") = fn;
+        register void *__cstack      __asm__ ("r4") = child_stack;
+        register int __flags         __asm__ ("r5") = flags;
+        register void * __arg        __asm__ ("r6") = arg;
+        register int * __ptidptr     __asm__ ("r7") = parent_tidptr;
+        register void * __newtls     __asm__ ("r8") = newtls;
+        register int * __ctidptr     __asm__ ("r9") = child_tidptr;
+        __asm__ __volatile__(
+            /* check for fn == NULL
+             * and child_stack == NULL
+             */
+            "cmpdi cr0, %6, 0\n\t"
+            "cmpdi cr1, %7, 0\n\t"
+            "cror  cr0*4+eq, cr1*4+eq, cr0*4+eq\n\t"
+            "beq-  cr0, 1f\n\t"
+
+            /* set up stack frame for child                                  */
+            "clrrdi %7, %7, 4\n\t"
+            "li     0, 0\n\t"
+            "stdu   0, -%13(%7)\n\t"
+
+            /* fn, arg, child_stack are saved acrVoss the syscall             */
+            "mr 28, %6\n\t"
+            "mr 29, %7\n\t"
+            "mr 27, %9\n\t"
+
+            /* syscall
+               r3 == flags
+               r4 == child_stack
+               r5 == parent_tidptr
+               r6 == newtls
+               r7 == child_tidptr                                            */
+            "mr 3, %8\n\t"
+            "mr 5, %10\n\t"
+            "mr 6, %11\n\t"
+            "mr 7, %12\n\t"
+	    "li	0, %4\n\t"
+            "sc\n\t"
+
+            /* Test if syscall was successful                                */
+            "cmpdi  cr1, 3, 0\n\t"
+            "crandc cr1*4+eq, cr1*4+eq, cr0*4+so\n\t"
+            "bne-   cr1, 1f\n\t"
+
+            /* Do the function call                                          */
+            "std   2, %14(1)\n\t"
+#if _CALL_ELF != 2
+	    "ld    0, 0(28)\n\t"
+	    "ld    2, 8(28)\n\t"
+            "mtctr 0\n\t"
+#else
+            "mr    12, 28\n\t"
+            "mtctr 12\n\t"
+#endif
+            "mr    3, 27\n\t"
+            "bctrl\n\t"
+	    "ld    2, %14(1)\n\t"
+
+            /* Call _exit(r3)                                                */
+            "li 0, %5\n\t"
+            "sc\n\t"
+
+            /* Return to parent                                              */
+	    "1:\n\t"
+            "mr %0, 3\n\t"
+              : "=r" (__ret), "=r" (__err)
+              : "0" (-1), "i" (EINVAL),
+                "i" (__NR_clone), "i" (__NR_exit),
+                "r" (__fn), "r" (__cstack), "r" (__flags),
+                "r" (__arg), "r" (__ptidptr), "r" (__newtls),
+                "r" (__ctidptr), "i" (FRAME_MIN_SIZE), "i" (FRAME_TOC_SAVE)
+              : "cr0", "cr1", "memory", "ctr",
+                "r0", "r29", "r27", "r28");
+#else
+        register int (*__fn)(void *)    __asm__ ("r8")  = fn;
+        register void *__cstack                 __asm__ ("r4")  = child_stack;
+        register int __flags                    __asm__ ("r3")  = flags;
+        register void * __arg                   __asm__ ("r9")  = arg;
+        register int * __ptidptr                __asm__ ("r5")  = parent_tidptr;
+        register void * __newtls                __asm__ ("r6")  = newtls;
+        register int * __ctidptr                __asm__ ("r7")  = child_tidptr;
+        __asm__ __volatile__(
+            /* check for fn == NULL
+             * and child_stack == NULL
+             */
+            "cmpwi cr0, %6, 0\n\t"
+            "cmpwi cr1, %7, 0\n\t"
+            "cror cr0*4+eq, cr1*4+eq, cr0*4+eq\n\t"
+            "beq- cr0, 1f\n\t"
+
+            /* set up stack frame for child                                  */
+            "clrrwi %7, %7, 4\n\t"
+            "li 0, 0\n\t"
+            "stwu 0, -16(%7)\n\t"
+
+            /* fn, arg, child_stack are saved across the syscall: r28-30     */
+            "mr 28, %6\n\t"
+            "mr 29, %7\n\t"
+            "mr 27, %9\n\t"
+
+            /* syscall                                                       */
+            "li 0, %4\n\t"
+            /* flags already in r3
+             * child_stack already in r4
+             * ptidptr already in r5
+             * newtls already in r6
+             * ctidptr already in r7
+             */
+            "sc\n\t"
+
+            /* Test if syscall was successful                                */
+            "cmpwi cr1, 3, 0\n\t"
+            "crandc cr1*4+eq, cr1*4+eq, cr0*4+so\n\t"
+            "bne- cr1, 1f\n\t"
+
+            /* Do the function call                                          */
+            "mtctr 28\n\t"
+            "mr 3, 27\n\t"
+            "bctrl\n\t"
+
+            /* Call _exit(r3)                                                */
+            "li 0, %5\n\t"
+            "sc\n\t"
+
+            /* Return to parent                                              */
+            "1:\n"
+            "mfcr %1\n\t"
+            "mr %0, 3\n\t"
+              : "=r" (__ret), "=r" (__err)
+              : "0" (-1), "1" (EINVAL),
+                "i" (__NR_clone), "i" (__NR_exit),
+                "r" (__fn), "r" (__cstack), "r" (__flags),
+                "r" (__arg), "r" (__ptidptr), "r" (__newtls),
+                "r" (__ctidptr)
+              : "cr0", "cr1", "memory", "ctr",
+                "r0", "r29", "r27", "r28");
+
+#endif
+      }
+      LSS_RETURN(int, __ret, __err);
+    }
+  #elif defined(__aarch64__)
+    #undef LSS_REG
+    #define LSS_REG(r,a) register long __x##r __asm__("x"#r) = (long)a
+    #undef  LSS_BODY
+    #define LSS_BODY(type,name,args...)                                       \
+          register long __res_x0 __asm__("x0");                               \
+          long __res;                                                         \
+          __asm__ __volatile__ ("mov x8, %1\n"                                \
+                                "svc 0x0\n"                                   \
+                                : "=r"(__res_x0)                              \
+                                : "i"(__NR_##name) , ## args                  \
+                                : "memory");                                  \
+          __res = __res_x0;                                                   \
+          LSS_RETURN(type, __res)
+    #undef _syscall0
+    #define _syscall0(type, name)                                             \
+      type LSS_NAME(name)(void) {                                             \
+        LSS_BODY(type, name);                                                 \
+      }
+    #undef _syscall1
+    #define _syscall1(type, name, type1, arg1)                                \
+      type LSS_NAME(name)(type1 arg1) {                                       \
+        LSS_REG(0, arg1); LSS_BODY(type, name, "r"(__x0));                    \
+      }
+    #undef _syscall2
+    #define _syscall2(type, name, type1, arg1, type2, arg2)                   \
+      type LSS_NAME(name)(type1 arg1, type2 arg2) {                           \
+        LSS_REG(0, arg1); LSS_REG(1, arg2);                                   \
+        LSS_BODY(type, name, "r"(__x0), "r"(__x1));                           \
+      }
+    #undef _syscall3
+    #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3)      \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) {               \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_BODY(type, name, "r"(__x0), "r"(__x1), "r"(__x2));                \
+      }
+    #undef _syscall4
+    #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {   \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_REG(3, arg4);                                                     \
+        LSS_BODY(type, name, "r"(__x0), "r"(__x1), "r"(__x2), "r"(__x3));     \
+      }
+    #undef _syscall5
+    #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5)                                             \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5) {                                       \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_REG(3, arg4); LSS_REG(4, arg5);                                   \
+        LSS_BODY(type, name, "r"(__x0), "r"(__x1), "r"(__x2), "r"(__x3),      \
+                             "r"(__x4));                                      \
+      }
+    #undef _syscall6
+    #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5,type6,arg6)                                  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5, type6 arg6) {                           \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_REG(3, arg4); LSS_REG(4, arg5); LSS_REG(5, arg6);                 \
+        LSS_BODY(type, name, "r"(__x0), "r"(__x1), "x"(__x2), "r"(__x3),      \
+                             "r"(__x4), "r"(__x5));                           \
+      }
+    /* clone function adapted from glibc 2.18 clone.S                       */
+    LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+                                   int flags, void *arg, int *parent_tidptr,
+                                   void *newtls, int *child_tidptr) {
+      long __res;
+      {
+        register int (*__fn)(void *)  __asm__("x0") = fn;
+        register void *__stack __asm__("x1") = child_stack;
+        register int   __flags __asm__("x2") = flags;
+        register void *__arg   __asm__("x3") = arg;
+        register int  *__ptid  __asm__("x4") = parent_tidptr;
+        register void *__tls   __asm__("x5") = newtls;
+        register int  *__ctid  __asm__("x6") = child_tidptr;
+        __asm__ __volatile__(/* if (fn == NULL || child_stack == NULL)
+                              *   return -EINVAL;
+                              */
+                             "cbz     x0,1f\n"
+                             "cbz     x1,1f\n"
+
+                             /* Push "arg" and "fn" onto the stack that will be
+                              * used by the child.
+                              */
+                             "stp x0,x3, [x1, #-16]!\n"
+
+                             "mov x0,x2\n" /* flags  */
+                             "mov x2,x4\n" /* ptid  */
+                             "mov x3,x5\n" /* tls */
+                             "mov x4,x6\n" /* ctid */
+                             "mov x8,%9\n" /* clone */
+
+                             "svc 0x0\n"
+
+                             /* if (%r0 != 0)
+                              *   return %r0;
+                              */
+                             "cmp x0, #0\n"
+                             "bne 2f\n"
+
+                             /* In the child, now. Call "fn(arg)".
+                              */
+                             "ldp x1, x0, [sp], #16\n"
+                             "blr x1\n"
+
+                             /* Call _exit(%r0).
+                              */
+                             "mov x8, %10\n"
+                             "svc 0x0\n"
+                           "1:\n"
+                             "mov x8, %1\n"
+                           "2:\n"
+                             : "=r" (__res)
+                             : "i"(-EINVAL),
+                               "r"(__fn), "r"(__stack), "r"(__flags), "r"(__arg),
+                               "r"(__ptid), "r"(__tls), "r"(__ctid),
+                               "i"(__NR_clone), "i"(__NR_exit)
+                             : "x30", "memory");
+      }
+      LSS_RETURN(int, __res);
+    }
+  #endif
+  #define __NR__exit   __NR_exit
+  #define __NR__gettid __NR_gettid
+  #define __NR__mremap __NR_mremap
+  LSS_INLINE _syscall1(int,     close,           int,         f)
+  LSS_INLINE _syscall1(int,     _exit,           int,         e)
+  LSS_INLINE _syscall3(int,     fcntl,           int,         f,
+                       int,            c, long,   a)
+  LSS_INLINE _syscall2(int,     fstat,           int,         f,
+                      struct kernel_stat*,   b)
+  LSS_INLINE _syscall6(int,     futex,           int*,        a,
+                       int,            o, int,    v,
+                      struct kernel_timespec*, t,
+                       int*, a2,
+                       int, v3)
+#ifdef __NR_getdents64
+    LSS_INLINE _syscall3(int,     getdents64,      int,         f,
+                         struct kernel_dirent64*, d, int,    c)
+#define KERNEL_DIRENT kernel_dirent64
+#define GETDENTS sys_getdents64
+#else
+    LSS_INLINE _syscall3(int,     getdents,        int,         f,
+                         struct kernel_dirent*, d, int,    c)
+#define KERNEL_DIRENT kernel_dirent
+#define GETDENTS sys_getdents
+#endif
+  LSS_INLINE _syscall0(pid_t,   getpid)
+  LSS_INLINE _syscall0(pid_t,   getppid)
+  LSS_INLINE _syscall0(pid_t,   _gettid)
+  LSS_INLINE _syscall2(int,     kill,            pid_t,       p,
+                       int,            s)
+  #if defined(__x86_64__)
+    /* Need to make sure off_t isn't truncated to 32-bits under x32.  */
+    LSS_INLINE off_t LSS_NAME(lseek)(int f, off_t o, int w) {
+      _LSS_BODY(3, off_t, lseek, off_t, LSS_SYSCALL_ARG(f), (uint64_t)(o),
+                                        LSS_SYSCALL_ARG(w));
+    }
+  #else
+    LSS_INLINE _syscall3(off_t,   lseek,           int,         f,
+                         off_t,          o, int,    w)
+  #endif
+  LSS_INLINE _syscall2(int,     munmap,          void*,       s,
+                       size_t,         l)
+  LSS_INLINE _syscall5(void*,   _mremap,         void*,       o,
+                       size_t,         os,       size_t,      ns,
+                       unsigned long,  f, void *, a)
+  LSS_INLINE _syscall2(int,     prctl,           int,         o,
+                       long,           a)
+  LSS_INLINE _syscall4(long,    ptrace,          int,         r,
+                       pid_t,          p, void *, a, void *, d)
+  LSS_INLINE _syscall3(ssize_t, read,            int,         f,
+                       void *,         b, size_t, c)
+  LSS_INLINE _syscall4(int,     rt_sigaction,    int,         s,
+                       const struct kernel_sigaction*, a,
+                       struct kernel_sigaction*, o, size_t,   c)
+  LSS_INLINE _syscall4(int, rt_sigprocmask,      int,         h,
+                       const struct kernel_sigset_t*,  s,
+                       struct kernel_sigset_t*,        o, size_t, c);
+  LSS_INLINE _syscall0(int,     sched_yield)
+  LSS_INLINE _syscall2(int,     sigaltstack,     const stack_t*, s,
+                       const stack_t*, o)
+  #if defined(__NR_fstatat)
+    LSS_INLINE _syscall4(int, fstatat, int, d, const char *, p,
+                         struct kernel_stat*,   b, int, flags)
+    LSS_INLINE int LSS_NAME(stat)(const char* p, struct kernel_stat* b) {
+      return LSS_NAME(fstatat)(AT_FDCWD,p,b,0);
+  }
+  #else
+    LSS_INLINE _syscall2(int,     stat,            const char*, f,
+                         struct kernel_stat*,   b)
+  #endif
+  LSS_INLINE _syscall3(ssize_t, write,            int,        f,
+                       const void *,   b, size_t, c)
+  #if defined(__NR_getcpu)
+    LSS_INLINE _syscall3(long, getcpu, unsigned *, cpu,
+                         unsigned *, node, void *, unused);
+  #endif
+  #if defined(__x86_64__) || defined(__aarch64__) || \
+     (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32)
+    LSS_INLINE _syscall3(int, socket,             int,   d,
+                         int,                     t, int,       p)
+  #endif
+  #if defined(__x86_64__)
+    /* Need to make sure __off64_t isn't truncated to 32-bits under x32.  */
+    LSS_INLINE void* LSS_NAME(mmap)(void *s, size_t l, int p, int f, int d,
+                                    __off64_t o) {
+      LSS_BODY(6, void*, mmap, LSS_SYSCALL_ARG(s), LSS_SYSCALL_ARG(l),
+                               LSS_SYSCALL_ARG(p), LSS_SYSCALL_ARG(f),
+                               LSS_SYSCALL_ARG(d), (uint64_t)(o));
+    }
+
+    LSS_INLINE int LSS_NAME(sigaction)(int signum,
+                                       const struct kernel_sigaction *act,
+                                       struct kernel_sigaction *oldact) {
+      /* On x86_64, the kernel requires us to always set our own
+       * SA_RESTORER in order to be able to return from a signal handler.
+       * This function must have a "magic" signature that the "gdb"
+       * (and maybe the kernel?) can recognize.
+       */
+      if (act != NULL && !(act->sa_flags & SA_RESTORER)) {
+        struct kernel_sigaction a = *act;
+        a.sa_flags   |= SA_RESTORER;
+        a.sa_restorer = LSS_NAME(restore_rt)();
+        return LSS_NAME(rt_sigaction)(signum, &a, oldact,
+                                      (KERNEL_NSIG+7)/8);
+      } else {
+        return LSS_NAME(rt_sigaction)(signum, act, oldact,
+                                      (KERNEL_NSIG+7)/8);
+      }
+    }
+
+    LSS_INLINE int LSS_NAME(sigprocmask)(int how,
+                                         const struct kernel_sigset_t *set,
+                                         struct kernel_sigset_t *oldset) {
+      return LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8);
+    }
+  #endif
+  #if (defined(__aarch64__)) || \
+      (defined(__mips__) && (_MIPS_ISA == _MIPS_ISA_MIPS64))
+    LSS_INLINE _syscall6(void*, mmap,              void*, s,
+                         size_t,                   l, int,               p,
+                         int,                      f, int,               d,
+                         __off64_t,                o)
+    LSS_INLINE int LSS_NAME(sigaction)(int signum,
+                                       const struct kernel_sigaction *act,
+                                       struct kernel_sigaction *oldact) {
+        return LSS_NAME(rt_sigaction)(signum, act, oldact, (KERNEL_NSIG+7)/8);
+
+    }
+    LSS_INLINE int LSS_NAME(sigprocmask)(int how,
+                                         const struct kernel_sigset_t *set,
+                                         struct kernel_sigset_t *oldset) {
+      return LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8);
+    }
+  #endif
+  #ifdef __NR_wait4
+    LSS_INLINE _syscall4(pid_t, wait4,            pid_t, p,
+                         int*,                    s, int,       o,
+                         struct kernel_rusage*,   r)
+    LSS_INLINE pid_t LSS_NAME(waitpid)(pid_t pid, int *status, int options){
+      return LSS_NAME(wait4)(pid, status, options, 0);
+    }
+  #else
+    LSS_INLINE _syscall3(pid_t, waitpid,          pid_t, p,
+                         int*,              s,    int,   o)
+  #endif
+  #ifdef __NR_openat
+    LSS_INLINE _syscall4(int, openat, int, d, const char *, p, int, f, int, m)
+    LSS_INLINE int LSS_NAME(open)(const char* p, int f, int m) {
+      return LSS_NAME(openat)(AT_FDCWD,p,f,m );
+    }
+  #else
+  LSS_INLINE _syscall3(int,     open,            const char*, p,
+                       int,            f, int,    m)
+  #endif
+  LSS_INLINE int LSS_NAME(sigemptyset)(struct kernel_sigset_t *set) {
+    memset(&set->sig, 0, sizeof(set->sig));
+    return 0;
+  }
+
+  LSS_INLINE int LSS_NAME(sigfillset)(struct kernel_sigset_t *set) {
+    memset(&set->sig, -1, sizeof(set->sig));
+    return 0;
+  }
+
+  LSS_INLINE int LSS_NAME(sigaddset)(struct kernel_sigset_t *set,
+                                     int signum) {
+    if (signum < 1 || signum > (int)(8*sizeof(set->sig))) {
+      LSS_ERRNO = EINVAL;
+      return -1;
+    } else {
+      set->sig[(signum - 1)/(8*sizeof(set->sig[0]))]
+          |= 1UL << ((signum - 1) % (8*sizeof(set->sig[0])));
+      return 0;
+    }
+  }
+
+  LSS_INLINE int LSS_NAME(sigdelset)(struct kernel_sigset_t *set,
+                                        int signum) {
+    if (signum < 1 || signum > (int)(8*sizeof(set->sig))) {
+      LSS_ERRNO = EINVAL;
+      return -1;
+    } else {
+      set->sig[(signum - 1)/(8*sizeof(set->sig[0]))]
+          &= ~(1UL << ((signum - 1) % (8*sizeof(set->sig[0]))));
+      return 0;
+    }
+  }
+
+  #if defined(__i386__) || \
+      defined(__arm__) || \
+     (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) || defined(__PPC__)
+    #define __NR__sigaction   __NR_sigaction
+    #define __NR__sigprocmask __NR_sigprocmask
+    LSS_INLINE _syscall2(int, fstat64,             int, f,
+                         struct kernel_stat64 *, b)
+    LSS_INLINE _syscall5(int, _llseek,     uint, fd, ulong, hi, ulong, lo,
+                         loff_t *, res, uint, wh)
+#ifdef __PPC64__
+    LSS_INLINE _syscall6(void*, mmap,              void*, s,
+                         size_t,                   l, int,               p,
+                         int,                      f, int,               d,
+                         off_t,                    o)
+#else
+    #ifndef __ARM_EABI__
+    /* Not available on ARM EABI Linux.  */
+    LSS_INLINE _syscall1(void*, mmap,              void*, a)
+    #endif
+    LSS_INLINE _syscall6(void*, mmap2,             void*, s,
+                         size_t,                   l, int,               p,
+                         int,                      f, int,               d,
+                         off_t,                    o)
+#endif
+    LSS_INLINE _syscall3(int,   _sigaction,        int,   s,
+                         const struct kernel_old_sigaction*,  a,
+                         struct kernel_old_sigaction*,        o)
+    LSS_INLINE _syscall3(int,   _sigprocmask,      int,   h,
+                         const unsigned long*,     s,
+                         unsigned long*,           o)
+    LSS_INLINE _syscall2(int, stat64,              const char *, p,
+                         struct kernel_stat64 *, b)
+
+    LSS_INLINE int LSS_NAME(sigaction)(int signum,
+                                       const struct kernel_sigaction *act,
+                                       struct kernel_sigaction *oldact) {
+      int old_errno = LSS_ERRNO;
+      int rc;
+      struct kernel_sigaction a;
+      if (act != NULL) {
+        a             = *act;
+        #ifdef __i386__
+        /* On i386, the kernel requires us to always set our own
+         * SA_RESTORER when using realtime signals. Otherwise, it does not
+         * know how to return from a signal handler. This function must have
+         * a "magic" signature that the "gdb" (and maybe the kernel?) can
+         * recognize.
+         * Apparently, a SA_RESTORER is implicitly set by the kernel, when
+         * using non-realtime signals.
+         *
+         * TODO: Test whether ARM needs a restorer
+         */
+        if (!(a.sa_flags & SA_RESTORER)) {
+          a.sa_flags   |= SA_RESTORER;
+          a.sa_restorer = (a.sa_flags & SA_SIGINFO)
+                          ? LSS_NAME(restore_rt)() : LSS_NAME(restore)();
+        }
+        #endif
+      }
+      rc = LSS_NAME(rt_sigaction)(signum, act ? &a : act, oldact,
+                                  (KERNEL_NSIG+7)/8);
+      if (rc < 0 && LSS_ERRNO == ENOSYS) {
+        struct kernel_old_sigaction oa, ooa, *ptr_a = &oa, *ptr_oa = &ooa;
+        if (!act) {
+          ptr_a            = NULL;
+        } else {
+          oa.sa_handler_   = act->sa_handler_;
+          memcpy(&oa.sa_mask, &act->sa_mask, sizeof(oa.sa_mask));
+          #ifndef __mips__
+          oa.sa_restorer   = act->sa_restorer;
+          #endif
+          oa.sa_flags      = act->sa_flags;
+        }
+        if (!oldact) {
+          ptr_oa           = NULL;
+        }
+        LSS_ERRNO = old_errno;
+        rc = LSS_NAME(_sigaction)(signum, ptr_a, ptr_oa);
+        if (rc == 0 && oldact) {
+          if (act) {
+            memcpy(oldact, act, sizeof(*act));
+          } else {
+            memset(oldact, 0, sizeof(*oldact));
+          }
+          oldact->sa_handler_    = ptr_oa->sa_handler_;
+          oldact->sa_flags       = ptr_oa->sa_flags;
+          memcpy(&oldact->sa_mask, &ptr_oa->sa_mask, sizeof(ptr_oa->sa_mask));
+          #ifndef __mips__
+          oldact->sa_restorer    = ptr_oa->sa_restorer;
+          #endif
+        }
+      }
+      return rc;
+    }
+
+    LSS_INLINE int LSS_NAME(sigprocmask)(int how,
+                                         const struct kernel_sigset_t *set,
+                                         struct kernel_sigset_t *oldset) {
+      int olderrno = LSS_ERRNO;
+      int rc = LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8);
+      if (rc < 0 && LSS_ERRNO == ENOSYS) {
+        LSS_ERRNO = olderrno;
+        if (oldset) {
+          LSS_NAME(sigemptyset)(oldset);
+        }
+        rc = LSS_NAME(_sigprocmask)(how,
+                                    set ? &set->sig[0] : NULL,
+                                    oldset ? &oldset->sig[0] : NULL);
+      }
+      return rc;
+    }
+  #endif
+  #if defined(__i386__) || \
+      defined(__PPC__) || \
+      (defined(__arm__) && !defined(__ARM_EABI__)) || \
+      (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32)
+
+    /* See sys_socketcall in net/socket.c in kernel source.
+     * It de-multiplexes on its first arg and unpacks the arglist
+     * array in its second arg.
+     */
+    LSS_INLINE _syscall2(long, socketcall, int, c, unsigned long*, a)
+
+    LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) {
+      unsigned long args[3] = {
+        (unsigned long) domain,
+        (unsigned long) type,
+        (unsigned long) protocol
+      };
+      return LSS_NAME(socketcall)(1, args);
+    }
+  #elif defined(__ARM_EABI__)
+    LSS_INLINE _syscall3(int, socket,             int,   d,
+                         int,                     t, int,       p)
+  #endif
+  #if defined(__mips__)
+    /* sys_pipe() on MIPS has non-standard calling conventions, as it returns
+     * both file handles through CPU registers.
+     */
+    LSS_INLINE int LSS_NAME(pipe)(int *p) {
+      register unsigned long __v0 __asm__("$2") = __NR_pipe;
+      register unsigned long __v1 __asm__("$3");
+      register unsigned long __r7 __asm__("$7");
+      __asm__ __volatile__ ("syscall\n"
+                            : "=&r"(__v0), "=&r"(__v1), "+r" (__r7)
+                            : "0"(__v0)
+                            : "$8", "$9", "$10", "$11", "$12",
+                              "$13", "$14", "$15", "$24", "memory");
+      if (__r7) {
+        LSS_ERRNO = __v0;
+        return -1;
+      } else {
+        p[0] = __v0;
+        p[1] = __v1;
+        return 0;
+      }
+    }
+  #elif defined(__NR_pipe2)
+    LSS_INLINE _syscall2(int,     pipe2,          int *, p,
+                         int,     f                        )
+    LSS_INLINE int LSS_NAME(pipe)( int * p) {
+        return LSS_NAME(pipe2)(p, 0);
+    }
+  #else
+    LSS_INLINE _syscall1(int,     pipe,           int *, p)
+  #endif
+
+  LSS_INLINE pid_t LSS_NAME(gettid)() {
+    pid_t tid = LSS_NAME(_gettid)();
+    if (tid != -1) {
+      return tid;
+    }
+    return LSS_NAME(getpid)();
+  }
+
+  LSS_INLINE void *LSS_NAME(mremap)(void *old_address, size_t old_size,
+                                    size_t new_size, int flags, ...) {
+    va_list ap;
+    void *new_address, *rc;
+    va_start(ap, flags);
+    new_address = va_arg(ap, void *);
+    rc = LSS_NAME(_mremap)(old_address, old_size, new_size,
+                           flags, new_address);
+    va_end(ap);
+    return rc;
+  }
+
+  LSS_INLINE int LSS_NAME(ptrace_detach)(pid_t pid) {
+    /* PTRACE_DETACH can sometimes forget to wake up the tracee and it
+     * then sends job control signals to the real parent, rather than to
+     * the tracer. We reduce the risk of this happening by starting a
+     * whole new time slice, and then quickly sending a SIGCONT signal
+     * right after detaching from the tracee.
+     */
+    int rc, err;
+    LSS_NAME(sched_yield)();
+    rc = LSS_NAME(ptrace)(PTRACE_DETACH, pid, (void *)0, (void *)0);
+    err = LSS_ERRNO;
+    LSS_NAME(kill)(pid, SIGCONT);
+    LSS_ERRNO = err;
+    return rc;
+  }
+#endif
+
+#if defined(__cplusplus) && !defined(SYS_CPLUSPLUS)
+}
+#endif
+
+#endif
+#endif

diff --git a/src/base/linuxthreads.cc b/src/base/linuxthreads.cc
new file mode 100644
index 0000000..891e70c
--- /dev/null
+++ b/src/base/linuxthreads.cc

@@ -0,0 +1,707 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2005-2007, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Markus Gutschke
+ */
+
+#include "base/linuxthreads.h"
+
+#ifdef THREADS
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sched.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <sys/prctl.h>
+#include <semaphore.h>
+
+#include "base/linux_syscall_support.h"
+#include "base/thread_lister.h"
+
+#ifndef CLONE_UNTRACED
+#define CLONE_UNTRACED 0x00800000
+#endif
+
+
+/* Synchronous signals that should not be blocked while in the lister thread.
+ */
+static const int sync_signals[]  = { SIGABRT, SIGILL, SIGFPE, SIGSEGV, SIGBUS,
+                                     SIGXCPU, SIGXFSZ };
+
+/* itoa() is not a standard function, and we cannot safely call printf()
+ * after suspending threads. So, we just implement our own copy. A
+ * recursive approach is the easiest here.
+ */
+static char *local_itoa(char *buf, int i) {
+  if (i < 0) {
+    *buf++ = '-';
+    return local_itoa(buf, -i);
+  } else {
+    if (i >= 10)
+      buf = local_itoa(buf, i/10);
+    *buf++ = (i%10) + '0';
+    *buf   = '\000';
+    return buf;
+  }
+}
+
+
+/* Wrapper around clone() that runs "fn" on the same stack as the
+ * caller! Unlike fork(), the cloned thread shares the same address space.
+ * The caller must be careful to use only minimal amounts of stack until
+ * the cloned thread has returned.
+ * There is a good chance that the cloned thread and the caller will share
+ * the same copy of errno!
+ */
+#ifdef __GNUC__
+#if __GNUC__ == 3 && __GNUC_MINOR__ >= 1 || __GNUC__ > 3
+/* Try to force this function into a separate stack frame, and make sure
+ * that arguments are passed on the stack.
+ */
+static int local_clone (int (*fn)(void *), void *arg, ...)
+  __attribute__ ((noinline));
+#endif
+#endif
+
+/* To avoid the gap cross page boundaries, increase by the large parge
+ * size mostly PowerPC system uses.  */
+#ifdef __PPC64__
+#define CLONE_STACK_SIZE 65536
+#else
+#define CLONE_STACK_SIZE 4096
+#endif
+
+static int local_clone (int (*fn)(void *), void *arg, ...) {
+  /* Leave 4kB of gap between the callers stack and the new clone. This
+   * should be more than sufficient for the caller to call waitpid() until
+   * the cloned thread terminates.
+   *
+   * It is important that we set the CLONE_UNTRACED flag, because newer
+   * versions of "gdb" otherwise attempt to attach to our thread, and will
+   * attempt to reap its status codes. This subsequently results in the
+   * caller hanging indefinitely in waitpid(), waiting for a change in
+   * status that will never happen. By setting the CLONE_UNTRACED flag, we
+   * prevent "gdb" from stealing events, but we still expect the thread
+   * lister to fail, because it cannot PTRACE_ATTACH to the process that
+   * is being debugged. This is OK and the error code will be reported
+   * correctly.
+   */
+  return sys_clone(fn, (char *)&arg - CLONE_STACK_SIZE,
+                   CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_UNTRACED, arg, 0, 0, 0);
+}
+
+
+/* Local substitute for the atoi() function, which is not necessarily safe
+ * to call once threads are suspended (depending on whether libc looks up
+ * locale information,  when executing atoi()).
+ */
+static int local_atoi(const char *s) {
+  int n   = 0;
+  int neg = *s == '-';
+  if (neg)
+    s++;
+  while (*s >= '0' && *s <= '9')
+    n = 10*n + (*s++ - '0');
+  return neg ? -n : n;
+}
+
+
+/* Re-runs fn until it doesn't cause EINTR
+ */
+#define NO_INTR(fn)   do {} while ((fn) < 0 && errno == EINTR)
+
+
+/* Wrap a class around system calls, in order to give us access to
+ * a private copy of errno. This only works in C++, but it has the
+ * advantage of not needing nested functions, which are a non-standard
+ * language extension.
+ */
+#ifdef __cplusplus
+namespace {
+  class SysCalls {
+   public:
+    #define SYS_CPLUSPLUS
+    #define SYS_ERRNO     my_errno
+    #define SYS_INLINE    inline
+    #define SYS_PREFIX    -1
+    #undef  SYS_LINUX_SYSCALL_SUPPORT_H
+    #include "linux_syscall_support.h"
+    SysCalls() : my_errno(0) { }
+    int my_errno;
+  };
+}
+#define ERRNO sys.my_errno
+#else
+#define ERRNO my_errno
+#endif
+
+
+/* Wrapper for open() which is guaranteed to never return EINTR.
+ */
+static int c_open(const char *fname, int flags, int mode) {
+  ssize_t rc;
+  NO_INTR(rc = sys_open(fname, flags, mode));
+  return rc;
+}
+
+
+/* abort() is not safely reentrant, and changes it's behavior each time
+ * it is called. This means, if the main application ever called abort()
+ * we cannot safely call it again. This would happen if we were called
+ * from a SIGABRT signal handler in the main application. So, document
+ * that calling SIGABRT from the thread lister makes it not signal safe
+ * (and vice-versa).
+ * Also, since we share address space with the main application, we
+ * cannot call abort() from the callback and expect the main application
+ * to behave correctly afterwards. In fact, the only thing we can do, is
+ * to terminate the main application with extreme prejudice (aka
+ * PTRACE_KILL).
+ * We set up our own SIGABRT handler to do this.
+ * In order to find the main application from the signal handler, we
+ * need to store information about it in global variables. This is
+ * safe, because the main application should be suspended at this
+ * time. If the callback ever called TCMalloc_ResumeAllProcessThreads(), then
+ * we are running a higher risk, though. So, try to avoid calling
+ * abort() after calling TCMalloc_ResumeAllProcessThreads.
+ */
+static volatile int *sig_pids, sig_num_threads, sig_proc, sig_marker;
+
+
+/* Signal handler to help us recover from dying while we are attached to
+ * other threads.
+ */
+static void SignalHandler(int signum, siginfo_t *si, void *data) {
+  if (sig_pids != NULL) {
+    if (signum == SIGABRT) {
+      while (sig_num_threads-- > 0) {
+        /* Not sure if sched_yield is really necessary here, but it does not */
+        /* hurt, and it might be necessary for the same reasons that we have */
+        /* to do so in sys_ptrace_detach().                                  */
+        sys_sched_yield();
+        sys_ptrace(PTRACE_KILL, sig_pids[sig_num_threads], 0, 0);
+      }
+    } else if (sig_num_threads > 0) {
+      TCMalloc_ResumeAllProcessThreads(sig_num_threads, (int *)sig_pids);
+    }
+  }
+  sig_pids = NULL;
+  if (sig_marker >= 0)
+    NO_INTR(sys_close(sig_marker));
+  sig_marker = -1;
+  if (sig_proc >= 0)
+    NO_INTR(sys_close(sig_proc));
+  sig_proc = -1;
+
+  sys__exit(signum == SIGABRT ? 1 : 2);
+}
+
+
+/* Try to dirty the stack, and hope that the compiler is not smart enough
+ * to optimize this function away. Or worse, the compiler could inline the
+ * function and permanently allocate the data on the stack.
+ */
+static void DirtyStack(size_t amount) {
+  char buf[amount];
+  memset(buf, 0, amount);
+  sys_read(-1, buf, amount);
+}
+
+
+/* Data structure for passing arguments to the lister thread.
+ */
+#define ALT_STACKSIZE (MINSIGSTKSZ + 4096)
+
+struct ListerParams {
+  int         result, err;
+  char        *altstack_mem;
+  ListAllProcessThreadsCallBack callback;
+  void        *parameter;
+  va_list     ap;
+  sem_t       *lock;
+};
+
+
+static void ListerThread(struct ListerParams *args) {
+  int                found_parent = 0;
+  pid_t              clone_pid  = sys_gettid(), ppid = sys_getppid();
+  char               proc_self_task[80], marker_name[48], *marker_path;
+  const char         *proc_paths[3];
+  const char *const  *proc_path = proc_paths;
+  int                proc = -1, marker = -1, num_threads = 0;
+  int                max_threads = 0, sig;
+  struct kernel_stat marker_sb, proc_sb;
+  stack_t            altstack;
+
+  /* Wait for parent thread to set appropriate permissions
+   * to allow ptrace activity
+   */
+  if (sem_wait(args->lock) < 0) {
+    goto failure;
+  }
+
+  /* Create "marker" that we can use to detect threads sharing the same
+   * address space and the same file handles. By setting the FD_CLOEXEC flag
+   * we minimize the risk of misidentifying child processes as threads;
+   * and since there is still a race condition,  we will filter those out
+   * later, anyway.
+   */
+  if ((marker = sys_socket(PF_LOCAL, SOCK_DGRAM, 0)) < 0 ||
+      sys_fcntl(marker, F_SETFD, FD_CLOEXEC) < 0) {
+  failure:
+    args->result = -1;
+    args->err    = errno;
+    if (marker >= 0)
+      NO_INTR(sys_close(marker));
+    sig_marker = marker = -1;
+    if (proc >= 0)
+      NO_INTR(sys_close(proc));
+    sig_proc = proc = -1;
+    sys__exit(1);
+  }
+
+  /* Compute search paths for finding thread directories in /proc            */
+  local_itoa(strrchr(strcpy(proc_self_task, "/proc/"), '\000'), ppid);
+  strcpy(marker_name, proc_self_task);
+  marker_path = marker_name + strlen(marker_name);
+  strcat(proc_self_task, "/task/");
+  proc_paths[0] = proc_self_task; /* /proc/$$/task/                          */
+  proc_paths[1] = "/proc/";       /* /proc/                                  */
+  proc_paths[2] = NULL;
+
+  /* Compute path for marker socket in /proc                                 */
+  local_itoa(strcpy(marker_path, "/fd/") + 4, marker);
+  if (sys_stat(marker_name, &marker_sb) < 0) {
+    goto failure;
+  }
+
+  /* Catch signals on an alternate pre-allocated stack. This way, we can
+   * safely execute the signal handler even if we ran out of memory.
+   */
+  memset(&altstack, 0, sizeof(altstack));
+  altstack.ss_sp    = args->altstack_mem;
+  altstack.ss_flags = 0;
+  altstack.ss_size  = ALT_STACKSIZE;
+  sys_sigaltstack(&altstack, (const stack_t *)NULL);
+
+  /* Some kernels forget to wake up traced processes, when the
+   * tracer dies.  So, intercept synchronous signals and make sure
+   * that we wake up our tracees before dying. It is the caller's
+   * responsibility to ensure that asynchronous signals do not
+   * interfere with this function.
+   */
+  sig_marker = marker;
+  sig_proc   = -1;
+  for (sig = 0; sig < sizeof(sync_signals)/sizeof(*sync_signals); sig++) {
+    struct kernel_sigaction sa;
+    memset(&sa, 0, sizeof(sa));
+    sa.sa_sigaction_ = SignalHandler;
+    sys_sigfillset(&sa.sa_mask);
+    sa.sa_flags      = SA_ONSTACK|SA_SIGINFO|SA_RESETHAND;
+    sys_sigaction(sync_signals[sig], &sa, (struct kernel_sigaction *)NULL);
+  }
+  
+  /* Read process directories in /proc/...                                   */
+  for (;;) {
+    /* Some kernels know about threads, and hide them in "/proc"
+     * (although they are still there, if you know the process
+     * id). Threads are moved into a separate "task" directory. We
+     * check there first, and then fall back on the older naming
+     * convention if necessary.
+     */
+    if ((sig_proc = proc = c_open(*proc_path, O_RDONLY|O_DIRECTORY, 0)) < 0) {
+      if (*++proc_path != NULL)
+        continue;
+      goto failure;
+    }
+    if (sys_fstat(proc, &proc_sb) < 0)
+      goto failure;
+    
+    /* Since we are suspending threads, we cannot call any libc
+     * functions that might acquire locks. Most notably, we cannot
+     * call malloc(). So, we have to allocate memory on the stack,
+     * instead. Since we do not know how much memory we need, we
+     * make a best guess. And if we guessed incorrectly we retry on
+     * a second iteration (by jumping to "detach_threads").
+     *
+     * Unless the number of threads is increasing very rapidly, we
+     * should never need to do so, though, as our guestimate is very
+     * conservative.
+     */
+    if (max_threads < proc_sb.st_nlink + 100)
+      max_threads = proc_sb.st_nlink + 100;
+    
+    /* scope */ {
+      pid_t pids[max_threads];
+      int   added_entries = 0;
+      sig_num_threads     = num_threads;
+      sig_pids            = pids;
+      for (;;) {
+        struct KERNEL_DIRENT *entry;
+        char buf[4096];
+        ssize_t nbytes = GETDENTS(proc, (struct KERNEL_DIRENT *)buf,
+                                         sizeof(buf));
+        if (nbytes < 0)
+          goto failure;
+        else if (nbytes == 0) {
+          if (added_entries) {
+            /* Need to keep iterating over "/proc" in multiple
+             * passes until we no longer find any more threads. This
+             * algorithm eventually completes, when all threads have
+             * been suspended.
+             */
+            added_entries = 0;
+            sys_lseek(proc, 0, SEEK_SET);
+            continue;
+          }
+          break;
+        }
+        for (entry = (struct KERNEL_DIRENT *)buf;
+             entry < (struct KERNEL_DIRENT *)&buf[nbytes];
+             entry = (struct KERNEL_DIRENT *)((char *)entry+entry->d_reclen)) {
+          if (entry->d_ino != 0) {
+            const char *ptr = entry->d_name;
+            pid_t pid;
+            
+            /* Some kernels hide threads by preceding the pid with a '.'     */
+            if (*ptr == '.')
+              ptr++;
+            
+            /* If the directory is not numeric, it cannot be a
+             * process/thread
+             */
+            if (*ptr < '0' || *ptr > '9')
+              continue;
+            pid = local_atoi(ptr);
+
+            /* Attach (and suspend) all threads                              */
+            if (pid && pid != clone_pid) {
+              struct kernel_stat tmp_sb;
+              char fname[entry->d_reclen + 48];
+              strcat(strcat(strcpy(fname, "/proc/"),
+                            entry->d_name), marker_path);
+              
+              /* Check if the marker is identical to the one we created      */
+              if (sys_stat(fname, &tmp_sb) >= 0 &&
+                  marker_sb.st_ino == tmp_sb.st_ino) {
+                long i, j;
+
+                /* Found one of our threads, make sure it is no duplicate    */
+                for (i = 0; i < num_threads; i++) {
+                  /* Linear search is slow, but should not matter much for
+                   * the typically small number of threads.
+                   */
+                  if (pids[i] == pid) {
+                    /* Found a duplicate; most likely on second pass         */
+                    goto next_entry;
+                  }
+                }
+                
+                /* Check whether data structure needs growing                */
+                if (num_threads >= max_threads) {
+                  /* Back to square one, this time with more memory          */
+                  NO_INTR(sys_close(proc));
+                  goto detach_threads;
+                }
+
+                /* Attaching to thread suspends it                           */
+                pids[num_threads++] = pid;
+                sig_num_threads     = num_threads;
+                if (sys_ptrace(PTRACE_ATTACH, pid, (void *)0,
+                               (void *)0) < 0) {
+                  /* If operation failed, ignore thread. Maybe it
+                   * just died?  There might also be a race
+                   * condition with a concurrent core dumper or
+                   * with a debugger. In that case, we will just
+                   * make a best effort, rather than failing
+                   * entirely.
+                   */
+                  num_threads--;
+                  sig_num_threads = num_threads;
+                  goto next_entry;
+                }
+                while (sys_waitpid(pid, (int *)0, __WALL) < 0) {
+                  if (errno != EINTR) {
+                    sys_ptrace_detach(pid);
+                    num_threads--;
+                    sig_num_threads = num_threads;
+                    goto next_entry;
+                  }
+                }
+
+                if (sys_ptrace(PTRACE_PEEKDATA, pid, &i, &j) || i++ != j ||
+                    sys_ptrace(PTRACE_PEEKDATA, pid, &i, &j) || i   != j) {
+                  /* Address spaces are distinct, even though both
+                   * processes show the "marker". This is probably
+                   * a forked child process rather than a thread.
+                   */
+                  sys_ptrace_detach(pid);
+                  num_threads--;
+                  sig_num_threads = num_threads;
+                } else {
+                  found_parent |= pid == ppid;
+                  added_entries++;
+                }
+              }
+            }
+          }
+        next_entry:;
+        }
+      }
+      NO_INTR(sys_close(proc));
+      sig_proc = proc = -1;
+
+      /* If we failed to find any threads, try looking somewhere else in
+       * /proc. Maybe, threads are reported differently on this system.
+       */
+      if (num_threads > 1 || !*++proc_path) {
+        NO_INTR(sys_close(marker));
+        sig_marker = marker = -1;
+
+        /* If we never found the parent process, something is very wrong.
+         * Most likely, we are running in debugger. Any attempt to operate
+         * on the threads would be very incomplete. Let's just report an
+         * error to the caller.
+         */
+        if (!found_parent) {
+          TCMalloc_ResumeAllProcessThreads(num_threads, pids);
+          sys__exit(3);
+        }
+
+        /* Now we are ready to call the callback,
+         * which takes care of resuming the threads for us.
+         */
+        args->result = args->callback(args->parameter, num_threads,
+                                      pids, args->ap);
+        args->err = errno;
+
+        /* Callback should have resumed threads, but better safe than sorry  */
+        if (TCMalloc_ResumeAllProcessThreads(num_threads, pids)) {
+          /* Callback forgot to resume at least one thread, report error     */
+          args->err    = EINVAL;
+          args->result = -1;
+        }
+
+        sys__exit(0);
+      }
+    detach_threads:
+      /* Resume all threads prior to retrying the operation                  */
+      TCMalloc_ResumeAllProcessThreads(num_threads, pids);
+      sig_pids = NULL;
+      num_threads = 0;
+      sig_num_threads = num_threads;
+      max_threads += 100;
+    }
+  }
+}
+
+
+/* This function gets the list of all linux threads of the current process
+ * passes them to the 'callback' along with the 'parameter' pointer; at the
+ * call back call time all the threads are paused via
+ * PTRACE_ATTACH.
+ * The callback is executed from a separate thread which shares only the
+ * address space, the filesystem, and the filehandles with the caller. Most
+ * notably, it does not share the same pid and ppid; and if it terminates,
+ * the rest of the application is still there. 'callback' is supposed to do
+ * or arrange for TCMalloc_ResumeAllProcessThreads. This happens automatically, if
+ * the thread raises a synchronous signal (e.g. SIGSEGV); asynchronous
+ * signals are blocked. If the 'callback' decides to unblock them, it must
+ * ensure that they cannot terminate the application, or that
+ * TCMalloc_ResumeAllProcessThreads will get called.
+ * It is an error for the 'callback' to make any library calls that could
+ * acquire locks. Most notably, this means that most system calls have to
+ * avoid going through libc. Also, this means that it is not legal to call
+ * exit() or abort().
+ * We return -1 on error and the return value of 'callback' on success.
+ */
+int TCMalloc_ListAllProcessThreads(void *parameter,
+                                   ListAllProcessThreadsCallBack callback, ...) {
+  char                   altstack_mem[ALT_STACKSIZE];
+  struct ListerParams    args;
+  pid_t                  clone_pid;
+  int                    dumpable = 1, sig;
+  struct kernel_sigset_t sig_blocked, sig_old;
+  sem_t                  lock;
+
+  va_start(args.ap, callback);
+
+  /* If we are short on virtual memory, initializing the alternate stack
+   * might trigger a SIGSEGV. Let's do this early, before it could get us
+   * into more trouble (i.e. before signal handlers try to use the alternate
+   * stack, and before we attach to other threads).
+   */
+  memset(altstack_mem, 0, sizeof(altstack_mem));
+
+  /* Some of our cleanup functions could conceivable use more stack space.
+   * Try to touch the stack right now. This could be defeated by the compiler
+   * being too smart for it's own good, so try really hard.
+   */
+  DirtyStack(32768);
+
+  /* Make this process "dumpable". This is necessary in order to ptrace()
+   * after having called setuid().
+   */
+  dumpable = sys_prctl(PR_GET_DUMPABLE, 0);
+  if (!dumpable)
+    sys_prctl(PR_SET_DUMPABLE, 1);
+
+  /* Fill in argument block for dumper thread                                */
+  args.result       = -1;
+  args.err          = 0;
+  args.altstack_mem = altstack_mem;
+  args.parameter    = parameter;
+  args.callback     = callback;
+  args.lock         = &lock;
+
+  /* Before cloning the thread lister, block all asynchronous signals, as we */
+  /* are not prepared to handle them.                                        */
+  sys_sigfillset(&sig_blocked);
+  for (sig = 0; sig < sizeof(sync_signals)/sizeof(*sync_signals); sig++) {
+    sys_sigdelset(&sig_blocked, sync_signals[sig]);
+  }
+  if (sys_sigprocmask(SIG_BLOCK, &sig_blocked, &sig_old)) {
+    args.err = errno;
+    args.result = -1;
+    goto failed;
+  }
+
+  /* scope */ {
+    /* After cloning, both the parent and the child share the same instance
+     * of errno. We must make sure that at least one of these processes
+     * (in our case, the parent) uses modified syscall macros that update
+     * a local copy of errno, instead.
+     */
+    #ifdef __cplusplus
+      #define sys0_sigprocmask sys.sigprocmask
+      #define sys0_waitpid     sys.waitpid
+      SysCalls sys;
+    #else
+      int my_errno;
+      #define SYS_ERRNO        my_errno
+      #define SYS_INLINE       inline
+      #define SYS_PREFIX       0
+      #undef  SYS_LINUX_SYSCALL_SUPPORT_H
+      #include "linux_syscall_support.h"
+    #endif
+
+    /* Lock before clone so that parent can set
+	 * ptrace permissions (if necessary) prior
+     * to ListerThread actually executing
+     */
+    if (sem_init(&lock, 0, 0) == 0) {
+
+      int clone_errno;
+      clone_pid = local_clone((int (*)(void *))ListerThread, &args);
+      clone_errno = errno;
+
+      sys_sigprocmask(SIG_SETMASK, &sig_old, &sig_old);
+
+      if (clone_pid >= 0) {
+#ifdef PR_SET_PTRACER
+        /* In newer versions of glibc permission must explicitly
+         * be given to allow for ptrace.
+         */
+        prctl(PR_SET_PTRACER, clone_pid, 0, 0, 0);
+#endif
+        /* Releasing the lock here allows the
+         * ListerThread to execute and ptrace us.
+		 */
+        sem_post(&lock);
+        int status, rc;
+        while ((rc = sys0_waitpid(clone_pid, &status, __WALL)) < 0 &&
+               ERRNO == EINTR) {
+                /* Keep waiting                                                 */
+        }
+        if (rc < 0) {
+          args.err = ERRNO;
+          args.result = -1;
+        } else if (WIFEXITED(status)) {
+          switch (WEXITSTATUS(status)) {
+            case 0: break;             /* Normal process termination           */
+            case 2: args.err = EFAULT; /* Some fault (e.g. SIGSEGV) detected   */
+                    args.result = -1;
+                    break;
+            case 3: args.err = EPERM;  /* Process is already being traced      */
+                    args.result = -1;
+                    break;
+            default:args.err = ECHILD; /* Child died unexpectedly              */
+                    args.result = -1;
+                    break;
+          }
+        } else if (!WIFEXITED(status)) {
+          args.err    = EFAULT;        /* Terminated due to an unhandled signal*/
+          args.result = -1;
+        }
+        sem_destroy(&lock);
+      } else {
+        args.result = -1;
+        args.err    = clone_errno;
+      }
+    } else {
+      args.result = -1;
+      args.err    = errno;
+    }
+  }
+
+  /* Restore the "dumpable" state of the process                             */
+failed:
+  if (!dumpable)
+    sys_prctl(PR_SET_DUMPABLE, dumpable);
+
+  va_end(args.ap);
+
+  errno = args.err;
+  return args.result;
+}
+
+/* This function resumes the list of all linux threads that
+ * TCMalloc_ListAllProcessThreads pauses before giving to its callback.
+ * The function returns non-zero if at least one thread was
+ * suspended and has now been resumed.
+ */
+int TCMalloc_ResumeAllProcessThreads(int num_threads, pid_t *thread_pids) {
+  int detached_at_least_one = 0;
+  while (num_threads-- > 0) {
+    detached_at_least_one |= sys_ptrace_detach(thread_pids[num_threads]) >= 0;
+  }
+  return detached_at_least_one;
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif

diff --git a/src/base/linuxthreads.h b/src/base/linuxthreads.h
new file mode 100644
index 0000000..16bc8c6
--- /dev/null
+++ b/src/base/linuxthreads.h

@@ -0,0 +1,53 @@
+/* Copyright (c) 2005-2007, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Markus Gutschke
+ */
+
+#ifndef _LINUXTHREADS_H
+#define _LINUXTHREADS_H
+
+/* Include thread_lister.h to get the interface that we implement for linux.
+ */
+
+/* We currently only support x86-32 and x86-64 on Linux. Porting to other
+ * related platforms should not be difficult.
+ */
+#if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__) || \
+     defined(__mips__) || defined(__PPC__) || defined(__aarch64__)) && defined(__linux)
+
+/* Define the THREADS symbol to make sure that there is exactly one core dumper
+ * built into the library.
+ */
+#define THREADS "Linux /proc"
+
+#endif
+
+#endif  /* _LINUXTHREADS_H */

diff --git a/src/base/logging.cc b/src/base/logging.cc
new file mode 100644
index 0000000..761c2fd
--- /dev/null
+++ b/src/base/logging.cc

@@ -0,0 +1,108 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// This file just provides storage for FLAGS_verbose.
+
+#include <config.h>
+#include "base/logging.h"
+#include "base/commandlineflags.h"
+
+DEFINE_int32(verbose, EnvToInt("PERFTOOLS_VERBOSE", 0),
+             "Set to numbers >0 for more verbose output, or <0 for less.  "
+             "--verbose == -4 means we log fatal errors only.");
+
+
+#if defined(_WIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__)
+
+// While windows does have a POSIX-compatible API
+// (_open/_write/_close), it acquires memory.  Using this lower-level
+// windows API is the closest we can get to being "raw".
+RawFD RawOpenForWriting(const char* filename) {
+  // CreateFile allocates memory if file_name isn't absolute, so if
+  // that ever becomes a problem then we ought to compute the absolute
+  // path on its behalf (perhaps the ntdll/kernel function isn't aware
+  // of the working directory?)
+  RawFD fd = CreateFileA(filename, GENERIC_WRITE, 0, NULL,
+                         CREATE_ALWAYS, 0, NULL);
+  if (fd != kIllegalRawFD && GetLastError() == ERROR_ALREADY_EXISTS)
+    SetEndOfFile(fd);    // truncate the existing file
+  return fd;
+}
+
+void RawWrite(RawFD handle, const char* buf, size_t len) {
+  while (len > 0) {
+    DWORD wrote;
+    BOOL ok = WriteFile(handle, buf, len, &wrote, NULL);
+    // We do not use an asynchronous file handle, so ok==false means an error
+    if (!ok) break;
+    buf += wrote;
+    len -= wrote;
+  }
+}
+
+void RawClose(RawFD handle) {
+  CloseHandle(handle);
+}
+
+#else  // _WIN32 || __CYGWIN__ || __CYGWIN32__
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
+
+// Re-run fn until it doesn't cause EINTR.
+#define NO_INTR(fn)  do {} while ((fn) < 0 && errno == EINTR)
+
+RawFD RawOpenForWriting(const char* filename) {
+  return open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0664);
+}
+
+void RawWrite(RawFD fd, const char* buf, size_t len) {
+  while (len > 0) {
+    ssize_t r;
+    NO_INTR(r = write(fd, buf, len));
+    if (r <= 0) break;
+    buf += r;
+    len -= r;
+  }
+}
+
+void RawClose(RawFD fd) {
+  NO_INTR(close(fd));
+}
+
+#endif  // _WIN32 || __CYGWIN__ || __CYGWIN32__

diff --git a/src/base/logging.h b/src/base/logging.h
new file mode 100644
index 0000000..a1afe4d
--- /dev/null
+++ b/src/base/logging.h

@@ -0,0 +1,259 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// This file contains #include information about logging-related stuff.
+// Pretty much everybody needs to #include this file so that they can
+// log various happenings.
+//
+#ifndef _LOGGING_H_
+#define _LOGGING_H_
+
+#include <config.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>    // for write()
+#endif
+#include <string.h>    // for strlen(), strcmp()
+#include <assert.h>
+#include <errno.h>     // for errno
+#include "base/commandlineflags.h"
+
+// On some systems (like freebsd), we can't call write() at all in a
+// global constructor, perhaps because errno hasn't been set up.
+// (In windows, we can't call it because it might call malloc.)
+// Calling the write syscall is safer (it doesn't set errno), so we
+// prefer that.  Note we don't care about errno for logging: we just
+// do logging on a best-effort basis.
+#if defined(_MSC_VER)
+#define WRITE_TO_STDERR(buf, len) WriteToStderr(buf, len);  // in port.cc
+#elif defined(HAVE_SYS_SYSCALL_H)
+#include <sys/syscall.h>
+#define WRITE_TO_STDERR(buf, len) syscall(SYS_write, STDERR_FILENO, buf, len)
+#else
+#define WRITE_TO_STDERR(buf, len) write(STDERR_FILENO, buf, len)
+#endif
+
+// MSVC and mingw define their own, safe version of vnsprintf (the
+// windows one in broken) in port.cc.  Everyone else can use the
+// version here.  We had to give it a unique name for windows.
+#ifndef _WIN32
+# define perftools_vsnprintf vsnprintf
+#endif
+
+
+// We log all messages at this log-level and below.
+// INFO == -1, WARNING == -2, ERROR == -3, FATAL == -4
+DECLARE_int32(verbose);
+
+// CHECK dies with a fatal error if condition is not true.  It is *not*
+// controlled by NDEBUG, so the check will be executed regardless of
+// compilation mode.  Therefore, it is safe to do things like:
+//    CHECK(fp->Write(x) == 4)
+// Note we use write instead of printf/puts to avoid the risk we'll
+// call malloc().
+#define CHECK(condition)                                                \
+  do {                                                                  \
+    if (!(condition)) {                                                 \
+      WRITE_TO_STDERR("Check failed: " #condition "\n",                 \
+                      sizeof("Check failed: " #condition "\n")-1);      \
+      abort();                                                          \
+    }                                                                   \
+  } while (0)
+
+// This takes a message to print.  The name is historical.
+#define RAW_CHECK(condition, message)                                          \
+  do {                                                                         \
+    if (!(condition)) {                                                        \
+      WRITE_TO_STDERR("Check failed: " #condition ": " message "\n",           \
+                      sizeof("Check failed: " #condition ": " message "\n")-1);\
+      abort();                                                                 \
+    }                                                                          \
+  } while (0)
+
+// This is like RAW_CHECK, but only in debug-mode
+#ifdef NDEBUG
+enum { DEBUG_MODE = 0 };
+#define RAW_DCHECK(condition, message)
+#else
+enum { DEBUG_MODE = 1 };
+#define RAW_DCHECK(condition, message)  RAW_CHECK(condition, message)
+#endif
+
+// This prints errno as well.  Note we use write instead of printf/puts to
+// avoid the risk we'll call malloc().
+#define PCHECK(condition)                                               \
+  do {                                                                  \
+    if (!(condition)) {                                                 \
+      const int err_no = errno;                                         \
+      WRITE_TO_STDERR("Check failed: " #condition ": ",                 \
+                      sizeof("Check failed: " #condition ": ")-1);      \
+      WRITE_TO_STDERR(strerror(err_no), strlen(strerror(err_no)));      \
+      WRITE_TO_STDERR("\n", sizeof("\n")-1);                            \
+      abort();                                                          \
+    }                                                                   \
+  } while (0)
+
+// Helper macro for binary operators; prints the two values on error
+// Don't use this macro directly in your code, use CHECK_EQ et al below
+
+// WARNING: These don't compile correctly if one of the arguments is a pointer
+// and the other is NULL. To work around this, simply static_cast NULL to the
+// type of the desired pointer.
+
+// TODO(jandrews): Also print the values in case of failure.  Requires some
+// sort of type-sensitive ToString() function.
+#define CHECK_OP(op, val1, val2)                                        \
+  do {                                                                  \
+    if (!((val1) op (val2))) {                                          \
+      fprintf(stderr, "Check failed: %s %s %s\n", #val1, #op, #val2);   \
+      abort();                                                          \
+    }                                                                   \
+  } while (0)
+
+#define CHECK_EQ(val1, val2) CHECK_OP(==, val1, val2)
+#define CHECK_NE(val1, val2) CHECK_OP(!=, val1, val2)
+#define CHECK_LE(val1, val2) CHECK_OP(<=, val1, val2)
+#define CHECK_LT(val1, val2) CHECK_OP(< , val1, val2)
+#define CHECK_GE(val1, val2) CHECK_OP(>=, val1, val2)
+#define CHECK_GT(val1, val2) CHECK_OP(> , val1, val2)
+
+// Synonyms for CHECK_* that are used in some unittests.
+#define EXPECT_EQ(val1, val2) CHECK_EQ(val1, val2)
+#define EXPECT_NE(val1, val2) CHECK_NE(val1, val2)
+#define EXPECT_LE(val1, val2) CHECK_LE(val1, val2)
+#define EXPECT_LT(val1, val2) CHECK_LT(val1, val2)
+#define EXPECT_GE(val1, val2) CHECK_GE(val1, val2)
+#define EXPECT_GT(val1, val2) CHECK_GT(val1, val2)
+#define ASSERT_EQ(val1, val2) EXPECT_EQ(val1, val2)
+#define ASSERT_NE(val1, val2) EXPECT_NE(val1, val2)
+#define ASSERT_LE(val1, val2) EXPECT_LE(val1, val2)
+#define ASSERT_LT(val1, val2) EXPECT_LT(val1, val2)
+#define ASSERT_GE(val1, val2) EXPECT_GE(val1, val2)
+#define ASSERT_GT(val1, val2) EXPECT_GT(val1, val2)
+// As are these variants.
+#define EXPECT_TRUE(cond)     CHECK(cond)
+#define EXPECT_FALSE(cond)    CHECK(!(cond))
+#define EXPECT_STREQ(a, b)    CHECK(strcmp(a, b) == 0)
+#define ASSERT_TRUE(cond)     EXPECT_TRUE(cond)
+#define ASSERT_FALSE(cond)    EXPECT_FALSE(cond)
+#define ASSERT_STREQ(a, b)    EXPECT_STREQ(a, b)
+
+// Used for (libc) functions that return -1 and set errno
+#define CHECK_ERR(invocation)  PCHECK((invocation) != -1)
+
+// A few more checks that only happen in debug mode
+#ifdef NDEBUG
+#define DCHECK_EQ(val1, val2)
+#define DCHECK_NE(val1, val2)
+#define DCHECK_LE(val1, val2)
+#define DCHECK_LT(val1, val2)
+#define DCHECK_GE(val1, val2)
+#define DCHECK_GT(val1, val2)
+#else
+#define DCHECK_EQ(val1, val2)  CHECK_EQ(val1, val2)
+#define DCHECK_NE(val1, val2)  CHECK_NE(val1, val2)
+#define DCHECK_LE(val1, val2)  CHECK_LE(val1, val2)
+#define DCHECK_LT(val1, val2)  CHECK_LT(val1, val2)
+#define DCHECK_GE(val1, val2)  CHECK_GE(val1, val2)
+#define DCHECK_GT(val1, val2)  CHECK_GT(val1, val2)
+#endif
+
+
+#ifdef ERROR
+#undef ERROR      // may conflict with ERROR macro on windows
+#endif
+enum LogSeverity {INFO = -1, WARNING = -2, ERROR = -3, FATAL = -4};
+
+// NOTE: we add a newline to the end of the output if it's not there already
+inline void LogPrintf(int severity, const char* pat, va_list ap) {
+  // We write directly to the stderr file descriptor and avoid FILE
+  // buffering because that may invoke malloc()
+  char buf[600];
+  perftools_vsnprintf(buf, sizeof(buf)-1, pat, ap);
+  if (buf[0] != '\0' && buf[strlen(buf)-1] != '\n') {
+    assert(strlen(buf)+1 < sizeof(buf));
+    strcat(buf, "\n");
+  }
+  WRITE_TO_STDERR(buf, strlen(buf));
+  if ((severity) == FATAL)
+    abort(); // LOG(FATAL) indicates a big problem, so don't run atexit() calls
+}
+
+// Note that since the order of global constructors is unspecified,
+// global code that calls RAW_LOG may execute before FLAGS_verbose is set.
+// Such code will run with verbosity == 0 no matter what.
+#define VLOG_IS_ON(severity) (FLAGS_verbose >= severity)
+
+// In a better world, we'd use __VA_ARGS__, but VC++ 7 doesn't support it.
+#define LOG_PRINTF(severity, pat) do {          \
+  if (VLOG_IS_ON(severity)) {                   \
+    va_list ap;                                 \
+    va_start(ap, pat);                          \
+    LogPrintf(severity, pat, ap);               \
+    va_end(ap);                                 \
+  }                                             \
+} while (0)
+
+// RAW_LOG is the main function; some synonyms are used in unittests.
+inline void RAW_LOG(int lvl, const char* pat, ...)  { LOG_PRINTF(lvl, pat); }
+inline void RAW_VLOG(int lvl, const char* pat, ...) { LOG_PRINTF(lvl, pat); }
+inline void LOG(int lvl, const char* pat, ...)      { LOG_PRINTF(lvl, pat); }
+inline void VLOG(int lvl, const char* pat, ...)     { LOG_PRINTF(lvl, pat); }
+inline void LOG_IF(int lvl, bool cond, const char* pat, ...) {
+  if (cond)  LOG_PRINTF(lvl, pat);
+}
+
+// This isn't technically logging, but it's also IO and also is an
+// attempt to be "raw" -- that is, to not use any higher-level libc
+// routines that might allocate memory or (ideally) try to allocate
+// locks.  We use an opaque file handle (not necessarily an int)
+// to allow even more low-level stuff in the future.
+// Like other "raw" routines, these functions are best effort, and
+// thus don't return error codes (except RawOpenForWriting()).
+#if defined(_WIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__)
+#ifndef NOMINMAX
+#define NOMINMAX     // @#!$& windows
+#endif
+#include <windows.h>
+typedef HANDLE RawFD;
+const RawFD kIllegalRawFD = INVALID_HANDLE_VALUE;
+#else
+typedef int RawFD;
+const RawFD kIllegalRawFD = -1;   // what open returns if it fails
+#endif  // defined(_WIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__)
+
+RawFD RawOpenForWriting(const char* filename);   // uses default permissions
+void RawWrite(RawFD fd, const char* buf, size_t len);
+void RawClose(RawFD fd);
+
+#endif // _LOGGING_H_

diff --git a/src/base/low_level_alloc.cc b/src/base/low_level_alloc.cc
new file mode 100644
index 0000000..4d2ae8d
--- /dev/null
+++ b/src/base/low_level_alloc.cc

@@ -0,0 +1,523 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2006, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// A low-level allocator that can be used by other low-level
+// modules without introducing dependency cycles.
+// This allocator is slow and wasteful of memory;
+// it should not be used when performance is key.
+
+#include "base/low_level_alloc.h"
+#include "base/dynamic_annotations.h"
+#include "base/spinlock.h"
+#include "base/logging.h"
+#include "malloc_hook-inl.h"
+#include <gperftools/malloc_hook.h>
+#include <errno.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_MMAP
+#include <sys/mman.h>
+#endif
+#include <new>                   // for placement-new
+
+// On systems (like freebsd) that don't define MAP_ANONYMOUS, use the old
+// form of the name instead.
+#ifndef MAP_ANONYMOUS
+# define MAP_ANONYMOUS MAP_ANON
+#endif
+
+// A first-fit allocator with amortized logarithmic free() time.
+
+// ---------------------------------------------------------------------------
+static const int kMaxLevel = 30;
+
+// We put this class-only struct in a namespace to avoid polluting the
+// global namespace with this struct name (thus risking an ODR violation).
+namespace low_level_alloc_internal {
+  // This struct describes one allocated block, or one free block.
+  struct AllocList {
+    struct Header {
+      intptr_t size;  // size of entire region, including this field. Must be
+                      // first.  Valid in both allocated and unallocated blocks
+      intptr_t magic; // kMagicAllocated or kMagicUnallocated xor this
+      LowLevelAlloc::Arena *arena; // pointer to parent arena
+      void *dummy_for_alignment;   // aligns regions to 0 mod 2*sizeof(void*)
+    } header;
+
+    // Next two fields: in unallocated blocks: freelist skiplist data
+    //                  in allocated blocks: overlaps with client data
+    int levels;           // levels in skiplist used
+    AllocList *next[kMaxLevel];   // actually has levels elements.
+                                  // The AllocList node may not have room for
+                                  // all kMaxLevel entries.  See max_fit in
+                                  // LLA_SkiplistLevels()
+  };
+}
+using low_level_alloc_internal::AllocList;
+
+
+// ---------------------------------------------------------------------------
+// A trivial skiplist implementation.  This is used to keep the freelist
+// in address order while taking only logarithmic time per insert and delete.
+
+// An integer approximation of log2(size/base)
+// Requires size >= base.
+static int IntLog2(size_t size, size_t base) {
+  int result = 0;
+  for (size_t i = size; i > base; i >>= 1) { // i == floor(size/2**result)
+    result++;
+  }
+  //    floor(size / 2**result) <= base < floor(size / 2**(result-1))
+  // =>     log2(size/(base+1)) <= result < 1+log2(size/base)
+  // => result ~= log2(size/base)
+  return result;
+}
+
+// Return a random integer n:  p(n)=1/(2**n) if 1 <= n; p(n)=0 if n < 1.
+static int Random() {
+  static int32 r = 1;         // no locking---it's not critical
+  ANNOTATE_BENIGN_RACE(&r, "benign race, not critical.");
+  int result = 1;
+  while ((((r = r*1103515245 + 12345) >> 30) & 1) == 0) {
+    result++;
+  }
+  return result;
+}
+
+// Return a number of skiplist levels for a node of size bytes, where
+// base is the minimum node size.  Compute level=log2(size / base)+n
+// where n is 1 if random is false and otherwise a random number generated with
+// the standard distribution for a skiplist:  See Random() above.
+// Bigger nodes tend to have more skiplist levels due to the log2(size / base)
+// term, so first-fit searches touch fewer nodes.  "level" is clipped so
+// level<kMaxLevel and next[level-1] will fit in the node.
+// 0 < LLA_SkiplistLevels(x,y,false) <= LLA_SkiplistLevels(x,y,true) < kMaxLevel
+static int LLA_SkiplistLevels(size_t size, size_t base, bool random) {
+  // max_fit is the maximum number of levels that will fit in a node for the
+  // given size.   We can't return more than max_fit, no matter what the
+  // random number generator says.
+  int max_fit = (size-OFFSETOF_MEMBER(AllocList, next)) / sizeof (AllocList *);
+  int level = IntLog2(size, base) + (random? Random() : 1);
+  if (level > max_fit)     level = max_fit;
+  if (level > kMaxLevel-1) level = kMaxLevel - 1;
+  RAW_CHECK(level >= 1, "block not big enough for even one level");
+  return level;
+}
+
+// Return "atleast", the first element of AllocList *head s.t. *atleast >= *e.
+// For 0 <= i < head->levels, set prev[i] to "no_greater", where no_greater
+// points to the last element at level i in the AllocList less than *e, or is
+// head if no such element exists.
+static AllocList *LLA_SkiplistSearch(AllocList *head,
+                                     AllocList *e, AllocList **prev) {
+  AllocList *p = head;
+  for (int level = head->levels - 1; level >= 0; level--) {
+    for (AllocList *n; (n = p->next[level]) != 0 && n < e; p = n) {
+    }
+    prev[level] = p;
+  }
+  return (head->levels == 0) ?  0 : prev[0]->next[0];
+}
+
+// Insert element *e into AllocList *head.  Set prev[] as LLA_SkiplistSearch.
+// Requires that e->levels be previously set by the caller (using
+// LLA_SkiplistLevels())
+static void LLA_SkiplistInsert(AllocList *head, AllocList *e,
+                               AllocList **prev) {
+  LLA_SkiplistSearch(head, e, prev);
+  for (; head->levels < e->levels; head->levels++) { // extend prev pointers
+    prev[head->levels] = head;                       // to all *e's levels
+  }
+  for (int i = 0; i != e->levels; i++) { // add element to list
+    e->next[i] = prev[i]->next[i];
+    prev[i]->next[i] = e;
+  }
+}
+
+// Remove element *e from AllocList *head.  Set prev[] as LLA_SkiplistSearch().
+// Requires that e->levels be previous set by the caller (using
+// LLA_SkiplistLevels())
+static void LLA_SkiplistDelete(AllocList *head, AllocList *e,
+                               AllocList **prev) {
+  AllocList *found = LLA_SkiplistSearch(head, e, prev);
+  RAW_CHECK(e == found, "element not in freelist");
+  for (int i = 0; i != e->levels && prev[i]->next[i] == e; i++) {
+    prev[i]->next[i] = e->next[i];
+  }
+  while (head->levels > 0 && head->next[head->levels - 1] == 0) {
+    head->levels--;   // reduce head->levels if level unused
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Arena implementation
+
+struct LowLevelAlloc::Arena {
+  Arena() : mu(SpinLock::LINKER_INITIALIZED) {} // does nothing; for static init
+  explicit Arena(int) : pagesize(0) {}  // set pagesize to zero explicitly
+                                        // for non-static init
+
+  SpinLock mu;            // protects freelist, allocation_count,
+                          // pagesize, roundup, min_size
+  AllocList freelist;     // head of free list; sorted by addr (under mu)
+  int32 allocation_count; // count of allocated blocks (under mu)
+  int32 flags;            // flags passed to NewArena (ro after init)
+  size_t pagesize;        // ==getpagesize()  (init under mu, then ro)
+  size_t roundup;         // lowest power of 2 >= max(16,sizeof (AllocList))
+                          // (init under mu, then ro)
+  size_t min_size;        // smallest allocation block size
+                          // (init under mu, then ro)
+};
+
+// The default arena, which is used when 0 is passed instead of an Arena
+// pointer.
+static struct LowLevelAlloc::Arena default_arena;
+
+// Non-malloc-hooked arenas: used only to allocate metadata for arenas that
+// do not want malloc hook reporting, so that for them there's no malloc hook
+// reporting even during arena creation.
+static struct LowLevelAlloc::Arena unhooked_arena;
+static struct LowLevelAlloc::Arena unhooked_async_sig_safe_arena;
+
+// magic numbers to identify allocated and unallocated blocks
+static const intptr_t kMagicAllocated = 0x4c833e95;
+static const intptr_t kMagicUnallocated = ~kMagicAllocated;
+
+namespace {
+  class SCOPED_LOCKABLE ArenaLock {
+   public:
+    explicit ArenaLock(LowLevelAlloc::Arena *arena)
+        EXCLUSIVE_LOCK_FUNCTION(arena->mu)
+        : left_(false), mask_valid_(false), arena_(arena) {
+      if ((arena->flags & LowLevelAlloc::kAsyncSignalSafe) != 0) {
+      // We've decided not to support async-signal-safe arena use until
+      // there a demonstrated need.  Here's how one could do it though
+      // (would need to be made more portable).
+#if 0
+        sigset_t all;
+        sigfillset(&all);
+        this->mask_valid_ =
+            (pthread_sigmask(SIG_BLOCK, &all, &this->mask_) == 0);
+#else
+        RAW_CHECK(false, "We do not yet support async-signal-safe arena.");
+#endif
+      }
+      this->arena_->mu.Lock();
+    }
+    ~ArenaLock() { RAW_CHECK(this->left_, "haven't left Arena region"); }
+    void Leave() /*UNLOCK_FUNCTION()*/ {
+      this->arena_->mu.Unlock();
+#if 0
+      if (this->mask_valid_) {
+        pthread_sigmask(SIG_SETMASK, &this->mask_, 0);
+      }
+#endif
+      this->left_ = true;
+    }
+   private:
+    bool left_;       // whether left region
+    bool mask_valid_;
+#if 0
+    sigset_t mask_;   // old mask of blocked signals
+#endif
+    LowLevelAlloc::Arena *arena_;
+    DISALLOW_COPY_AND_ASSIGN(ArenaLock);
+  };
+} // anonymous namespace
+
+// create an appropriate magic number for an object at "ptr"
+// "magic" should be kMagicAllocated or kMagicUnallocated
+inline static intptr_t Magic(intptr_t magic, AllocList::Header *ptr) {
+  return magic ^ reinterpret_cast<intptr_t>(ptr);
+}
+
+// Initialize the fields of an Arena
+static void ArenaInit(LowLevelAlloc::Arena *arena) {
+  if (arena->pagesize == 0) {
+    arena->pagesize = getpagesize();
+    // Round up block sizes to a power of two close to the header size.
+    arena->roundup = 16;
+    while (arena->roundup < sizeof (arena->freelist.header)) {
+      arena->roundup += arena->roundup;
+    }
+    // Don't allocate blocks less than twice the roundup size to avoid tiny
+    // free blocks.
+    arena->min_size = 2 * arena->roundup;
+    arena->freelist.header.size = 0;
+    arena->freelist.header.magic =
+        Magic(kMagicUnallocated, &arena->freelist.header);
+    arena->freelist.header.arena = arena;
+    arena->freelist.levels = 0;
+    memset(arena->freelist.next, 0, sizeof (arena->freelist.next));
+    arena->allocation_count = 0;
+    if (arena == &default_arena) {
+      // Default arena should be hooked, e.g. for heap-checker to trace
+      // pointer chains through objects in the default arena.
+      arena->flags = LowLevelAlloc::kCallMallocHook;
+    } else if (arena == &unhooked_async_sig_safe_arena) {
+      arena->flags = LowLevelAlloc::kAsyncSignalSafe;
+    } else {
+      arena->flags = 0;   // other arenas' flags may be overridden by client,
+                          // but unhooked_arena will have 0 in 'flags'.
+    }
+  }
+}
+
+// L < meta_data_arena->mu
+LowLevelAlloc::Arena *LowLevelAlloc::NewArena(int32 flags,
+                                              Arena *meta_data_arena) {
+  RAW_CHECK(meta_data_arena != 0, "must pass a valid arena");
+  if (meta_data_arena == &default_arena) {
+    if ((flags & LowLevelAlloc::kAsyncSignalSafe) != 0) {
+      meta_data_arena = &unhooked_async_sig_safe_arena;
+    } else if ((flags & LowLevelAlloc::kCallMallocHook) == 0) {
+      meta_data_arena = &unhooked_arena;
+    }
+  }
+  // Arena(0) uses the constructor for non-static contexts
+  Arena *result =
+    new (AllocWithArena(sizeof (*result), meta_data_arena)) Arena(0);
+  ArenaInit(result);
+  result->flags = flags;
+  return result;
+}
+
+// L < arena->mu, L < arena->arena->mu
+bool LowLevelAlloc::DeleteArena(Arena *arena) {
+  RAW_CHECK(arena != 0 && arena != &default_arena && arena != &unhooked_arena,
+            "may not delete default arena");
+  ArenaLock section(arena);
+  bool empty = (arena->allocation_count == 0);
+  section.Leave();
+  if (empty) {
+    while (arena->freelist.next[0] != 0) {
+      AllocList *region = arena->freelist.next[0];
+      size_t size = region->header.size;
+      arena->freelist.next[0] = region->next[0];
+      RAW_CHECK(region->header.magic ==
+                Magic(kMagicUnallocated, &region->header),
+                "bad magic number in DeleteArena()");
+      RAW_CHECK(region->header.arena == arena,
+                "bad arena pointer in DeleteArena()");
+      RAW_CHECK(size % arena->pagesize == 0,
+                "empty arena has non-page-aligned block size");
+      RAW_CHECK(reinterpret_cast<intptr_t>(region) % arena->pagesize == 0,
+                "empty arena has non-page-aligned block");
+      int munmap_result;
+      if ((arena->flags & LowLevelAlloc::kAsyncSignalSafe) == 0) {
+        munmap_result = munmap(region, size);
+      } else {
+        munmap_result = MallocHook::UnhookedMUnmap(region, size);
+      }
+      RAW_CHECK(munmap_result == 0,
+                "LowLevelAlloc::DeleteArena:  munmap failed address");
+    }
+    Free(arena);
+  }
+  return empty;
+}
+
+// ---------------------------------------------------------------------------
+
+// Return value rounded up to next multiple of align.
+// align must be a power of two.
+static intptr_t RoundUp(intptr_t addr, intptr_t align) {
+  return (addr + align - 1) & ~(align - 1);
+}
+
+// Equivalent to "return prev->next[i]" but with sanity checking
+// that the freelist is in the correct order, that it
+// consists of regions marked "unallocated", and that no two regions
+// are adjacent in memory (they should have been coalesced).
+// L < arena->mu
+static AllocList *Next(int i, AllocList *prev, LowLevelAlloc::Arena *arena) {
+  RAW_CHECK(i < prev->levels, "too few levels in Next()");
+  AllocList *next = prev->next[i];
+  if (next != 0) {
+    RAW_CHECK(next->header.magic == Magic(kMagicUnallocated, &next->header),
+              "bad magic number in Next()");
+    RAW_CHECK(next->header.arena == arena,
+              "bad arena pointer in Next()");
+    if (prev != &arena->freelist) {
+      RAW_CHECK(prev < next, "unordered freelist");
+      RAW_CHECK(reinterpret_cast<char *>(prev) + prev->header.size <
+                reinterpret_cast<char *>(next), "malformed freelist");
+    }
+  }
+  return next;
+}
+
+// Coalesce list item "a" with its successor if they are adjacent.
+static void Coalesce(AllocList *a) {
+  AllocList *n = a->next[0];
+  if (n != 0 && reinterpret_cast<char *>(a) + a->header.size ==
+                    reinterpret_cast<char *>(n)) {
+    LowLevelAlloc::Arena *arena = a->header.arena;
+    a->header.size += n->header.size;
+    n->header.magic = 0;
+    n->header.arena = 0;
+    AllocList *prev[kMaxLevel];
+    LLA_SkiplistDelete(&arena->freelist, n, prev);
+    LLA_SkiplistDelete(&arena->freelist, a, prev);
+    a->levels = LLA_SkiplistLevels(a->header.size, arena->min_size, true);
+    LLA_SkiplistInsert(&arena->freelist, a, prev);
+  }
+}
+
+// Adds block at location "v" to the free list
+// L >= arena->mu
+static void AddToFreelist(void *v, LowLevelAlloc::Arena *arena) {
+  AllocList *f = reinterpret_cast<AllocList *>(
+                        reinterpret_cast<char *>(v) - sizeof (f->header));
+  RAW_CHECK(f->header.magic == Magic(kMagicAllocated, &f->header),
+            "bad magic number in AddToFreelist()");
+  RAW_CHECK(f->header.arena == arena,
+            "bad arena pointer in AddToFreelist()");
+  f->levels = LLA_SkiplistLevels(f->header.size, arena->min_size, true);
+  AllocList *prev[kMaxLevel];
+  LLA_SkiplistInsert(&arena->freelist, f, prev);
+  f->header.magic = Magic(kMagicUnallocated, &f->header);
+  Coalesce(f);                  // maybe coalesce with successor
+  Coalesce(prev[0]);            // maybe coalesce with predecessor
+}
+
+// Frees storage allocated by LowLevelAlloc::Alloc().
+// L < arena->mu
+void LowLevelAlloc::Free(void *v) {
+  if (v != 0) {
+    AllocList *f = reinterpret_cast<AllocList *>(
+                        reinterpret_cast<char *>(v) - sizeof (f->header));
+    RAW_CHECK(f->header.magic == Magic(kMagicAllocated, &f->header),
+              "bad magic number in Free()");
+    LowLevelAlloc::Arena *arena = f->header.arena;
+    if ((arena->flags & kCallMallocHook) != 0) {
+      MallocHook::InvokeDeleteHook(v);
+    }
+    ArenaLock section(arena);
+    AddToFreelist(v, arena);
+    RAW_CHECK(arena->allocation_count > 0, "nothing in arena to free");
+    arena->allocation_count--;
+    section.Leave();
+  }
+}
+
+// allocates and returns a block of size bytes, to be freed with Free()
+// L < arena->mu
+static void *DoAllocWithArena(size_t request, LowLevelAlloc::Arena *arena) {
+  void *result = 0;
+  if (request != 0) {
+    AllocList *s;       // will point to region that satisfies request
+    ArenaLock section(arena);
+    ArenaInit(arena);
+    // round up with header
+    size_t req_rnd = RoundUp(request + sizeof (s->header), arena->roundup);
+    for (;;) {      // loop until we find a suitable region
+      // find the minimum levels that a block of this size must have
+      int i = LLA_SkiplistLevels(req_rnd, arena->min_size, false) - 1;
+      if (i < arena->freelist.levels) {   // potential blocks exist
+        AllocList *before = &arena->freelist;  // predecessor of s
+        while ((s = Next(i, before, arena)) != 0 && s->header.size < req_rnd) {
+          before = s;
+        }
+        if (s != 0) {       // we found a region
+          break;
+        }
+      }
+      // we unlock before mmap() both because mmap() may call a callback hook,
+      // and because it may be slow.
+      arena->mu.Unlock();
+      // mmap generous 64K chunks to decrease
+      // the chances/impact of fragmentation:
+      size_t new_pages_size = RoundUp(req_rnd, arena->pagesize * 16);
+      void *new_pages;
+      if ((arena->flags & LowLevelAlloc::kAsyncSignalSafe) != 0) {
+        new_pages = MallocHook::UnhookedMMap(0, new_pages_size,
+            PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+      } else {
+        new_pages = mmap(0, new_pages_size,
+            PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+      }
+      RAW_CHECK(new_pages != MAP_FAILED, "mmap error");
+      arena->mu.Lock();
+      s = reinterpret_cast<AllocList *>(new_pages);
+      s->header.size = new_pages_size;
+      // Pretend the block is allocated; call AddToFreelist() to free it.
+      s->header.magic = Magic(kMagicAllocated, &s->header);
+      s->header.arena = arena;
+      AddToFreelist(&s->levels, arena);  // insert new region into free list
+    }
+    AllocList *prev[kMaxLevel];
+    LLA_SkiplistDelete(&arena->freelist, s, prev);    // remove from free list
+    // s points to the first free region that's big enough
+    if (req_rnd + arena->min_size <= s->header.size) {  // big enough to split
+      AllocList *n = reinterpret_cast<AllocList *>
+                        (req_rnd + reinterpret_cast<char *>(s));
+      n->header.size = s->header.size - req_rnd;
+      n->header.magic = Magic(kMagicAllocated, &n->header);
+      n->header.arena = arena;
+      s->header.size = req_rnd;
+      AddToFreelist(&n->levels, arena);
+    }
+    s->header.magic = Magic(kMagicAllocated, &s->header);
+    RAW_CHECK(s->header.arena == arena, "");
+    arena->allocation_count++;
+    section.Leave();
+    result = &s->levels;
+  }
+  ANNOTATE_NEW_MEMORY(result, request);
+  return result;
+}
+
+void *LowLevelAlloc::Alloc(size_t request) {
+  void *result = DoAllocWithArena(request, &default_arena);
+  if ((default_arena.flags & kCallMallocHook) != 0) {
+    // this call must be directly in the user-called allocator function
+    // for MallocHook::GetCallerStackTrace to work properly
+    MallocHook::InvokeNewHook(result, request);
+  }
+  return result;
+}
+
+void *LowLevelAlloc::AllocWithArena(size_t request, Arena *arena) {
+  RAW_CHECK(arena != 0, "must pass a valid arena");
+  void *result = DoAllocWithArena(request, arena);
+  if ((arena->flags & kCallMallocHook) != 0) {
+    // this call must be directly in the user-called allocator function
+    // for MallocHook::GetCallerStackTrace to work properly
+    MallocHook::InvokeNewHook(result, request);
+  }
+  return result;
+}
+
+LowLevelAlloc::Arena *LowLevelAlloc::DefaultArena() {
+  return &default_arena;
+}

diff --git a/src/base/low_level_alloc.h b/src/base/low_level_alloc.h
new file mode 100644
index 0000000..4081ff8
--- /dev/null
+++ b/src/base/low_level_alloc.h

@@ -0,0 +1,107 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2006, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#if !defined(_BASE_LOW_LEVEL_ALLOC_H_)
+#define _BASE_LOW_LEVEL_ALLOC_H_
+
+// A simple thread-safe memory allocator that does not depend on
+// mutexes or thread-specific data.  It is intended to be used
+// sparingly, and only when malloc() would introduce an unwanted
+// dependency, such as inside the heap-checker.
+
+#include <config.h>
+#include <stddef.h>             // for size_t
+#include "base/basictypes.h"
+
+class LowLevelAlloc {
+ public:
+  struct Arena;       // an arena from which memory may be allocated
+
+  // Returns a pointer to a block of at least "request" bytes
+  // that have been newly allocated from the specific arena.
+  // for Alloc() call the DefaultArena() is used.
+  // Returns 0 if passed request==0.
+  // Does not return 0 under other circumstances; it crashes if memory
+  // is not available.
+  static void *Alloc(size_t request)
+    ATTRIBUTE_SECTION(malloc_hook);
+  static void *AllocWithArena(size_t request, Arena *arena)
+    ATTRIBUTE_SECTION(malloc_hook);
+
+  // Deallocates a region of memory that was previously allocated with
+  // Alloc().   Does nothing if passed 0.   "s" must be either 0,
+  // or must have been returned from a call to Alloc() and not yet passed to
+  // Free() since that call to Alloc().  The space is returned to the arena
+  // from which it was allocated.
+  static void Free(void *s) ATTRIBUTE_SECTION(malloc_hook);
+
+    // ATTRIBUTE_SECTION(malloc_hook) for Alloc* and Free
+    // are to put all callers of MallocHook::Invoke* in this module
+    // into special section,
+    // so that MallocHook::GetCallerStackTrace can function accurately.
+
+  // Create a new arena.
+  // The root metadata for the new arena is allocated in the
+  // meta_data_arena; the DefaultArena() can be passed for meta_data_arena.
+  // These values may be ored into flags:
+  enum {
+    // Report calls to Alloc() and Free() via the MallocHook interface.
+    // Set in the DefaultArena.
+    kCallMallocHook = 0x0001,
+
+    // Make calls to Alloc(), Free() be async-signal-safe.  Not set in
+    // DefaultArena().
+    kAsyncSignalSafe = 0x0002,
+
+    // When used with DefaultArena(), the NewArena() and DeleteArena() calls
+    // obey the flags given explicitly in the NewArena() call, even if those
+    // flags differ from the settings in DefaultArena().  So the call
+    // NewArena(kAsyncSignalSafe, DefaultArena()) is itself async-signal-safe,
+    // as well as generatating an arena that provides async-signal-safe
+    // Alloc/Free.
+  };
+  static Arena *NewArena(int32 flags, Arena *meta_data_arena);
+
+  // Destroys an arena allocated by NewArena and returns true,
+  // provided no allocated blocks remain in the arena.
+  // If allocated blocks remain in the arena, does nothing and
+  // returns false.
+  // It is illegal to attempt to destroy the DefaultArena().
+  static bool DeleteArena(Arena *arena);
+
+  // The default arena that always exists.
+  static Arena *DefaultArena();
+
+ private:
+  LowLevelAlloc();      // no instances
+};
+
+#endif

diff --git a/src/base/simple_mutex.h b/src/base/simple_mutex.h
new file mode 100644
index 0000000..a1886e4
--- /dev/null
+++ b/src/base/simple_mutex.h

@@ -0,0 +1,332 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// 
+// ---
+// Author: Craig Silverstein.
+//
+// A simple mutex wrapper, supporting locks and read-write locks.
+// You should assume the locks are *not* re-entrant.
+//
+// To use: you should define the following macros in your configure.ac:
+//   ACX_PTHREAD
+//   AC_RWLOCK
+// The latter is defined in ../autoconf.
+//
+// This class is meant to be internal-only and should be wrapped by an
+// internal namespace.  Before you use this module, please give the
+// name of your internal namespace for this module.  Or, if you want
+// to expose it, you'll want to move it to the Google namespace.  We
+// cannot put this class in global namespace because there can be some
+// problems when we have multiple versions of Mutex in each shared object.
+//
+// NOTE: TryLock() is broken for NO_THREADS mode, at least in NDEBUG
+//       mode.
+//
+// CYGWIN NOTE: Cygwin support for rwlock seems to be buggy:
+//    http://www.cygwin.com/ml/cygwin/2008-12/msg00017.html
+// Because of that, we might as well use windows locks for
+// cygwin.  They seem to be more reliable than the cygwin pthreads layer.
+//
+// TRICKY IMPLEMENTATION NOTE:
+// This class is designed to be safe to use during
+// dynamic-initialization -- that is, by global constructors that are
+// run before main() starts.  The issue in this case is that
+// dynamic-initialization happens in an unpredictable order, and it
+// could be that someone else's dynamic initializer could call a
+// function that tries to acquire this mutex -- but that all happens
+// before this mutex's constructor has run.  (This can happen even if
+// the mutex and the function that uses the mutex are in the same .cc
+// file.)  Basically, because Mutex does non-trivial work in its
+// constructor, it's not, in the naive implementation, safe to use
+// before dynamic initialization has run on it.
+//
+// The solution used here is to pair the actual mutex primitive with a
+// bool that is set to true when the mutex is dynamically initialized.
+// (Before that it's false.)  Then we modify all mutex routines to
+// look at the bool, and not try to lock/unlock until the bool makes
+// it to true (which happens after the Mutex constructor has run.)
+//
+// This works because before main() starts -- particularly, during
+// dynamic initialization -- there are no threads, so a) it's ok that
+// the mutex operations are a no-op, since we don't need locking then
+// anyway; and b) we can be quite confident our bool won't change
+// state between a call to Lock() and a call to Unlock() (that would
+// require a global constructor in one translation unit to call Lock()
+// and another global constructor in another translation unit to call
+// Unlock() later, which is pretty perverse).
+//
+// That said, it's tricky, and can conceivably fail; it's safest to
+// avoid trying to acquire a mutex in a global constructor, if you
+// can.  One way it can fail is that a really smart compiler might
+// initialize the bool to true at static-initialization time (too
+// early) rather than at dynamic-initialization time.  To discourage
+// that, we set is_safe_ to true in code (not the constructor
+// colon-initializer) and set it to true via a function that always
+// evaluates to true, but that the compiler can't know always
+// evaluates to true.  This should be good enough.
+//
+// A related issue is code that could try to access the mutex
+// after it's been destroyed in the global destructors (because
+// the Mutex global destructor runs before some other global
+// destructor, that tries to acquire the mutex).  The way we
+// deal with this is by taking a constructor arg that global
+// mutexes should pass in, that causes the destructor to do no
+// work.  We still depend on the compiler not doing anything
+// weird to a Mutex's memory after it is destroyed, but for a
+// static global variable, that's pretty safe.
+
+#ifndef GOOGLE_MUTEX_H_
+#define GOOGLE_MUTEX_H_
+
+#include <config.h>
+
+#if defined(NO_THREADS)
+  typedef int MutexType;      // to keep a lock-count
+#elif defined(_WIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__)
+# ifndef WIN32_LEAN_AND_MEAN
+#   define WIN32_LEAN_AND_MEAN  // We only need minimal includes
+# endif
+  // We need Windows NT or later for TryEnterCriticalSection().  If you
+  // don't need that functionality, you can remove these _WIN32_WINNT
+  // lines, and change TryLock() to assert(0) or something.
+# ifndef _WIN32_WINNT
+#   define _WIN32_WINNT 0x0400
+# endif
+# include <windows.h>
+  typedef CRITICAL_SECTION MutexType;
+#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
+  // Needed for pthread_rwlock_*.  If it causes problems, you could take it
+  // out, but then you'd have to unset HAVE_RWLOCK (at least on linux -- it
+  // *does* cause problems for FreeBSD, or MacOSX, but isn't needed
+  // for locking there.)
+# ifdef __linux__
+#   define _XOPEN_SOURCE 500  // may be needed to get the rwlock calls
+# endif
+# include <pthread.h>
+  typedef pthread_rwlock_t MutexType;
+#elif defined(HAVE_PTHREAD)
+# include <pthread.h>
+  typedef pthread_mutex_t MutexType;
+#else
+# error Need to implement mutex.h for your architecture, or #define NO_THREADS
+#endif
+
+#include <assert.h>
+#include <stdlib.h>      // for abort()
+
+#define MUTEX_NAMESPACE perftools_mutex_namespace
+
+namespace MUTEX_NAMESPACE {
+
+class Mutex {
+ public:
+  // This is used for the single-arg constructor
+  enum LinkerInitialized { LINKER_INITIALIZED };
+
+  // Create a Mutex that is not held by anybody.  This constructor is
+  // typically used for Mutexes allocated on the heap or the stack.
+  inline Mutex();
+  // This constructor should be used for global, static Mutex objects.
+  // It inhibits work being done by the destructor, which makes it
+  // safer for code that tries to acqiure this mutex in their global
+  // destructor.
+  inline Mutex(LinkerInitialized);
+
+  // Destructor
+  inline ~Mutex();
+
+  inline void Lock();    // Block if needed until free then acquire exclusively
+  inline void Unlock();  // Release a lock acquired via Lock()
+  inline bool TryLock(); // If free, Lock() and return true, else return false
+  // Note that on systems that don't support read-write locks, these may
+  // be implemented as synonyms to Lock() and Unlock().  So you can use
+  // these for efficiency, but don't use them anyplace where being able
+  // to do shared reads is necessary to avoid deadlock.
+  inline void ReaderLock();   // Block until free or shared then acquire a share
+  inline void ReaderUnlock(); // Release a read share of this Mutex
+  inline void WriterLock() { Lock(); }     // Acquire an exclusive lock
+  inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock()
+
+ private:
+  MutexType mutex_;
+  // We want to make sure that the compiler sets is_safe_ to true only
+  // when we tell it to, and never makes assumptions is_safe_ is
+  // always true.  volatile is the most reliable way to do that.
+  volatile bool is_safe_;
+  // This indicates which constructor was called.
+  bool destroy_;
+
+  inline void SetIsSafe() { is_safe_ = true; }
+
+  // Catch the error of writing Mutex when intending MutexLock.
+  Mutex(Mutex* /*ignored*/) {}
+  // Disallow "evil" constructors
+  Mutex(const Mutex&);
+  void operator=(const Mutex&);
+};
+
+// Now the implementation of Mutex for various systems
+#if defined(NO_THREADS)
+
+// When we don't have threads, we can be either reading or writing,
+// but not both.  We can have lots of readers at once (in no-threads
+// mode, that's most likely to happen in recursive function calls),
+// but only one writer.  We represent this by having mutex_ be -1 when
+// writing and a number > 0 when reading (and 0 when no lock is held).
+//
+// In debug mode, we assert these invariants, while in non-debug mode
+// we do nothing, for efficiency.  That's why everything is in an
+// assert.
+
+Mutex::Mutex() : mutex_(0) { }
+Mutex::Mutex(Mutex::LinkerInitialized) : mutex_(0) { }
+Mutex::~Mutex()            { assert(mutex_ == 0); }
+void Mutex::Lock()         { assert(--mutex_ == -1); }
+void Mutex::Unlock()       { assert(mutex_++ == -1); }
+bool Mutex::TryLock()      { if (mutex_) return false; Lock(); return true; }
+void Mutex::ReaderLock()   { assert(++mutex_ > 0); }
+void Mutex::ReaderUnlock() { assert(mutex_-- > 0); }
+
+#elif defined(_WIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__)
+
+Mutex::Mutex() : destroy_(true) {
+  InitializeCriticalSection(&mutex_);
+  SetIsSafe();
+}
+Mutex::Mutex(LinkerInitialized) : destroy_(false) {
+  InitializeCriticalSection(&mutex_);
+  SetIsSafe();
+}
+Mutex::~Mutex()            { if (destroy_) DeleteCriticalSection(&mutex_); }
+void Mutex::Lock()         { if (is_safe_) EnterCriticalSection(&mutex_); }
+void Mutex::Unlock()       { if (is_safe_) LeaveCriticalSection(&mutex_); }
+bool Mutex::TryLock()      { return is_safe_ ?
+                                 TryEnterCriticalSection(&mutex_) != 0 : true; }
+void Mutex::ReaderLock()   { Lock(); }      // we don't have read-write locks
+void Mutex::ReaderUnlock() { Unlock(); }
+
+#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
+
+#define SAFE_PTHREAD(fncall)  do {   /* run fncall if is_safe_ is true */  \
+  if (is_safe_ && fncall(&mutex_) != 0) abort();                           \
+} while (0)
+
+Mutex::Mutex() : destroy_(true) {
+  SetIsSafe();
+  if (is_safe_ && pthread_rwlock_init(&mutex_, NULL) != 0) abort();
+}
+Mutex::Mutex(Mutex::LinkerInitialized) : destroy_(false) {
+  SetIsSafe();
+  if (is_safe_ && pthread_rwlock_init(&mutex_, NULL) != 0) abort();
+}
+Mutex::~Mutex()       { if (destroy_) SAFE_PTHREAD(pthread_rwlock_destroy); }
+void Mutex::Lock()         { SAFE_PTHREAD(pthread_rwlock_wrlock); }
+void Mutex::Unlock()       { SAFE_PTHREAD(pthread_rwlock_unlock); }
+bool Mutex::TryLock()      { return is_safe_ ?
+                               pthread_rwlock_trywrlock(&mutex_) == 0 : true; }
+void Mutex::ReaderLock()   { SAFE_PTHREAD(pthread_rwlock_rdlock); }
+void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock); }
+#undef SAFE_PTHREAD
+
+#elif defined(HAVE_PTHREAD)
+
+#define SAFE_PTHREAD(fncall)  do {   /* run fncall if is_safe_ is true */  \
+  if (is_safe_ && fncall(&mutex_) != 0) abort();                           \
+} while (0)
+
+Mutex::Mutex() : destroy_(true) {
+  SetIsSafe();
+  if (is_safe_ && pthread_mutex_init(&mutex_, NULL) != 0) abort();
+}
+Mutex::Mutex(Mutex::LinkerInitialized) : destroy_(false) {
+  SetIsSafe();
+  if (is_safe_ && pthread_mutex_init(&mutex_, NULL) != 0) abort();
+}
+Mutex::~Mutex()       { if (destroy_) SAFE_PTHREAD(pthread_mutex_destroy); }
+void Mutex::Lock()         { SAFE_PTHREAD(pthread_mutex_lock); }
+void Mutex::Unlock()       { SAFE_PTHREAD(pthread_mutex_unlock); }
+bool Mutex::TryLock()      { return is_safe_ ?
+                                 pthread_mutex_trylock(&mutex_) == 0 : true; }
+void Mutex::ReaderLock()   { Lock(); }
+void Mutex::ReaderUnlock() { Unlock(); }
+#undef SAFE_PTHREAD
+
+#endif
+
+// --------------------------------------------------------------------------
+// Some helper classes
+
+// MutexLock(mu) acquires mu when constructed and releases it when destroyed.
+class MutexLock {
+ public:
+  explicit MutexLock(Mutex *mu) : mu_(mu) { mu_->Lock(); }
+  ~MutexLock() { mu_->Unlock(); }
+ private:
+  Mutex * const mu_;
+  // Disallow "evil" constructors
+  MutexLock(const MutexLock&);
+  void operator=(const MutexLock&);
+};
+
+// ReaderMutexLock and WriterMutexLock do the same, for rwlocks
+class ReaderMutexLock {
+ public:
+  explicit ReaderMutexLock(Mutex *mu) : mu_(mu) { mu_->ReaderLock(); }
+  ~ReaderMutexLock() { mu_->ReaderUnlock(); }
+ private:
+  Mutex * const mu_;
+  // Disallow "evil" constructors
+  ReaderMutexLock(const ReaderMutexLock&);
+  void operator=(const ReaderMutexLock&);
+};
+
+class WriterMutexLock {
+ public:
+  explicit WriterMutexLock(Mutex *mu) : mu_(mu) { mu_->WriterLock(); }
+  ~WriterMutexLock() { mu_->WriterUnlock(); }
+ private:
+  Mutex * const mu_;
+  // Disallow "evil" constructors
+  WriterMutexLock(const WriterMutexLock&);
+  void operator=(const WriterMutexLock&);
+};
+
+// Catch bug where variable name is omitted, e.g. MutexLock (&mu);
+#define MutexLock(x) COMPILE_ASSERT(0, mutex_lock_decl_missing_var_name)
+#define ReaderMutexLock(x) COMPILE_ASSERT(0, rmutex_lock_decl_missing_var_name)
+#define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name)
+
+}  // namespace MUTEX_NAMESPACE
+
+using namespace MUTEX_NAMESPACE;
+
+#undef MUTEX_NAMESPACE
+
+#endif  /* #define GOOGLE_SIMPLE_MUTEX_H_ */

diff --git a/src/base/spinlock.cc b/src/base/spinlock.cc
new file mode 100644
index 0000000..2021fec
--- /dev/null
+++ b/src/base/spinlock.cc

@@ -0,0 +1,183 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2006, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Sanjay Ghemawat
+ */
+
+#include <config.h>
+#include "base/spinlock.h"
+#include "base/synchronization_profiling.h"
+#include "base/spinlock_internal.h"
+#include "base/cycleclock.h"
+#include "base/sysinfo.h"   /* for NumCPUs() */
+
+// NOTE on the Lock-state values:
+//
+//   kSpinLockFree represents the unlocked state
+//   kSpinLockHeld represents the locked state with no waiters
+//
+// Values greater than kSpinLockHeld represent the locked state with waiters,
+// where the value is the time the current lock holder had to
+// wait before obtaining the lock.  The kSpinLockSleeper state is a special
+// "locked with waiters" state that indicates that a sleeper needs to
+// be woken, but the thread that just released the lock didn't wait.
+
+static int adaptive_spin_count = 0;
+
+const base::LinkerInitialized SpinLock::LINKER_INITIALIZED =
+    base::LINKER_INITIALIZED;
+
+namespace {
+struct SpinLock_InitHelper {
+  SpinLock_InitHelper() {
+    // On multi-cpu machines, spin for longer before yielding
+    // the processor or sleeping.  Reduces idle time significantly.
+    if (NumCPUs() > 1) {
+      adaptive_spin_count = 1000;
+    }
+  }
+};
+
+// Hook into global constructor execution:
+// We do not do adaptive spinning before that,
+// but nothing lock-intensive should be going on at that time.
+static SpinLock_InitHelper init_helper;
+
+}  // unnamed namespace
+
+// Monitor the lock to see if its value changes within some time period
+// (adaptive_spin_count loop iterations).  A timestamp indicating
+// when the thread initially started waiting for the lock is passed in via
+// the initial_wait_timestamp value.  The total wait time in cycles for the
+// lock is returned in the wait_cycles parameter.  The last value read
+// from the lock is returned from the method.
+Atomic32 SpinLock::SpinLoop(int64 initial_wait_timestamp,
+                            Atomic32* wait_cycles) {
+  int c = adaptive_spin_count;
+  while (base::subtle::NoBarrier_Load(&lockword_) != kSpinLockFree && --c > 0) {
+  }
+  Atomic32 spin_loop_wait_cycles = CalculateWaitCycles(initial_wait_timestamp);
+  Atomic32 lock_value =
+      base::subtle::Acquire_CompareAndSwap(&lockword_, kSpinLockFree,
+                                           spin_loop_wait_cycles);
+  *wait_cycles = spin_loop_wait_cycles;
+  return lock_value;
+}
+
+void SpinLock::SlowLock() {
+  // The lock was not obtained initially, so this thread needs to wait for
+  // it.  Record the current timestamp in the local variable wait_start_time
+  // so the total wait time can be stored in the lockword once this thread
+  // obtains the lock.
+  int64 wait_start_time = CycleClock::Now();
+  Atomic32 wait_cycles;
+  Atomic32 lock_value = SpinLoop(wait_start_time, &wait_cycles);
+
+  int lock_wait_call_count = 0;
+  while (lock_value != kSpinLockFree) {
+    // If the lock is currently held, but not marked as having a sleeper, mark
+    // it as having a sleeper.
+    if (lock_value == kSpinLockHeld) {
+      // Here, just "mark" that the thread is going to sleep.  Don't store the
+      // lock wait time in the lock as that will cause the current lock
+      // owner to think it experienced contention.
+      lock_value = base::subtle::Acquire_CompareAndSwap(&lockword_,
+                                                        kSpinLockHeld,
+                                                        kSpinLockSleeper);
+      if (lock_value == kSpinLockHeld) {
+        // Successfully transitioned to kSpinLockSleeper.  Pass
+        // kSpinLockSleeper to the SpinLockWait routine to properly indicate
+        // the last lock_value observed.
+        lock_value = kSpinLockSleeper;
+      } else if (lock_value == kSpinLockFree) {
+        // Lock is free again, so try and acquire it before sleeping.  The
+        // new lock state will be the number of cycles this thread waited if
+        // this thread obtains the lock.
+        lock_value = base::subtle::Acquire_CompareAndSwap(&lockword_,
+                                                          kSpinLockFree,
+                                                          wait_cycles);
+        continue;  // skip the delay at the end of the loop
+      }
+    }
+
+    // Wait for an OS specific delay.
+    base::internal::SpinLockDelay(&lockword_, lock_value,
+                                  ++lock_wait_call_count);
+    // Spin again after returning from the wait routine to give this thread
+    // some chance of obtaining the lock.
+    lock_value = SpinLoop(wait_start_time, &wait_cycles);
+  }
+}
+
+// The wait time for contentionz lock profiling must fit into 32 bits.
+// However, the lower 32-bits of the cycle counter wrap around too quickly
+// with high frequency processors, so a right-shift by 7 is performed to
+// quickly divide the cycles by 128.  Using these 32 bits, reduces the
+// granularity of time measurement to 128 cycles, and loses track
+// of wait time for waits greater than 109 seconds on a 5 GHz machine
+// [(2^32 cycles/5 Ghz)*128 = 109.95 seconds]. Waits this long should be
+// very rare and the reduced granularity should not be an issue given
+// processors in the Google fleet operate at a minimum of one billion
+// cycles/sec.
+enum { PROFILE_TIMESTAMP_SHIFT = 7 };
+
+void SpinLock::SlowUnlock(uint64 wait_cycles) {
+  base::internal::SpinLockWake(&lockword_, false);  // wake waiter if necessary
+
+  // Collect contentionz profile info, expanding the wait_cycles back out to
+  // the full value.  If wait_cycles is <= kSpinLockSleeper, then no wait
+  // was actually performed, so don't record the wait time.  Note, that the
+  // CalculateWaitCycles method adds in kSpinLockSleeper cycles
+  // unconditionally to guarantee the wait time is not kSpinLockFree or
+  // kSpinLockHeld.  The adding in of these small number of cycles may
+  // overestimate the contention by a slight amount 50% of the time.  However,
+  // if this code tried to correct for that addition by subtracting out the
+  // kSpinLockSleeper amount that would underestimate the contention slightly
+  // 50% of the time.  Both ways get the wrong answer, so the code
+  // overestimates to be more conservative. Overestimating also makes the code
+  // a little simpler.
+  //
+  if (wait_cycles > kSpinLockSleeper) {
+    base::SubmitSpinLockProfileData(this,
+                                    wait_cycles << PROFILE_TIMESTAMP_SHIFT);
+  }
+}
+
+inline int32 SpinLock::CalculateWaitCycles(int64 wait_start_time) {
+  int32 wait_cycles = ((CycleClock::Now() - wait_start_time) >>
+                       PROFILE_TIMESTAMP_SHIFT);
+  // The number of cycles waiting for the lock is used as both the
+  // wait_cycles and lock value, so it can't be kSpinLockFree or
+  // kSpinLockHeld.  Make sure the value returned is at least
+  // kSpinLockSleeper.
+  wait_cycles |= kSpinLockSleeper;
+  return wait_cycles;
+}

diff --git a/src/base/spinlock.h b/src/base/spinlock.h
new file mode 100644
index 0000000..033a75e
--- /dev/null
+++ b/src/base/spinlock.h

@@ -0,0 +1,146 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2006, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Sanjay Ghemawat
+ */
+
+// SpinLock is async signal safe.
+// If used within a signal handler, all lock holders
+// should block the signal even outside the signal handler.
+
+#ifndef BASE_SPINLOCK_H_
+#define BASE_SPINLOCK_H_
+
+#include <config.h>
+#include "base/atomicops.h"
+#include "base/basictypes.h"
+#include "base/dynamic_annotations.h"
+#include "base/thread_annotations.h"
+
+class LOCKABLE SpinLock {
+ public:
+  SpinLock() : lockword_(kSpinLockFree) { }
+
+  // Special constructor for use with static SpinLock objects.  E.g.,
+  //
+  //    static SpinLock lock(base::LINKER_INITIALIZED);
+  //
+  // When intialized using this constructor, we depend on the fact
+  // that the linker has already initialized the memory appropriately.
+  // A SpinLock constructed like this can be freely used from global
+  // initializers without worrying about the order in which global
+  // initializers run.
+  explicit SpinLock(base::LinkerInitialized /*x*/) {
+    // Does nothing; lockword_ is already initialized
+  }
+
+  // Acquire this SpinLock.
+  // TODO(csilvers): uncomment the annotation when we figure out how to
+  //                 support this macro with 0 args (see thread_annotations.h)
+  inline void Lock() /*EXCLUSIVE_LOCK_FUNCTION()*/ {
+    if (base::subtle::Acquire_CompareAndSwap(&lockword_, kSpinLockFree,
+                                             kSpinLockHeld) != kSpinLockFree) {
+      SlowLock();
+    }
+    ANNOTATE_RWLOCK_ACQUIRED(this, 1);
+  }
+
+  // Try to acquire this SpinLock without blocking and return true if the
+  // acquisition was successful.  If the lock was not acquired, false is
+  // returned.  If this SpinLock is free at the time of the call, TryLock
+  // will return true with high probability.
+  inline bool TryLock() EXCLUSIVE_TRYLOCK_FUNCTION(true) {
+    bool res =
+        (base::subtle::Acquire_CompareAndSwap(&lockword_, kSpinLockFree,
+                                              kSpinLockHeld) == kSpinLockFree);
+    if (res) {
+      ANNOTATE_RWLOCK_ACQUIRED(this, 1);
+    }
+    return res;
+  }
+
+  // Release this SpinLock, which must be held by the calling thread.
+  // TODO(csilvers): uncomment the annotation when we figure out how to
+  //                 support this macro with 0 args (see thread_annotations.h)
+  inline void Unlock() /*UNLOCK_FUNCTION()*/ {
+    ANNOTATE_RWLOCK_RELEASED(this, 1);
+    uint64 wait_cycles = static_cast<uint64>(
+        base::subtle::Release_AtomicExchange(&lockword_, kSpinLockFree));
+    if (wait_cycles != kSpinLockHeld) {
+      // Collect contentionz profile info, and speed the wakeup of any waiter.
+      // The wait_cycles value indicates how long this thread spent waiting
+      // for the lock.
+      SlowUnlock(wait_cycles);
+    }
+  }
+
+  // Determine if the lock is held.  When the lock is held by the invoking
+  // thread, true will always be returned. Intended to be used as
+  // CHECK(lock.IsHeld()).
+  inline bool IsHeld() const {
+    return base::subtle::NoBarrier_Load(&lockword_) != kSpinLockFree;
+  }
+
+  static const base::LinkerInitialized LINKER_INITIALIZED;  // backwards compat
+ private:
+  enum { kSpinLockFree = 0 };
+  enum { kSpinLockHeld = 1 };
+  enum { kSpinLockSleeper = 2 };
+
+  volatile Atomic32 lockword_;
+
+  void SlowLock();
+  void SlowUnlock(uint64 wait_cycles);
+  Atomic32 SpinLoop(int64 initial_wait_timestamp, Atomic32* wait_cycles);
+  inline int32 CalculateWaitCycles(int64 wait_start_time);
+
+  DISALLOW_COPY_AND_ASSIGN(SpinLock);
+};
+
+// Corresponding locker object that arranges to acquire a spinlock for
+// the duration of a C++ scope.
+class SCOPED_LOCKABLE SpinLockHolder {
+ private:
+  SpinLock* lock_;
+ public:
+  inline explicit SpinLockHolder(SpinLock* l) EXCLUSIVE_LOCK_FUNCTION(l)
+      : lock_(l) {
+    l->Lock();
+  }
+  // TODO(csilvers): uncomment the annotation when we figure out how to
+  //                 support this macro with 0 args (see thread_annotations.h)
+  inline ~SpinLockHolder() /*UNLOCK_FUNCTION()*/ { lock_->Unlock(); }
+};
+// Catch bug where variable name is omitted, e.g. SpinLockHolder (&lock);
+#define SpinLockHolder(x) COMPILE_ASSERT(0, spin_lock_decl_missing_var_name)
+
+
+#endif  // BASE_SPINLOCK_H_

diff --git a/src/base/spinlock_internal.cc b/src/base/spinlock_internal.cc
new file mode 100644
index 0000000..e090f9b
--- /dev/null
+++ b/src/base/spinlock_internal.cc

@@ -0,0 +1,122 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2010, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// The OS-specific header included below must provide two calls:
+// base::internal::SpinLockDelay() and base::internal::SpinLockWake().
+// See spinlock_internal.h for the spec of SpinLockWake().
+
+// void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop)
+// SpinLockDelay() generates an apprproate spin delay on iteration "loop" of a
+// spin loop on location *w, whose previously observed value was "value".
+// SpinLockDelay() may do nothing, may yield the CPU, may sleep a clock tick,
+// or may wait for a delay that can be truncated by a call to SpinlockWake(w).
+// In all cases, it must return in bounded time even if SpinlockWake() is not
+// called.
+
+#include "base/spinlock_internal.h"
+
+// forward declaration for use by spinlock_*-inl.h
+namespace base { namespace internal { static int SuggestedDelayNS(int loop); }}
+
+#if defined(_WIN32)
+#include "base/spinlock_win32-inl.h"
+#elif defined(__linux__)
+#include "base/spinlock_linux-inl.h"
+#else
+#include "base/spinlock_posix-inl.h"
+#endif
+
+namespace base {
+namespace internal {
+
+// See spinlock_internal.h for spec.
+int32 SpinLockWait(volatile Atomic32 *w, int n,
+                   const SpinLockWaitTransition trans[]) {
+  int32 v;
+  bool done = false;
+  for (int loop = 0; !done; loop++) {
+    v = base::subtle::Acquire_Load(w);
+    int i;
+    for (i = 0; i != n && v != trans[i].from; i++) {
+    }
+    if (i == n) {
+      SpinLockDelay(w, v, loop);     // no matching transition
+    } else if (trans[i].to == v ||   // null transition
+               base::subtle::Acquire_CompareAndSwap(w, v, trans[i].to) == v) {
+      done = trans[i].done;
+    }
+  }
+  return v;
+}
+
+// Return a suggested delay in nanoseconds for iteration number "loop"
+static int SuggestedDelayNS(int loop) {
+  // Weak pseudo-random number generator to get some spread between threads
+  // when many are spinning.
+#ifdef BASE_HAS_ATOMIC64
+  static base::subtle::Atomic64 rand;
+  uint64 r = base::subtle::NoBarrier_Load(&rand);
+  r = 0x5deece66dLL * r + 0xb;   // numbers from nrand48()
+  base::subtle::NoBarrier_Store(&rand, r);
+
+  r <<= 16;   // 48-bit random number now in top 48-bits.
+  if (loop < 0 || loop > 32) {   // limit loop to 0..32
+    loop = 32;
+  }
+  // loop>>3 cannot exceed 4 because loop cannot exceed 32.
+  // Select top 20..24 bits of lower 48 bits,
+  // giving approximately 0ms to 16ms.
+  // Mean is exponential in loop for first 32 iterations, then 8ms.
+  // The futex path multiplies this by 16, since we expect explicit wakeups
+  // almost always on that path.
+  return r >> (44 - (loop >> 3));
+#else
+  static Atomic32 rand;
+  uint32 r = base::subtle::NoBarrier_Load(&rand);
+  r = 0x343fd * r + 0x269ec3;   // numbers from MSVC++
+  base::subtle::NoBarrier_Store(&rand, r);
+
+  r <<= 1;   // 31-bit random number now in top 31-bits.
+  if (loop < 0 || loop > 32) {   // limit loop to 0..32
+    loop = 32;
+  }
+  // loop>>3 cannot exceed 4 because loop cannot exceed 32.
+  // Select top 20..24 bits of lower 31 bits,
+  // giving approximately 0ms to 16ms.
+  // Mean is exponential in loop for first 32 iterations, then 8ms.
+  // The futex path multiplies this by 16, since we expect explicit wakeups
+  // almost always on that path.
+  return r >> (12 - (loop >> 3));
+#endif
+}
+
+} // namespace internal
+} // namespace base

diff --git a/src/base/spinlock_internal.h b/src/base/spinlock_internal.h
new file mode 100644
index 0000000..4d3c17f
--- /dev/null
+++ b/src/base/spinlock_internal.h

@@ -0,0 +1,65 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2010, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * This file is an internal part spinlock.cc and once.cc
+ * It may not be used directly by code outside of //base.
+ */
+
+#ifndef BASE_SPINLOCK_INTERNAL_H_
+#define BASE_SPINLOCK_INTERNAL_H_
+
+#include <config.h>
+#include "base/basictypes.h"
+#include "base/atomicops.h"
+
+namespace base {
+namespace internal {
+
+// SpinLockWait() waits until it can perform one of several transitions from
+// "from" to "to".  It returns when it performs a transition where done==true.
+struct SpinLockWaitTransition {
+  int32 from;
+  int32 to;
+  bool done;
+};
+
+// Wait until *w can transition from trans[i].from to trans[i].to for some i
+// satisfying 0<=i<n && trans[i].done, atomically make the transition,
+// then return the old value of *w.   Make any other atomic tranistions
+// where !trans[i].done, but continue waiting.
+int32 SpinLockWait(volatile Atomic32 *w, int n,
+                   const SpinLockWaitTransition trans[]);
+void SpinLockWake(volatile Atomic32 *w, bool all);
+void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop);
+
+} // namespace internal
+} // namespace base
+#endif

diff --git a/src/base/spinlock_linux-inl.h b/src/base/spinlock_linux-inl.h
new file mode 100644
index 0000000..aadf62a
--- /dev/null
+++ b/src/base/spinlock_linux-inl.h

@@ -0,0 +1,101 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2009, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * This file is a Linux-specific part of spinlock_internal.cc
+ */
+
+#include <errno.h>
+#include <sched.h>
+#include <time.h>
+#include <limits.h>
+#include "base/linux_syscall_support.h"
+
+#define FUTEX_WAIT 0
+#define FUTEX_WAKE 1
+#define FUTEX_PRIVATE_FLAG 128
+
+static bool have_futex;
+static int futex_private_flag = FUTEX_PRIVATE_FLAG;
+
+namespace {
+static struct InitModule {
+  InitModule() {
+    int x = 0;
+    // futexes are ints, so we can use them only when
+    // that's the same size as the lockword_ in SpinLock.
+    have_futex = (sizeof (Atomic32) == sizeof (int) &&
+                  sys_futex(&x, FUTEX_WAKE, 1, NULL, NULL, 0) >= 0);
+    if (have_futex &&
+        sys_futex(&x, FUTEX_WAKE | futex_private_flag, 1, NULL, NULL, 0) < 0) {
+      futex_private_flag = 0;
+    }
+  }
+} init_module;
+
+}  // anonymous namespace
+
+
+namespace base {
+namespace internal {
+
+void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) {
+  if (loop != 0) {
+    int save_errno = errno;
+    struct timespec tm;
+    tm.tv_sec = 0;
+    if (have_futex) {
+      tm.tv_nsec = base::internal::SuggestedDelayNS(loop);
+    } else {
+      tm.tv_nsec = 2000001;   // above 2ms so linux 2.4 doesn't spin
+    }
+    if (have_futex) {
+      tm.tv_nsec *= 16;  // increase the delay; we expect explicit wakeups
+      sys_futex(reinterpret_cast<int *>(const_cast<Atomic32 *>(w)),
+                FUTEX_WAIT | futex_private_flag,
+                value, reinterpret_cast<struct kernel_timespec *>(&tm),
+                NULL, 0);
+    } else {
+      nanosleep(&tm, NULL);
+    }
+    errno = save_errno;
+  }
+}
+
+void SpinLockWake(volatile Atomic32 *w, bool all) {
+  if (have_futex) {
+    sys_futex(reinterpret_cast<int *>(const_cast<Atomic32 *>(w)),
+              FUTEX_WAKE | futex_private_flag, all? INT_MAX : 1,
+              NULL, NULL, 0);
+  }
+}
+
+} // namespace internal
+} // namespace base

diff --git a/src/base/spinlock_posix-inl.h b/src/base/spinlock_posix-inl.h
new file mode 100644
index 0000000..e73a30f
--- /dev/null
+++ b/src/base/spinlock_posix-inl.h

@@ -0,0 +1,63 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2009, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * This file is a Posix-specific part of spinlock_internal.cc
+ */
+
+#include <config.h>
+#include <errno.h>
+#ifdef HAVE_SCHED_H
+#include <sched.h>      /* For sched_yield() */
+#endif
+#include <time.h>       /* For nanosleep() */
+
+namespace base {
+namespace internal {
+
+void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) {
+  int save_errno = errno;
+  if (loop == 0) {
+  } else if (loop == 1) {
+    sched_yield();
+  } else {
+    struct timespec tm;
+    tm.tv_sec = 0;
+    tm.tv_nsec = base::internal::SuggestedDelayNS(loop);
+    nanosleep(&tm, NULL);
+  }
+  errno = save_errno;
+}
+
+void SpinLockWake(volatile Atomic32 *w, bool all) {
+}
+
+} // namespace internal
+} // namespace base

diff --git a/src/base/spinlock_win32-inl.h b/src/base/spinlock_win32-inl.h
new file mode 100644
index 0000000..956b965
--- /dev/null
+++ b/src/base/spinlock_win32-inl.h

@@ -0,0 +1,54 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2009, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * This file is a Win32-specific part of spinlock_internal.cc
+ */
+
+
+#include <windows.h>
+
+namespace base {
+namespace internal {
+
+void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) {
+  if (loop == 0) {
+  } else if (loop == 1) {
+    Sleep(0);
+  } else {
+    Sleep(base::internal::SuggestedDelayNS(loop) / 1000000);
+  }
+}
+
+void SpinLockWake(volatile Atomic32 *w, bool all) {
+}
+
+} // namespace internal
+} // namespace base

diff --git a/src/base/stl_allocator.h b/src/base/stl_allocator.h
new file mode 100644
index 0000000..2345f46
--- /dev/null
+++ b/src/base/stl_allocator.h

@@ -0,0 +1,98 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2006, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Maxim Lifantsev
+ */
+
+
+#ifndef BASE_STL_ALLOCATOR_H_
+#define BASE_STL_ALLOCATOR_H_
+
+#include <config.h>
+
+#include <stddef.h>   // for ptrdiff_t
+#include <limits>
+
+#include "base/logging.h"
+
+// Generic allocator class for STL objects
+// that uses a given type-less allocator Alloc, which must provide:
+//   static void* Alloc::Allocate(size_t size);
+//   static void Alloc::Free(void* ptr, size_t size);
+//
+// STL_Allocator<T, MyAlloc> provides the same thread-safety
+// guarantees as MyAlloc.
+//
+// Usage example:
+//   set<T, less<T>, STL_Allocator<T, MyAlloc> > my_set;
+// CAVEAT: Parts of the code below are probably specific
+//         to the STL version(s) we are using.
+//         The code is simply lifted from what std::allocator<> provides.
+template <typename T, class Alloc>
+class STL_Allocator {
+ public:
+  typedef size_t     size_type;
+  typedef ptrdiff_t  difference_type;
+  typedef T*         pointer;
+  typedef const T*   const_pointer;
+  typedef T&         reference;
+  typedef const T&   const_reference;
+  typedef T          value_type;
+
+  template <class T1> struct rebind {
+    typedef STL_Allocator<T1, Alloc> other;
+  };
+
+  STL_Allocator() { }
+  STL_Allocator(const STL_Allocator&) { }
+  template <class T1> STL_Allocator(const STL_Allocator<T1, Alloc>&) { }
+  ~STL_Allocator() { }
+
+  pointer address(reference x) const { return &x; }
+  const_pointer address(const_reference x) const { return &x; }
+
+  pointer allocate(size_type n, const void* = 0) {
+    RAW_DCHECK((n * sizeof(T)) / sizeof(T) == n, "n is too big to allocate");
+    return static_cast<T*>(Alloc::Allocate(n * sizeof(T)));
+  }
+  void deallocate(pointer p, size_type n) { Alloc::Free(p, n * sizeof(T)); }
+
+  size_type max_size() const { return size_t(-1) / sizeof(T); }
+
+  void construct(pointer p, const T& val) { ::new(p) T(val); }
+  void construct(pointer p) { ::new(p) T(); }
+  void destroy(pointer p) { p->~T(); }
+
+  // There's no state, so these allocators are always equal
+  bool operator==(const STL_Allocator&) const { return true; }
+};
+
+#endif  // BASE_STL_ALLOCATOR_H_

diff --git a/src/base/synchronization_profiling.h b/src/base/synchronization_profiling.h
new file mode 100644
index 0000000..b495034
--- /dev/null
+++ b/src/base/synchronization_profiling.h

@@ -0,0 +1,51 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2010, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Chris Ruemmler
+ */
+
+#ifndef BASE_AUXILIARY_SYNCHRONIZATION_PROFILING_H_
+#define BASE_AUXILIARY_SYNCHRONIZATION_PROFILING_H_
+
+#include "base/basictypes.h"
+
+namespace base {
+
+// We can do contention-profiling of SpinLocks, but the code is in
+// mutex.cc, which is not always linked in with spinlock.  Hence we
+// provide a weak definition, which are used if mutex.cc isn't linked in.
+
+// Submit the number of cycles the spinlock spent contending.
+ATTRIBUTE_WEAK extern void SubmitSpinLockProfileData(const void *, int64);
+extern void SubmitSpinLockProfileData(const void *contendedlock,
+                                      int64 wait_cycles) {}
+}
+#endif  // BASE_AUXILIARY_SYNCHRONIZATION_PROFILING_H_

diff --git a/src/base/sysinfo.cc b/src/base/sysinfo.cc
new file mode 100644
index 0000000..cad751b
--- /dev/null
+++ b/src/base/sysinfo.cc

@@ -0,0 +1,1153 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <config.h>
+#if (defined(_WIN32) || defined(__MINGW32__)) && !defined(__CYGWIN__) && !defined(__CYGWIN32)
+# define PLATFORM_WINDOWS 1
+#endif
+
+#include <ctype.h>    // for isspace()
+#include <stdlib.h>   // for getenv()
+#include <stdio.h>    // for snprintf(), sscanf()
+#include <string.h>   // for memmove(), memchr(), etc.
+#include <fcntl.h>    // for open()
+#include <errno.h>    // for errno
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>   // for read()
+#endif
+#if defined __MACH__          // Mac OS X, almost certainly
+#include <mach-o/dyld.h>      // for iterating over dll's in ProcMapsIter
+#include <mach-o/loader.h>    // for iterating over dll's in ProcMapsIter
+#include <sys/types.h>
+#include <sys/sysctl.h>       // how we figure out numcpu's on OS X
+#elif defined __FreeBSD__
+#include <sys/sysctl.h>
+#elif defined __sun__         // Solaris
+#include <procfs.h>           // for, e.g., prmap_t
+#elif defined(PLATFORM_WINDOWS)
+#include <process.h>          // for getpid() (actually, _getpid())
+#include <shlwapi.h>          // for SHGetValueA()
+#include <tlhelp32.h>         // for Module32First()
+#endif
+#include "base/sysinfo.h"
+#include "base/commandlineflags.h"
+#include "base/dynamic_annotations.h"   // for RunningOnValgrind
+#include "base/logging.h"
+#include "base/cycleclock.h"
+
+#ifdef PLATFORM_WINDOWS
+#ifdef MODULEENTRY32
+// In a change from the usual W-A pattern, there is no A variant of
+// MODULEENTRY32.  Tlhelp32.h #defines the W variant, but not the A.
+// In unicode mode, tlhelp32.h #defines MODULEENTRY32 to be
+// MODULEENTRY32W.  These #undefs are the only way I see to get back
+// access to the original, ascii struct (and related functions).
+#undef MODULEENTRY32
+#undef Module32First
+#undef Module32Next
+#undef PMODULEENTRY32
+#undef LPMODULEENTRY32
+#endif  /* MODULEENTRY32 */
+// MinGW doesn't seem to define this, perhaps some windowsen don't either.
+#ifndef TH32CS_SNAPMODULE32
+#define TH32CS_SNAPMODULE32  0
+#endif  /* TH32CS_SNAPMODULE32 */
+#endif  /* PLATFORM_WINDOWS */
+
+// Re-run fn until it doesn't cause EINTR.
+#define NO_INTR(fn)  do {} while ((fn) < 0 && errno == EINTR)
+
+// open/read/close can set errno, which may be illegal at this
+// time, so prefer making the syscalls directly if we can.
+#ifdef HAVE_SYS_SYSCALL_H
+# include <sys/syscall.h>
+#endif
+#ifdef SYS_open   // solaris 11, at least sometimes, only defines SYS_openat
+# define safeopen(filename, mode)  syscall(SYS_open, filename, mode)
+#else
+# define safeopen(filename, mode)  open(filename, mode)
+#endif
+#ifdef SYS_read
+# define saferead(fd, buffer, size)  syscall(SYS_read, fd, buffer, size)
+#else
+# define saferead(fd, buffer, size)  read(fd, buffer, size)
+#endif
+#ifdef SYS_close
+# define safeclose(fd)  syscall(SYS_close, fd)
+#else
+# define safeclose(fd)  close(fd)
+#endif
+
+// ----------------------------------------------------------------------
+// GetenvBeforeMain()
+// GetUniquePathFromEnv()
+//    Some non-trivial getenv-related functions.
+// ----------------------------------------------------------------------
+
+// It's not safe to call getenv() in the malloc hooks, because they
+// might be called extremely early, before libc is done setting up
+// correctly.  In particular, the thread library may not be done
+// setting up errno.  So instead, we use the built-in __environ array
+// if it exists, and otherwise read /proc/self/environ directly, using
+// system calls to read the file, and thus avoid setting errno.
+// /proc/self/environ has a limit of how much data it exports (around
+// 8K), so it's not an ideal solution.
+const char* GetenvBeforeMain(const char* name) {
+#if defined(HAVE___ENVIRON)   // if we have it, it's declared in unistd.h
+  if (__environ) {            // can exist but be NULL, if statically linked
+    const int namelen = strlen(name);
+    for (char** p = __environ; *p; p++) {
+      if (strlen(*p) < namelen) {
+        continue;
+      }
+      if (!memcmp(*p, name, namelen) && (*p)[namelen] == '=')  // it's a match
+        return *p + namelen+1;                                 // point after =
+    }
+    return NULL;
+  }
+#endif
+#if defined(PLATFORM_WINDOWS)
+  // TODO(mbelshe) - repeated calls to this function will overwrite the
+  // contents of the static buffer.
+  static char envvar_buf[1024];  // enough to hold any envvar we care about
+  if (!GetEnvironmentVariableA(name, envvar_buf, sizeof(envvar_buf)-1))
+    return NULL;
+  return envvar_buf;
+#endif
+  // static is ok because this function should only be called before
+  // main(), when we're single-threaded.
+  static char envbuf[16<<10];
+  if (*envbuf == '\0') {    // haven't read the environ yet
+    int fd = safeopen("/proc/self/environ", O_RDONLY);
+    // The -2 below guarantees the last two bytes of the buffer will be \0\0
+    if (fd == -1 ||           // unable to open the file, fall back onto libc
+        saferead(fd, envbuf, sizeof(envbuf) - 2) < 0) { // error reading file
+      RAW_VLOG(1, "Unable to open /proc/self/environ, falling back "
+               "on getenv(\"%s\"), which may not work", name);
+      if (fd != -1) safeclose(fd);
+      return getenv(name);
+    }
+    safeclose(fd);
+  }
+  const int namelen = strlen(name);
+  const char* p = envbuf;
+  while (*p != '\0') {    // will happen at the \0\0 that terminates the buffer
+    // proc file has the format NAME=value\0NAME=value\0NAME=value\0...
+    const char* endp = (char*)memchr(p, '\0', sizeof(envbuf) - (p - envbuf));
+    if (endp == NULL)            // this entry isn't NUL terminated
+      return NULL;
+    else if (!memcmp(p, name, namelen) && p[namelen] == '=')    // it's a match
+      return p + namelen+1;      // point after =
+    p = endp + 1;
+  }
+  return NULL;                   // env var never found
+}
+
+extern "C" {
+  const char* TCMallocGetenvSafe(const char* name) {
+    return GetenvBeforeMain(name);
+  }
+}
+
+// This takes as an argument an environment-variable name (like
+// CPUPROFILE) whose value is supposed to be a file-path, and sets
+// path to that path, and returns true.  If the env var doesn't exist,
+// or is the empty string, leave path unchanged and returns false.
+// The reason this is non-trivial is that this function handles munged
+// pathnames.  Here's why:
+//
+// If we're a child process of the 'main' process, we can't just use
+// getenv("CPUPROFILE") -- the parent process will be using that path.
+// Instead we append our pid to the pathname.  How do we tell if we're a
+// child process?  Ideally we'd set an environment variable that all
+// our children would inherit.  But -- and this is seemingly a bug in
+// gcc -- if you do a setenv() in a shared libarary in a global
+// constructor, the environment setting is lost by the time main() is
+// called.  The only safe thing we can do in such a situation is to
+// modify the existing envvar.  So we do a hack: in the parent, we set
+// the high bit of the 1st char of CPUPROFILE.  In the child, we
+// notice the high bit is set and append the pid().  This works
+// assuming cpuprofile filenames don't normally have the high bit set
+// in their first character!  If that assumption is violated, we'll
+// still get a profile, but one with an unexpected name.
+// TODO(csilvers): set an envvar instead when we can do it reliably.
+bool GetUniquePathFromEnv(const char* env_name, char* path) {
+  char* envval = getenv(env_name);
+  if (envval == NULL || *envval == '\0')
+    return false;
+  if (envval[0] & 128) {                  // high bit is set
+    snprintf(path, PATH_MAX, "%c%s_%u",   // add pid and clear high bit
+             envval[0] & 127, envval+1, (unsigned int)(getpid()));
+  } else {
+    snprintf(path, PATH_MAX, "%s", envval);
+    envval[0] |= 128;                     // set high bit for kids to see
+  }
+  return true;
+}
+
+// ----------------------------------------------------------------------
+// CyclesPerSecond()
+// NumCPUs()
+//    It's important this not call malloc! -- they may be called at
+//    global-construct time, before we've set up all our proper malloc
+//    hooks and such.
+// ----------------------------------------------------------------------
+
+static double cpuinfo_cycles_per_second = 1.0;  // 0.0 might be dangerous
+static int cpuinfo_num_cpus = 1;  // Conservative guess
+
+void SleepForMilliseconds(int milliseconds) {
+#ifdef PLATFORM_WINDOWS
+  _sleep(milliseconds);   // Windows's _sleep takes milliseconds argument
+#else
+  // Sleep for a few milliseconds
+  struct timespec sleep_time;
+  sleep_time.tv_sec = milliseconds / 1000;
+  sleep_time.tv_nsec = (milliseconds % 1000) * 1000000;
+  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
+    ;  // Ignore signals and wait for the full interval to elapse.
+#endif
+}
+
+// Helper function estimates cycles/sec by observing cycles elapsed during
+// sleep(). Using small sleep time decreases accuracy significantly.
+static int64 EstimateCyclesPerSecond(const int estimate_time_ms) {
+  assert(estimate_time_ms > 0);
+  if (estimate_time_ms <= 0)
+    return 1;
+  double multiplier = 1000.0 / (double)estimate_time_ms;  // scale by this much
+
+  const int64 start_ticks = CycleClock::Now();
+  SleepForMilliseconds(estimate_time_ms);
+  const int64 guess = int64(multiplier * (CycleClock::Now() - start_ticks));
+  return guess;
+}
+
+// ReadIntFromFile is only called on linux and cygwin platforms.
+#if defined(__linux__) || defined(__CYGWIN__) || defined(__CYGWIN32__)
+// Helper function for reading an int from a file. Returns true if successful
+// and the memory location pointed to by value is set to the value read.
+static bool ReadIntFromFile(const char *file, int *value) {
+  bool ret = false;
+  int fd = open(file, O_RDONLY);
+  if (fd != -1) {
+    char line[1024];
+    char* err;
+    memset(line, '\0', sizeof(line));
+    read(fd, line, sizeof(line) - 1);
+    const int temp_value = strtol(line, &err, 10);
+    if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
+      *value = temp_value;
+      ret = true;
+    }
+    close(fd);
+  }
+  return ret;
+}
+#endif
+
+// WARNING: logging calls back to InitializeSystemInfo() so it must
+// not invoke any logging code.  Also, InitializeSystemInfo() can be
+// called before main() -- in fact it *must* be since already_called
+// isn't protected -- before malloc hooks are properly set up, so
+// we make an effort not to call any routines which might allocate
+// memory.
+
+static void InitializeSystemInfo() {
+  static bool already_called = false;   // safe if we run before threads
+  if (already_called)  return;
+  already_called = true;
+
+  bool saw_mhz = false;
+
+  if (RunningOnValgrind()) {
+    // Valgrind may slow the progress of time artificially (--scale-time=N
+    // option). We thus can't rely on CPU Mhz info stored in /sys or /proc
+    // files. Thus, actually measure the cps.
+    cpuinfo_cycles_per_second = EstimateCyclesPerSecond(100);
+    saw_mhz = true;
+  }
+
+#if defined(__linux__) || defined(__CYGWIN__) || defined(__CYGWIN32__)
+  char line[1024];
+  char* err;
+  int freq;
+
+  // If the kernel is exporting the tsc frequency use that. There are issues
+  // where cpuinfo_max_freq cannot be relied on because the BIOS may be
+  // exporintg an invalid p-state (on x86) or p-states may be used to put the
+  // processor in a new mode (turbo mode). Essentially, those frequencies
+  // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
+  // well.
+  if (!saw_mhz &&
+      ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) {
+      // The value is in kHz (as the file name suggests).  For example, on a
+      // 2GHz warpstation, the file contains the value "2000000".
+      cpuinfo_cycles_per_second = freq * 1000.0;
+      saw_mhz = true;
+  }
+
+  // If CPU scaling is in effect, we want to use the *maximum* frequency,
+  // not whatever CPU speed some random processor happens to be using now.
+  if (!saw_mhz &&
+      ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
+                      &freq)) {
+    // The value is in kHz.  For example, on a 2GHz machine, the file
+    // contains the value "2000000".
+    cpuinfo_cycles_per_second = freq * 1000.0;
+    saw_mhz = true;
+  }
+
+  // Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq.
+  const char* pname = "/proc/cpuinfo";
+  int fd = open(pname, O_RDONLY);
+  if (fd == -1) {
+    perror(pname);
+    if (!saw_mhz) {
+      cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
+    }
+    return;          // TODO: use generic tester instead?
+  }
+
+  double bogo_clock = 1.0;
+  bool saw_bogo = false;
+  int num_cpus = 0;
+  line[0] = line[1] = '\0';
+  int chars_read = 0;
+  do {   // we'll exit when the last read didn't read anything
+    // Move the next line to the beginning of the buffer
+    const int oldlinelen = strlen(line);
+    if (sizeof(line) == oldlinelen + 1)    // oldlinelen took up entire line
+      line[0] = '\0';
+    else                                   // still other lines left to save
+      memmove(line, line + oldlinelen+1, sizeof(line) - (oldlinelen+1));
+    // Terminate the new line, reading more if we can't find the newline
+    char* newline = strchr(line, '\n');
+    if (newline == NULL) {
+      const int linelen = strlen(line);
+      const int bytes_to_read = sizeof(line)-1 - linelen;
+      assert(bytes_to_read > 0);  // because the memmove recovered >=1 bytes
+      chars_read = read(fd, line + linelen, bytes_to_read);
+      line[linelen + chars_read] = '\0';
+      newline = strchr(line, '\n');
+    }
+    if (newline != NULL)
+      *newline = '\0';
+
+#if defined(__powerpc__) || defined(__ppc__)
+    // PowerPC cpus report the frequency in "clock" line
+    if (strncasecmp(line, "clock", sizeof("clock")-1) == 0) {
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+	// PowerPC frequencies are only reported as MHz (check 'show_cpuinfo'
+	// function at arch/powerpc/kernel/setup-common.c)
+	char *endp = strstr(line, "MHz");
+	if (endp) {
+	  *endp = 0;
+	  cpuinfo_cycles_per_second = strtod(freqstr+1, &err) * 1000000.0;
+          if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0)
+            saw_mhz = true;
+	}
+      }
+#else
+    // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
+    // accept postive values. Some environments (virtual machines) report zero,
+    // which would cause infinite looping in WallTime_Init.
+    if (!saw_mhz && strncasecmp(line, "cpu MHz", sizeof("cpu MHz")-1) == 0) {
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+        cpuinfo_cycles_per_second = strtod(freqstr+1, &err) * 1000000.0;
+        if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0)
+          saw_mhz = true;
+      }
+    } else if (strncasecmp(line, "bogomips", sizeof("bogomips")-1) == 0) {
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+        bogo_clock = strtod(freqstr+1, &err) * 1000000.0;
+        if (freqstr[1] != '\0' && *err == '\0' && bogo_clock > 0)
+          saw_bogo = true;
+      }
+#endif
+    } else if (strncasecmp(line, "processor", sizeof("processor")-1) == 0) {
+      num_cpus++;  // count up every time we see an "processor :" entry
+    }
+  } while (chars_read > 0);
+  close(fd);
+
+  if (!saw_mhz) {
+    if (saw_bogo) {
+      // If we didn't find anything better, we'll use bogomips, but
+      // we're not happy about it.
+      cpuinfo_cycles_per_second = bogo_clock;
+    } else {
+      // If we don't even have bogomips, we'll use the slow estimation.
+      cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
+    }
+  }
+  if (cpuinfo_cycles_per_second == 0.0) {
+    cpuinfo_cycles_per_second = 1.0;   // maybe unnecessary, but safe
+  }
+  if (num_cpus > 0) {
+    cpuinfo_num_cpus = num_cpus;
+  }
+
+#elif defined __FreeBSD__
+  // For this sysctl to work, the machine must be configured without
+  // SMP, APIC, or APM support.  hz should be 64-bit in freebsd 7.0
+  // and later.  Before that, it's a 32-bit quantity (and gives the
+  // wrong answer on machines faster than 2^32 Hz).  See
+  //  http://lists.freebsd.org/pipermail/freebsd-i386/2004-November/001846.html
+  // But also compare FreeBSD 7.0:
+  //  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG70#L223
+  //  231         error = sysctl_handle_quad(oidp, &freq, 0, req);
+  // To FreeBSD 6.3 (it's the same in 6-STABLE):
+  //  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG6#L131
+  //  139         error = sysctl_handle_int(oidp, &freq, sizeof(freq), req);
+#if __FreeBSD__ >= 7
+  uint64_t hz = 0;
+#else
+  unsigned int hz = 0;
+#endif
+  size_t sz = sizeof(hz);
+  const char *sysctl_path = "machdep.tsc_freq";
+  if ( sysctlbyname(sysctl_path, &hz, &sz, NULL, 0) != 0 ) {
+    fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
+            sysctl_path, strerror(errno));
+    cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
+  } else {
+    cpuinfo_cycles_per_second = hz;
+  }
+  // TODO(csilvers): also figure out cpuinfo_num_cpus
+
+#elif defined(PLATFORM_WINDOWS)
+# pragma comment(lib, "shlwapi.lib")  // for SHGetValue()
+  // In NT, read MHz from the registry. If we fail to do so or we're in win9x
+  // then make a crude estimate.
+  OSVERSIONINFO os;
+  os.dwOSVersionInfoSize = sizeof(os);
+  DWORD data, data_size = sizeof(data);
+  if (GetVersionEx(&os) &&
+      os.dwPlatformId == VER_PLATFORM_WIN32_NT &&
+      SUCCEEDED(SHGetValueA(HKEY_LOCAL_MACHINE,
+                         "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
+                           "~MHz", NULL, &data, &data_size)))
+    cpuinfo_cycles_per_second = (int64)data * (int64)(1000 * 1000); // was mhz
+  else
+    cpuinfo_cycles_per_second = EstimateCyclesPerSecond(500); // TODO <500?
+
+  // Get the number of processors.
+  SYSTEM_INFO info;
+  GetSystemInfo(&info);
+  cpuinfo_num_cpus = info.dwNumberOfProcessors;
+
+#elif defined(__MACH__) && defined(__APPLE__)
+  // returning "mach time units" per second. the current number of elapsed
+  // mach time units can be found by calling uint64 mach_absolute_time();
+  // while not as precise as actual CPU cycles, it is accurate in the face
+  // of CPU frequency scaling and multi-cpu/core machines.
+  // Our mac users have these types of machines, and accuracy
+  // (i.e. correctness) trumps precision.
+  // See cycleclock.h: CycleClock::Now(), which returns number of mach time
+  // units on Mac OS X.
+  mach_timebase_info_data_t timebase_info;
+  mach_timebase_info(&timebase_info);
+  double mach_time_units_per_nanosecond =
+      static_cast<double>(timebase_info.denom) /
+      static_cast<double>(timebase_info.numer);
+  cpuinfo_cycles_per_second = mach_time_units_per_nanosecond * 1e9;
+
+  int num_cpus = 0;
+  size_t size = sizeof(num_cpus);
+  int numcpus_name[] = { CTL_HW, HW_NCPU };
+  if (::sysctl(numcpus_name, arraysize(numcpus_name), &num_cpus, &size, 0, 0)
+      == 0
+      && (size == sizeof(num_cpus)))
+    cpuinfo_num_cpus = num_cpus;
+
+#else
+  // Generic cycles per second counter
+  cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
+#endif
+}
+
+double CyclesPerSecond(void) {
+  InitializeSystemInfo();
+  return cpuinfo_cycles_per_second;
+}
+
+int NumCPUs(void) {
+  InitializeSystemInfo();
+  return cpuinfo_num_cpus;
+}
+
+// ----------------------------------------------------------------------
+// HasPosixThreads()
+//      Return true if we're running POSIX (e.g., NPTL on Linux)
+//      threads, as opposed to a non-POSIX thread library.  The thing
+//      that we care about is whether a thread's pid is the same as
+//      the thread that spawned it.  If so, this function returns
+//      true.
+// ----------------------------------------------------------------------
+bool HasPosixThreads() {
+#if defined(__linux__)
+#ifndef _CS_GNU_LIBPTHREAD_VERSION
+#define _CS_GNU_LIBPTHREAD_VERSION 3
+#endif
+  char buf[32];
+  //  We assume that, if confstr() doesn't know about this name, then
+  //  the same glibc is providing LinuxThreads.
+  if (confstr(_CS_GNU_LIBPTHREAD_VERSION, buf, sizeof(buf)) == 0)
+    return false;
+  return strncmp(buf, "NPTL", 4) == 0;
+#elif defined(PLATFORM_WINDOWS) || defined(__CYGWIN__) || defined(__CYGWIN32__)
+  return false;
+#else  // other OS
+  return true;      //  Assume that everything else has Posix
+#endif  // else OS_LINUX
+}
+
+// ----------------------------------------------------------------------
+
+#if defined __linux__ || defined __FreeBSD__ || defined __sun__ || defined __CYGWIN__ || defined __CYGWIN32__
+static void ConstructFilename(const char* spec, pid_t pid,
+                              char* buf, int buf_size) {
+  CHECK_LT(snprintf(buf, buf_size,
+                    spec,
+                    static_cast<int>(pid ? pid : getpid())), buf_size);
+}
+#endif
+
+// A templatized helper function instantiated for Mach (OS X) only.
+// It can handle finding info for both 32 bits and 64 bits.
+// Returns true if it successfully handled the hdr, false else.
+#ifdef __MACH__          // Mac OS X, almost certainly
+template<uint32_t kMagic, uint32_t kLCSegment,
+         typename MachHeader, typename SegmentCommand>
+static bool NextExtMachHelper(const mach_header* hdr,
+                              int current_image, int current_load_cmd,
+                              uint64 *start, uint64 *end, char **flags,
+                              uint64 *offset, int64 *inode, char **filename,
+                              uint64 *file_mapping, uint64 *file_pages,
+                              uint64 *anon_mapping, uint64 *anon_pages,
+                              dev_t *dev) {
+  static char kDefaultPerms[5] = "r-xp";
+  if (hdr->magic != kMagic)
+    return false;
+  const char* lc = (const char *)hdr + sizeof(MachHeader);
+  // TODO(csilvers): make this not-quadradic (increment and hold state)
+  for (int j = 0; j < current_load_cmd; j++)  // advance to *our* load_cmd
+    lc += ((const load_command *)lc)->cmdsize;
+  if (((const load_command *)lc)->cmd == kLCSegment) {
+    const intptr_t dlloff = _dyld_get_image_vmaddr_slide(current_image);
+    const SegmentCommand* sc = (const SegmentCommand *)lc;
+    if (start) *start = sc->vmaddr + dlloff;
+    if (end) *end = sc->vmaddr + sc->vmsize + dlloff;
+    if (flags) *flags = kDefaultPerms;  // can we do better?
+    if (offset) *offset = sc->fileoff;
+    if (inode) *inode = 0;
+    if (filename)
+      *filename = const_cast<char*>(_dyld_get_image_name(current_image));
+    if (file_mapping) *file_mapping = 0;
+    if (file_pages) *file_pages = 0;   // could we use sc->filesize?
+    if (anon_mapping) *anon_mapping = 0;
+    if (anon_pages) *anon_pages = 0;
+    if (dev) *dev = 0;
+    return true;
+  }
+
+  return false;
+}
+#endif
+
+// Finds |c| in |text|, and assign '\0' at the found position.
+// The original character at the modified position should be |c|.
+// A pointer to the modified position is stored in |endptr|.
+// |endptr| should not be NULL.
+static bool ExtractUntilChar(char *text, int c, char **endptr) {
+  CHECK_NE(text, NULL);
+  CHECK_NE(endptr, NULL);
+  char *found;
+  found = strchr(text, c);
+  if (found == NULL) {
+    *endptr = NULL;
+    return false;
+  }
+
+  *endptr = found;
+  *found = '\0';
+  return true;
+}
+
+// Increments |*text_pointer| while it points a whitespace character.
+// It is to follow sscanf's whilespace handling.
+static void SkipWhileWhitespace(char **text_pointer, int c) {
+  if (isspace(c)) {
+    while (isspace(**text_pointer) && isspace(*((*text_pointer) + 1))) {
+      ++(*text_pointer);
+    }
+  }
+}
+
+template<class T>
+static T StringToInteger(char *text, char **endptr, int base) {
+  assert(false);
+  return T();
+}
+
+template<>
+int StringToInteger<int>(char *text, char **endptr, int base) {
+  return strtol(text, endptr, base);
+}
+
+template<>
+int64 StringToInteger<int64>(char *text, char **endptr, int base) {
+  return strtoll(text, endptr, base);
+}
+
+template<>
+uint64 StringToInteger<uint64>(char *text, char **endptr, int base) {
+  return strtoull(text, endptr, base);
+}
+
+template<typename T>
+static T StringToIntegerUntilChar(
+    char *text, int base, int c, char **endptr_result) {
+  CHECK_NE(endptr_result, NULL);
+  *endptr_result = NULL;
+
+  char *endptr_extract;
+  if (!ExtractUntilChar(text, c, &endptr_extract))
+    return 0;
+
+  T result;
+  char *endptr_strto;
+  result = StringToInteger<T>(text, &endptr_strto, base);
+  *endptr_extract = c;
+
+  if (endptr_extract != endptr_strto)
+    return 0;
+
+  *endptr_result = endptr_extract;
+  SkipWhileWhitespace(endptr_result, c);
+
+  return result;
+}
+
+static char *CopyStringUntilChar(
+    char *text, unsigned out_len, int c, char *out) {
+  char *endptr;
+  if (!ExtractUntilChar(text, c, &endptr))
+    return NULL;
+
+  strncpy(out, text, out_len);
+  out[out_len-1] = '\0';
+  *endptr = c;
+
+  SkipWhileWhitespace(&endptr, c);
+  return endptr;
+}
+
+template<typename T>
+static bool StringToIntegerUntilCharWithCheck(
+    T *outptr, char *text, int base, int c, char **endptr) {
+  *outptr = StringToIntegerUntilChar<T>(*endptr, base, c, endptr);
+  if (*endptr == NULL || **endptr == '\0') return false;
+  ++(*endptr);
+  return true;
+}
+
+static bool ParseProcMapsLine(char *text, uint64 *start, uint64 *end,
+                              char *flags, uint64 *offset,
+                              int *major, int *minor, int64 *inode,
+                              unsigned *filename_offset) {
+#if defined(__linux__)
+  /*
+   * It's similar to:
+   * sscanf(text, "%"SCNx64"-%"SCNx64" %4s %"SCNx64" %x:%x %"SCNd64" %n",
+   *        start, end, flags, offset, major, minor, inode, filename_offset)
+   */
+  char *endptr = text;
+  if (endptr == NULL || *endptr == '\0')  return false;
+
+  if (!StringToIntegerUntilCharWithCheck(start, endptr, 16, '-', &endptr))
+    return false;
+
+  if (!StringToIntegerUntilCharWithCheck(end, endptr, 16, ' ', &endptr))
+    return false;
+
+  endptr = CopyStringUntilChar(endptr, 5, ' ', flags);
+  if (endptr == NULL || *endptr == '\0')  return false;
+  ++endptr;
+
+  if (!StringToIntegerUntilCharWithCheck(offset, endptr, 16, ' ', &endptr))
+    return false;
+
+  if (!StringToIntegerUntilCharWithCheck(major, endptr, 16, ':', &endptr))
+    return false;
+
+  if (!StringToIntegerUntilCharWithCheck(minor, endptr, 16, ' ', &endptr))
+    return false;
+
+  if (!StringToIntegerUntilCharWithCheck(inode, endptr, 10, ' ', &endptr))
+    return false;
+
+  *filename_offset = (endptr - text);
+  return true;
+#else
+  return false;
+#endif
+}
+
+ProcMapsIterator::ProcMapsIterator(pid_t pid) {
+  Init(pid, NULL, false);
+}
+
+ProcMapsIterator::ProcMapsIterator(pid_t pid, Buffer *buffer) {
+  Init(pid, buffer, false);
+}
+
+ProcMapsIterator::ProcMapsIterator(pid_t pid, Buffer *buffer,
+                                   bool use_maps_backing) {
+  Init(pid, buffer, use_maps_backing);
+}
+
+void ProcMapsIterator::Init(pid_t pid, Buffer *buffer,
+                            bool use_maps_backing) {
+  pid_ = pid;
+  using_maps_backing_ = use_maps_backing;
+  dynamic_buffer_ = NULL;
+  if (!buffer) {
+    // If the user didn't pass in any buffer storage, allocate it
+    // now. This is the normal case; the signal handler passes in a
+    // static buffer.
+    buffer = dynamic_buffer_ = new Buffer;
+  } else {
+    dynamic_buffer_ = NULL;
+  }
+
+  ibuf_ = buffer->buf_;
+
+  stext_ = etext_ = nextline_ = ibuf_;
+  ebuf_ = ibuf_ + Buffer::kBufSize - 1;
+  nextline_ = ibuf_;
+
+#if defined(__linux__) || defined(__CYGWIN__) || defined(__CYGWIN32__)
+  if (use_maps_backing) {  // don't bother with clever "self" stuff in this case
+    ConstructFilename("/proc/%d/maps_backing", pid, ibuf_, Buffer::kBufSize);
+  } else if (pid == 0) {
+    // We have to kludge a bit to deal with the args ConstructFilename
+    // expects.  The 1 is never used -- it's only impt. that it's not 0.
+    ConstructFilename("/proc/self/maps", 1, ibuf_, Buffer::kBufSize);
+  } else {
+    ConstructFilename("/proc/%d/maps", pid, ibuf_, Buffer::kBufSize);
+  }
+  // No error logging since this can be called from the crash dump
+  // handler at awkward moments. Users should call Valid() before
+  // using.
+  NO_INTR(fd_ = open(ibuf_, O_RDONLY));
+#elif defined(__FreeBSD__)
+  // We don't support maps_backing on freebsd
+  if (pid == 0) {
+    ConstructFilename("/proc/curproc/map", 1, ibuf_, Buffer::kBufSize);
+  } else {
+    ConstructFilename("/proc/%d/map", pid, ibuf_, Buffer::kBufSize);
+  }
+  NO_INTR(fd_ = open(ibuf_, O_RDONLY));
+#elif defined(__sun__)
+  if (pid == 0) {
+    ConstructFilename("/proc/self/map", 1, ibuf_, Buffer::kBufSize);
+  } else {
+    ConstructFilename("/proc/%d/map", pid, ibuf_, Buffer::kBufSize);
+  }
+  NO_INTR(fd_ = open(ibuf_, O_RDONLY));
+#elif defined(__MACH__)
+  current_image_ = _dyld_image_count();   // count down from the top
+  current_load_cmd_ = -1;
+#elif defined(PLATFORM_WINDOWS)
+  snapshot_ = CreateToolhelp32Snapshot(TH32CS_SNAPMODULE |
+                                       TH32CS_SNAPMODULE32,
+                                       GetCurrentProcessId());
+  memset(&module_, 0, sizeof(module_));
+#else
+  fd_ = -1;   // so Valid() is always false
+#endif
+
+}
+
+ProcMapsIterator::~ProcMapsIterator() {
+#if defined(PLATFORM_WINDOWS)
+  if (snapshot_ != INVALID_HANDLE_VALUE) CloseHandle(snapshot_);
+#elif defined(__MACH__)
+  // no cleanup necessary!
+#else
+  if (fd_ >= 0) NO_INTR(close(fd_));
+#endif
+  delete dynamic_buffer_;
+}
+
+bool ProcMapsIterator::Valid() const {
+#if defined(PLATFORM_WINDOWS)
+  return snapshot_ != INVALID_HANDLE_VALUE;
+#elif defined(__MACH__)
+  return 1;
+#else
+  return fd_ != -1;
+#endif
+}
+
+bool ProcMapsIterator::Next(uint64 *start, uint64 *end, char **flags,
+                            uint64 *offset, int64 *inode, char **filename) {
+  return NextExt(start, end, flags, offset, inode, filename, NULL, NULL,
+                 NULL, NULL, NULL);
+}
+
+// This has too many arguments.  It should really be building
+// a map object and returning it.  The problem is that this is called
+// when the memory allocator state is undefined, hence the arguments.
+bool ProcMapsIterator::NextExt(uint64 *start, uint64 *end, char **flags,
+                               uint64 *offset, int64 *inode, char **filename,
+                               uint64 *file_mapping, uint64 *file_pages,
+                               uint64 *anon_mapping, uint64 *anon_pages,
+                               dev_t *dev) {
+
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__CYGWIN__) || defined(__CYGWIN32__)
+  do {
+    // Advance to the start of the next line
+    stext_ = nextline_;
+
+    // See if we have a complete line in the buffer already
+    nextline_ = static_cast<char *>(memchr (stext_, '\n', etext_ - stext_));
+    if (!nextline_) {
+      // Shift/fill the buffer so we do have a line
+      int count = etext_ - stext_;
+
+      // Move the current text to the start of the buffer
+      memmove(ibuf_, stext_, count);
+      stext_ = ibuf_;
+      etext_ = ibuf_ + count;
+
+      int nread = 0;            // fill up buffer with text
+      while (etext_ < ebuf_) {
+        NO_INTR(nread = read(fd_, etext_, ebuf_ - etext_));
+        if (nread > 0)
+          etext_ += nread;
+        else
+          break;
+      }
+
+      // Zero out remaining characters in buffer at EOF to avoid returning
+      // garbage from subsequent calls.
+      if (etext_ != ebuf_ && nread == 0) {
+        memset(etext_, 0, ebuf_ - etext_);
+      }
+      *etext_ = '\n';   // sentinel; safe because ibuf extends 1 char beyond ebuf
+      nextline_ = static_cast<char *>(memchr (stext_, '\n', etext_ + 1 - stext_));
+    }
+    *nextline_ = 0;                // turn newline into nul
+    nextline_ += ((nextline_ < etext_)? 1 : 0);  // skip nul if not end of text
+    // stext_ now points at a nul-terminated line
+    uint64 tmpstart, tmpend, tmpoffset;
+    int64 tmpinode;
+    int major, minor;
+    unsigned filename_offset = 0;
+#if defined(__linux__)
+    // for now, assume all linuxes have the same format
+    if (!ParseProcMapsLine(
+        stext_,
+        start ? start : &tmpstart,
+        end ? end : &tmpend,
+        flags_,
+        offset ? offset : &tmpoffset,
+        &major, &minor,
+        inode ? inode : &tmpinode, &filename_offset)) continue;
+#elif defined(__CYGWIN__) || defined(__CYGWIN32__)
+    // cygwin is like linux, except the third field is the "entry point"
+    // rather than the offset (see format_process_maps at
+    // http://cygwin.com/cgi-bin/cvsweb.cgi/src/winsup/cygwin/fhandler_process.cc?rev=1.89&content-type=text/x-cvsweb-markup&cvsroot=src
+    // Offset is always be 0 on cygwin: cygwin implements an mmap
+    // by loading the whole file and then calling NtMapViewOfSection.
+    // Cygwin also seems to set its flags kinda randomly; use windows default.
+    char tmpflags[5];
+    if (offset)
+      *offset = 0;
+    strcpy(flags_, "r-xp");
+    if (sscanf(stext_, "%llx-%llx %4s %llx %x:%x %lld %n",
+               start ? start : &tmpstart,
+               end ? end : &tmpend,
+               tmpflags,
+               &tmpoffset,
+               &major, &minor,
+               inode ? inode : &tmpinode, &filename_offset) != 7) continue;
+#elif defined(__FreeBSD__)
+    // For the format, see http://www.freebsd.org/cgi/cvsweb.cgi/src/sys/fs/procfs/procfs_map.c?rev=1.31&content-type=text/x-cvsweb-markup
+    tmpstart = tmpend = tmpoffset = 0;
+    tmpinode = 0;
+    major = minor = 0;   // can't get this info in freebsd
+    if (inode)
+      *inode = 0;        // nor this
+    if (offset)
+      *offset = 0;       // seems like this should be in there, but maybe not
+    // start end resident privateresident obj(?) prot refcnt shadowcnt
+    // flags copy_on_write needs_copy type filename:
+    // 0x8048000 0x804a000 2 0 0xc104ce70 r-x 1 0 0x0 COW NC vnode /bin/cat
+    if (sscanf(stext_, "0x%" SCNx64 " 0x%" SCNx64 " %*d %*d %*p %3s %*d %*d 0x%*x %*s %*s %*s %n",
+               start ? start : &tmpstart,
+               end ? end : &tmpend,
+               flags_,
+               &filename_offset) != 3) continue;
+#endif
+
+    // Depending on the Linux kernel being used, there may or may not be a space
+    // after the inode if there is no filename.  sscanf will in such situations
+    // nondeterministically either fill in filename_offset or not (the results
+    // differ on multiple calls in the same run even with identical arguments).
+    // We don't want to wander off somewhere beyond the end of the string.
+    size_t stext_length = strlen(stext_);
+    if (filename_offset == 0 || filename_offset > stext_length)
+      filename_offset = stext_length;
+
+    // We found an entry
+    if (flags) *flags = flags_;
+    if (filename) *filename = stext_ + filename_offset;
+    if (dev) *dev = minor | (major << 8);
+
+    if (using_maps_backing_) {
+      // Extract and parse physical page backing info.
+      char *backing_ptr = stext_ + filename_offset +
+          strlen(stext_+filename_offset);
+
+      // find the second '('
+      int paren_count = 0;
+      while (--backing_ptr > stext_) {
+        if (*backing_ptr == '(') {
+          ++paren_count;
+          if (paren_count >= 2) {
+            uint64 tmp_file_mapping;
+            uint64 tmp_file_pages;
+            uint64 tmp_anon_mapping;
+            uint64 tmp_anon_pages;
+
+            sscanf(backing_ptr+1, "F %" SCNx64 " %" SCNd64 ") (A %" SCNx64 " %" SCNd64 ")",
+                   file_mapping ? file_mapping : &tmp_file_mapping,
+                   file_pages ? file_pages : &tmp_file_pages,
+                   anon_mapping ? anon_mapping : &tmp_anon_mapping,
+                   anon_pages ? anon_pages : &tmp_anon_pages);
+            // null terminate the file name (there is a space
+            // before the first (.
+            backing_ptr[-1] = 0;
+            break;
+          }
+        }
+      }
+    }
+
+    return true;
+  } while (etext_ > ibuf_);
+#elif defined(__sun__)
+  // This is based on MA_READ == 4, MA_WRITE == 2, MA_EXEC == 1
+  static char kPerms[8][4] = { "---", "--x", "-w-", "-wx",
+                               "r--", "r-x", "rw-", "rwx" };
+  COMPILE_ASSERT(MA_READ == 4, solaris_ma_read_must_equal_4);
+  COMPILE_ASSERT(MA_WRITE == 2, solaris_ma_write_must_equal_2);
+  COMPILE_ASSERT(MA_EXEC == 1, solaris_ma_exec_must_equal_1);
+  Buffer object_path;
+  int nread = 0;            // fill up buffer with text
+  NO_INTR(nread = read(fd_, ibuf_, sizeof(prmap_t)));
+  if (nread == sizeof(prmap_t)) {
+    long inode_from_mapname = 0;
+    prmap_t* mapinfo = reinterpret_cast<prmap_t*>(ibuf_);
+    // Best-effort attempt to get the inode from the filename.  I think the
+    // two middle ints are major and minor device numbers, but I'm not sure.
+    sscanf(mapinfo->pr_mapname, "ufs.%*d.%*d.%ld", &inode_from_mapname);
+
+    if (pid_ == 0) {
+      CHECK_LT(snprintf(object_path.buf_, Buffer::kBufSize,
+                        "/proc/self/path/%s", mapinfo->pr_mapname),
+               Buffer::kBufSize);
+    } else {
+      CHECK_LT(snprintf(object_path.buf_, Buffer::kBufSize,
+                        "/proc/%d/path/%s",
+                        static_cast<int>(pid_), mapinfo->pr_mapname),
+               Buffer::kBufSize);
+    }
+    ssize_t len = readlink(object_path.buf_, current_filename_, PATH_MAX);
+    CHECK_LT(len, PATH_MAX);
+    if (len < 0)
+      len = 0;
+    current_filename_[len] = '\0';
+
+    if (start) *start = mapinfo->pr_vaddr;
+    if (end) *end = mapinfo->pr_vaddr + mapinfo->pr_size;
+    if (flags) *flags = kPerms[mapinfo->pr_mflags & 7];
+    if (offset) *offset = mapinfo->pr_offset;
+    if (inode) *inode = inode_from_mapname;
+    if (filename) *filename = current_filename_;
+    if (file_mapping) *file_mapping = 0;
+    if (file_pages) *file_pages = 0;
+    if (anon_mapping) *anon_mapping = 0;
+    if (anon_pages) *anon_pages = 0;
+    if (dev) *dev = 0;
+    return true;
+  }
+#elif defined(__MACH__)
+  // We return a separate entry for each segment in the DLL. (TODO(csilvers):
+  // can we do better?)  A DLL ("image") has load-commands, some of which
+  // talk about segment boundaries.
+  // cf image_for_address from http://svn.digium.com/view/asterisk/team/oej/minivoicemail/dlfcn.c?revision=53912
+  for (; current_image_ >= 0; current_image_--) {
+    const mach_header* hdr = _dyld_get_image_header(current_image_);
+    if (!hdr) continue;
+    if (current_load_cmd_ < 0)   // set up for this image
+      current_load_cmd_ = hdr->ncmds;  // again, go from the top down
+
+    // We start with the next load command (we've already looked at this one).
+    for (current_load_cmd_--; current_load_cmd_ >= 0; current_load_cmd_--) {
+#ifdef MH_MAGIC_64
+      if (NextExtMachHelper<MH_MAGIC_64, LC_SEGMENT_64,
+                            struct mach_header_64, struct segment_command_64>(
+                                hdr, current_image_, current_load_cmd_,
+                                start, end, flags, offset, inode, filename,
+                                file_mapping, file_pages, anon_mapping,
+                                anon_pages, dev)) {
+        return true;
+      }
+#endif
+      if (NextExtMachHelper<MH_MAGIC, LC_SEGMENT,
+                            struct mach_header, struct segment_command>(
+                                hdr, current_image_, current_load_cmd_,
+                                start, end, flags, offset, inode, filename,
+                                file_mapping, file_pages, anon_mapping,
+                                anon_pages, dev)) {
+        return true;
+      }
+    }
+    // If we get here, no more load_cmd's in this image talk about
+    // segments.  Go on to the next image.
+  }
+#elif defined(PLATFORM_WINDOWS)
+  static char kDefaultPerms[5] = "r-xp";
+  BOOL ok;
+  if (module_.dwSize == 0) {  // only possible before first call
+    module_.dwSize = sizeof(module_);
+    ok = Module32First(snapshot_, &module_);
+  } else {
+    ok = Module32Next(snapshot_, &module_);
+  }
+  if (ok) {
+    uint64 base_addr = reinterpret_cast<DWORD_PTR>(module_.modBaseAddr);
+    if (start) *start = base_addr;
+    if (end) *end = base_addr + module_.modBaseSize;
+    if (flags) *flags = kDefaultPerms;
+    if (offset) *offset = 0;
+    if (inode) *inode = 0;
+    if (filename) *filename = module_.szExePath;
+    if (file_mapping) *file_mapping = 0;
+    if (file_pages) *file_pages = 0;
+    if (anon_mapping) *anon_mapping = 0;
+    if (anon_pages) *anon_pages = 0;
+    if (dev) *dev = 0;
+    return true;
+  }
+#endif
+
+  // We didn't find anything
+  return false;
+}
+
+int ProcMapsIterator::FormatLine(char* buffer, int bufsize,
+                                 uint64 start, uint64 end, const char *flags,
+                                 uint64 offset, int64 inode,
+                                 const char *filename, dev_t dev) {
+  // We assume 'flags' looks like 'rwxp' or 'rwx'.
+  char r = (flags && flags[0] == 'r') ? 'r' : '-';
+  char w = (flags && flags[0] && flags[1] == 'w') ? 'w' : '-';
+  char x = (flags && flags[0] && flags[1] && flags[2] == 'x') ? 'x' : '-';
+  // p always seems set on linux, so we set the default to 'p', not '-'
+  char p = (flags && flags[0] && flags[1] && flags[2] && flags[3] != 'p')
+      ? '-' : 'p';
+
+  const int rc = snprintf(buffer, bufsize,
+                          "%08" PRIx64 "-%08" PRIx64 " %c%c%c%c %08" PRIx64 " %02x:%02x %-11" PRId64 " %s\n",
+                          start, end, r,w,x,p, offset,
+                          static_cast<int>(dev/256), static_cast<int>(dev%256),
+                          inode, filename);
+  return (rc < 0 || rc >= bufsize) ? 0 : rc;
+}
+
+namespace tcmalloc {
+
+// Helper to add the list of mapped shared libraries to a profile.
+// Fill formatted "/proc/self/maps" contents into buffer 'buf' of size 'size'
+// and return the actual size occupied in 'buf'.  We fill wrote_all to true
+// if we successfully wrote all proc lines to buf, false else.
+// We do not provision for 0-terminating 'buf'.
+int FillProcSelfMaps(char buf[], int size, bool* wrote_all) {
+  ProcMapsIterator::Buffer iterbuf;
+  ProcMapsIterator it(0, &iterbuf);   // 0 means "current pid"
+
+  uint64 start, end, offset;
+  int64 inode;
+  char *flags, *filename;
+  int bytes_written = 0;
+  *wrote_all = true;
+  while (it.Next(&start, &end, &flags, &offset, &inode, &filename)) {
+    const int line_length = it.FormatLine(buf + bytes_written,
+                                          size - bytes_written,
+                                          start, end, flags, offset,
+                                          inode, filename, 0);
+    if (line_length == 0)
+      *wrote_all = false;     // failed to write this line out
+    else
+      bytes_written += line_length;
+
+  }
+  return bytes_written;
+}
+
+// Dump the same data as FillProcSelfMaps reads to fd.
+// It seems easier to repeat parts of FillProcSelfMaps here than to
+// reuse it via a call.
+void DumpProcSelfMaps(RawFD fd) {
+  ProcMapsIterator::Buffer iterbuf;
+  ProcMapsIterator it(0, &iterbuf);   // 0 means "current pid"
+
+  uint64 start, end, offset;
+  int64 inode;
+  char *flags, *filename;
+  ProcMapsIterator::Buffer linebuf;
+  while (it.Next(&start, &end, &flags, &offset, &inode, &filename)) {
+    int written = it.FormatLine(linebuf.buf_, sizeof(linebuf.buf_),
+                                start, end, flags, offset, inode, filename,
+                                0);
+    RawWrite(fd, linebuf.buf_, written);
+  }
+}
+
+}  // namespace tcmalloc

diff --git a/src/base/sysinfo.h b/src/base/sysinfo.h
new file mode 100644
index 0000000..cc5cb74
--- /dev/null
+++ b/src/base/sysinfo.h

@@ -0,0 +1,236 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// All functions here are thread-hostile due to file caching unless
+// commented otherwise.
+
+#ifndef _SYSINFO_H_
+#define _SYSINFO_H_
+
+#include <config.h>
+
+#include <time.h>
+#if (defined(_WIN32) || defined(__MINGW32__)) && (!defined(__CYGWIN__) && !defined(__CYGWIN32__))
+#include <windows.h>   // for DWORD
+#include <tlhelp32.h>  // for CreateToolhelp32Snapshot
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>    // for pid_t
+#endif
+#include <stddef.h>    // for size_t
+#include <limits.h>    // for PATH_MAX
+#include "base/basictypes.h"
+#include "base/logging.h"   // for RawFD
+
+// This getenv function is safe to call before the C runtime is initialized.
+// On Windows, it utilizes GetEnvironmentVariable() and on unix it uses
+// /proc/self/environ instead calling getenv().  It's intended to be used in
+// routines that run before main(), when the state required for getenv() may
+// not be set up yet.  In particular, errno isn't set up until relatively late
+// (after the pthreads library has a chance to make it threadsafe), and
+// getenv() doesn't work until then. 
+// On some platforms, this call will utilize the same, static buffer for
+// repeated GetenvBeforeMain() calls. Callers should not expect pointers from
+// this routine to be long lived.
+// Note that on unix, /proc only has the environment at the time the
+// application was started, so this routine ignores setenv() calls/etc.  Also
+// note it only reads the first 16K of the environment.
+extern const char* GetenvBeforeMain(const char* name);
+
+// This takes as an argument an environment-variable name (like
+// CPUPROFILE) whose value is supposed to be a file-path, and sets
+// path to that path, and returns true.  Non-trivial for surprising
+// reasons, as documented in sysinfo.cc.  path must have space PATH_MAX.
+extern bool GetUniquePathFromEnv(const char* env_name, char* path);
+
+extern int NumCPUs();
+
+void SleepForMilliseconds(int milliseconds);
+
+// processor cycles per second of each processor.  Thread-safe.
+extern double CyclesPerSecond(void);
+
+
+//  Return true if we're running POSIX (e.g., NPTL on Linux) threads,
+//  as opposed to a non-POSIX thread library.  The thing that we care
+//  about is whether a thread's pid is the same as the thread that
+//  spawned it.  If so, this function returns true.
+//  Thread-safe.
+//  Note: We consider false negatives to be OK.
+bool HasPosixThreads();
+
+#ifndef SWIG  // SWIG doesn't like struct Buffer and variable arguments.
+
+// A ProcMapsIterator abstracts access to /proc/maps for a given
+// process. Needs to be stack-allocatable and avoid using stdio/malloc
+// so it can be used in the google stack dumper, heap-profiler, etc.
+//
+// On Windows and Mac OS X, this iterator iterates *only* over DLLs
+// mapped into this process space.  For Linux, FreeBSD, and Solaris,
+// it iterates over *all* mapped memory regions, including anonymous
+// mmaps.  For other O/Ss, it is unlikely to work at all, and Valid()
+// will always return false.  Also note: this routine only works on
+// FreeBSD if procfs is mounted: make sure this is in your /etc/fstab:
+//    proc            /proc   procfs  rw 0 0
+class ProcMapsIterator {
+ public:
+  struct Buffer {
+#ifdef __FreeBSD__
+    // FreeBSD requires us to read all of the maps file at once, so
+    // we have to make a buffer that's "always" big enough
+    static const size_t kBufSize = 102400;
+#else   // a one-line buffer is good enough
+    static const size_t kBufSize = PATH_MAX + 1024;
+#endif
+    char buf_[kBufSize];
+  };
+
+
+  // Create a new iterator for the specified pid.  pid can be 0 for "self".
+  explicit ProcMapsIterator(pid_t pid);
+
+  // Create an iterator with specified storage (for use in signal
+  // handler). "buffer" should point to a ProcMapsIterator::Buffer
+  // buffer can be NULL in which case a bufer will be allocated.
+  ProcMapsIterator(pid_t pid, Buffer *buffer);
+
+  // Iterate through maps_backing instead of maps if use_maps_backing
+  // is true.  Otherwise the same as above.  buffer can be NULL and
+  // it will allocate a buffer itself.
+  ProcMapsIterator(pid_t pid, Buffer *buffer,
+                   bool use_maps_backing);
+
+  // Returns true if the iterator successfully initialized;
+  bool Valid() const;
+
+  // Returns a pointer to the most recently parsed line. Only valid
+  // after Next() returns true, and until the iterator is destroyed or
+  // Next() is called again.  This may give strange results on non-Linux
+  // systems.  Prefer FormatLine() if that may be a concern.
+  const char *CurrentLine() const { return stext_; }
+
+  // Writes the "canonical" form of the /proc/xxx/maps info for a single
+  // line to the passed-in buffer. Returns the number of bytes written,
+  // or 0 if it was not able to write the complete line.  (To guarantee
+  // success, buffer should have size at least Buffer::kBufSize.)
+  // Takes as arguments values set via a call to Next().  The
+  // "canonical" form of the line (taken from linux's /proc/xxx/maps):
+  //    <start_addr(hex)>-<end_addr(hex)> <perms(rwxp)> <offset(hex)>   +
+  //    <major_dev(hex)>:<minor_dev(hex)> <inode> <filename> Note: the
+  // eg
+  //    08048000-0804c000 r-xp 00000000 03:01 3793678    /bin/cat
+  // If you don't have the dev_t (dev), feel free to pass in 0.
+  // (Next() doesn't return a dev_t, though NextExt does.)
+  //
+  // Note: if filename and flags were obtained via a call to Next(),
+  // then the output of this function is only valid if Next() returned
+  // true, and only until the iterator is destroyed or Next() is
+  // called again.  (Since filename, at least, points into CurrentLine.)
+  static int FormatLine(char* buffer, int bufsize,
+                        uint64 start, uint64 end, const char *flags,
+                        uint64 offset, int64 inode, const char *filename,
+                        dev_t dev);
+
+  // Find the next entry in /proc/maps; return true if found or false
+  // if at the end of the file.
+  //
+  // Any of the result pointers can be NULL if you're not interested
+  // in those values.
+  //
+  // If "flags" and "filename" are passed, they end up pointing to
+  // storage within the ProcMapsIterator that is valid only until the
+  // iterator is destroyed or Next() is called again. The caller may
+  // modify the contents of these strings (up as far as the first NUL,
+  // and only until the subsequent call to Next()) if desired.
+
+  // The offsets are all uint64 in order to handle the case of a
+  // 32-bit process running on a 64-bit kernel
+  //
+  // IMPORTANT NOTE: see top-of-class notes for details about what
+  // mapped regions Next() iterates over, depending on O/S.
+  // TODO(csilvers): make flags and filename const.
+  bool Next(uint64 *start, uint64 *end, char **flags,
+            uint64 *offset, int64 *inode, char **filename);
+
+  bool NextExt(uint64 *start, uint64 *end, char **flags,
+               uint64 *offset, int64 *inode, char **filename,
+               uint64 *file_mapping, uint64 *file_pages,
+               uint64 *anon_mapping, uint64 *anon_pages,
+               dev_t *dev);
+
+  ~ProcMapsIterator();
+
+ private:
+  void Init(pid_t pid, Buffer *buffer, bool use_maps_backing);
+
+  char *ibuf_;        // input buffer
+  char *stext_;       // start of text
+  char *etext_;       // end of text
+  char *nextline_;    // start of next line
+  char *ebuf_;        // end of buffer (1 char for a nul)
+#if (defined(_WIN32) || defined(__MINGW32__)) && (!defined(__CYGWIN__) && !defined(__CYGWIN32__))
+  HANDLE snapshot_;   // filehandle on dll info
+  // In a change from the usual W-A pattern, there is no A variant of
+  // MODULEENTRY32.  Tlhelp32.h #defines the W variant, but not the A.
+  // We want the original A variants, and this #undef is the only
+  // way I see to get them.  Redefining it when we're done prevents us
+  // from affecting other .cc files.
+# ifdef MODULEENTRY32  // Alias of W
+#   undef MODULEENTRY32
+  MODULEENTRY32 module_;   // info about current dll (and dll iterator)
+#   define MODULEENTRY32 MODULEENTRY32W
+# else  // It's the ascii, the one we want.
+  MODULEENTRY32 module_;   // info about current dll (and dll iterator)
+# endif
+#elif defined(__MACH__)
+  int current_image_; // dll's are called "images" in macos parlance
+  int current_load_cmd_;   // the segment of this dll we're examining
+#elif defined(__sun__)     // Solaris
+  int fd_;
+  char current_filename_[PATH_MAX];
+#else
+  int fd_;            // filehandle on /proc/*/maps
+#endif
+  pid_t pid_;
+  char flags_[10];
+  Buffer* dynamic_buffer_;  // dynamically-allocated Buffer
+  bool using_maps_backing_; // true if we are looking at maps_backing instead of maps.
+};
+
+#endif  /* #ifndef SWIG */
+
+// Helper routines
+
+namespace tcmalloc {
+int FillProcSelfMaps(char buf[], int size, bool* wrote_all);
+void DumpProcSelfMaps(RawFD fd);
+}
+
+#endif   /* #ifndef _SYSINFO_H_ */

diff --git a/src/base/thread_annotations.h b/src/base/thread_annotations.h
new file mode 100644
index 0000000..f57b299
--- /dev/null
+++ b/src/base/thread_annotations.h

@@ -0,0 +1,134 @@
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Le-Chun Wu
+//
+// This header file contains the macro definitions for thread safety
+// annotations that allow the developers to document the locking policies
+// of their multi-threaded code. The annotations can also help program
+// analysis tools to identify potential thread safety issues.
+//
+// The annotations are implemented using GCC's "attributes" extension.
+// Using the macros defined here instead of the raw GCC attributes allows
+// for portability and future compatibility.
+//
+// This functionality is not yet fully implemented in perftools,
+// but may be one day.
+
+#ifndef BASE_THREAD_ANNOTATIONS_H_
+#define BASE_THREAD_ANNOTATIONS_H_
+
+
+#if defined(__GNUC__) \
+  && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) \
+  && defined(__SUPPORT_TS_ANNOTATION__) && (!defined(SWIG))
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)   __attribute__((x))
+#else
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)   // no-op
+#endif
+
+
+// Document if a shared variable/field needs to be protected by a lock.
+// GUARDED_BY allows the user to specify a particular lock that should be
+// held when accessing the annotated variable, while GUARDED_VAR only
+// indicates a shared variable should be guarded (by any lock). GUARDED_VAR
+// is primarily used when the client cannot express the name of the lock.
+#define GUARDED_BY(x)          THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define GUARDED_VAR            THREAD_ANNOTATION_ATTRIBUTE__(guarded)
+
+// Document if the memory location pointed to by a pointer should be guarded
+// by a lock when dereferencing the pointer. Similar to GUARDED_VAR,
+// PT_GUARDED_VAR is primarily used when the client cannot express the name
+// of the lock. Note that a pointer variable to a shared memory location
+// could itself be a shared variable. For example, if a shared global pointer
+// q, which is guarded by mu1, points to a shared memory location that is
+// guarded by mu2, q should be annotated as follows:
+//     int *q GUARDED_BY(mu1) PT_GUARDED_BY(mu2);
+#define PT_GUARDED_BY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(point_to_guarded_by(x))
+#define PT_GUARDED_VAR \
+  THREAD_ANNOTATION_ATTRIBUTE__(point_to_guarded)
+
+// Document the acquisition order between locks that can be held
+// simultaneously by a thread. For any two locks that need to be annotated
+// to establish an acquisition order, only one of them needs the annotation.
+// (i.e. You don't have to annotate both locks with both ACQUIRED_AFTER
+// and ACQUIRED_BEFORE.)
+#define ACQUIRED_AFTER(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(x))
+#define ACQUIRED_BEFORE(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(x))
+
+// The following three annotations document the lock requirements for
+// functions/methods.
+
+// Document if a function expects certain locks to be held before it is called
+#define EXCLUSIVE_LOCKS_REQUIRED(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(exclusive_locks_required(x))
+
+#define SHARED_LOCKS_REQUIRED(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(shared_locks_required(x))
+
+// Document the locks acquired in the body of the function. These locks
+// cannot be held when calling this function (as google3's Mutex locks are
+// non-reentrant).
+#define LOCKS_EXCLUDED(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(x))
+
+// Document the lock the annotated function returns without acquiring it.
+#define LOCK_RETURNED(x)       THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+
+// Document if a class/type is a lockable type (such as the Mutex class).
+#define LOCKABLE               THREAD_ANNOTATION_ATTRIBUTE__(lockable)
+
+// Document if a class is a scoped lockable type (such as the MutexLock class).
+#define SCOPED_LOCKABLE        THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+
+// The following annotations specify lock and unlock primitives.
+#define EXCLUSIVE_LOCK_FUNCTION(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock(x))
+
+#define SHARED_LOCK_FUNCTION(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(shared_lock(x))
+
+#define EXCLUSIVE_TRYLOCK_FUNCTION(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock(x))
+
+#define SHARED_TRYLOCK_FUNCTION(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock(x))
+
+#define UNLOCK_FUNCTION(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(unlock(x))
+
+// An escape hatch for thread safety analysis to ignore the annotated function.
+#define NO_THREAD_SAFETY_ANALYSIS \
+  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+
+#endif  // BASE_THREAD_ANNOTATIONS_H_

diff --git a/src/base/thread_lister.c b/src/base/thread_lister.c
new file mode 100644
index 0000000..ca1b2de
--- /dev/null
+++ b/src/base/thread_lister.c

@@ -0,0 +1,77 @@
+/* Copyright (c) 2005-2007, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Markus Gutschke
+ */
+
+#include "config.h"
+#include <stdio.h>         /* needed for NULL on some powerpc platforms (?!) */
+#ifdef HAVE_SYS_PRCTL
+# include <sys/prctl.h>
+#endif
+#include "base/thread_lister.h"
+#include "base/linuxthreads.h"
+/* Include other thread listers here that define THREADS macro
+ * only when they can provide a good implementation.
+ */
+
+#ifndef THREADS
+
+/* Default trivial thread lister for single-threaded applications,
+ * or if the multi-threading code has not been ported, yet.
+ */
+
+int TCMalloc_ListAllProcessThreads(void *parameter,
+				   ListAllProcessThreadsCallBack callback, ...) {
+  int rc;
+  va_list ap;
+  pid_t pid;
+
+#ifdef HAVE_SYS_PRCTL
+  int dumpable = prctl(PR_GET_DUMPABLE, 0);
+  if (!dumpable)
+    prctl(PR_SET_DUMPABLE, 1);
+#endif
+  va_start(ap, callback);
+  pid = getpid();
+  rc = callback(parameter, 1, &pid, ap);
+  va_end(ap);
+#ifdef HAVE_SYS_PRCTL
+  if (!dumpable)
+    prctl(PR_SET_DUMPABLE, 0);
+#endif
+  return rc;
+}
+
+int TCMalloc_ResumeAllProcessThreads(int num_threads, pid_t *thread_pids) {
+  return 1;
+}
+
+#endif   /* ifndef THREADS */

diff --git a/src/base/thread_lister.h b/src/base/thread_lister.h
new file mode 100644
index 0000000..6e70b89
--- /dev/null
+++ b/src/base/thread_lister.h

@@ -0,0 +1,83 @@
+/* -*- Mode: c; c-basic-offset: 2; indent-tabs-mode: nil -*- */
+/* Copyright (c) 2005-2007, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Markus Gutschke
+ */
+
+#ifndef _THREAD_LISTER_H
+#define _THREAD_LISTER_H
+
+#include <stdarg.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int (*ListAllProcessThreadsCallBack)(void *parameter,
+                                             int num_threads,
+                                             pid_t *thread_pids,
+                                             va_list ap);
+
+/* This function gets the list of all linux threads of the current process
+ * passes them to the 'callback' along with the 'parameter' pointer; at the
+ * call back call time all the threads are paused via
+ * PTRACE_ATTACH.
+ * The callback is executed from a separate thread which shares only the
+ * address space, the filesystem, and the filehandles with the caller. Most
+ * notably, it does not share the same pid and ppid; and if it terminates,
+ * the rest of the application is still there. 'callback' is supposed to do
+ * or arrange for TCMalloc_ResumeAllProcessThreads. This happens automatically, if
+ * the thread raises a synchronous signal (e.g. SIGSEGV); asynchronous
+ * signals are blocked. If the 'callback' decides to unblock them, it must
+ * ensure that they cannot terminate the application, or that
+ * TCMalloc_ResumeAllProcessThreads will get called.
+ * It is an error for the 'callback' to make any library calls that could
+ * acquire locks. Most notably, this means that most system calls have to
+ * avoid going through libc. Also, this means that it is not legal to call
+ * exit() or abort().
+ * We return -1 on error and the return value of 'callback' on success.
+ */
+int TCMalloc_ListAllProcessThreads(void *parameter,
+                                   ListAllProcessThreadsCallBack callback, ...);
+
+/* This function resumes the list of all linux threads that
+ * TCMalloc_ListAllProcessThreads pauses before giving to its
+ * callback.  The function returns non-zero if at least one thread was
+ * suspended and has now been resumed.
+ */
+int TCMalloc_ResumeAllProcessThreads(int num_threads, pid_t *thread_pids);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* _THREAD_LISTER_H */

diff --git a/src/base/vdso_support.cc b/src/base/vdso_support.cc
new file mode 100644
index 0000000..730df30
--- /dev/null
+++ b/src/base/vdso_support.cc

@@ -0,0 +1,143 @@
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Paul Pluzhnikov
+//
+// Allow dynamic symbol lookup in the kernel VDSO page.
+//
+// VDSOSupport -- a class representing kernel VDSO (if present).
+//
+
+#include "base/vdso_support.h"
+
+#ifdef HAVE_VDSO_SUPPORT     // defined in vdso_support.h
+
+#include <fcntl.h>
+#include <stddef.h>   // for ptrdiff_t
+
+#include "base/atomicops.h"  // for MemoryBarrier
+#include "base/linux_syscall_support.h"
+#include "base/logging.h"
+#include "base/dynamic_annotations.h"
+#include "base/basictypes.h"  // for COMPILE_ASSERT
+
+using base::subtle::MemoryBarrier;
+
+#ifndef AT_SYSINFO_EHDR
+#define AT_SYSINFO_EHDR 33
+#endif
+
+namespace base {
+
+const void *VDSOSupport::vdso_base_ = ElfMemImage::kInvalidBase;
+VDSOSupport::VDSOSupport()
+    // If vdso_base_ is still set to kInvalidBase, we got here
+    // before VDSOSupport::Init has been called. Call it now.
+    : image_(vdso_base_ == ElfMemImage::kInvalidBase ? Init() : vdso_base_) {
+}
+
+// NOTE: we can't use GoogleOnceInit() below, because we can be
+// called by tcmalloc, and none of the *once* stuff may be functional yet.
+//
+// In addition, we hope that the VDSOSupportHelper constructor
+// causes this code to run before there are any threads, and before
+// InitGoogle() has executed any chroot or setuid calls.
+//
+// Finally, even if there is a race here, it is harmless, because
+// the operation should be idempotent.
+const void *VDSOSupport::Init() {
+  if (vdso_base_ == ElfMemImage::kInvalidBase) {
+    // Valgrind zaps AT_SYSINFO_EHDR and friends from the auxv[]
+    // on stack, and so glibc works as if VDSO was not present.
+    // But going directly to kernel via /proc/self/auxv below bypasses
+    // Valgrind zapping. So we check for Valgrind separately.
+    if (RunningOnValgrind()) {
+      vdso_base_ = NULL;
+      return NULL;
+    }
+    int fd = open("/proc/self/auxv", O_RDONLY);
+    if (fd == -1) {
+      // Kernel too old to have a VDSO.
+      vdso_base_ = NULL;
+      return NULL;
+    }
+    ElfW(auxv_t) aux;
+    while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
+      if (aux.a_type == AT_SYSINFO_EHDR) {
+        COMPILE_ASSERT(sizeof(vdso_base_) == sizeof(aux.a_un.a_val),
+                       unexpected_sizeof_pointer_NE_sizeof_a_val);
+        vdso_base_ = reinterpret_cast<void *>(aux.a_un.a_val);
+        break;
+      }
+    }
+    close(fd);
+    if (vdso_base_ == ElfMemImage::kInvalidBase) {
+      // Didn't find AT_SYSINFO_EHDR in auxv[].
+      vdso_base_ = NULL;
+    }
+  }
+  return vdso_base_;
+}
+
+const void *VDSOSupport::SetBase(const void *base) {
+  CHECK(base != ElfMemImage::kInvalidBase);
+  const void *old_base = vdso_base_;
+  vdso_base_ = base;
+  image_.Init(base);
+  return old_base;
+}
+
+bool VDSOSupport::LookupSymbol(const char *name,
+                               const char *version,
+                               int type,
+                               SymbolInfo *info) const {
+  return image_.LookupSymbol(name, version, type, info);
+}
+
+bool VDSOSupport::LookupSymbolByAddress(const void *address,
+                                        SymbolInfo *info_out) const {
+  return image_.LookupSymbolByAddress(address, info_out);
+}
+
+// We need to make sure VDSOSupport::Init() is called before
+// the main() runs, since it might do something like setuid or
+// chroot.  If VDSOSupport
+// is used in any global constructor, this will happen, since
+// VDSOSupport's constructor calls Init.  But if not, we need to
+// ensure it here, with a global constructor of our own.  This
+// is an allowed exception to the normal rule against non-trivial
+// global constructors.
+static class VDSOInitHelper {
+ public:
+  VDSOInitHelper() { VDSOSupport::Init(); }
+} vdso_init_helper;
+}
+
+#endif  // HAVE_VDSO_SUPPORT

diff --git a/src/base/vdso_support.h b/src/base/vdso_support.h
new file mode 100644
index 0000000..c1209a4
--- /dev/null
+++ b/src/base/vdso_support.h

@@ -0,0 +1,132 @@
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Paul Pluzhnikov
+//
+// Allow dynamic symbol lookup in the kernel VDSO page.
+//
+// VDSO stands for "Virtual Dynamic Shared Object" -- a page of
+// executable code, which looks like a shared library, but doesn't
+// necessarily exist anywhere on disk, and which gets mmap()ed into
+// every process by kernels which support VDSO, such as 2.6.x for 32-bit
+// executables, and 2.6.24 and above for 64-bit executables.
+//
+// More details could be found here:
+// http://www.trilithium.com/johan/2005/08/linux-gate/
+//
+// VDSOSupport -- a class representing kernel VDSO (if present).
+//
+// Example usage:
+//  VDSOSupport vdso;
+//  VDSOSupport::SymbolInfo info;
+//  typedef (*FN)(unsigned *, void *, void *);
+//  FN fn = NULL;
+//  if (vdso.LookupSymbol("__vdso_getcpu", "LINUX_2.6", STT_FUNC, &info)) {
+//     fn = reinterpret_cast<FN>(info.address);
+//  }
+
+#ifndef BASE_VDSO_SUPPORT_H_
+#define BASE_VDSO_SUPPORT_H_
+
+#include <config.h>
+#include "base/basictypes.h"
+#include "base/elf_mem_image.h"
+
+#ifdef HAVE_ELF_MEM_IMAGE
+
+#define HAVE_VDSO_SUPPORT 1
+
+#include <stdlib.h>     // for NULL
+
+namespace base {
+
+// NOTE: this class may be used from within tcmalloc, and can not
+// use any memory allocation routines.
+class VDSOSupport {
+ public:
+  VDSOSupport();
+
+  typedef ElfMemImage::SymbolInfo SymbolInfo;
+  typedef ElfMemImage::SymbolIterator SymbolIterator;
+
+  // Answers whether we have a vdso at all.
+  bool IsPresent() const { return image_.IsPresent(); }
+
+  // Allow to iterate over all VDSO symbols.
+  SymbolIterator begin() const { return image_.begin(); }
+  SymbolIterator end() const { return image_.end(); }
+
+  // Look up versioned dynamic symbol in the kernel VDSO.
+  // Returns false if VDSO is not present, or doesn't contain given
+  // symbol/version/type combination.
+  // If info_out != NULL, additional details are filled in.
+  bool LookupSymbol(const char *name, const char *version,
+                    int symbol_type, SymbolInfo *info_out) const;
+
+  // Find info about symbol (if any) which overlaps given address.
+  // Returns true if symbol was found; false if VDSO isn't present
+  // or doesn't have a symbol overlapping given address.
+  // If info_out != NULL, additional details are filled in.
+  bool LookupSymbolByAddress(const void *address, SymbolInfo *info_out) const;
+
+  // Used only for testing. Replace real VDSO base with a mock.
+  // Returns previous value of vdso_base_. After you are done testing,
+  // you are expected to call SetBase() with previous value, in order to
+  // reset state to the way it was.
+  const void *SetBase(const void *s);
+
+  // Computes vdso_base_ and returns it. Should be called as early as
+  // possible; before any thread creation, chroot or setuid.
+  static const void *Init();
+
+ private:
+  // image_ represents VDSO ELF image in memory.
+  // image_.ehdr_ == NULL implies there is no VDSO.
+  ElfMemImage image_;
+
+  // Cached value of auxv AT_SYSINFO_EHDR, computed once.
+  // This is a tri-state:
+  //   kInvalidBase   => value hasn't been determined yet.
+  //              0   => there is no VDSO.
+  //           else   => vma of VDSO Elf{32,64}_Ehdr.
+  //
+  // When testing with mock VDSO, low bit is set.
+  // The low bit is always available because vdso_base_ is
+  // page-aligned.
+  static const void *vdso_base_;
+
+  DISALLOW_COPY_AND_ASSIGN(VDSOSupport);
+};
+
+}  // namespace base
+
+#endif  // HAVE_ELF_MEM_IMAGE
+
+#endif  // BASE_VDSO_SUPPORT_H_

diff --git a/src/central_freelist.cc b/src/central_freelist.cc
new file mode 100644
index 0000000..11b190d
--- /dev/null
+++ b/src/central_freelist.cc

@@ -0,0 +1,387 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+
+#include "config.h"
+#include <algorithm>
+#include "central_freelist.h"
+#include "internal_logging.h"  // for ASSERT, MESSAGE
+#include "linked_list.h"       // for SLL_Next, SLL_Push, etc
+#include "page_heap.h"         // for PageHeap
+#include "static_vars.h"       // for Static
+
+using std::min;
+using std::max;
+
+namespace tcmalloc {
+
+void CentralFreeList::Init(size_t cl) {
+  size_class_ = cl;
+  tcmalloc::DLL_Init(&empty_);
+  tcmalloc::DLL_Init(&nonempty_);
+  num_spans_ = 0;
+  counter_ = 0;
+
+  max_cache_size_ = kMaxNumTransferEntries;
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+  // Disable the transfer cache for the small footprint case.
+  cache_size_ = 0;
+#else
+  cache_size_ = 16;
+#endif
+  if (cl > 0) {
+    // Limit the maximum size of the cache based on the size class.  If this
+    // is not done, large size class objects will consume a lot of memory if
+    // they just sit in the transfer cache.
+    int32_t bytes = Static::sizemap()->ByteSizeForClass(cl);
+    int32_t objs_to_move = Static::sizemap()->num_objects_to_move(cl);
+
+    ASSERT(objs_to_move > 0 && bytes > 0);
+    // Limit each size class cache to at most 1MB of objects or one entry,
+    // whichever is greater. Total transfer cache memory used across all
+    // size classes then can't be greater than approximately
+    // 1MB * kMaxNumTransferEntries.
+    // min and max are in parens to avoid macro-expansion on windows.
+    max_cache_size_ = (min)(max_cache_size_,
+                          (max)(1, (1024 * 1024) / (bytes * objs_to_move)));
+    cache_size_ = (min)(cache_size_, max_cache_size_);
+  }
+  used_slots_ = 0;
+  ASSERT(cache_size_ <= max_cache_size_);
+}
+
+void CentralFreeList::ReleaseListToSpans(void* start) {
+  while (start) {
+    void *next = SLL_Next(start);
+    ReleaseToSpans(start);
+    start = next;
+  }
+}
+
+// MapObjectToSpan should logically be part of ReleaseToSpans.  But
+// this triggers an optimization bug in gcc 4.5.0.  Moving to a
+// separate function, and making sure that function isn't inlined,
+// seems to fix the problem.  It also should be fixed for gcc 4.5.1.
+static
+#if __GNUC__ == 4 && __GNUC_MINOR__ == 5 && __GNUC_PATCHLEVEL__ == 0
+__attribute__ ((noinline))
+#endif
+Span* MapObjectToSpan(void* object) {
+  const PageID p = reinterpret_cast<uintptr_t>(object) >> kPageShift;
+  Span* span = Static::pageheap()->GetDescriptor(p);
+  return span;
+}
+
+void CentralFreeList::ReleaseToSpans(void* object) {
+  Span* span = MapObjectToSpan(object);
+  ASSERT(span != NULL);
+  ASSERT(span->refcount > 0);
+
+  // If span is empty, move it to non-empty list
+  if (span->objects == NULL) {
+    tcmalloc::DLL_Remove(span);
+    tcmalloc::DLL_Prepend(&nonempty_, span);
+    Event(span, 'N', 0);
+  }
+
+  // The following check is expensive, so it is disabled by default
+  if (false) {
+    // Check that object does not occur in list
+    int got = 0;
+    for (void* p = span->objects; p != NULL; p = *((void**) p)) {
+      ASSERT(p != object);
+      got++;
+    }
+    ASSERT(got + span->refcount ==
+           (span->length<<kPageShift) /
+           Static::sizemap()->ByteSizeForClass(span->sizeclass));
+  }
+
+  counter_++;
+  span->refcount--;
+  if (span->refcount == 0) {
+    Event(span, '#', 0);
+    counter_ -= ((span->length<<kPageShift) /
+                 Static::sizemap()->ByteSizeForClass(span->sizeclass));
+    tcmalloc::DLL_Remove(span);
+    --num_spans_;
+
+    // Release central list lock while operating on pageheap
+    lock_.Unlock();
+    {
+      SpinLockHolder h(Static::pageheap_lock());
+      Static::pageheap()->Delete(span);
+    }
+    lock_.Lock();
+  } else {
+    *(reinterpret_cast<void**>(object)) = span->objects;
+    span->objects = object;
+  }
+}
+
+bool CentralFreeList::EvictRandomSizeClass(
+    int locked_size_class, bool force) {
+  static int race_counter = 0;
+  int t = race_counter++;  // Updated without a lock, but who cares.
+  if (t >= kNumClasses) {
+    while (t >= kNumClasses) {
+      t -= kNumClasses;
+    }
+    race_counter = t;
+  }
+  ASSERT(t >= 0);
+  ASSERT(t < kNumClasses);
+  if (t == locked_size_class) return false;
+  return Static::central_cache()[t].ShrinkCache(locked_size_class, force);
+}
+
+bool CentralFreeList::MakeCacheSpace() {
+  // Is there room in the cache?
+  if (used_slots_ < cache_size_) return true;
+  // Check if we can expand this cache?
+  if (cache_size_ == max_cache_size_) return false;
+  // Ok, we'll try to grab an entry from some other size class.
+  if (EvictRandomSizeClass(size_class_, false) ||
+      EvictRandomSizeClass(size_class_, true)) {
+    // Succeeded in evicting, we're going to make our cache larger.
+    // However, we may have dropped and re-acquired the lock in
+    // EvictRandomSizeClass (via ShrinkCache and the LockInverter), so the
+    // cache_size may have changed.  Therefore, check and verify that it is
+    // still OK to increase the cache_size.
+    if (cache_size_ < max_cache_size_) {
+      cache_size_++;
+      return true;
+    }
+  }
+  return false;
+}
+
+
+namespace {
+class LockInverter {
+ private:
+  SpinLock *held_, *temp_;
+ public:
+  inline explicit LockInverter(SpinLock* held, SpinLock *temp)
+    : held_(held), temp_(temp) { held_->Unlock(); temp_->Lock(); }
+  inline ~LockInverter() { temp_->Unlock(); held_->Lock();  }
+};
+}
+
+// This function is marked as NO_THREAD_SAFETY_ANALYSIS because it uses
+// LockInverter to release one lock and acquire another in scoped-lock
+// style, which our current annotation/analysis does not support.
+bool CentralFreeList::ShrinkCache(int locked_size_class, bool force)
+    NO_THREAD_SAFETY_ANALYSIS {
+  // Start with a quick check without taking a lock.
+  if (cache_size_ == 0) return false;
+  // We don't evict from a full cache unless we are 'forcing'.
+  if (force == false && used_slots_ == cache_size_) return false;
+
+  // Grab lock, but first release the other lock held by this thread.  We use
+  // the lock inverter to ensure that we never hold two size class locks
+  // concurrently.  That can create a deadlock because there is no well
+  // defined nesting order.
+  LockInverter li(&Static::central_cache()[locked_size_class].lock_, &lock_);
+  ASSERT(used_slots_ <= cache_size_);
+  ASSERT(0 <= cache_size_);
+  if (cache_size_ == 0) return false;
+  if (used_slots_ == cache_size_) {
+    if (force == false) return false;
+    // ReleaseListToSpans releases the lock, so we have to make all the
+    // updates to the central list before calling it.
+    cache_size_--;
+    used_slots_--;
+    ReleaseListToSpans(tc_slots_[used_slots_].head);
+    return true;
+  }
+  cache_size_--;
+  return true;
+}
+
+void CentralFreeList::InsertRange(void *start, void *end, int N) {
+  SpinLockHolder h(&lock_);
+  if (N == Static::sizemap()->num_objects_to_move(size_class_) &&
+    MakeCacheSpace()) {
+    int slot = used_slots_++;
+    ASSERT(slot >=0);
+    ASSERT(slot < max_cache_size_);
+    TCEntry *entry = &tc_slots_[slot];
+    entry->head = start;
+    entry->tail = end;
+    return;
+  }
+  ReleaseListToSpans(start);
+}
+
+int CentralFreeList::RemoveRange(void **start, void **end, int N) {
+  ASSERT(N > 0);
+  lock_.Lock();
+  if (N == Static::sizemap()->num_objects_to_move(size_class_) &&
+      used_slots_ > 0) {
+    int slot = --used_slots_;
+    ASSERT(slot >= 0);
+    TCEntry *entry = &tc_slots_[slot];
+    *start = entry->head;
+    *end = entry->tail;
+    lock_.Unlock();
+    return N;
+  }
+
+  int result = 0;
+  *start = NULL;
+  *end = NULL;
+  // TODO: Prefetch multiple TCEntries?
+  result = FetchFromOneSpansSafe(N, start, end);
+  if (result != 0) {
+    while (result < N) {
+      int n;
+      void* head = NULL;
+      void* tail = NULL;
+      n = FetchFromOneSpans(N - result, &head, &tail);
+      if (!n) break;
+      result += n;
+      SLL_PushRange(start, head, tail);
+    }
+  }
+  lock_.Unlock();
+  return result;
+}
+
+
+int CentralFreeList::FetchFromOneSpansSafe(int N, void **start, void **end) {
+  int result = FetchFromOneSpans(N, start, end);
+  if (!result) {
+    Populate();
+    result = FetchFromOneSpans(N, start, end);
+  }
+  return result;
+}
+
+int CentralFreeList::FetchFromOneSpans(int N, void **start, void **end) {
+  if (tcmalloc::DLL_IsEmpty(&nonempty_)) return 0;
+  Span* span = nonempty_.next;
+
+  ASSERT(span->objects != NULL);
+
+  int result = 0;
+  void *prev, *curr;
+  curr = span->objects;
+  do {
+    prev = curr;
+    curr = *(reinterpret_cast<void**>(curr));
+  } while (++result < N && curr != NULL);
+
+  if (curr == NULL) {
+    // Move to empty list
+    tcmalloc::DLL_Remove(span);
+    tcmalloc::DLL_Prepend(&empty_, span);
+    Event(span, 'E', 0);
+  }
+
+  *start = span->objects;
+  *end = prev;
+  span->objects = curr;
+  SLL_SetNext(*end, NULL);
+  span->refcount += result;
+  counter_ -= result;
+  return result;
+}
+
+// Fetch memory from the system and add to the central cache freelist.
+void CentralFreeList::Populate() {
+  // Release central list lock while operating on pageheap
+  lock_.Unlock();
+  const size_t npages = Static::sizemap()->class_to_pages(size_class_);
+
+  Span* span;
+  {
+    SpinLockHolder h(Static::pageheap_lock());
+    span = Static::pageheap()->New(npages);
+    if (span) Static::pageheap()->RegisterSizeClass(span, size_class_);
+  }
+  if (span == NULL) {
+    Log(kLog, __FILE__, __LINE__,
+        "tcmalloc: allocation failed", npages << kPageShift);
+    lock_.Lock();
+    return;
+  }
+  ASSERT(span->length == npages);
+  // Cache sizeclass info eagerly.  Locking is not necessary.
+  // (Instead of being eager, we could just replace any stale info
+  // about this span, but that seems to be no better in practice.)
+  for (int i = 0; i < npages; i++) {
+    Static::pageheap()->CacheSizeClass(span->start + i, size_class_);
+  }
+
+  // Split the block into pieces and add to the free-list
+  // TODO: coloring of objects to avoid cache conflicts?
+  void** tail = &span->objects;
+  char* ptr = reinterpret_cast<char*>(span->start << kPageShift);
+  char* limit = ptr + (npages << kPageShift);
+  const size_t size = Static::sizemap()->ByteSizeForClass(size_class_);
+  int num = 0;
+  while (ptr + size <= limit) {
+    *tail = ptr;
+    tail = reinterpret_cast<void**>(ptr);
+    ptr += size;
+    num++;
+  }
+  ASSERT(ptr <= limit);
+  *tail = NULL;
+  span->refcount = 0; // No sub-object in use yet
+
+  // Add span to list of non-empty spans
+  lock_.Lock();
+  tcmalloc::DLL_Prepend(&nonempty_, span);
+  ++num_spans_;
+  counter_ += num;
+}
+
+int CentralFreeList::tc_length() {
+  SpinLockHolder h(&lock_);
+  return used_slots_ * Static::sizemap()->num_objects_to_move(size_class_);
+}
+
+size_t CentralFreeList::OverheadBytes() {
+  SpinLockHolder h(&lock_);
+  if (size_class_ == 0) {  // 0 holds the 0-sized allocations
+    return 0;
+  }
+  const size_t pages_per_span = Static::sizemap()->class_to_pages(size_class_);
+  const size_t object_size = Static::sizemap()->class_to_size(size_class_);
+  ASSERT(object_size > 0);
+  const size_t overhead_per_span = (pages_per_span * kPageSize) % object_size;
+  return num_spans_ * overhead_per_span;
+}
+
+}  // namespace tcmalloc

diff --git a/src/central_freelist.h b/src/central_freelist.h
new file mode 100644
index 0000000..4148680
--- /dev/null
+++ b/src/central_freelist.h

@@ -0,0 +1,211 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+
+#ifndef TCMALLOC_CENTRAL_FREELIST_H_
+#define TCMALLOC_CENTRAL_FREELIST_H_
+
+#include "config.h"
+#include <stddef.h>                     // for size_t
+#ifdef HAVE_STDINT_H
+#include <stdint.h>                     // for int32_t
+#endif
+#include "base/spinlock.h"
+#include "base/thread_annotations.h"
+#include "common.h"
+#include "span.h"
+
+namespace tcmalloc {
+
+// Data kept per size-class in central cache.
+class CentralFreeList {
+ public:
+  // A CentralFreeList may be used before its constructor runs.
+  // So we prevent lock_'s constructor from doing anything to the
+  // lock_ state.
+  CentralFreeList() : lock_(base::LINKER_INITIALIZED) { }
+
+  void Init(size_t cl);
+
+  // These methods all do internal locking.
+
+  // Insert the specified range into the central freelist.  N is the number of
+  // elements in the range.  RemoveRange() is the opposite operation.
+  void InsertRange(void *start, void *end, int N);
+
+  // Returns the actual number of fetched elements and sets *start and *end.
+  int RemoveRange(void **start, void **end, int N);
+
+  // Returns the number of free objects in cache.
+  int length() {
+    SpinLockHolder h(&lock_);
+    return counter_;
+  }
+
+  // Returns the number of free objects in the transfer cache.
+  int tc_length();
+
+  // Returns the memory overhead (internal fragmentation) attributable
+  // to the freelist.  This is memory lost when the size of elements
+  // in a freelist doesn't exactly divide the page-size (an 8192-byte
+  // page full of 5-byte objects would have 2 bytes memory overhead).
+  size_t OverheadBytes();
+
+  // Lock/Unlock the internal SpinLock. Used on the pthread_atfork call
+  // to set the lock in a consistent state before the fork.
+  void Lock() {
+    lock_.Lock();
+  }
+
+  void Unlock() {
+    lock_.Unlock();
+  }
+
+ private:
+  // TransferCache is used to cache transfers of
+  // sizemap.num_objects_to_move(size_class) back and forth between
+  // thread caches and the central cache for a given size class.
+  struct TCEntry {
+    void *head;  // Head of chain of objects.
+    void *tail;  // Tail of chain of objects.
+  };
+
+  // A central cache freelist can have anywhere from 0 to kMaxNumTransferEntries
+  // slots to put link list chains into.
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+  // For the small memory model, the transfer cache is not used.
+  static const int kMaxNumTransferEntries = 0;
+#else
+  // Starting point for the the maximum number of entries in the transfer cache.
+  // This actual maximum for a given size class may be lower than this
+  // maximum value.
+  static const int kMaxNumTransferEntries = 64;
+#endif
+
+  // REQUIRES: lock_ is held
+  // Remove object from cache and return.
+  // Return NULL if no free entries in cache.
+  int FetchFromOneSpans(int N, void **start, void **end) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // REQUIRES: lock_ is held
+  // Remove object from cache and return.  Fetches
+  // from pageheap if cache is empty.  Only returns
+  // NULL on allocation failure.
+  int FetchFromOneSpansSafe(int N, void **start, void **end) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // REQUIRES: lock_ is held
+  // Release a linked list of objects to spans.
+  // May temporarily release lock_.
+  void ReleaseListToSpans(void *start) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // REQUIRES: lock_ is held
+  // Release an object to spans.
+  // May temporarily release lock_.
+  void ReleaseToSpans(void* object) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // REQUIRES: lock_ is held
+  // Populate cache by fetching from the page heap.
+  // May temporarily release lock_.
+  void Populate() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // REQUIRES: lock is held.
+  // Tries to make room for a TCEntry.  If the cache is full it will try to
+  // expand it at the cost of some other cache size.  Return false if there is
+  // no space.
+  bool MakeCacheSpace() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // REQUIRES: lock_ for locked_size_class is held.
+  // Picks a "random" size class to steal TCEntry slot from.  In reality it
+  // just iterates over the sizeclasses but does so without taking a lock.
+  // Returns true on success.
+  // May temporarily lock a "random" size class.
+  static bool EvictRandomSizeClass(int locked_size_class, bool force);
+
+  // REQUIRES: lock_ is *not* held.
+  // Tries to shrink the Cache.  If force is true it will relase objects to
+  // spans if it allows it to shrink the cache.  Return false if it failed to
+  // shrink the cache.  Decrements cache_size_ on succeess.
+  // May temporarily take lock_.  If it takes lock_, the locked_size_class
+  // lock is released to keep the thread from holding two size class locks
+  // concurrently which could lead to a deadlock.
+  bool ShrinkCache(int locked_size_class, bool force) LOCKS_EXCLUDED(lock_);
+
+  // This lock protects all the data members.  cached_entries and cache_size_
+  // may be looked at without holding the lock.
+  SpinLock lock_;
+
+  // We keep linked lists of empty and non-empty spans.
+  size_t   size_class_;     // My size class
+  Span     empty_;          // Dummy header for list of empty spans
+  Span     nonempty_;       // Dummy header for list of non-empty spans
+  size_t   num_spans_;      // Number of spans in empty_ plus nonempty_
+  size_t   counter_;        // Number of free objects in cache entry
+
+  // Here we reserve space for TCEntry cache slots.  Space is preallocated
+  // for the largest possible number of entries than any one size class may
+  // accumulate.  Not all size classes are allowed to accumulate
+  // kMaxNumTransferEntries, so there is some wasted space for those size
+  // classes.
+  TCEntry tc_slots_[kMaxNumTransferEntries];
+
+  // Number of currently used cached entries in tc_slots_.  This variable is
+  // updated under a lock but can be read without one.
+  int32_t used_slots_;
+  // The current number of slots for this size class.  This is an
+  // adaptive value that is increased if there is lots of traffic
+  // on a given size class.
+  int32_t cache_size_;
+  // Maximum size of the cache for a given size class.
+  int32_t max_cache_size_;
+};
+
+// Pads each CentralCache object to multiple of 64 bytes.  Since some
+// compilers (such as MSVC) don't like it when the padding is 0, I use
+// template specialization to remove the padding entirely when
+// sizeof(CentralFreeList) is a multiple of 64.
+template<int kFreeListSizeMod64>
+class CentralFreeListPaddedTo : public CentralFreeList {
+ private:
+  char pad_[64 - kFreeListSizeMod64];
+};
+
+template<>
+class CentralFreeListPaddedTo<0> : public CentralFreeList {
+};
+
+class CentralFreeListPadded : public CentralFreeListPaddedTo<
+  sizeof(CentralFreeList) % 64> {
+};
+
+}  // namespace tcmalloc
+
+#endif  // TCMALLOC_CENTRAL_FREELIST_H_

diff --git a/src/common.cc b/src/common.cc
new file mode 100644
index 0000000..3b66afe
--- /dev/null
+++ b/src/common.cc

@@ -0,0 +1,276 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+
+#include <stdlib.h> // for getenv and strtol
+#include "config.h"
+#include "common.h"
+#include "system-alloc.h"
+#include "base/spinlock.h"
+#include "getenv_safe.h" // TCMallocGetenvSafe
+
+namespace tcmalloc {
+
+// Define the maximum number of object per classe type to transfer between
+// thread and central caches.
+static int32 FLAGS_tcmalloc_transfer_num_objects;
+
+static const int32 kDefaultTransferNumObjecs = 32768;
+
+// The init function is provided to explicit initialize the variable value
+// from the env. var to avoid C++ global construction that might defer its
+// initialization after a malloc/new call.
+static inline void InitTCMallocTransferNumObjects()
+{
+  if (UNLIKELY(FLAGS_tcmalloc_transfer_num_objects == 0)) {
+    const char *envval = TCMallocGetenvSafe("TCMALLOC_TRANSFER_NUM_OBJ");
+    FLAGS_tcmalloc_transfer_num_objects = !envval ? kDefaultTransferNumObjecs :
+      strtol(envval, NULL, 10);
+  }
+}
+
+// Note: the following only works for "n"s that fit in 32-bits, but
+// that is fine since we only use it for small sizes.
+static inline int LgFloor(size_t n) {
+  int log = 0;
+  for (int i = 4; i >= 0; --i) {
+    int shift = (1 << i);
+    size_t x = n >> shift;
+    if (x != 0) {
+      n = x;
+      log += shift;
+    }
+  }
+  ASSERT(n == 1);
+  return log;
+}
+
+int AlignmentForSize(size_t size) {
+  int alignment = kAlignment;
+  if (size > kMaxSize) {
+    // Cap alignment at kPageSize for large sizes.
+    alignment = kPageSize;
+  } else if (size >= 128) {
+    // Space wasted due to alignment is at most 1/8, i.e., 12.5%.
+    alignment = (1 << LgFloor(size)) / 8;
+  } else if (size >= kMinAlign) {
+    // We need an alignment of at least 16 bytes to satisfy
+    // requirements for some SSE types.
+    alignment = kMinAlign;
+  }
+  // Maximum alignment allowed is page size alignment.
+  if (alignment > kPageSize) {
+    alignment = kPageSize;
+  }
+  CHECK_CONDITION(size < kMinAlign || alignment >= kMinAlign);
+  CHECK_CONDITION((alignment & (alignment - 1)) == 0);
+  return alignment;
+}
+
+int SizeMap::NumMoveSize(size_t size) {
+  if (size == 0) return 0;
+  // Use approx 64k transfers between thread and central caches.
+  int num = static_cast<int>(64.0 * 1024.0 / size);
+  if (num < 2) num = 2;
+
+  // Avoid bringing too many objects into small object free lists.
+  // If this value is too large:
+  // - We waste memory with extra objects sitting in the thread caches.
+  // - The central freelist holds its lock for too long while
+  //   building a linked list of objects, slowing down the allocations
+  //   of other threads.
+  // If this value is too small:
+  // - We go to the central freelist too often and we have to acquire
+  //   its lock each time.
+  // This value strikes a balance between the constraints above.
+  if (num > FLAGS_tcmalloc_transfer_num_objects)
+    num = FLAGS_tcmalloc_transfer_num_objects;
+
+  return num;
+}
+
+// Initialize the mapping arrays
+void SizeMap::Init() {
+  InitTCMallocTransferNumObjects();
+
+  // Do some sanity checking on add_amount[]/shift_amount[]/class_array[]
+  if (ClassIndex(0) != 0) {
+    Log(kCrash, __FILE__, __LINE__,
+        "Invalid class index for size 0", ClassIndex(0));
+  }
+  if (ClassIndex(kMaxSize) >= sizeof(class_array_)) {
+    Log(kCrash, __FILE__, __LINE__,
+        "Invalid class index for kMaxSize", ClassIndex(kMaxSize));
+  }
+
+  // Compute the size classes we want to use
+  int sc = 1;   // Next size class to assign
+  int alignment = kAlignment;
+  CHECK_CONDITION(kAlignment <= kMinAlign);
+  for (size_t size = kAlignment; size <= kMaxSize; size += alignment) {
+    alignment = AlignmentForSize(size);
+    CHECK_CONDITION((size % alignment) == 0);
+
+    int blocks_to_move = NumMoveSize(size) / 4;
+    size_t psize = 0;
+    do {
+      psize += kPageSize;
+      // Allocate enough pages so leftover is less than 1/8 of total.
+      // This bounds wasted space to at most 12.5%.
+      while ((psize % size) > (psize >> 3)) {
+        psize += kPageSize;
+      }
+      // Continue to add pages until there are at least as many objects in
+      // the span as are needed when moving objects from the central
+      // freelists and spans to the thread caches.
+    } while ((psize / size) < (blocks_to_move));
+    const size_t my_pages = psize >> kPageShift;
+
+    if (sc > 1 && my_pages == class_to_pages_[sc-1]) {
+      // See if we can merge this into the previous class without
+      // increasing the fragmentation of the previous class.
+      const size_t my_objects = (my_pages << kPageShift) / size;
+      const size_t prev_objects = (class_to_pages_[sc-1] << kPageShift)
+                                  / class_to_size_[sc-1];
+      if (my_objects == prev_objects) {
+        // Adjust last class to include this size
+        class_to_size_[sc-1] = size;
+        continue;
+      }
+    }
+
+    // Add new class
+    class_to_pages_[sc] = my_pages;
+    class_to_size_[sc] = size;
+    sc++;
+  }
+  if (sc != kNumClasses) {
+    Log(kCrash, __FILE__, __LINE__,
+        "wrong number of size classes: (found vs. expected )", sc, kNumClasses);
+  }
+
+  // Initialize the mapping arrays
+  int next_size = 0;
+  for (int c = 1; c < kNumClasses; c++) {
+    const int max_size_in_class = class_to_size_[c];
+    for (int s = next_size; s <= max_size_in_class; s += kAlignment) {
+      class_array_[ClassIndex(s)] = c;
+    }
+    next_size = max_size_in_class + kAlignment;
+  }
+
+  // Double-check sizes just to be safe
+  for (size_t size = 0; size <= kMaxSize;) {
+    const int sc = SizeClass(size);
+    if (sc <= 0 || sc >= kNumClasses) {
+      Log(kCrash, __FILE__, __LINE__,
+          "Bad size class (class, size)", sc, size);
+    }
+    if (sc > 1 && size <= class_to_size_[sc-1]) {
+      Log(kCrash, __FILE__, __LINE__,
+          "Allocating unnecessarily large class (class, size)", sc, size);
+    }
+    const size_t s = class_to_size_[sc];
+    if (size > s || s == 0) {
+      Log(kCrash, __FILE__, __LINE__,
+          "Bad (class, size, requested)", sc, s, size);
+    }
+    if (size <= kMaxSmallSize) {
+      size += 8;
+    } else {
+      size += 128;
+    }
+  }
+
+  // Initialize the num_objects_to_move array.
+  for (size_t cl = 1; cl  < kNumClasses; ++cl) {
+    num_objects_to_move_[cl] = NumMoveSize(ByteSizeForClass(cl));
+  }
+}
+
+// Metadata allocator -- keeps stats about how many bytes allocated.
+static uint64_t metadata_system_bytes_ = 0;
+static const size_t kMetadataAllocChunkSize = 8*1024*1024;
+static const size_t kMetadataBigAllocThreshold = kMetadataAllocChunkSize / 8;
+// usually malloc uses larger alignments, but because metadata cannot
+// have and fancy simd types, aligning on pointer size seems fine
+static const size_t kMetadataAllignment = sizeof(void *);
+
+static char *metadata_chunk_alloc_;
+static size_t metadata_chunk_avail_;
+
+static SpinLock metadata_alloc_lock(SpinLock::LINKER_INITIALIZED);
+
+void* MetaDataAlloc(size_t bytes) {
+  if (bytes >= kMetadataAllocChunkSize) {
+    void *rv = TCMalloc_SystemAlloc(bytes,
+                                    NULL, kMetadataAllignment);
+    if (rv != NULL) {
+      metadata_system_bytes_ += bytes;
+    }
+    return rv;
+  }
+
+  SpinLockHolder h(&metadata_alloc_lock);
+
+  // the following works by essentially turning address to integer of
+  // log_2 kMetadataAllignment size and negating it. I.e. negated
+  // value + original value gets 0 and that's what we want modulo
+  // kMetadataAllignment. Note, we negate before masking higher bits
+  // off, otherwise we'd have to mask them off after negation anyways.
+  intptr_t alignment = -reinterpret_cast<intptr_t>(metadata_chunk_alloc_) & (kMetadataAllignment-1);
+
+  if (metadata_chunk_avail_ < bytes + alignment) {
+    size_t real_size;
+    void *ptr = TCMalloc_SystemAlloc(kMetadataAllocChunkSize,
+                                     &real_size, kMetadataAllignment);
+    if (ptr == NULL) {
+      return NULL;
+    }
+
+    metadata_chunk_alloc_ = static_cast<char *>(ptr);
+    metadata_chunk_avail_ = real_size;
+
+    alignment = 0;
+  }
+
+  void *rv = static_cast<void *>(metadata_chunk_alloc_ + alignment);
+  bytes += alignment;
+  metadata_chunk_alloc_ += bytes;
+  metadata_chunk_avail_ -= bytes;
+  metadata_system_bytes_ += bytes;
+  return rv;
+}
+
+uint64_t metadata_system_bytes() { return metadata_system_bytes_; }
+
+}  // namespace tcmalloc

diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..c3484d3
--- /dev/null
+++ b/src/common.h

@@ -0,0 +1,274 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+//
+// Common definitions for tcmalloc code.
+
+#ifndef TCMALLOC_COMMON_H_
+#define TCMALLOC_COMMON_H_
+
+#include "config.h"
+#include <stddef.h>                     // for size_t
+#ifdef HAVE_STDINT_H
+#include <stdint.h>                     // for uintptr_t, uint64_t
+#endif
+#include "internal_logging.h"  // for ASSERT, etc
+#include "base/basictypes.h"   // for LIKELY, etc
+
+#ifdef HAVE_BUILTIN_EXPECT
+#define LIKELY(x) __builtin_expect(!!(x), 1)
+#define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+// Type that can hold a page number
+typedef uintptr_t PageID;
+
+// Type that can hold the length of a run of pages
+typedef uintptr_t Length;
+
+//-------------------------------------------------------------------
+// Configuration
+//-------------------------------------------------------------------
+
+#if defined(TCMALLOC_ALIGN_8BYTES)
+// Unless we force to use 8 bytes alignment we use an alignment of
+// at least 16 bytes to statisfy requirements for some SSE types.
+// Keep in mind when using the 16 bytes alignment you can have a space
+// waste due alignment of 25%. (eg malloc of 24 bytes will get 32 bytes)
+static const size_t kMinAlign   = 8;
+// Number of classes created until reach page size 128.
+static const size_t kBaseClasses = 16;
+#else
+static const size_t kMinAlign   = 16;
+static const size_t kBaseClasses = 9;
+#endif
+
+// Using large pages speeds up the execution at a cost of larger memory use.
+// Deallocation may speed up by a factor as the page map gets 8x smaller, so
+// lookups in the page map result in fewer L2 cache misses, which translates to
+// speedup for application/platform combinations with high L2 cache pressure.
+// As the number of size classes increases with large pages, we increase
+// the thread cache allowance to avoid passing more free ranges to and from
+// central lists.  Also, larger pages are less likely to get freed.
+// These two factors cause a bounded increase in memory use.
+#if defined(TCMALLOC_32K_PAGES)
+static const size_t kPageShift  = 15;
+static const size_t kNumClasses = kBaseClasses + 69;
+#elif defined(TCMALLOC_64K_PAGES)
+static const size_t kPageShift  = 16;
+static const size_t kNumClasses = kBaseClasses + 73;
+#else
+static const size_t kPageShift  = 13;
+static const size_t kNumClasses = kBaseClasses + 79;
+#endif
+
+static const size_t kMaxThreadCacheSize = 4 << 20;
+
+static const size_t kPageSize   = 1 << kPageShift;
+static const size_t kMaxSize    = 256 * 1024;
+static const size_t kAlignment  = 8;
+static const size_t kLargeSizeClass = 0;
+// For all span-lengths < kMaxPages we keep an exact-size list.
+static const size_t kMaxPages = 1 << (20 - kPageShift);
+
+// Default bound on the total amount of thread caches.
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+// Make the overall thread cache no bigger than that of a single thread
+// for the small memory footprint case.
+static const size_t kDefaultOverallThreadCacheSize = kMaxThreadCacheSize;
+#else
+static const size_t kDefaultOverallThreadCacheSize = 8u * kMaxThreadCacheSize;
+#endif
+
+// Lower bound on the per-thread cache sizes
+static const size_t kMinThreadCacheSize = kMaxSize * 2;
+
+// The number of bytes one ThreadCache will steal from another when
+// the first ThreadCache is forced to Scavenge(), delaying the
+// next call to Scavenge for this thread.
+static const size_t kStealAmount = 1 << 16;
+
+// The number of times that a deallocation can cause a freelist to
+// go over its max_length() before shrinking max_length().
+static const int kMaxOverages = 3;
+
+// Maximum length we allow a per-thread free-list to have before we
+// move objects from it into the corresponding central free-list.  We
+// want this big to avoid locking the central free-list too often.  It
+// should not hurt to make this list somewhat big because the
+// scavenging code will shrink it down when its contents are not in use.
+static const int kMaxDynamicFreeListLength = 8192;
+
+static const Length kMaxValidPages = (~static_cast<Length>(0)) >> kPageShift;
+
+#if defined __x86_64__
+// All current and planned x86_64 processors only look at the lower 48 bits
+// in virtual to physical address translation.  The top 16 are thus unused.
+// TODO(rus): Under what operating systems can we increase it safely to 17?
+// This lets us use smaller page maps.  On first allocation, a 36-bit page map
+// uses only 96 KB instead of the 4.5 MB used by a 52-bit page map.
+static const int kAddressBits = (sizeof(void*) < 8 ? (8 * sizeof(void*)) : 48);
+#else
+static const int kAddressBits = 8 * sizeof(void*);
+#endif
+
+namespace tcmalloc {
+
+// Convert byte size into pages.  This won't overflow, but may return
+// an unreasonably large value if bytes is huge enough.
+inline Length pages(size_t bytes) {
+  return (bytes >> kPageShift) +
+      ((bytes & (kPageSize - 1)) > 0 ? 1 : 0);
+}
+
+// For larger allocation sizes, we use larger memory alignments to
+// reduce the number of size classes.
+int AlignmentForSize(size_t size);
+
+// Size-class information + mapping
+class SizeMap {
+ private:
+  // Number of objects to move between a per-thread list and a central
+  // list in one shot.  We want this to be not too small so we can
+  // amortize the lock overhead for accessing the central list.  Making
+  // it too big may temporarily cause unnecessary memory wastage in the
+  // per-thread free list until the scavenger cleans up the list.
+  int num_objects_to_move_[kNumClasses];
+
+  //-------------------------------------------------------------------
+  // Mapping from size to size_class and vice versa
+  //-------------------------------------------------------------------
+
+  // Sizes <= 1024 have an alignment >= 8.  So for such sizes we have an
+  // array indexed by ceil(size/8).  Sizes > 1024 have an alignment >= 128.
+  // So for these larger sizes we have an array indexed by ceil(size/128).
+  //
+  // We flatten both logical arrays into one physical array and use
+  // arithmetic to compute an appropriate index.  The constants used by
+  // ClassIndex() were selected to make the flattening work.
+  //
+  // Examples:
+  //   Size       Expression                      Index
+  //   -------------------------------------------------------
+  //   0          (0 + 7) / 8                     0
+  //   1          (1 + 7) / 8                     1
+  //   ...
+  //   1024       (1024 + 7) / 8                  128
+  //   1025       (1025 + 127 + (120<<7)) / 128   129
+  //   ...
+  //   32768      (32768 + 127 + (120<<7)) / 128  376
+  static const int kMaxSmallSize = 1024;
+  static const size_t kClassArraySize =
+      ((kMaxSize + 127 + (120 << 7)) >> 7) + 1;
+  unsigned char class_array_[kClassArraySize];
+
+  // Compute index of the class_array[] entry for a given size
+  static inline size_t ClassIndex(int s) {
+    // Use unsigned arithmetic to avoid unnecessary sign extensions.
+    ASSERT(0 <= s);
+    ASSERT(s <= kMaxSize);
+    if (LIKELY(s <= kMaxSmallSize)) {
+      return (static_cast<uint32_t>(s) + 7) >> 3;
+    } else {
+      return (static_cast<uint32_t>(s) + 127 + (120 << 7)) >> 7;
+    }
+  }
+
+  int NumMoveSize(size_t size);
+
+  // Mapping from size class to max size storable in that class
+  size_t class_to_size_[kNumClasses];
+
+  // Mapping from size class to number of pages to allocate at a time
+  size_t class_to_pages_[kNumClasses];
+
+ public:
+  // Constructor should do nothing since we rely on explicit Init()
+  // call, which may or may not be called before the constructor runs.
+  SizeMap() { }
+
+  // Initialize the mapping arrays
+  void Init();
+
+  inline int SizeClass(int size) {
+    return class_array_[ClassIndex(size)];
+  }
+
+  // Get the byte-size for a specified class
+  inline size_t ByteSizeForClass(size_t cl) {
+    return class_to_size_[cl];
+  }
+
+  // Mapping from size class to max size storable in that class
+  inline size_t class_to_size(size_t cl) {
+    return class_to_size_[cl];
+  }
+
+  // Mapping from size class to number of pages to allocate at a time
+  inline size_t class_to_pages(size_t cl) {
+    return class_to_pages_[cl];
+  }
+
+  // Number of objects to move between a per-thread list and a central
+  // list in one shot.  We want this to be not too small so we can
+  // amortize the lock overhead for accessing the central list.  Making
+  // it too big may temporarily cause unnecessary memory wastage in the
+  // per-thread free list until the scavenger cleans up the list.
+  inline int num_objects_to_move(size_t cl) {
+    return num_objects_to_move_[cl];
+  }
+};
+
+// Allocates "bytes" worth of memory and returns it.  Increments
+// metadata_system_bytes appropriately.  May return NULL if allocation
+// fails.  Requires pageheap_lock is held.
+void* MetaDataAlloc(size_t bytes);
+
+// Returns the total number of bytes allocated from the system.
+// Requires pageheap_lock is held.
+uint64_t metadata_system_bytes();
+
+// size/depth are made the same size as a pointer so that some generic
+// code below can conveniently cast them back and forth to void*.
+static const int kMaxStackDepth = 31;
+struct StackTrace {
+  uintptr_t size;          // Size of object
+  uintptr_t depth;         // Number of PC values stored in array below
+  void*     stack[kMaxStackDepth];
+};
+
+}  // namespace tcmalloc
+
+#endif  // TCMALLOC_COMMON_H_

diff --git a/src/config_for_unittests.h b/src/config_for_unittests.h
new file mode 100644
index 0000000..66592a7
--- /dev/null
+++ b/src/config_for_unittests.h

@@ -0,0 +1,65 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// All Rights Reserved.
+//
+// Author: Craig Silverstein
+//
+// This file is needed for windows -- unittests are not part of the
+// perftools dll, but still want to include config.h just like the
+// dll does, so they can use internal tools and APIs for testing.
+//
+// The problem is that config.h declares PERFTOOLS_DLL_DECL to be
+// for exporting symbols, but the unittest needs to *import* symbols
+// (since it's not the dll).
+//
+// The solution is to have this file, which is just like config.h but
+// sets PERFTOOLS_DLL_DECL to do a dllimport instead of a dllexport.
+//
+// The reason we need this extra PERFTOOLS_DLL_DECL_FOR_UNITTESTS
+// variable is in case people want to set PERFTOOLS_DLL_DECL explicitly
+// to something other than __declspec(dllexport).  In that case, they
+// may want to use something other than __declspec(dllimport) for the
+// unittest case.  For that, we allow folks to define both
+// PERFTOOLS_DLL_DECL and PERFTOOLS_DLL_DECL_FOR_UNITTESTS explicitly.
+//
+// NOTE: This file is equivalent to config.h on non-windows systems,
+// which never defined PERFTOOLS_DLL_DECL_FOR_UNITTESTS and always
+// define PERFTOOLS_DLL_DECL to the empty string.
+
+#include "config.h"
+
+#undef PERFTOOLS_DLL_DECL
+#ifdef PERFTOOLS_DLL_DECL_FOR_UNITTESTS
+# define PERFTOOLS_DLL_DECL  PERFTOOLS_DLL_DECL_FOR_UNITTESTS
+#else
+# define PERFTOOLS_DLL_DECL  // if DLL_DECL_FOR_UNITTESTS isn't defined, use ""
+#endif

diff --git a/src/debugallocation.cc b/src/debugallocation.cc
new file mode 100644
index 0000000..c170bc7
--- /dev/null
+++ b/src/debugallocation.cc

@@ -0,0 +1,1458 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2000, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Urs Holzle <opensource@google.com>
+
+#include "config.h"
+#include <errno.h>
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#endif
+// We only need malloc.h for struct mallinfo.
+#ifdef HAVE_STRUCT_MALLINFO
+// Malloc can be in several places on older versions of OS X.
+# if defined(HAVE_MALLOC_H)
+# include <malloc.h>
+# elif defined(HAVE_MALLOC_MALLOC_H)
+# include <malloc/malloc.h>
+# elif defined(HAVE_SYS_MALLOC_H)
+# include <sys/malloc.h>
+# endif
+#endif
+#ifdef HAVE_PTHREAD
+#include <pthread.h>
+#endif
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#ifdef HAVE_MMAP
+#include <sys/mman.h>
+#endif
+#include <sys/stat.h>
+#include <sys/types.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#include <gperftools/malloc_extension.h>
+#include <gperftools/malloc_hook.h>
+#include <gperftools/stacktrace.h>
+#include "addressmap-inl.h"
+#include "base/commandlineflags.h"
+#include "base/googleinit.h"
+#include "base/logging.h"
+#include "base/spinlock.h"
+#include "malloc_hook-inl.h"
+#include "symbolize.h"
+
+// NOTE: due to #define below, tcmalloc.cc will omit tc_XXX
+// definitions. So that debug implementations can be defined
+// instead. We're going to use do_malloc, do_free and other do_XXX
+// functions that are defined in tcmalloc.cc for actual memory
+// management
+#define TCMALLOC_USING_DEBUGALLOCATION
+#include "tcmalloc.cc"
+
+// __THROW is defined in glibc systems.  It means, counter-intuitively,
+// "This function will never throw an exception."  It's an optional
+// optimization tool, but we may need to use it to match glibc prototypes.
+#ifndef __THROW    // I guess we're not on a glibc system
+# define __THROW   // __THROW is just an optimization, so ok to make it ""
+#endif
+
+// On systems (like freebsd) that don't define MAP_ANONYMOUS, use the old
+// form of the name instead.
+#ifndef MAP_ANONYMOUS
+# define MAP_ANONYMOUS MAP_ANON
+#endif
+
+// ========================================================================= //
+
+DEFINE_bool(malloctrace,
+            EnvToBool("TCMALLOC_TRACE", false),
+            "Enables memory (de)allocation tracing to /tmp/google.alloc.");
+#ifdef HAVE_MMAP
+DEFINE_bool(malloc_page_fence,
+            EnvToBool("TCMALLOC_PAGE_FENCE", false),
+            "Enables putting of memory allocations at page boundaries "
+            "with a guard page following the allocation (to catch buffer "
+            "overruns right when they happen).");
+DEFINE_bool(malloc_page_fence_never_reclaim,
+            EnvToBool("TCMALLOC_PAGE_FRANCE_NEVER_RECLAIM", false),
+            "Enables making the virtual address space inaccessible "
+            "upon a deallocation instead of returning it and reusing later.");
+#else
+DEFINE_bool(malloc_page_fence, false, "Not usable (requires mmap)");
+DEFINE_bool(malloc_page_fence_never_reclaim, false, "Not usable (required mmap)");
+#endif
+DEFINE_bool(malloc_reclaim_memory,
+            EnvToBool("TCMALLOC_RECLAIM_MEMORY", true),
+            "If set to false, we never return memory to malloc "
+            "when an object is deallocated. This ensures that all "
+            "heap object addresses are unique.");
+DEFINE_int32(max_free_queue_size,
+             EnvToInt("TCMALLOC_MAX_FREE_QUEUE_SIZE", 10*1024*1024),
+             "If greater than 0, keep freed blocks in a queue instead of "
+             "releasing them to the allocator immediately.  Release them when "
+             "the total size of all blocks in the queue would otherwise exceed "
+             "this limit.");
+
+DEFINE_bool(symbolize_stacktrace,
+            EnvToBool("TCMALLOC_SYMBOLIZE_STACKTRACE", true),
+            "Symbolize the stack trace when provided (on some error exits)");
+
+// If we are LD_PRELOAD-ed against a non-pthreads app, then
+// pthread_once won't be defined.  We declare it here, for that
+// case (with weak linkage) which will cause the non-definition to
+// resolve to NULL.  We can then check for NULL or not in Instance.
+extern "C" int pthread_once(pthread_once_t *, void (*)(void))
+    ATTRIBUTE_WEAK;
+
+// ========================================================================= //
+
+// A safe version of printf() that does not do any allocation and
+// uses very little stack space.
+static void TracePrintf(int fd, const char *fmt, ...)
+  __attribute__ ((__format__ (__printf__, 2, 3)));
+
+// Round "value" up to next "alignment" boundary.
+// Requires that "alignment" be a power of two.
+static intptr_t RoundUp(intptr_t value, intptr_t alignment) {
+  return (value + alignment - 1) & ~(alignment - 1);
+}
+
+// ========================================================================= //
+
+class MallocBlock;
+
+// A circular buffer to hold freed blocks of memory.  MallocBlock::Deallocate
+// (below) pushes blocks into this queue instead of returning them to the
+// underlying allocator immediately.  See MallocBlock::Deallocate for more
+// information.
+//
+// We can't use an STL class for this because we need to be careful not to
+// perform any heap de-allocations in any of the code in this class, since the
+// code in MallocBlock::Deallocate is not re-entrant.
+template <typename QueueEntry>
+class FreeQueue {
+ public:
+  FreeQueue() : q_front_(0), q_back_(0) {}
+
+  bool Full() {
+    return (q_front_ + 1) % kFreeQueueSize == q_back_;
+  }
+
+  void Push(const QueueEntry& block) {
+    q_[q_front_] = block;
+    q_front_ = (q_front_ + 1) % kFreeQueueSize;
+  }
+
+  QueueEntry Pop() {
+    RAW_CHECK(q_back_ != q_front_, "Queue is empty");
+    const QueueEntry& ret = q_[q_back_];
+    q_back_ = (q_back_ + 1) % kFreeQueueSize;
+    return ret;
+  }
+
+  size_t size() const {
+    return (q_front_ - q_back_ + kFreeQueueSize) % kFreeQueueSize;
+  }
+
+ private:
+  // Maximum number of blocks kept in the free queue before being freed.
+  static const int kFreeQueueSize = 1024;
+
+  QueueEntry q_[kFreeQueueSize];
+  int q_front_;
+  int q_back_;
+};
+
+struct MallocBlockQueueEntry {
+  MallocBlockQueueEntry() : block(NULL), size(0),
+                            num_deleter_pcs(0), deleter_threadid(0) {}
+  MallocBlockQueueEntry(MallocBlock* b, size_t s) : block(b), size(s) {
+    if (FLAGS_max_free_queue_size != 0 && b != NULL) {
+      // Adjust the number of frames to skip (4) if you change the
+      // location of this call.
+      num_deleter_pcs =
+          GetStackTrace(deleter_pcs,
+                        sizeof(deleter_pcs) / sizeof(deleter_pcs[0]),
+                        4);
+      deleter_threadid = pthread_self();
+    } else {
+      num_deleter_pcs = 0;
+      // Zero is an illegal pthread id by my reading of the pthread
+      // implementation:
+      deleter_threadid = 0;
+    }
+  }
+
+  MallocBlock* block;
+  size_t size;
+
+  // When deleted and put in the free queue, we (flag-controlled)
+  // record the stack so that if corruption is later found, we can
+  // print the deleter's stack.  (These three vars add 144 bytes of
+  // overhead under the LP64 data model.)
+  void* deleter_pcs[16];
+  int num_deleter_pcs;
+  pthread_t deleter_threadid;
+};
+
+class MallocBlock {
+ public:  // allocation type constants
+
+  // Different allocation types we distinguish.
+  // Note: The lower 4 bits are not random: we index kAllocName array
+  // by these values masked with kAllocTypeMask;
+  // the rest are "random" magic bits to help catch memory corruption.
+  static const int kMallocType = 0xEFCDAB90;
+  static const int kNewType = 0xFEBADC81;
+  static const int kArrayNewType = 0xBCEADF72;
+
+ private:  // constants
+
+  // A mask used on alloc types above to get to 0, 1, 2
+  static const int kAllocTypeMask = 0x3;
+  // An additional bit to set in AllocType constants
+  // to mark now deallocated regions.
+  static const int kDeallocatedTypeBit = 0x4;
+
+  // For better memory debugging, we initialize all storage to known
+  // values, and overwrite the storage when it's deallocated:
+  // Byte that fills uninitialized storage.
+  static const int kMagicUninitializedByte = 0xAB;
+  // Byte that fills deallocated storage.
+  // NOTE: tcmalloc.cc depends on the value of kMagicDeletedByte
+  //       to work around a bug in the pthread library.
+  static const int kMagicDeletedByte = 0xCD;
+  // A size_t (type of alloc_type_ below) in a deallocated storage
+  // filled with kMagicDeletedByte.
+  static const size_t kMagicDeletedSizeT =
+      0xCDCDCDCD | (((size_t)0xCDCDCDCD << 16) << 16);
+    // Initializer works for 32 and 64 bit size_ts;
+    // "<< 16 << 16" is to fool gcc from issuing a warning
+    // when size_ts are 32 bits.
+
+  // NOTE: on Linux, you can enable malloc debugging support in libc by
+  // setting the environment variable MALLOC_CHECK_ to 1 before you
+  // start the program (see man malloc).
+
+  // We use either do_malloc or mmap to make the actual allocation. In
+  // order to remember which one of the two was used for any block, we store an
+  // appropriate magic word next to the block.
+  static const int kMagicMalloc = 0xDEADBEEF;
+  static const int kMagicMMap = 0xABCDEFAB;
+
+  // This array will be filled with 0xCD, for use with memcmp.
+  static unsigned char kMagicDeletedBuffer[1024];
+  static pthread_once_t deleted_buffer_initialized_;
+  static bool deleted_buffer_initialized_no_pthreads_;
+
+ private:  // data layout
+
+                    // The four fields size1_,offset_,magic1_,alloc_type_
+                    // should together occupy a multiple of 16 bytes. (At the
+                    // moment, sizeof(size_t) == 4 or 8 depending on piii vs
+                    // k8, and 4 of those sum to 16 or 32 bytes).
+                    // This, combined with do_malloc's alignment guarantees,
+                    // ensures that SSE types can be stored into the returned
+                    // block, at &size2_.
+  size_t size1_;
+  size_t offset_;   // normally 0 unless memaligned memory
+                    // see comments in memalign() and FromRawPointer().
+  size_t magic1_;
+  size_t alloc_type_;
+  // here comes the actual data (variable length)
+  // ...
+  // then come the size2_ and magic2_, or a full page of mprotect-ed memory
+  // if the malloc_page_fence feature is enabled.
+  size_t size2_;
+  int magic2_;
+
+ private:  // static data and helpers
+
+  // Allocation map: stores the allocation type for each allocated object,
+  // or the type or'ed with kDeallocatedTypeBit
+  // for each formerly allocated object.
+  typedef AddressMap<int> AllocMap;
+  static AllocMap* alloc_map_;
+  // This protects alloc_map_ and consistent state of metadata
+  // for each still-allocated object in it.
+  // We use spin locks instead of pthread_mutex_t locks
+  // to prevent crashes via calls to pthread_mutex_(un)lock
+  // for the (de)allocations coming from pthreads initialization itself.
+  static SpinLock alloc_map_lock_;
+
+  // A queue of freed blocks.  Instead of releasing blocks to the allocator
+  // immediately, we put them in a queue, freeing them only when necessary
+  // to keep the total size of all the freed blocks below the limit set by
+  // FLAGS_max_free_queue_size.
+  static FreeQueue<MallocBlockQueueEntry>* free_queue_;
+
+  static size_t free_queue_size_;  // total size of blocks in free_queue_
+  // protects free_queue_ and free_queue_size_
+  static SpinLock free_queue_lock_;
+
+  // Names of allocation types (kMallocType, kNewType, kArrayNewType)
+  static const char* const kAllocName[];
+  // Names of corresponding deallocation types
+  static const char* const kDeallocName[];
+
+  static const char* AllocName(int type) {
+    return kAllocName[type & kAllocTypeMask];
+  }
+
+  static const char* DeallocName(int type) {
+    return kDeallocName[type & kAllocTypeMask];
+  }
+
+ private:  // helper accessors
+
+  bool IsMMapped() const { return kMagicMMap == magic1_; }
+
+  bool IsValidMagicValue(int value) const {
+    return kMagicMMap == value  ||  kMagicMalloc == value;
+  }
+
+  static size_t real_malloced_size(size_t size) {
+    return size + sizeof(MallocBlock);
+  }
+
+  /*
+   * Here we assume size of page is kMinAlign aligned,
+   * so if size is MALLOC_ALIGNMENT aligned too, then we could
+   * guarantee return address is also kMinAlign aligned, because
+   * mmap return address at nearby page boundary on Linux.
+   */
+  static size_t real_mmapped_size(size_t size) {
+    size_t tmp = size + MallocBlock::data_offset();
+    tmp = RoundUp(tmp, kMinAlign);
+    return tmp;
+  }
+
+  size_t real_size() {
+    return IsMMapped() ? real_mmapped_size(size1_) : real_malloced_size(size1_);
+  }
+
+  // NOTE: if the block is mmapped (that is, we're using the
+  // malloc_page_fence option) then there's no size2 or magic2
+  // (instead, the guard page begins where size2 would be).
+
+  size_t* size2_addr() { return (size_t*)((char*)&size2_ + size1_); }
+  const size_t* size2_addr() const {
+    return (const size_t*)((char*)&size2_ + size1_);
+  }
+
+  int* magic2_addr() { return (int*)(size2_addr() + 1); }
+  const int* magic2_addr() const { return (const int*)(size2_addr() + 1); }
+
+ private:  // other helpers
+
+  void Initialize(size_t size, int type) {
+    RAW_CHECK(IsValidMagicValue(magic1_), "");
+    // record us as allocated in the map
+    alloc_map_lock_.Lock();
+    if (!alloc_map_) {
+      void* p = do_malloc(sizeof(AllocMap));
+      alloc_map_ = new(p) AllocMap(do_malloc, do_free);
+    }
+    alloc_map_->Insert(data_addr(), type);
+    // initialize us
+    size1_ = size;
+    offset_ = 0;
+    alloc_type_ = type;
+    if (!IsMMapped()) {
+      *magic2_addr() = magic1_;
+      *size2_addr() = size;
+    }
+    alloc_map_lock_.Unlock();
+    memset(data_addr(), kMagicUninitializedByte, size);
+    if (!IsMMapped()) {
+      RAW_CHECK(size1_ == *size2_addr(), "should hold");
+      RAW_CHECK(magic1_ == *magic2_addr(), "should hold");
+    }
+  }
+
+  size_t CheckAndClear(int type) {
+    alloc_map_lock_.Lock();
+    CheckLocked(type);
+    if (!IsMMapped()) {
+      RAW_CHECK(size1_ == *size2_addr(), "should hold");
+    }
+    // record us as deallocated in the map
+    alloc_map_->Insert(data_addr(), type | kDeallocatedTypeBit);
+    alloc_map_lock_.Unlock();
+    // clear us
+    const size_t size = real_size();
+    memset(this, kMagicDeletedByte, size);
+    return size;
+  }
+
+  void CheckLocked(int type) const {
+    int map_type = 0;
+    const int* found_type =
+      alloc_map_ != NULL ? alloc_map_->Find(data_addr()) : NULL;
+    if (found_type == NULL) {
+      RAW_LOG(FATAL, "memory allocation bug: object at %p "
+                     "has never been allocated", data_addr());
+    } else {
+      map_type = *found_type;
+    }
+    if ((map_type & kDeallocatedTypeBit) != 0) {
+      RAW_LOG(FATAL, "memory allocation bug: object at %p "
+                     "has been already deallocated (it was allocated with %s)",
+                     data_addr(), AllocName(map_type & ~kDeallocatedTypeBit));
+    }
+    if (alloc_type_ == kMagicDeletedSizeT) {
+      RAW_LOG(FATAL, "memory stomping bug: a word before object at %p "
+                     "has been corrupted; or else the object has been already "
+                     "deallocated and our memory map has been corrupted",
+                     data_addr());
+    }
+    if (!IsValidMagicValue(magic1_)) {
+      RAW_LOG(FATAL, "memory stomping bug: a word before object at %p "
+                     "has been corrupted; "
+                     "or else our memory map has been corrupted and this is a "
+                     "deallocation for not (currently) heap-allocated object",
+                     data_addr());
+    }
+    if (!IsMMapped()) {
+      if (size1_ != *size2_addr()) {
+        RAW_LOG(FATAL, "memory stomping bug: a word after object at %p "
+                       "has been corrupted", data_addr());
+      }
+      if (!IsValidMagicValue(*magic2_addr())) {
+        RAW_LOG(FATAL, "memory stomping bug: a word after object at %p "
+                "has been corrupted", data_addr());
+      }
+    }
+    if (alloc_type_ != type) {
+      if ((alloc_type_ != MallocBlock::kMallocType) &&
+          (alloc_type_ != MallocBlock::kNewType)    &&
+          (alloc_type_ != MallocBlock::kArrayNewType)) {
+        RAW_LOG(FATAL, "memory stomping bug: a word before object at %p "
+                       "has been corrupted", data_addr());
+      }
+      RAW_LOG(FATAL, "memory allocation/deallocation mismatch at %p: "
+                     "allocated with %s being deallocated with %s",
+                     data_addr(), AllocName(alloc_type_), DeallocName(type));
+    }
+    if (alloc_type_ != map_type) {
+      RAW_LOG(FATAL, "memory stomping bug: our memory map has been corrupted : "
+                     "allocation at %p made with %s "
+                     "is recorded in the map to be made with %s",
+                     data_addr(), AllocName(alloc_type_),  AllocName(map_type));
+    }
+  }
+
+ public:  // public accessors
+
+  void* data_addr() { return (void*)&size2_; }
+  const void* data_addr() const { return (const void*)&size2_; }
+
+  static size_t data_offset() { return OFFSETOF_MEMBER(MallocBlock, size2_); }
+
+  size_t data_size() const { return size1_; }
+
+  void set_offset(int offset) { this->offset_ = offset; }
+
+ public:  // our main interface
+
+  static MallocBlock* Allocate(size_t size, int type) {
+    // Prevent an integer overflow / crash with large allocation sizes.
+    // TODO - Note that for a e.g. 64-bit size_t, max_size_t may not actually
+    // be the maximum value, depending on how the compiler treats ~0. The worst
+    // practical effect is that allocations are limited to 4Gb or so, even if
+    // the address space could take more.
+    static size_t max_size_t = ~0;
+    if (size > max_size_t - sizeof(MallocBlock)) {
+      RAW_LOG(ERROR, "Massive size passed to malloc: %" PRIuS "", size);
+      return NULL;
+    }
+    MallocBlock* b = NULL;
+    const bool use_malloc_page_fence = FLAGS_malloc_page_fence;
+#ifdef HAVE_MMAP
+    if (use_malloc_page_fence) {
+      // Put the block towards the end of the page and make the next page
+      // inaccessible. This will catch buffer overrun right when it happens.
+      size_t sz = real_mmapped_size(size);
+      int pagesize = getpagesize();
+      int num_pages = (sz + pagesize - 1) / pagesize + 1;
+      char* p = (char*) mmap(NULL, num_pages * pagesize, PROT_READ|PROT_WRITE,
+                             MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+      if (p == MAP_FAILED) {
+        // If the allocation fails, abort rather than returning NULL to
+        // malloc. This is because in most cases, the program will run out
+        // of memory in this mode due to tremendous amount of wastage. There
+        // is no point in propagating the error elsewhere.
+        RAW_LOG(FATAL, "Out of memory: possibly due to page fence overhead: %s",
+                strerror(errno));
+      }
+      // Mark the page after the block inaccessible
+      if (mprotect(p + (num_pages - 1) * pagesize, pagesize, PROT_NONE)) {
+        RAW_LOG(FATAL, "Guard page setup failed: %s", strerror(errno));
+      }
+      b = (MallocBlock*) (p + (num_pages - 1) * pagesize - sz);
+    } else {
+      b = (MallocBlock*) do_malloc(real_malloced_size(size));
+    }
+#else
+    b = (MallocBlock*) do_malloc(real_malloced_size(size));
+#endif
+
+    // It would be nice to output a diagnostic on allocation failure
+    // here, but logging (other than FATAL) requires allocating
+    // memory, which could trigger a nasty recursion. Instead, preserve
+    // malloc semantics and return NULL on failure.
+    if (b != NULL) {
+      b->magic1_ = use_malloc_page_fence ? kMagicMMap : kMagicMalloc;
+      b->Initialize(size, type);
+    }
+    return b;
+  }
+
+  void Deallocate(int type) {
+    if (IsMMapped()) {  // have to do this before CheckAndClear
+#ifdef HAVE_MMAP
+      int size = CheckAndClear(type);
+      int pagesize = getpagesize();
+      int num_pages = (size + pagesize - 1) / pagesize + 1;
+      char* p = (char*) this;
+      if (FLAGS_malloc_page_fence_never_reclaim  ||
+          !FLAGS_malloc_reclaim_memory) {
+        mprotect(p - (num_pages - 1) * pagesize + size,
+                 num_pages * pagesize, PROT_NONE);
+      } else {
+        munmap(p - (num_pages - 1) * pagesize + size, num_pages * pagesize);
+      }
+#endif
+    } else {
+      const size_t size = CheckAndClear(type);
+      if (FLAGS_malloc_reclaim_memory) {
+        // Instead of freeing the block immediately, push it onto a queue of
+        // recently freed blocks.  Free only enough blocks to keep from
+        // exceeding the capacity of the queue or causing the total amount of
+        // un-released memory in the queue from exceeding
+        // FLAGS_max_free_queue_size.
+        ProcessFreeQueue(this, size, FLAGS_max_free_queue_size);
+      }
+    }
+  }
+
+  static size_t FreeQueueSize() {
+    SpinLockHolder l(&free_queue_lock_);
+    return free_queue_size_;
+  }
+
+  static void ProcessFreeQueue(MallocBlock* b, size_t size,
+                               int max_free_queue_size) {
+    // MallocBlockQueueEntry are about 144 in size, so we can only
+    // use a small array of them on the stack.
+    MallocBlockQueueEntry entries[4];
+    int num_entries = 0;
+    MallocBlockQueueEntry new_entry(b, size);
+    free_queue_lock_.Lock();
+    if (free_queue_ == NULL)
+      free_queue_ = new FreeQueue<MallocBlockQueueEntry>;
+    RAW_CHECK(!free_queue_->Full(), "Free queue mustn't be full!");
+
+    if (b != NULL) {
+      free_queue_size_ += size + sizeof(MallocBlockQueueEntry);
+      free_queue_->Push(new_entry);
+    }
+
+    // Free blocks until the total size of unfreed blocks no longer exceeds
+    // max_free_queue_size, and the free queue has at least one free
+    // space in it.
+    while (free_queue_size_ > max_free_queue_size || free_queue_->Full()) {
+      RAW_CHECK(num_entries < arraysize(entries), "entries array overflow");
+      entries[num_entries] = free_queue_->Pop();
+      free_queue_size_ -=
+          entries[num_entries].size + sizeof(MallocBlockQueueEntry);
+      num_entries++;
+      if (num_entries == arraysize(entries)) {
+        // The queue will not be full at this point, so it is ok to
+        // release the lock.  The queue may still contain more than
+        // max_free_queue_size, but this is not a strict invariant.
+        free_queue_lock_.Unlock();
+        for (int i = 0; i < num_entries; i++) {
+          CheckForDanglingWrites(entries[i]);
+          do_free(entries[i].block);
+        }
+        num_entries = 0;
+        free_queue_lock_.Lock();
+      }
+    }
+    RAW_CHECK(free_queue_size_ >= 0, "Free queue size went negative!");
+    free_queue_lock_.Unlock();
+    for (int i = 0; i < num_entries; i++) {
+      CheckForDanglingWrites(entries[i]);
+      do_free(entries[i].block);
+    }
+  }
+
+  static void InitDeletedBuffer() {
+    memset(kMagicDeletedBuffer, kMagicDeletedByte, sizeof(kMagicDeletedBuffer));
+    deleted_buffer_initialized_no_pthreads_ = true;
+  }
+
+  static void CheckForDanglingWrites(const MallocBlockQueueEntry& queue_entry) {
+    // Initialize the buffer if necessary.
+    if (pthread_once)
+      pthread_once(&deleted_buffer_initialized_, &InitDeletedBuffer);
+    if (!deleted_buffer_initialized_no_pthreads_) {
+      // This will be the case on systems that don't link in pthreads,
+      // including on FreeBSD where pthread_once has a non-zero address
+      // (but doesn't do anything) even when pthreads isn't linked in.
+      InitDeletedBuffer();
+    }
+
+    const unsigned char* p =
+        reinterpret_cast<unsigned char*>(queue_entry.block);
+
+    static const size_t size_of_buffer = sizeof(kMagicDeletedBuffer);
+    const size_t size = queue_entry.size;
+    const size_t buffers = size / size_of_buffer;
+    const size_t remainder = size % size_of_buffer;
+    size_t buffer_idx;
+    for (buffer_idx = 0; buffer_idx < buffers; ++buffer_idx) {
+      CheckForCorruptedBuffer(queue_entry, buffer_idx, p, size_of_buffer);
+      p += size_of_buffer;
+    }
+    CheckForCorruptedBuffer(queue_entry, buffer_idx, p, remainder);
+  }
+
+  static void CheckForCorruptedBuffer(const MallocBlockQueueEntry& queue_entry,
+                                      size_t buffer_idx,
+                                      const unsigned char* buffer,
+                                      size_t size_of_buffer) {
+    if (memcmp(buffer, kMagicDeletedBuffer, size_of_buffer) == 0) {
+      return;
+    }
+
+    RAW_LOG(ERROR,
+            "Found a corrupted memory buffer in MallocBlock (may be offset "
+            "from user ptr): buffer index: %zd, buffer ptr: %p, size of "
+            "buffer: %zd", buffer_idx, buffer, size_of_buffer);
+
+    // The magic deleted buffer should only be 1024 bytes, but in case
+    // this changes, let's put an upper limit on the number of debug
+    // lines we'll output:
+    if (size_of_buffer <= 1024) {
+      for (int i = 0; i < size_of_buffer; ++i) {
+        if (buffer[i] != kMagicDeletedByte) {
+          RAW_LOG(ERROR, "Buffer byte %d is 0x%02x (should be 0x%02x).",
+                  i, buffer[i], kMagicDeletedByte);
+        }
+      }
+    } else {
+      RAW_LOG(ERROR, "Buffer too large to print corruption.");
+    }
+
+    const MallocBlock* b = queue_entry.block;
+    const size_t size = queue_entry.size;
+    if (queue_entry.num_deleter_pcs > 0) {
+      TracePrintf(STDERR_FILENO, "Deleted by thread %p\n",
+                  reinterpret_cast<void*>(
+                      PRINTABLE_PTHREAD(queue_entry.deleter_threadid)));
+
+      // We don't want to allocate or deallocate memory here, so we use
+      // placement-new.  It's ok that we don't destroy this, since we're
+      // just going to error-exit below anyway.  Union is for alignment.
+      union { void* alignment; char buf[sizeof(SymbolTable)]; } tablebuf;
+      SymbolTable* symbolization_table = new (tablebuf.buf) SymbolTable;
+      for (int i = 0; i < queue_entry.num_deleter_pcs; i++) {
+        // Symbolizes the previous address of pc because pc may be in the
+        // next function.  This may happen when the function ends with
+        // a call to a function annotated noreturn (e.g. CHECK).
+        char *pc = reinterpret_cast<char*>(queue_entry.deleter_pcs[i]);
+        symbolization_table->Add(pc - 1);
+      }
+      if (FLAGS_symbolize_stacktrace)
+        symbolization_table->Symbolize();
+      for (int i = 0; i < queue_entry.num_deleter_pcs; i++) {
+        char *pc = reinterpret_cast<char*>(queue_entry.deleter_pcs[i]);
+        TracePrintf(STDERR_FILENO, "    @ %p %s\n",
+                    pc, symbolization_table->GetSymbol(pc - 1));
+      }
+    } else {
+      RAW_LOG(ERROR,
+              "Skipping the printing of the deleter's stack!  Its stack was "
+              "not found; either the corruption occurred too early in "
+              "execution to obtain a stack trace or --max_free_queue_size was "
+              "set to 0.");
+    }
+
+    RAW_LOG(FATAL,
+            "Memory was written to after being freed.  MallocBlock: %p, user "
+            "ptr: %p, size: %zd.  If you can't find the source of the error, "
+            "try using ASan (http://code.google.com/p/address-sanitizer/), "
+            "Valgrind, or Purify, or study the "
+            "output of the deleter's stack printed above.",
+            b, b->data_addr(), size);
+  }
+
+  static MallocBlock* FromRawPointer(void* p) {
+    const size_t data_offset = MallocBlock::data_offset();
+    // Find the header just before client's memory.
+    MallocBlock *mb = reinterpret_cast<MallocBlock *>(
+                reinterpret_cast<char *>(p) - data_offset);
+    // If mb->alloc_type_ is kMagicDeletedSizeT, we're not an ok pointer.
+    if (mb->alloc_type_ == kMagicDeletedSizeT) {
+      RAW_LOG(FATAL, "memory allocation bug: object at %p has been already"
+                     " deallocated; or else a word before the object has been"
+                     " corrupted (memory stomping bug)", p);
+    }
+    // If mb->offset_ is zero (common case), mb is the real header.
+    // If mb->offset_ is non-zero, this block was allocated by debug
+    // memallign implementation, and mb->offset_ is the distance
+    // backwards to the real header from mb, which is a fake header.
+    if (mb->offset_ == 0) {
+      return mb;
+    }
+
+    MallocBlock *main_block = reinterpret_cast<MallocBlock *>(
+      reinterpret_cast<char *>(mb) - mb->offset_);
+
+    if (main_block->offset_ != 0) {
+      RAW_LOG(FATAL, "memory corruption bug: offset_ field is corrupted."
+              " Need 0 but got %x",
+              (unsigned)(main_block->offset_));
+    }
+    if (main_block >= p) {
+      RAW_LOG(FATAL, "memory corruption bug: offset_ field is corrupted."
+              " Detected main_block address overflow: %x",
+              (unsigned)(mb->offset_));
+    }
+    if (main_block->size2_addr() < p) {
+      RAW_LOG(FATAL, "memory corruption bug: offset_ field is corrupted."
+              " It points below it's own main_block: %x",
+              (unsigned)(mb->offset_));
+    }
+
+    return main_block;
+  }
+
+  static const MallocBlock* FromRawPointer(const void* p) {
+    // const-safe version: we just cast about
+    return FromRawPointer(const_cast<void*>(p));
+  }
+
+  void Check(int type) const {
+    alloc_map_lock_.Lock();
+    CheckLocked(type);
+    alloc_map_lock_.Unlock();
+  }
+
+  static bool CheckEverything() {
+    alloc_map_lock_.Lock();
+    if (alloc_map_ != NULL)  alloc_map_->Iterate(CheckCallback, 0);
+    alloc_map_lock_.Unlock();
+    return true;  // if we get here, we're okay
+  }
+
+  static bool MemoryStats(int* blocks, size_t* total,
+                          int histogram[kMallocHistogramSize]) {
+    memset(histogram, 0, kMallocHistogramSize * sizeof(int));
+    alloc_map_lock_.Lock();
+    stats_blocks_ = 0;
+    stats_total_ = 0;
+    stats_histogram_ = histogram;
+    if (alloc_map_ != NULL) alloc_map_->Iterate(StatsCallback, 0);
+    *blocks = stats_blocks_;
+    *total = stats_total_;
+    alloc_map_lock_.Unlock();
+    return true;
+  }
+
+ private:  // helpers for CheckEverything and MemoryStats
+
+  static void CheckCallback(const void* ptr, int* type, int dummy) {
+    if ((*type & kDeallocatedTypeBit) == 0) {
+      FromRawPointer(ptr)->CheckLocked(*type);
+    }
+  }
+
+  // Accumulation variables for StatsCallback protected by alloc_map_lock_
+  static int stats_blocks_;
+  static size_t stats_total_;
+  static int* stats_histogram_;
+
+  static void StatsCallback(const void* ptr, int* type, int dummy) {
+    if ((*type & kDeallocatedTypeBit) == 0) {
+      const MallocBlock* b = FromRawPointer(ptr);
+      b->CheckLocked(*type);
+      ++stats_blocks_;
+      size_t mysize = b->size1_;
+      int entry = 0;
+      stats_total_ += mysize;
+      while (mysize) {
+        ++entry;
+        mysize >>= 1;
+      }
+      RAW_CHECK(entry < kMallocHistogramSize,
+                "kMallocHistogramSize should be at least as large as log2 "
+                "of the maximum process memory size");
+      stats_histogram_[entry] += 1;
+    }
+  }
+};
+
+void DanglingWriteChecker() {
+  // Clear out the remaining free queue to check for dangling writes.
+  MallocBlock::ProcessFreeQueue(NULL, 0, 0);
+}
+
+// ========================================================================= //
+
+const int MallocBlock::kMagicMalloc;
+const int MallocBlock::kMagicMMap;
+
+MallocBlock::AllocMap* MallocBlock::alloc_map_ = NULL;
+SpinLock MallocBlock::alloc_map_lock_(SpinLock::LINKER_INITIALIZED);
+
+FreeQueue<MallocBlockQueueEntry>* MallocBlock::free_queue_ = NULL;
+size_t MallocBlock::free_queue_size_ = 0;
+SpinLock MallocBlock::free_queue_lock_(SpinLock::LINKER_INITIALIZED);
+
+unsigned char MallocBlock::kMagicDeletedBuffer[1024];
+pthread_once_t MallocBlock::deleted_buffer_initialized_ = PTHREAD_ONCE_INIT;
+bool MallocBlock::deleted_buffer_initialized_no_pthreads_ = false;
+
+const char* const MallocBlock::kAllocName[] = {
+  "malloc",
+  "new",
+  "new []",
+  NULL,
+};
+
+const char* const MallocBlock::kDeallocName[] = {
+  "free",
+  "delete",
+  "delete []",
+  NULL,
+};
+
+int MallocBlock::stats_blocks_;
+size_t MallocBlock::stats_total_;
+int* MallocBlock::stats_histogram_;
+
+// ========================================================================= //
+
+// The following cut-down version of printf() avoids
+// using stdio or ostreams.
+// This is to guarantee no recursive calls into
+// the allocator and to bound the stack space consumed.  (The pthread
+// manager thread in linuxthreads has a very small stack,
+// so fprintf can't be called.)
+static void TracePrintf(int fd, const char *fmt, ...) {
+  char buf[64];
+  int i = 0;
+  va_list ap;
+  va_start(ap, fmt);
+  const char *p = fmt;
+  char numbuf[25];
+  if (fd < 0) {
+    return;
+  }
+  numbuf[sizeof(numbuf)-1] = 0;
+  while (*p != '\0') {              // until end of format string
+    char *s = &numbuf[sizeof(numbuf)-1];
+    if (p[0] == '%' && p[1] != 0) {  // handle % formats
+      int64 l = 0;
+      unsigned long base = 0;
+      if (*++p == 's') {                            // %s
+        s = va_arg(ap, char *);
+      } else if (*p == 'l' && p[1] == 'd') {        // %ld
+        l = va_arg(ap, long);
+        base = 10;
+        p++;
+      } else if (*p == 'l' && p[1] == 'u') {        // %lu
+        l = va_arg(ap, unsigned long);
+        base = 10;
+        p++;
+      } else if (*p == 'z' && p[1] == 'u') {        // %zu
+        l = va_arg(ap, size_t);
+        base = 10;
+        p++;
+      } else if (*p == 'u') {                       // %u
+        l = va_arg(ap, unsigned int);
+        base = 10;
+      } else if (*p == 'd') {                       // %d
+        l = va_arg(ap, int);
+        base = 10;
+      } else if (*p == 'p') {                       // %p
+        l = va_arg(ap, intptr_t);
+        base = 16;
+      } else {
+        write(STDERR_FILENO, "Unimplemented TracePrintf format\n", 33);
+        write(STDERR_FILENO, p, 2);
+        write(STDERR_FILENO, "\n", 1);
+        abort();
+      }
+      p++;
+      if (base != 0) {
+        bool minus = (l < 0 && base == 10);
+        uint64 ul = minus? -l : l;
+        do {
+          *--s = "0123456789abcdef"[ul % base];
+          ul /= base;
+        } while (ul != 0);
+        if (base == 16) {
+          *--s = 'x';
+          *--s = '0';
+        } else if (minus) {
+          *--s = '-';
+        }
+      }
+    } else {                        // handle normal characters
+      *--s = *p++;
+    }
+    while (*s != 0) {
+      if (i == sizeof(buf)) {
+        write(fd, buf, i);
+        i = 0;
+      }
+      buf[i++] = *s++;
+    }
+  }
+  if (i != 0) {
+    write(fd, buf, i);
+  }
+  va_end(ap);
+}
+
+// Return the file descriptor we're writing a log to
+static int TraceFd() {
+  static int trace_fd = -1;
+  if (trace_fd == -1) {            // Open the trace file on the first call
+    const char *val = getenv("TCMALLOC_TRACE_FILE");
+    bool fallback_to_stderr = false;
+    if (!val) {
+      val = "/tmp/google.alloc";
+      fallback_to_stderr = true;
+    }
+    trace_fd = open(val, O_CREAT|O_TRUNC|O_WRONLY, 0666);
+    if (trace_fd == -1) {
+      if (fallback_to_stderr) {
+        trace_fd = 2;
+        TracePrintf(trace_fd, "Can't open %s.  Logging to stderr.\n", val);
+      } else {
+        TracePrintf(2, "Can't open %s.  Logging disabled.\n", val);
+      }
+    }
+    // Add a header to the log.
+    TracePrintf(trace_fd, "Trace started: %lu\n",
+                static_cast<unsigned long>(time(NULL)));
+    TracePrintf(trace_fd,
+                "func\tsize\tptr\tthread_id\tstack pcs for tools/symbolize\n");
+  }
+  return trace_fd;
+}
+
+// Print the hex stack dump on a single line.   PCs are separated by tabs.
+static void TraceStack(void) {
+  void *pcs[16];
+  int n = GetStackTrace(pcs, sizeof(pcs)/sizeof(pcs[0]), 0);
+  for (int i = 0; i != n; i++) {
+    TracePrintf(TraceFd(), "\t%p", pcs[i]);
+  }
+}
+
+// This protects MALLOC_TRACE, to make sure its info is atomically written.
+static SpinLock malloc_trace_lock(SpinLock::LINKER_INITIALIZED);
+
+#define MALLOC_TRACE(name, size, addr)                                  \
+  do {                                                                  \
+    if (FLAGS_malloctrace) {                                            \
+      SpinLockHolder l(&malloc_trace_lock);                             \
+      TracePrintf(TraceFd(), "%s\t%" PRIuS "\t%p\t%" GPRIuPTHREAD,      \
+                  name, size, addr, PRINTABLE_PTHREAD(pthread_self())); \
+      TraceStack();                                                     \
+      TracePrintf(TraceFd(), "\n");                                     \
+    }                                                                   \
+  } while (0)
+
+// ========================================================================= //
+
+// Write the characters buf[0, ..., size-1] to
+// the malloc trace buffer.
+// This function is intended for debugging,
+// and is not declared in any header file.
+// You must insert a declaration of it by hand when you need
+// to use it.
+void __malloctrace_write(const char *buf, size_t size) {
+  if (FLAGS_malloctrace) {
+    write(TraceFd(), buf, size);
+  }
+}
+
+// ========================================================================= //
+
+// General debug allocation/deallocation
+
+static inline void* DebugAllocate(size_t size, int type) {
+  MallocBlock* ptr = MallocBlock::Allocate(size, type);
+  if (ptr == NULL)  return NULL;
+  MALLOC_TRACE("malloc", size, ptr->data_addr());
+  return ptr->data_addr();
+}
+
+static inline void DebugDeallocate(void* ptr, int type) {
+  MALLOC_TRACE("free",
+               (ptr != 0 ? MallocBlock::FromRawPointer(ptr)->data_size() : 0),
+               ptr);
+  if (ptr)  MallocBlock::FromRawPointer(ptr)->Deallocate(type);
+}
+
+// ========================================================================= //
+
+// The following functions may be called via MallocExtension::instance()
+// for memory verification and statistics.
+class DebugMallocImplementation : public TCMallocImplementation {
+ public:
+  virtual bool GetNumericProperty(const char* name, size_t* value) {
+    bool result = TCMallocImplementation::GetNumericProperty(name, value);
+    if (result && (strcmp(name, "generic.current_allocated_bytes") == 0)) {
+      // Subtract bytes kept in the free queue
+      size_t qsize = MallocBlock::FreeQueueSize();
+      if (*value >= qsize) {
+        *value -= qsize;
+      }
+    }
+    return result;
+  }
+
+  virtual bool VerifyNewMemory(const void* p) {
+    if (p)  MallocBlock::FromRawPointer(p)->Check(MallocBlock::kNewType);
+    return true;
+  }
+
+  virtual bool VerifyArrayNewMemory(const void* p) {
+    if (p)  MallocBlock::FromRawPointer(p)->Check(MallocBlock::kArrayNewType);
+    return true;
+  }
+
+  virtual bool VerifyMallocMemory(const void* p) {
+    if (p)  MallocBlock::FromRawPointer(p)->Check(MallocBlock::kMallocType);
+    return true;
+  }
+
+  virtual bool VerifyAllMemory() {
+    return MallocBlock::CheckEverything();
+  }
+
+  virtual bool MallocMemoryStats(int* blocks, size_t* total,
+                                 int histogram[kMallocHistogramSize]) {
+    return MallocBlock::MemoryStats(blocks, total, histogram);
+  }
+
+  virtual size_t GetEstimatedAllocatedSize(size_t size) {
+    return size;
+  }
+
+  virtual size_t GetAllocatedSize(const void* p) {
+    if (p) {
+      RAW_CHECK(GetOwnership(p) != MallocExtension::kNotOwned,
+                "ptr not allocated by tcmalloc");
+      return MallocBlock::FromRawPointer(p)->data_size();
+    }
+    return 0;
+  }
+
+  virtual MallocExtension::Ownership GetOwnership(const void* p) {
+    if (!p) {
+      // nobody owns NULL
+      return MallocExtension::kNotOwned;
+    }
+
+    // FIXME: note that correct GetOwnership should not touch memory
+    // that is not owned by tcmalloc. Main implementation is using
+    // pagemap to discover if page in question is owned by us or
+    // not. But pagemap only has marks for first and last page of
+    // spans.  Note that if p was returned out of our memalign with
+    // big alignment, then it will point outside of marked pages. Also
+    // note that FromRawPointer call below requires touching memory
+    // before pointer in order to handle memalign-ed chunks
+    // (offset_). This leaves us with two options:
+    //
+    // * do FromRawPointer first and have possibility of crashing if
+    //   we're given not owned pointer
+    //
+    // * return incorrect ownership for those large memalign chunks
+    //
+    // I've decided to choose later, which appears to happen rarer and
+    // therefore is arguably a lesser evil
+
+    MallocExtension::Ownership rv = TCMallocImplementation::GetOwnership(p);
+    if (rv != MallocExtension::kOwned) {
+      return rv;
+    }
+
+    const MallocBlock* mb = MallocBlock::FromRawPointer(p);
+    return TCMallocImplementation::GetOwnership(mb);
+  }
+
+  virtual void GetFreeListSizes(vector<MallocExtension::FreeListInfo>* v) {
+    static const char* kDebugFreeQueue = "debug.free_queue";
+
+    TCMallocImplementation::GetFreeListSizes(v);
+
+    MallocExtension::FreeListInfo i;
+    i.type = kDebugFreeQueue;
+    i.min_object_size = 0;
+    i.max_object_size = numeric_limits<size_t>::max();
+    i.total_bytes_free = MallocBlock::FreeQueueSize();
+    v->push_back(i);
+  }
+
+ };
+
+static union {
+  char chars[sizeof(DebugMallocImplementation)];
+  void *ptr;
+} debug_malloc_implementation_space;
+
+REGISTER_MODULE_INITIALIZER(debugallocation, {
+#if (__cplusplus >= 201103L)
+    COMPILE_ASSERT(alignof(debug_malloc_implementation_space) >= alignof(DebugMallocImplementation),
+                   debug_malloc_implementation_space_is_not_properly_aligned);
+#endif
+  // Either we or valgrind will control memory management.  We
+  // register our extension if we're the winner. Otherwise let
+  // Valgrind use its own malloc (so don't register our extension).
+  if (!RunningOnValgrind()) {
+    DebugMallocImplementation *impl = new (debug_malloc_implementation_space.chars) DebugMallocImplementation();
+    MallocExtension::Register(impl);
+  }
+});
+
+REGISTER_MODULE_DESTRUCTOR(debugallocation, {
+  if (!RunningOnValgrind()) {
+    // When the program exits, check all blocks still in the free
+    // queue for corruption.
+    DanglingWriteChecker();
+  }
+});
+
+// ========================================================================= //
+
+struct debug_alloc_retry_data {
+  size_t size;
+  int new_type;
+};
+
+static void *retry_debug_allocate(void *arg) {
+  debug_alloc_retry_data *data = static_cast<debug_alloc_retry_data *>(arg);
+  return DebugAllocate(data->size, data->new_type);
+}
+
+// This is mostly the same a cpp_alloc in tcmalloc.cc.
+// TODO(csilvers): change Allocate() above to call cpp_alloc, so we
+// don't have to reproduce the logic here.  To make tc_new_mode work
+// properly, I think we'll need to separate out the logic of throwing
+// from the logic of calling the new-handler.
+inline void* debug_cpp_alloc(size_t size, int new_type, bool nothrow) {
+  void* p = DebugAllocate(size, new_type);
+  if (p != NULL) {
+    return p;
+  }
+  struct debug_alloc_retry_data data;
+  data.size = size;
+  data.new_type = new_type;
+  return handle_oom(retry_debug_allocate, &data,
+                    true, nothrow);
+}
+
+inline void* do_debug_malloc_or_debug_cpp_alloc(size_t size) {
+  void* p = DebugAllocate(size, MallocBlock::kMallocType);
+  if (p != NULL) {
+    return p;
+  }
+  struct debug_alloc_retry_data data;
+  data.size = size;
+  data.new_type = MallocBlock::kMallocType;
+  return handle_oom(retry_debug_allocate, &data,
+                    false, true);
+}
+
+// Exported routines
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_malloc(size_t size) __THROW {
+  void* ptr = do_debug_malloc_or_debug_cpp_alloc(size);
+  MallocHook::InvokeNewHook(ptr, size);
+  return ptr;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void tc_free(void* ptr) __THROW {
+  MallocHook::InvokeDeleteHook(ptr);
+  DebugDeallocate(ptr, MallocBlock::kMallocType);
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_calloc(size_t count, size_t size) __THROW {
+  // Overflow check
+  const size_t total_size = count * size;
+  if (size != 0 && total_size / size != count) return NULL;
+
+  void* block = do_debug_malloc_or_debug_cpp_alloc(total_size);
+  MallocHook::InvokeNewHook(block, total_size);
+  if (block)  memset(block, 0, total_size);
+  return block;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void tc_cfree(void* ptr) __THROW {
+  MallocHook::InvokeDeleteHook(ptr);
+  DebugDeallocate(ptr, MallocBlock::kMallocType);
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_realloc(void* ptr, size_t size) __THROW {
+  if (ptr == NULL) {
+    ptr = do_debug_malloc_or_debug_cpp_alloc(size);
+    MallocHook::InvokeNewHook(ptr, size);
+    return ptr;
+  }
+  if (size == 0) {
+    MallocHook::InvokeDeleteHook(ptr);
+    DebugDeallocate(ptr, MallocBlock::kMallocType);
+    return NULL;
+  }
+  MallocBlock* old = MallocBlock::FromRawPointer(ptr);
+  old->Check(MallocBlock::kMallocType);
+  MallocBlock* p = MallocBlock::Allocate(size, MallocBlock::kMallocType);
+
+  // If realloc fails we are to leave the old block untouched and
+  // return null
+  if (p == NULL)  return NULL;
+
+  // if ptr was allocated via memalign, then old->data_size() is not
+  // start of user data. So we must be careful to copy only user-data
+  char *old_begin = (char *)old->data_addr();
+  char *old_end = old_begin + old->data_size();
+
+  ssize_t old_ssize = old_end - (char *)ptr;
+  CHECK_CONDITION(old_ssize >= 0);
+
+  size_t old_size = (size_t)old_ssize;
+  CHECK_CONDITION(old_size <= old->data_size());
+
+  memcpy(p->data_addr(), ptr, (old_size < size) ? old_size : size);
+  MallocHook::InvokeDeleteHook(ptr);
+  MallocHook::InvokeNewHook(p->data_addr(), size);
+  DebugDeallocate(ptr, MallocBlock::kMallocType);
+  MALLOC_TRACE("realloc", p->data_size(), p->data_addr());
+  return p->data_addr();
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_new(size_t size) {
+  void* ptr = debug_cpp_alloc(size, MallocBlock::kNewType, false);
+  MallocHook::InvokeNewHook(ptr, size);
+  if (ptr == NULL) {
+    RAW_LOG(FATAL, "Unable to allocate %" PRIuS " bytes: new failed.", size);
+  }
+  return ptr;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_new_nothrow(size_t size, const std::nothrow_t&) __THROW {
+  void* ptr = debug_cpp_alloc(size, MallocBlock::kNewType, true);
+  MallocHook::InvokeNewHook(ptr, size);
+  return ptr;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW {
+  MallocHook::InvokeDeleteHook(p);
+  DebugDeallocate(p, MallocBlock::kNewType);
+}
+
+// Some STL implementations explicitly invoke this.
+// It is completely equivalent to a normal delete (delete never throws).
+extern "C" PERFTOOLS_DLL_DECL void tc_delete_nothrow(void* p, const std::nothrow_t&) __THROW {
+  MallocHook::InvokeDeleteHook(p);
+  DebugDeallocate(p, MallocBlock::kNewType);
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_newarray(size_t size) {
+  void* ptr = debug_cpp_alloc(size, MallocBlock::kArrayNewType, false);
+  MallocHook::InvokeNewHook(ptr, size);
+  if (ptr == NULL) {
+    RAW_LOG(FATAL, "Unable to allocate %" PRIuS " bytes: new[] failed.", size);
+  }
+  return ptr;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size, const std::nothrow_t&)
+    __THROW {
+  void* ptr = debug_cpp_alloc(size, MallocBlock::kArrayNewType, true);
+  MallocHook::InvokeNewHook(ptr, size);
+  return ptr;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW {
+  MallocHook::InvokeDeleteHook(p);
+  DebugDeallocate(p, MallocBlock::kArrayNewType);
+}
+
+// Some STL implementations explicitly invoke this.
+// It is completely equivalent to a normal delete (delete never throws).
+extern "C" PERFTOOLS_DLL_DECL void tc_deletearray_nothrow(void* p, const std::nothrow_t&) __THROW {
+  MallocHook::InvokeDeleteHook(p);
+  DebugDeallocate(p, MallocBlock::kArrayNewType);
+}
+
+// This is mostly the same as do_memalign in tcmalloc.cc.
+static void *do_debug_memalign(size_t alignment, size_t size) {
+  // Allocate >= size bytes aligned on "alignment" boundary
+  // "alignment" is a power of two.
+  void *p = 0;
+  RAW_CHECK((alignment & (alignment-1)) == 0, "must be power of two");
+  const size_t data_offset = MallocBlock::data_offset();
+  // Allocate "alignment-1" extra bytes to ensure alignment is possible, and
+  // a further data_offset bytes for an additional fake header.
+  size_t extra_bytes = data_offset + alignment - 1;
+  if (size + extra_bytes < size) return NULL;         // Overflow
+  p = DebugAllocate(size + extra_bytes, MallocBlock::kMallocType);
+  if (p != 0) {
+    intptr_t orig_p = reinterpret_cast<intptr_t>(p);
+    // Leave data_offset bytes for fake header, and round up to meet
+    // alignment.
+    p = reinterpret_cast<void *>(RoundUp(orig_p + data_offset, alignment));
+    // Create a fake header block with an offset_ that points back to the
+    // real header.  FromRawPointer uses this value.
+    MallocBlock *fake_hdr = reinterpret_cast<MallocBlock *>(
+                reinterpret_cast<char *>(p) - data_offset);
+    // offset_ is distance between real and fake headers.
+    // p is now end of fake header (beginning of client area),
+    // and orig_p is the end of the real header, so offset_
+    // is their difference.
+    //
+    // Note that other fields of fake_hdr are initialized with
+    // kMagicUninitializedByte
+    fake_hdr->set_offset(reinterpret_cast<intptr_t>(p) - orig_p);
+  }
+  return p;
+}
+
+struct memalign_retry_data {
+  size_t align;
+  size_t size;
+};
+
+static void *retry_debug_memalign(void *arg) {
+  memalign_retry_data *data = static_cast<memalign_retry_data *>(arg);
+  return do_debug_memalign(data->align, data->size);
+}
+
+inline void* do_debug_memalign_or_debug_cpp_memalign(size_t align,
+                                                     size_t size) {
+  void* p = do_debug_memalign(align, size);
+  if (p != NULL) {
+    return p;
+  }
+
+  struct memalign_retry_data data;
+  data.align = align;
+  data.size = size;
+  return handle_oom(retry_debug_memalign, &data,
+                    false, true);
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_memalign(size_t align, size_t size) __THROW {
+  void *p = do_debug_memalign_or_debug_cpp_memalign(align, size);
+  MallocHook::InvokeNewHook(p, size);
+  return p;
+}
+
+// Implementation taken from tcmalloc/tcmalloc.cc
+extern "C" PERFTOOLS_DLL_DECL int tc_posix_memalign(void** result_ptr, size_t align, size_t size)
+    __THROW {
+  if (((align % sizeof(void*)) != 0) ||
+      ((align & (align - 1)) != 0) ||
+      (align == 0)) {
+    return EINVAL;
+  }
+
+  void* result = do_debug_memalign_or_debug_cpp_memalign(align, size);
+  MallocHook::InvokeNewHook(result, size);
+  if (result == NULL) {
+    return ENOMEM;
+  } else {
+    *result_ptr = result;
+    return 0;
+  }
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_valloc(size_t size) __THROW {
+  // Allocate >= size bytes starting on a page boundary
+  void *p = do_debug_memalign_or_debug_cpp_memalign(getpagesize(), size);
+  MallocHook::InvokeNewHook(p, size);
+  return p;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_pvalloc(size_t size) __THROW {
+  // Round size up to a multiple of pages
+  // then allocate memory on a page boundary
+  int pagesize = getpagesize();
+  size = RoundUp(size, pagesize);
+  if (size == 0) {     // pvalloc(0) should allocate one page, according to
+    size = pagesize;   // http://man.free4web.biz/man3/libmpatrol.3.html
+  }
+  void *p = do_debug_memalign_or_debug_cpp_memalign(pagesize, size);
+  MallocHook::InvokeNewHook(p, size);
+  return p;
+}
+
+// malloc_stats just falls through to the base implementation.
+extern "C" PERFTOOLS_DLL_DECL void tc_malloc_stats(void) __THROW {
+  do_malloc_stats();
+}
+
+extern "C" PERFTOOLS_DLL_DECL int tc_mallopt(int cmd, int value) __THROW {
+  return do_mallopt(cmd, value);
+}
+
+#ifdef HAVE_STRUCT_MALLINFO
+extern "C" PERFTOOLS_DLL_DECL struct mallinfo tc_mallinfo(void) __THROW {
+  return do_mallinfo();
+}
+#endif
+
+extern "C" PERFTOOLS_DLL_DECL size_t tc_malloc_size(void* ptr) __THROW {
+  return MallocExtension::instance()->GetAllocatedSize(ptr);
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_malloc_skip_new_handler(size_t size) __THROW {
+  void* result = DebugAllocate(size, MallocBlock::kMallocType);
+  MallocHook::InvokeNewHook(result, size);
+  return result;
+}

diff --git a/src/getenv_safe.h b/src/getenv_safe.h
new file mode 100644
index 0000000..3b9f4db
--- /dev/null
+++ b/src/getenv_safe.h

@@ -0,0 +1,63 @@
+/* -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+ * Copyright (c) 2014, gperftools Contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef GETENV_SAFE_H
+#define GETENV_SAFE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* 
+ * This getenv function is safe to call before the C runtime is initialized.
+ * On Windows, it utilizes GetEnvironmentVariable() and on unix it uses
+ * /proc/self/environ instead calling getenv().  It's intended to be used in
+ * routines that run before main(), when the state required for getenv() may
+ * not be set up yet.  In particular, errno isn't set up until relatively late
+ * (after the pthreads library has a chance to make it threadsafe), and
+ * getenv() doesn't work until then.
+ * On some platforms, this call will utilize the same, static buffer for
+ * repeated GetenvBeforeMain() calls. Callers should not expect pointers from
+ * this routine to be long lived.
+ * Note that on unix, /proc only has the environment at the time the
+ * application was started, so this routine ignores setenv() calls/etc.  Also
+ * note it only reads the first 16K of the environment.
+ * 
+ * NOTE: this is version of GetenvBeforeMain that's usable from
+ * C. Implementation is in sysinfo.cc
+ */
+const char* TCMallocGetenvSafe(const char* name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

diff --git a/src/getpc.h b/src/getpc.h
new file mode 100644
index 0000000..25fee39
--- /dev/null
+++ b/src/getpc.h

@@ -0,0 +1,187 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// This is an internal header file used by profiler.cc.  It defines
+// the single (inline) function GetPC.  GetPC is used in a signal
+// handler to figure out the instruction that was being executed when
+// the signal-handler was triggered.
+//
+// To get this, we use the ucontext_t argument to the signal-handler
+// callback, which holds the full context of what was going on when
+// the signal triggered.  How to get from a ucontext_t to a Program
+// Counter is OS-dependent.
+
+#ifndef BASE_GETPC_H_
+#define BASE_GETPC_H_
+
+#include "config.h"
+
+// On many linux systems, we may need _GNU_SOURCE to get access to
+// the defined constants that define the register we want to see (eg
+// REG_EIP).  Note this #define must come first!
+#define _GNU_SOURCE 1
+// If #define _GNU_SOURCE causes problems, this might work instead.
+// It will cause problems for FreeBSD though!, because it turns off
+// the needed __BSD_VISIBLE.
+//#define _XOPEN_SOURCE 500
+
+#include <string.h>         // for memcmp
+#if defined(HAVE_SYS_UCONTEXT_H)
+#include <sys/ucontext.h>
+#elif defined(HAVE_UCONTEXT_H)
+#include <ucontext.h>       // for ucontext_t (and also mcontext_t)
+#elif defined(HAVE_CYGWIN_SIGNAL_H)
+#include <cygwin/signal.h>
+typedef ucontext ucontext_t;
+#endif
+
+
+// Take the example where function Foo() calls function Bar().  For
+// many architectures, Bar() is responsible for setting up and tearing
+// down its own stack frame.  In that case, it's possible for the
+// interrupt to happen when execution is in Bar(), but the stack frame
+// is not properly set up (either before it's done being set up, or
+// after it's been torn down but before Bar() returns).  In those
+// cases, the stack trace cannot see the caller function anymore.
+//
+// GetPC can try to identify this situation, on architectures where it
+// might occur, and unwind the current function call in that case to
+// avoid false edges in the profile graph (that is, edges that appear
+// to show a call skipping over a function).  To do this, we hard-code
+// in the asm instructions we might see when setting up or tearing
+// down a stack frame.
+//
+// This is difficult to get right: the instructions depend on the
+// processor, the compiler ABI, and even the optimization level.  This
+// is a best effort patch -- if we fail to detect such a situation, or
+// mess up the PC, nothing happens; the returned PC is not used for
+// any further processing.
+struct CallUnrollInfo {
+  // Offset from (e)ip register where this instruction sequence
+  // should be matched. Interpreted as bytes. Offset 0 is the next
+  // instruction to execute. Be extra careful with negative offsets in
+  // architectures of variable instruction length (like x86) - it is
+  // not that easy as taking an offset to step one instruction back!
+  int pc_offset;
+  // The actual instruction bytes. Feel free to make it larger if you
+  // need a longer sequence.
+  unsigned char ins[16];
+  // How many bytes to match from ins array?
+  int ins_size;
+  // The offset from the stack pointer (e)sp where to look for the
+  // call return address. Interpreted as bytes.
+  int return_sp_offset;
+};
+
+
+// The dereferences needed to get the PC from a struct ucontext were
+// determined at configure time, and stored in the macro
+// PC_FROM_UCONTEXT in config.h.  The only thing we need to do here,
+// then, is to do the magic call-unrolling for systems that support it.
+
+// -- Special case 1: linux x86, for which we have CallUnrollInfo
+#if defined(__linux) && defined(__i386) && defined(__GNUC__)
+static const CallUnrollInfo callunrollinfo[] = {
+  // Entry to a function:  push %ebp;  mov  %esp,%ebp
+  // Top-of-stack contains the caller IP.
+  { 0,
+    {0x55, 0x89, 0xe5}, 3,
+    0
+  },
+  // Entry to a function, second instruction:  push %ebp;  mov  %esp,%ebp
+  // Top-of-stack contains the old frame, caller IP is +4.
+  { -1,
+    {0x55, 0x89, 0xe5}, 3,
+    4
+  },
+  // Return from a function: RET.
+  // Top-of-stack contains the caller IP.
+  { 0,
+    {0xc3}, 1,
+    0
+  }
+};
+
+inline void* GetPC(const ucontext_t& signal_ucontext) {
+  // See comment above struct CallUnrollInfo.  Only try instruction
+  // flow matching if both eip and esp looks reasonable.
+  const int eip = signal_ucontext.uc_mcontext.gregs[REG_EIP];
+  const int esp = signal_ucontext.uc_mcontext.gregs[REG_ESP];
+  if ((eip & 0xffff0000) != 0 && (~eip & 0xffff0000) != 0 &&
+      (esp & 0xffff0000) != 0) {
+    char* eip_char = reinterpret_cast<char*>(eip);
+    for (int i = 0; i < sizeof(callunrollinfo)/sizeof(*callunrollinfo); ++i) {
+      if (!memcmp(eip_char + callunrollinfo[i].pc_offset,
+                  callunrollinfo[i].ins, callunrollinfo[i].ins_size)) {
+        // We have a match.
+        void **retaddr = (void**)(esp + callunrollinfo[i].return_sp_offset);
+        return *retaddr;
+      }
+    }
+  }
+  return (void*)eip;
+}
+
+// Special case #2: Windows, which has to do something totally different.
+#elif defined(_WIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__) || defined(__MINGW32__)
+// If this is ever implemented, probably the way to do it is to have
+// profiler.cc use a high-precision timer via timeSetEvent:
+//    http://msdn2.microsoft.com/en-us/library/ms712713.aspx
+// We'd use it in mode TIME_CALLBACK_FUNCTION/TIME_PERIODIC.
+// The callback function would be something like prof_handler, but
+// alas the arguments are different: no ucontext_t!  I don't know
+// how we'd get the PC (using StackWalk64?)
+//    http://msdn2.microsoft.com/en-us/library/ms680650.aspx
+
+#include "base/logging.h"   // for RAW_LOG
+#ifndef HAVE_CYGWIN_SIGNAL_H
+typedef int ucontext_t;
+#endif
+
+inline void* GetPC(const struct ucontext_t& signal_ucontext) {
+  RAW_LOG(ERROR, "GetPC is not yet implemented on Windows\n");
+  return NULL;
+}
+
+// Normal cases.  If this doesn't compile, it's probably because
+// PC_FROM_UCONTEXT is the empty string.  You need to figure out
+// the right value for your system, and add it to the list in
+// configure.ac (or set it manually in your config.h).
+#else
+inline void* GetPC(const ucontext_t& signal_ucontext) {
+  return (void*)signal_ucontext.PC_FROM_UCONTEXT;   // defined in config.h
+}
+
+#endif
+
+#endif  // BASE_GETPC_H_

diff --git a/src/google/heap-checker.h b/src/google/heap-checker.h
new file mode 100644
index 0000000..7cacf1f
--- /dev/null
+++ b/src/google/heap-checker.h

@@ -0,0 +1,36 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+/* The code has moved to gperftools/.  Use that include-directory for
+ * new code.
+ */
+#ifdef __GNUC__
+#warning "google/heap-checker.h is deprecated. Use gperftools/heap-checker.h instead"
+#endif
+#include <gperftools/heap-checker.h>

diff --git a/src/google/heap-profiler.h b/src/google/heap-profiler.h
new file mode 100644
index 0000000..3fc26cf
--- /dev/null
+++ b/src/google/heap-profiler.h

@@ -0,0 +1,37 @@
+/* Copyright (c) 2005, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* The code has moved to gperftools/.  Use that include-directory for
+ * new code.
+ */
+#ifdef __GNUC__
+#warning "google/heap-profiler.h is deprecated. Use gperftools/heap-profiler.h instead"
+#endif
+#include <gperftools/heap-profiler.h>

diff --git a/src/google/malloc_extension.h b/src/google/malloc_extension.h
new file mode 100644
index 0000000..7cacc34
--- /dev/null
+++ b/src/google/malloc_extension.h

@@ -0,0 +1,36 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+/* The code has moved to gperftools/.  Use that include-directory for
+ * new code.
+ */
+#ifdef __GNUC__
+#warning "google/malloc_extension.h is deprecated. Use gperftools/malloc_extension.h instead"
+#endif
+#include <gperftools/malloc_extension.h>

diff --git a/src/google/malloc_extension_c.h b/src/google/malloc_extension_c.h
new file mode 100644
index 0000000..f34a835
--- /dev/null
+++ b/src/google/malloc_extension_c.h

@@ -0,0 +1,37 @@
+/* Copyright (c) 2008, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* The code has moved to gperftools/.  Use that include-directory for
+ * new code.
+ */
+#ifdef __GNUC__
+#warning "google/malloc_extension_c.h is deprecated. Use gperftools/malloc_extension_c.h instead"
+#endif
+#include <gperftools/malloc_extension_c.h>

diff --git a/src/google/malloc_hook.h b/src/google/malloc_hook.h
new file mode 100644
index 0000000..371aba4
--- /dev/null
+++ b/src/google/malloc_hook.h

@@ -0,0 +1,36 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+/* The code has moved to gperftools/.  Use that include-directory for
+ * new code.
+ */
+#ifdef __GNUC__
+#warning "google/malloc_hook.h is deprecated. Use gperftools/malloc_hook.h instead"
+#endif
+#include <gperftools/malloc_hook.h>

diff --git a/src/google/malloc_hook_c.h b/src/google/malloc_hook_c.h
new file mode 100644
index 0000000..f882c16
--- /dev/null
+++ b/src/google/malloc_hook_c.h

@@ -0,0 +1,37 @@
+/* Copyright (c) 2008, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* The code has moved to gperftools/.  Use that include-directory for
+ * new code.
+ */
+#ifdef __GNUC__
+#warning "google/malloc_hook_c.h is deprecated. Use gperftools/malloc_hook_c.h instead"
+#endif
+#include <gperftools/malloc_hook_c.h>

diff --git a/src/google/profiler.h b/src/google/profiler.h
new file mode 100644
index 0000000..3674c9e
--- /dev/null
+++ b/src/google/profiler.h

@@ -0,0 +1,37 @@
+/* Copyright (c) 2005, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* The code has moved to gperftools/.  Use that include-directory for
+ * new code.
+ */
+#ifdef __GNUC__
+#warning "google/profiler.h is deprecated. Use gperftools/profiler.h instead"
+#endif
+#include <gperftools/profiler.h>

diff --git a/src/google/stacktrace.h b/src/google/stacktrace.h
new file mode 100644
index 0000000..53d2947
--- /dev/null
+++ b/src/google/stacktrace.h

@@ -0,0 +1,36 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+/* The code has moved to gperftools/.  Use that include-directory for
+ * new code.
+ */
+#ifdef __GNUC__
+#warning "google/stacktrace.h is deprecated. Use gperftools/stacktrace.h instead"
+#endif
+#include <gperftools/stacktrace.h>

diff --git a/src/google/tcmalloc.h b/src/google/tcmalloc.h
new file mode 100644
index 0000000..a2db70e
--- /dev/null
+++ b/src/google/tcmalloc.h

@@ -0,0 +1,37 @@
+/* Copyright (c) 2003, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* The code has moved to gperftools/.  Use that include-directory for
+ * new code.
+ */
+#ifdef __GNUC__
+#warning "google/tcmalloc.h is deprecated. Use gperftools/tcmalloc.h instead"
+#endif
+#include <gperftools/tcmalloc.h>

diff --git a/src/gperftools/heap-checker.h b/src/gperftools/heap-checker.h
new file mode 100644
index 0000000..5a87d8d
--- /dev/null
+++ b/src/gperftools/heap-checker.h

@@ -0,0 +1,422 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Maxim Lifantsev (with design ideas by Sanjay Ghemawat)
+//
+//
+// Module for detecing heap (memory) leaks.
+//
+// For full(er) information, see doc/heap_checker.html
+//
+// This module can be linked into programs with
+// no slowdown caused by this unless you activate the leak-checker:
+//
+//    1. Set the environment variable HEAPCHEK to _type_ before
+//       running the program.
+//
+// _type_ is usually "normal" but can also be "minimal", "strict", or
+// "draconian".  (See the html file for other options, like 'local'.)
+//
+// After that, just run your binary.  If the heap-checker detects
+// a memory leak at program-exit, it will print instructions on how
+// to track down the leak.
+
+#ifndef BASE_HEAP_CHECKER_H_
+#define BASE_HEAP_CHECKER_H_
+
+#include <sys/types.h>  // for size_t
+// I can't #include config.h in this public API file, but I should
+// really use configure (and make malloc_extension.h a .in file) to
+// figure out if the system has stdint.h or not.  But I'm lazy, so
+// for now I'm assuming it's a problem only with MSVC.
+#ifndef _MSC_VER
+#include <stdint.h>     // for uintptr_t
+#endif
+#include <stdarg.h>     // for va_list
+#include <vector>
+
+// Annoying stuff for windows -- makes sure clients can import these functions
+#ifndef PERFTOOLS_DLL_DECL
+# ifdef _WIN32
+#   define PERFTOOLS_DLL_DECL  __declspec(dllimport)
+# else
+#   define PERFTOOLS_DLL_DECL
+# endif
+#endif
+
+
+// The class is thread-safe with respect to all the provided static methods,
+// as well as HeapLeakChecker objects: they can be accessed by multiple threads.
+class PERFTOOLS_DLL_DECL HeapLeakChecker {
+ public:
+
+  // ----------------------------------------------------------------------- //
+  // Static functions for working with (whole-program) leak checking.
+
+  // If heap leak checking is currently active in some mode
+  // e.g. if leak checking was started (and is still active now)
+  // due to HEAPCHECK=... defined in the environment.
+  // The return value reflects iff HeapLeakChecker objects manually
+  // constructed right now will be doing leak checking or nothing.
+  // Note that we can go from active to inactive state during InitGoogle()
+  // if FLAGS_heap_check gets set to "" by some code before/during InitGoogle().
+  static bool IsActive();
+
+  // Return pointer to the whole-program checker if it has been created
+  // and NULL otherwise.
+  // Once GlobalChecker() returns non-NULL that object will not disappear and
+  // will be returned by all later GlobalChecker calls.
+  // This is mainly to access BytesLeaked() and ObjectsLeaked() (see below)
+  // for the whole-program checker after one calls NoGlobalLeaks()
+  // or similar and gets false.
+  static HeapLeakChecker* GlobalChecker();
+
+  // Do whole-program leak check now (if it was activated for this binary);
+  // return false only if it was activated and has failed.
+  // The mode of the check is controlled by the command-line flags.
+  // This method can be called repeatedly.
+  // Things like GlobalChecker()->SameHeap() can also be called explicitly
+  // to do the desired flavor of the check.
+  static bool NoGlobalLeaks();
+
+  // If whole-program checker if active,
+  // cancel its automatic execution after main() exits.
+  // This requires that some leak check (e.g. NoGlobalLeaks())
+  // has been called at least once on the whole-program checker.
+  static void CancelGlobalCheck();
+
+  // ----------------------------------------------------------------------- //
+  // Non-static functions for starting and doing leak checking.
+
+  // Start checking and name the leak check performed.
+  // The name is used in naming dumped profiles
+  // and needs to be unique only within your binary.
+  // It must also be a string that can be a part of a file name,
+  // in particular not contain path expressions.
+  explicit HeapLeakChecker(const char *name);
+
+  // Destructor (verifies that some *NoLeaks or *SameHeap method
+  // has been called at least once).
+  ~HeapLeakChecker();
+
+  // These used to be different but are all the same now: they return
+  // true iff all memory allocated since this HeapLeakChecker object
+  // was constructor is still reachable from global state.
+  //
+  // Because we fork to convert addresses to symbol-names, and forking
+  // is not thread-safe, and we may be called in a threaded context,
+  // we do not try to symbolize addresses when called manually.
+  bool NoLeaks() { return DoNoLeaks(DO_NOT_SYMBOLIZE); }
+
+  // These forms are obsolete; use NoLeaks() instead.
+  // TODO(csilvers): mark as DEPRECATED.
+  bool QuickNoLeaks()  { return NoLeaks(); }
+  bool BriefNoLeaks()  { return NoLeaks(); }
+  bool SameHeap()      { return NoLeaks(); }
+  bool QuickSameHeap() { return NoLeaks(); }
+  bool BriefSameHeap() { return NoLeaks(); }
+
+  // Detailed information about the number of leaked bytes and objects
+  // (both of these can be negative as well).
+  // These are available only after a *SameHeap or *NoLeaks
+  // method has been called.
+  // Note that it's possible for both of these to be zero
+  // while SameHeap() or NoLeaks() returned false in case
+  // of a heap state change that is significant
+  // but preserves the byte and object counts.
+  ssize_t BytesLeaked() const;
+  ssize_t ObjectsLeaked() const;
+
+  // ----------------------------------------------------------------------- //
+  // Static helpers to make us ignore certain leaks.
+
+  // Scoped helper class.  Should be allocated on the stack inside a
+  // block of code.  Any heap allocations done in the code block
+  // covered by the scoped object (including in nested function calls
+  // done by the code block) will not be reported as leaks.  This is
+  // the recommended replacement for the GetDisableChecksStart() and
+  // DisableChecksToHereFrom() routines below.
+  //
+  // Example:
+  //   void Foo() {
+  //     HeapLeakChecker::Disabler disabler;
+  //     ... code that allocates objects whose leaks should be ignored ...
+  //   }
+  //
+  // REQUIRES: Destructor runs in same thread as constructor
+  class Disabler {
+   public:
+    Disabler();
+    ~Disabler();
+   private:
+    Disabler(const Disabler&);        // disallow copy
+    void operator=(const Disabler&);  // and assign
+  };
+
+  // Ignore an object located at 'ptr' (can go at the start or into the object)
+  // as well as all heap objects (transitively) referenced from it for the
+  // purposes of heap leak checking. Returns 'ptr' so that one can write
+  //   static T* obj = IgnoreObject(new T(...));
+  //
+  // If 'ptr' does not point to an active allocated object at the time of this
+  // call, it is ignored; but if it does, the object must not get deleted from
+  // the heap later on.
+  //
+  // See also HiddenPointer, below, if you need to prevent a pointer from
+  // being traversed by the heap checker but do not wish to transitively
+  // whitelist objects referenced through it.
+  template <typename T>
+  static T* IgnoreObject(T* ptr) {
+    DoIgnoreObject(static_cast<const void*>(const_cast<const T*>(ptr)));
+    return ptr;
+  }
+
+  // Undo what an earlier IgnoreObject() call promised and asked to do.
+  // At the time of this call 'ptr' must point at or inside of an active
+  // allocated object which was previously registered with IgnoreObject().
+  static void UnIgnoreObject(const void* ptr);
+
+  // ----------------------------------------------------------------------- //
+  // Internal types defined in .cc
+
+  class Allocator;
+  struct RangeValue;
+
+ private:
+
+  // ----------------------------------------------------------------------- //
+  // Various helpers
+
+  // Create the name of the heap profile file.
+  // Should be deleted via Allocator::Free().
+  char* MakeProfileNameLocked();
+
+  // Helper for constructors
+  void Create(const char *name, bool make_start_snapshot);
+
+  enum ShouldSymbolize { SYMBOLIZE, DO_NOT_SYMBOLIZE };
+
+  // Helper for *NoLeaks and *SameHeap
+  bool DoNoLeaks(ShouldSymbolize should_symbolize);
+
+  // Helper for NoGlobalLeaks, also called by the global destructor.
+  static bool NoGlobalLeaksMaybeSymbolize(ShouldSymbolize should_symbolize);
+
+  // These used to be public, but they are now deprecated.
+  // Will remove entirely when all internal uses are fixed.
+  // In the meantime, use friendship so the unittest can still test them.
+  static void* GetDisableChecksStart();
+  static void DisableChecksToHereFrom(const void* start_address);
+  static void DisableChecksIn(const char* pattern);
+  friend void RangeDisabledLeaks();
+  friend void NamedTwoDisabledLeaks();
+  friend void* RunNamedDisabledLeaks(void*);
+  friend void TestHeapLeakCheckerNamedDisabling();
+
+  // Actually implements IgnoreObject().
+  static void DoIgnoreObject(const void* ptr);
+
+  // Disable checks based on stack trace entry at a depth <=
+  // max_depth.  Used to hide allocations done inside some special
+  // libraries.
+  static void DisableChecksFromToLocked(const void* start_address,
+                                        const void* end_address,
+                                        int max_depth);
+
+  // Helper for DoNoLeaks to ignore all objects reachable from all live data
+  static void IgnoreAllLiveObjectsLocked(const void* self_stack_top);
+
+  // Callback we pass to TCMalloc_ListAllProcessThreads (see thread_lister.h)
+  // that is invoked when all threads of our process are found and stopped.
+  // The call back does the things needed to ignore live data reachable from
+  // thread stacks and registers for all our threads
+  // as well as do other global-live-data ignoring
+  // (via IgnoreNonThreadLiveObjectsLocked)
+  // during the quiet state of all threads being stopped.
+  // For the argument meaning see the comment by TCMalloc_ListAllProcessThreads.
+  // Here we only use num_threads and thread_pids, that TCMalloc_ListAllProcessThreads
+  // fills for us with the number and pids of all the threads of our process
+  // it found and attached to.
+  static int IgnoreLiveThreadsLocked(void* parameter,
+                                     int num_threads,
+                                     pid_t* thread_pids,
+                                     va_list ap);
+
+  // Helper for IgnoreAllLiveObjectsLocked and IgnoreLiveThreadsLocked
+  // that we prefer to execute from IgnoreLiveThreadsLocked
+  // while all threads are stopped.
+  // This helper does live object discovery and ignoring
+  // for all objects that are reachable from everything
+  // not related to thread stacks and registers.
+  static void IgnoreNonThreadLiveObjectsLocked();
+
+  // Helper for IgnoreNonThreadLiveObjectsLocked and IgnoreLiveThreadsLocked
+  // to discover and ignore all heap objects
+  // reachable from currently considered live objects
+  // (live_objects static global variable in out .cc file).
+  // "name", "name2" are two strings that we print one after another
+  // in a debug message to describe what kind of live object sources
+  // are being used.
+  static void IgnoreLiveObjectsLocked(const char* name, const char* name2);
+
+  // Do the overall whole-program heap leak check if needed;
+  // returns true when did the leak check.
+  static bool DoMainHeapCheck();
+
+  // Type of task for UseProcMapsLocked
+  enum ProcMapsTask {
+    RECORD_GLOBAL_DATA,
+    DISABLE_LIBRARY_ALLOCS
+  };
+
+  // Success/Error Return codes for UseProcMapsLocked.
+  enum ProcMapsResult {
+    PROC_MAPS_USED,
+    CANT_OPEN_PROC_MAPS,
+    NO_SHARED_LIBS_IN_PROC_MAPS
+  };
+
+  // Read /proc/self/maps, parse it, and do the 'proc_maps_task' for each line.
+  static ProcMapsResult UseProcMapsLocked(ProcMapsTask proc_maps_task);
+
+  // A ProcMapsTask to disable allocations from 'library'
+  // that is mapped to [start_address..end_address)
+  // (only if library is a certain system library).
+  static void DisableLibraryAllocsLocked(const char* library,
+                                         uintptr_t start_address,
+                                         uintptr_t end_address);
+
+  // Return true iff "*ptr" points to a heap object
+  // ("*ptr" can point at the start or inside of a heap object
+  //  so that this works e.g. for pointers to C++ arrays, C++ strings,
+  //  multiple-inherited objects, or pointers to members).
+  // We also fill *object_size for this object then
+  // and we move "*ptr" to point to the very start of the heap object.
+  static inline bool HaveOnHeapLocked(const void** ptr, size_t* object_size);
+
+  // Helper to shutdown heap leak checker when it's not needed
+  // or can't function properly.
+  static void TurnItselfOffLocked();
+
+  // Internally-used c-tor to start whole-executable checking.
+  HeapLeakChecker();
+
+  // ----------------------------------------------------------------------- //
+  // Friends and externally accessed helpers.
+
+  // Helper for VerifyHeapProfileTableStackGet in the unittest
+  // to get the recorded allocation caller for ptr,
+  // which must be a heap object.
+  static const void* GetAllocCaller(void* ptr);
+  friend void VerifyHeapProfileTableStackGet();
+
+  // This gets to execute before constructors for all global objects
+  static void BeforeConstructorsLocked();
+  friend void HeapLeakChecker_BeforeConstructors();
+
+  // This gets to execute after destructors for all global objects
+  friend void HeapLeakChecker_AfterDestructors();
+
+  // Full starting of recommended whole-program checking.
+  friend void HeapLeakChecker_InternalInitStart();
+
+  // Runs REGISTER_HEAPCHECK_CLEANUP cleanups and potentially
+  // calls DoMainHeapCheck
+  friend void HeapLeakChecker_RunHeapCleanups();
+
+  // ----------------------------------------------------------------------- //
+  // Member data.
+
+  class SpinLock* lock_;  // to make HeapLeakChecker objects thread-safe
+  const char* name_;  // our remembered name (we own it)
+                      // NULL means this leak checker is a noop
+
+  // Snapshot taken when the checker was created.  May be NULL
+  // for the global heap checker object.  We use void* instead of
+  // HeapProfileTable::Snapshot* to avoid including heap-profile-table.h.
+  void* start_snapshot_;
+
+  bool has_checked_;  // if we have done the leak check, so these are ready:
+  ssize_t inuse_bytes_increase_;  // bytes-in-use increase for this checker
+  ssize_t inuse_allocs_increase_;  // allocations-in-use increase
+                                   // for this checker
+  bool keep_profiles_;  // iff we should keep the heap profiles we've made
+
+  // ----------------------------------------------------------------------- //
+
+  // Disallow "evil" constructors.
+  HeapLeakChecker(const HeapLeakChecker&);
+  void operator=(const HeapLeakChecker&);
+};
+
+
+// Holds a pointer that will not be traversed by the heap checker.
+// Contrast with HeapLeakChecker::IgnoreObject(o), in which o and
+// all objects reachable from o are ignored by the heap checker.
+template <class T>
+class HiddenPointer {
+ public:
+  explicit HiddenPointer(T* t)
+      : masked_t_(reinterpret_cast<uintptr_t>(t) ^ kHideMask) {
+  }
+  // Returns unhidden pointer.  Be careful where you save the result.
+  T* get() const { return reinterpret_cast<T*>(masked_t_ ^ kHideMask); }
+
+ private:
+  // Arbitrary value, but not such that xor'ing with it is likely
+  // to map one valid pointer to another valid pointer:
+  static const uintptr_t kHideMask =
+      static_cast<uintptr_t>(0xF03A5F7BF03A5F7Bll);
+  uintptr_t masked_t_;
+};
+
+// A class that exists solely to run its destructor.  This class should not be
+// used directly, but instead by the REGISTER_HEAPCHECK_CLEANUP macro below.
+class PERFTOOLS_DLL_DECL HeapCleaner {
+ public:
+  typedef void (*void_function)(void);
+  HeapCleaner(void_function f);
+  static void RunHeapCleanups();
+ private:
+  static std::vector<void_function>* heap_cleanups_;
+};
+
+// A macro to declare module heap check cleanup tasks
+// (they run only if we are doing heap leak checking.)
+// 'body' should be the cleanup code to run.  'name' doesn't matter,
+// but must be unique amongst all REGISTER_HEAPCHECK_CLEANUP calls.
+#define REGISTER_HEAPCHECK_CLEANUP(name, body)  \
+  namespace { \
+  void heapcheck_cleanup_##name() { body; } \
+  static HeapCleaner heapcheck_cleaner_##name(&heapcheck_cleanup_##name); \
+  }
+
+#endif  // BASE_HEAP_CHECKER_H_

diff --git a/src/gperftools/heap-profiler.h b/src/gperftools/heap-profiler.h
new file mode 100644
index 0000000..9b67364
--- /dev/null
+++ b/src/gperftools/heap-profiler.h

@@ -0,0 +1,105 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2005, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Sanjay Ghemawat
+ *
+ * Module for heap-profiling.
+ *
+ * For full(er) information, see doc/heapprofile.html
+ *
+ * This module can be linked into your program with
+ * no slowdown caused by this unless you activate the profiler
+ * using one of the following methods:
+ *
+ *    1. Before starting the program, set the environment variable
+ *       "HEAPPROFILE" to be the name of the file to which the profile
+ *       data should be written.
+ *
+ *    2. Programmatically, start and stop the profiler using the
+ *       routines "HeapProfilerStart(filename)" and "HeapProfilerStop()".
+ *
+ */
+
+#ifndef BASE_HEAP_PROFILER_H_
+#define BASE_HEAP_PROFILER_H_
+
+#include <stddef.h>
+
+/* Annoying stuff for windows; makes sure clients can import these functions */
+#ifndef PERFTOOLS_DLL_DECL
+# ifdef _WIN32
+#   define PERFTOOLS_DLL_DECL  __declspec(dllimport)
+# else
+#   define PERFTOOLS_DLL_DECL
+# endif
+#endif
+
+/* All this code should be usable from within C apps. */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Start profiling and arrange to write profile data to file names
+ * of the form: "prefix.0000", "prefix.0001", ...
+ */
+PERFTOOLS_DLL_DECL void HeapProfilerStart(const char* prefix);
+
+/* Returns non-zero if we are currently profiling the heap.  (Returns
+ * an int rather than a bool so it's usable from C.)  This is true
+ * between calls to HeapProfilerStart() and HeapProfilerStop(), and
+ * also if the program has been run with HEAPPROFILER, or some other
+ * way to turn on whole-program profiling.
+ */
+int IsHeapProfilerRunning();
+
+/* Stop heap profiling.  Can be restarted again with HeapProfilerStart(),
+ * but the currently accumulated profiling information will be cleared.
+ */
+PERFTOOLS_DLL_DECL void HeapProfilerStop();
+
+/* Dump a profile now - can be used for dumping at a hopefully
+ * quiescent state in your program, in order to more easily track down
+ * memory leaks. Will include the reason in the logged message
+ */
+PERFTOOLS_DLL_DECL void HeapProfilerDump(const char *reason);
+
+/* Generate current heap profiling information.
+ * Returns an empty string when heap profiling is not active.
+ * The returned pointer is a '\0'-terminated string allocated using malloc()
+ * and should be free()-ed as soon as the caller does not need it anymore.
+ */
+PERFTOOLS_DLL_DECL char* GetHeapProfile();
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  /* BASE_HEAP_PROFILER_H_ */

diff --git a/src/gperftools/malloc_extension.h b/src/gperftools/malloc_extension.h
new file mode 100644
index 0000000..95b35cb
--- /dev/null
+++ b/src/gperftools/malloc_extension.h

@@ -0,0 +1,421 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+//
+// Extra extensions exported by some malloc implementations.  These
+// extensions are accessed through a virtual base class so an
+// application can link against a malloc that does not implement these
+// extensions, and it will get default versions that do nothing.
+//
+// NOTE FOR C USERS: If you wish to use this functionality from within
+// a C program, see malloc_extension_c.h.
+
+#ifndef BASE_MALLOC_EXTENSION_H_
+#define BASE_MALLOC_EXTENSION_H_
+
+#include <stddef.h>
+// I can't #include config.h in this public API file, but I should
+// really use configure (and make malloc_extension.h a .in file) to
+// figure out if the system has stdint.h or not.  But I'm lazy, so
+// for now I'm assuming it's a problem only with MSVC.
+#ifndef _MSC_VER
+#include <stdint.h>
+#endif
+#include <string>
+#include <vector>
+
+// Annoying stuff for windows -- makes sure clients can import these functions
+#ifndef PERFTOOLS_DLL_DECL
+# ifdef _WIN32
+#   define PERFTOOLS_DLL_DECL  __declspec(dllimport)
+# else
+#   define PERFTOOLS_DLL_DECL
+# endif
+#endif
+
+static const int kMallocHistogramSize = 64;
+
+// One day, we could support other types of writers (perhaps for C?)
+typedef std::string MallocExtensionWriter;
+
+namespace base {
+struct MallocRange;
+}
+
+// Interface to a pluggable system allocator.
+class PERFTOOLS_DLL_DECL SysAllocator {
+ public:
+  SysAllocator() {
+  }
+  virtual ~SysAllocator();
+
+  // Allocates "size"-byte of memory from system aligned with "alignment".
+  // Returns NULL if failed. Otherwise, the returned pointer p up to and
+  // including (p + actual_size -1) have been allocated.
+  virtual void* Alloc(size_t size, size_t *actual_size, size_t alignment) = 0;
+};
+
+// The default implementations of the following routines do nothing.
+// All implementations should be thread-safe; the current one
+// (TCMallocImplementation) is.
+class PERFTOOLS_DLL_DECL MallocExtension {
+ public:
+  virtual ~MallocExtension();
+
+  // Call this very early in the program execution -- say, in a global
+  // constructor -- to set up parameters and state needed by all
+  // instrumented malloc implemenatations.  One example: this routine
+  // sets environemnt variables to tell STL to use libc's malloc()
+  // instead of doing its own memory management.  This is safe to call
+  // multiple times, as long as each time is before threads start up.
+  static void Initialize();
+
+  // See "verify_memory.h" to see what these routines do
+  virtual bool VerifyAllMemory();
+  virtual bool VerifyNewMemory(const void* p);
+  virtual bool VerifyArrayNewMemory(const void* p);
+  virtual bool VerifyMallocMemory(const void* p);
+  virtual bool MallocMemoryStats(int* blocks, size_t* total,
+                                 int histogram[kMallocHistogramSize]);
+
+  // Get a human readable description of the current state of the malloc
+  // data structures.  The state is stored as a null-terminated string
+  // in a prefix of "buffer[0,buffer_length-1]".
+  // REQUIRES: buffer_length > 0.
+  virtual void GetStats(char* buffer, int buffer_length);
+
+  // Outputs to "writer" a sample of live objects and the stack traces
+  // that allocated these objects.  The format of the returned output
+  // is equivalent to the output of the heap profiler and can
+  // therefore be passed to "pprof". This function is equivalent to
+  // ReadStackTraces. The main difference is that this function returns
+  // serialized data appropriately formatted for use by the pprof tool.
+  // NOTE: by default, tcmalloc does not do any heap sampling, and this
+  //       function will always return an empty sample.  To get useful
+  //       data from GetHeapSample, you must also set the environment
+  //       variable TCMALLOC_SAMPLE_PARAMETER to a value such as 524288.
+  virtual void GetHeapSample(MallocExtensionWriter* writer);
+
+  // Outputs to "writer" the stack traces that caused growth in the
+  // address space size.  The format of the returned output is
+  // equivalent to the output of the heap profiler and can therefore
+  // be passed to "pprof". This function is equivalent to
+  // ReadHeapGrowthStackTraces. The main difference is that this function
+  // returns serialized data appropriately formatted for use by the
+  // pprof tool.  (This does not depend on, or require,
+  // TCMALLOC_SAMPLE_PARAMETER.)
+  virtual void GetHeapGrowthStacks(MallocExtensionWriter* writer);
+
+  // Invokes func(arg, range) for every controlled memory
+  // range.  *range is filled in with information about the range.
+  //
+  // This is a best-effort interface useful only for performance
+  // analysis.  The implementation may not call func at all.
+  typedef void (RangeFunction)(void*, const base::MallocRange*);
+  virtual void Ranges(void* arg, RangeFunction func);
+
+  // -------------------------------------------------------------------
+  // Control operations for getting and setting malloc implementation
+  // specific parameters.  Some currently useful properties:
+  //
+  // generic
+  // -------
+  // "generic.current_allocated_bytes"
+  //      Number of bytes currently allocated by application
+  //      This property is not writable.
+  //
+  // "generic.heap_size"
+  //      Number of bytes in the heap ==
+  //            current_allocated_bytes +
+  //            fragmentation +
+  //            freed memory regions
+  //      This property is not writable.
+  //
+  // tcmalloc
+  // --------
+  // "tcmalloc.max_total_thread_cache_bytes"
+  //      Upper limit on total number of bytes stored across all
+  //      per-thread caches.  Default: 16MB.
+  //
+  // "tcmalloc.current_total_thread_cache_bytes"
+  //      Number of bytes used across all thread caches.
+  //      This property is not writable.
+  //
+  // "tcmalloc.central_cache_free_bytes"
+  //      Number of free bytes in the central cache that have been
+  //      assigned to size classes. They always count towards virtual
+  //      memory usage, and unless the underlying memory is swapped out
+  //      by the OS, they also count towards physical memory usage.
+  //      This property is not writable.
+  //
+  // "tcmalloc.transfer_cache_free_bytes"
+  //      Number of free bytes that are waiting to be transfered between
+  //      the central cache and a thread cache. They always count
+  //      towards virtual memory usage, and unless the underlying memory
+  //      is swapped out by the OS, they also count towards physical
+  //      memory usage. This property is not writable.
+  //
+  // "tcmalloc.thread_cache_free_bytes"
+  //      Number of free bytes in thread caches. They always count
+  //      towards virtual memory usage, and unless the underlying memory
+  //      is swapped out by the OS, they also count towards physical
+  //      memory usage. This property is not writable.
+  //
+  // "tcmalloc.pageheap_free_bytes"
+  //      Number of bytes in free, mapped pages in page heap.  These
+  //      bytes can be used to fulfill allocation requests.  They
+  //      always count towards virtual memory usage, and unless the
+  //      underlying memory is swapped out by the OS, they also count
+  //      towards physical memory usage.  This property is not writable.
+  //
+  // "tcmalloc.pageheap_unmapped_bytes"
+  //        Number of bytes in free, unmapped pages in page heap.
+  //        These are bytes that have been released back to the OS,
+  //        possibly by one of the MallocExtension "Release" calls.
+  //        They can be used to fulfill allocation requests, but
+  //        typically incur a page fault.  They always count towards
+  //        virtual memory usage, and depending on the OS, typically
+  //        do not count towards physical memory usage.  This property
+  //        is not writable.
+  // -------------------------------------------------------------------
+
+  // Get the named "property"'s value.  Returns true if the property
+  // is known.  Returns false if the property is not a valid property
+  // name for the current malloc implementation.
+  // REQUIRES: property != NULL; value != NULL
+  virtual bool GetNumericProperty(const char* property, size_t* value);
+
+  // Set the named "property"'s value.  Returns true if the property
+  // is known and writable.  Returns false if the property is not a
+  // valid property name for the current malloc implementation, or
+  // is not writable.
+  // REQUIRES: property != NULL
+  virtual bool SetNumericProperty(const char* property, size_t value);
+
+  // Mark the current thread as "idle".  This routine may optionally
+  // be called by threads as a hint to the malloc implementation that
+  // any thread-specific resources should be released.  Note: this may
+  // be an expensive routine, so it should not be called too often.
+  //
+  // Also, if the code that calls this routine will go to sleep for
+  // a while, it should take care to not allocate anything between
+  // the call to this routine and the beginning of the sleep.
+  //
+  // Most malloc implementations ignore this routine.
+  virtual void MarkThreadIdle();
+
+  // Mark the current thread as "busy".  This routine should be
+  // called after MarkThreadIdle() if the thread will now do more
+  // work.  If this method is not called, performance may suffer.
+  //
+  // Most malloc implementations ignore this routine.
+  virtual void MarkThreadBusy();
+
+  // Gets the system allocator used by the malloc extension instance. Returns
+  // NULL for malloc implementations that do not support pluggable system
+  // allocators.
+  virtual SysAllocator* GetSystemAllocator();
+
+  // Sets the system allocator to the specified.
+  //
+  // Users could register their own system allocators for malloc implementation
+  // that supports pluggable system allocators, such as TCMalloc, by doing:
+  //   alloc = new MyOwnSysAllocator();
+  //   MallocExtension::instance()->SetSystemAllocator(alloc);
+  // It's up to users whether to fall back (recommended) to the default
+  // system allocator (use GetSystemAllocator() above) or not. The caller is
+  // responsible to any necessary locking.
+  // See tcmalloc/system-alloc.h for the interface and
+  //     tcmalloc/memfs_malloc.cc for the examples.
+  //
+  // It's a no-op for malloc implementations that do not support pluggable
+  // system allocators.
+  virtual void SetSystemAllocator(SysAllocator *a);
+
+  // Try to release num_bytes of free memory back to the operating
+  // system for reuse.  Use this extension with caution -- to get this
+  // memory back may require faulting pages back in by the OS, and
+  // that may be slow.  (Currently only implemented in tcmalloc.)
+  virtual void ReleaseToSystem(size_t num_bytes);
+
+  // Same as ReleaseToSystem() but release as much memory as possible.
+  virtual void ReleaseFreeMemory();
+
+  // Sets the rate at which we release unused memory to the system.
+  // Zero means we never release memory back to the system.  Increase
+  // this flag to return memory faster; decrease it to return memory
+  // slower.  Reasonable rates are in the range [0,10].  (Currently
+  // only implemented in tcmalloc).
+  virtual void SetMemoryReleaseRate(double rate);
+
+  // Gets the release rate.  Returns a value < 0 if unknown.
+  virtual double GetMemoryReleaseRate();
+
+  // Returns the estimated number of bytes that will be allocated for
+  // a request of "size" bytes.  This is an estimate: an allocation of
+  // SIZE bytes may reserve more bytes, but will never reserve less.
+  // (Currently only implemented in tcmalloc, other implementations
+  // always return SIZE.)
+  // This is equivalent to malloc_good_size() in OS X.
+  virtual size_t GetEstimatedAllocatedSize(size_t size);
+
+  // Returns the actual number N of bytes reserved by tcmalloc for the
+  // pointer p.  The client is allowed to use the range of bytes
+  // [p, p+N) in any way it wishes (i.e. N is the "usable size" of this
+  // allocation).  This number may be equal to or greater than the number
+  // of bytes requested when p was allocated.
+  // p must have been allocated by this malloc implementation,
+  // must not be an interior pointer -- that is, must be exactly
+  // the pointer returned to by malloc() et al., not some offset
+  // from that -- and should not have been freed yet.  p may be NULL.
+  // (Currently only implemented in tcmalloc; other implementations
+  // will return 0.)
+  // This is equivalent to malloc_size() in OS X, malloc_usable_size()
+  // in glibc, and _msize() for windows.
+  virtual size_t GetAllocatedSize(const void* p);
+
+  // Returns kOwned if this malloc implementation allocated the memory
+  // pointed to by p, or kNotOwned if some other malloc implementation
+  // allocated it or p is NULL.  May also return kUnknownOwnership if
+  // the malloc implementation does not keep track of ownership.
+  // REQUIRES: p must be a value returned from a previous call to
+  // malloc(), calloc(), realloc(), memalign(), posix_memalign(),
+  // valloc(), pvalloc(), new, or new[], and must refer to memory that
+  // is currently allocated (so, for instance, you should not pass in
+  // a pointer after having called free() on it).
+  enum Ownership {
+    // NOTE: Enum values MUST be kept in sync with the version in
+    // malloc_extension_c.h
+    kUnknownOwnership = 0,
+    kOwned,
+    kNotOwned
+  };
+  virtual Ownership GetOwnership(const void* p);
+
+  // The current malloc implementation.  Always non-NULL.
+  static MallocExtension* instance();
+
+  // Change the malloc implementation.  Typically called by the
+  // malloc implementation during initialization.
+  static void Register(MallocExtension* implementation);
+
+  // Returns detailed information about malloc's freelists. For each list,
+  // return a FreeListInfo:
+  struct FreeListInfo {
+    size_t min_object_size;
+    size_t max_object_size;
+    size_t total_bytes_free;
+    const char* type;
+  };
+  // Each item in the vector refers to a different freelist. The lists
+  // are identified by the range of allocations that objects in the
+  // list can satisfy ([min_object_size, max_object_size]) and the
+  // type of freelist (see below). The current size of the list is
+  // returned in total_bytes_free (which count against a processes
+  // resident and virtual size).
+  //
+  // Currently supported types are:
+  //
+  // "tcmalloc.page{_unmapped}" - tcmalloc's page heap. An entry for each size
+  //          class in the page heap is returned. Bytes in "page_unmapped"
+  //          are no longer backed by physical memory and do not count against
+  //          the resident size of a process.
+  //
+  // "tcmalloc.large{_unmapped}" - tcmalloc's list of objects larger
+  //          than the largest page heap size class. Only one "large"
+  //          entry is returned. There is no upper-bound on the size
+  //          of objects in the large free list; this call returns
+  //          kint64max for max_object_size.  Bytes in
+  //          "large_unmapped" are no longer backed by physical memory
+  //          and do not count against the resident size of a process.
+  //
+  // "tcmalloc.central" - tcmalloc's central free-list. One entry per
+  //          size-class is returned. Never unmapped.
+  //
+  // "debug.free_queue" - free objects queued by the debug allocator
+  //                      and not returned to tcmalloc.
+  //
+  // "tcmalloc.thread" - tcmalloc's per-thread caches. Never unmapped.
+  virtual void GetFreeListSizes(std::vector<FreeListInfo>* v);
+
+  // Get a list of stack traces of sampled allocation points.  Returns
+  // a pointer to a "new[]-ed" result array, and stores the sample
+  // period in "sample_period".
+  //
+  // The state is stored as a sequence of adjacent entries
+  // in the returned array.  Each entry has the following form:
+  //    uintptr_t count;        // Number of objects with following trace
+  //    uintptr_t size;         // Total size of objects with following trace
+  //    uintptr_t depth;        // Number of PC values in stack trace
+  //    void*     stack[depth]; // PC values that form the stack trace
+  //
+  // The list of entries is terminated by a "count" of 0.
+  //
+  // It is the responsibility of the caller to "delete[]" the returned array.
+  //
+  // May return NULL to indicate no results.
+  //
+  // This is an internal extension.  Callers should use the more
+  // convenient "GetHeapSample(string*)" method defined above.
+  virtual void** ReadStackTraces(int* sample_period);
+
+  // Like ReadStackTraces(), but returns stack traces that caused growth
+  // in the address space size.
+  virtual void** ReadHeapGrowthStackTraces();
+};
+
+namespace base {
+
+// Information passed per range.  More fields may be added later.
+struct MallocRange {
+  enum Type {
+    INUSE,                // Application is using this range
+    FREE,                 // Range is currently free
+    UNMAPPED,             // Backing physical memory has been returned to the OS
+    UNKNOWN
+    // More enum values may be added in the future
+  };
+
+  uintptr_t address;    // Address of range
+  size_t length;        // Byte length of range
+  Type type;            // Type of this range
+  double fraction;      // Fraction of range that is being used (0 if !INUSE)
+
+  // Perhaps add the following:
+  // - stack trace if this range was sampled
+  // - heap growth stack trace if applicable to this range
+  // - age when allocated (for inuse) or freed (if not in use)
+};
+
+} // namespace base
+
+#endif  // BASE_MALLOC_EXTENSION_H_

diff --git a/src/gperftools/malloc_extension_c.h b/src/gperftools/malloc_extension_c.h
new file mode 100644
index 0000000..baa013d
--- /dev/null
+++ b/src/gperftools/malloc_extension_c.h

@@ -0,0 +1,99 @@
+/* Copyright (c) 2008, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * --
+ * Author: Craig Silverstein
+ *
+ * C shims for the C++ malloc_extension.h.  See malloc_extension.h for
+ * details.  Note these C shims always work on
+ * MallocExtension::instance(); it is not possible to have more than
+ * one MallocExtension object in C applications.
+ */
+
+#ifndef _MALLOC_EXTENSION_C_H_
+#define _MALLOC_EXTENSION_C_H_
+
+#include <stddef.h>
+#include <sys/types.h>
+
+/* Annoying stuff for windows -- makes sure clients can import these fns */
+#ifndef PERFTOOLS_DLL_DECL
+# ifdef _WIN32
+#   define PERFTOOLS_DLL_DECL  __declspec(dllimport)
+# else
+#   define PERFTOOLS_DLL_DECL
+# endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define kMallocExtensionHistogramSize 64
+
+PERFTOOLS_DLL_DECL int MallocExtension_VerifyAllMemory(void);
+PERFTOOLS_DLL_DECL int MallocExtension_VerifyNewMemory(const void* p);
+PERFTOOLS_DLL_DECL int MallocExtension_VerifyArrayNewMemory(const void* p);
+PERFTOOLS_DLL_DECL int MallocExtension_VerifyMallocMemory(const void* p);
+PERFTOOLS_DLL_DECL int MallocExtension_MallocMemoryStats(int* blocks, size_t* total,
+                                      int histogram[kMallocExtensionHistogramSize]);
+PERFTOOLS_DLL_DECL void MallocExtension_GetStats(char* buffer, int buffer_length);
+
+/* TODO(csilvers): write a C version of these routines, that perhaps
+ * takes a function ptr and a void *.
+ */
+/* void MallocExtension_GetHeapSample(string* result); */
+/* void MallocExtension_GetHeapGrowthStacks(string* result); */
+
+PERFTOOLS_DLL_DECL int MallocExtension_GetNumericProperty(const char* property, size_t* value);
+PERFTOOLS_DLL_DECL int MallocExtension_SetNumericProperty(const char* property, size_t value);
+PERFTOOLS_DLL_DECL void MallocExtension_MarkThreadIdle(void);
+PERFTOOLS_DLL_DECL void MallocExtension_MarkThreadBusy(void);
+PERFTOOLS_DLL_DECL void MallocExtension_ReleaseToSystem(size_t num_bytes);
+PERFTOOLS_DLL_DECL void MallocExtension_ReleaseFreeMemory(void);
+PERFTOOLS_DLL_DECL size_t MallocExtension_GetEstimatedAllocatedSize(size_t size);
+PERFTOOLS_DLL_DECL size_t MallocExtension_GetAllocatedSize(const void* p);
+
+/*
+ * NOTE: These enum values MUST be kept in sync with the version in
+ *       malloc_extension.h
+ */
+typedef enum {
+  MallocExtension_kUnknownOwnership = 0,
+  MallocExtension_kOwned,
+  MallocExtension_kNotOwned
+} MallocExtension_Ownership;
+
+PERFTOOLS_DLL_DECL MallocExtension_Ownership MallocExtension_GetOwnership(const void* p);
+
+#ifdef __cplusplus
+}   /* extern "C" */
+#endif
+
+#endif /* _MALLOC_EXTENSION_C_H_ */

diff --git a/src/gperftools/malloc_hook.h b/src/gperftools/malloc_hook.h
new file mode 100644
index 0000000..9d56fb1
--- /dev/null
+++ b/src/gperftools/malloc_hook.h

@@ -0,0 +1,359 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// Some of our malloc implementations can invoke the following hooks whenever
+// memory is allocated or deallocated.  MallocHook is thread-safe, and things
+// you do before calling AddFooHook(MyHook) are visible to any resulting calls
+// to MyHook.  Hooks must be thread-safe.  If you write:
+//
+//   CHECK(MallocHook::AddNewHook(&MyNewHook));
+//
+// MyNewHook will be invoked in subsequent calls in the current thread, but
+// there are no guarantees on when it might be invoked in other threads.
+//
+// There are a limited number of slots available for each hook type.  Add*Hook
+// will return false if there are no slots available.  Remove*Hook will return
+// false if the given hook was not already installed.
+//
+// The order in which individual hooks are called in Invoke*Hook is undefined.
+//
+// It is safe for a hook to remove itself within Invoke*Hook and add other
+// hooks.  Any hooks added inside a hook invocation (for the same hook type)
+// will not be invoked for the current invocation.
+//
+// One important user of these hooks is the heap profiler.
+//
+// CAVEAT: If you add new MallocHook::Invoke* calls then those calls must be
+// directly in the code of the (de)allocation function that is provided to the
+// user and that function must have an ATTRIBUTE_SECTION(malloc_hook) attribute.
+//
+// Note: the Invoke*Hook() functions are defined in malloc_hook-inl.h.  If you
+// need to invoke a hook (which you shouldn't unless you're part of tcmalloc),
+// be sure to #include malloc_hook-inl.h in addition to malloc_hook.h.
+//
+// NOTE FOR C USERS: If you want to use malloc_hook functionality from
+// a C program, #include malloc_hook_c.h instead of this file.
+
+#ifndef _MALLOC_HOOK_H_
+#define _MALLOC_HOOK_H_
+
+#include <stddef.h>
+#include <sys/types.h>
+extern "C" {
+#include <gperftools/malloc_hook_c.h>  // a C version of the malloc_hook interface
+}
+
+// Annoying stuff for windows -- makes sure clients can import these functions
+#ifndef PERFTOOLS_DLL_DECL
+# ifdef _WIN32
+#   define PERFTOOLS_DLL_DECL  __declspec(dllimport)
+# else
+#   define PERFTOOLS_DLL_DECL
+# endif
+#endif
+
+// The C++ methods below call the C version (MallocHook_*), and thus
+// convert between an int and a bool.  Windows complains about this
+// (a "performance warning") which we don't care about, so we suppress.
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4800)
+#endif
+
+// Note: malloc_hook_c.h defines MallocHook_*Hook and
+// MallocHook_{Add,Remove}*Hook.  The version of these inside the MallocHook
+// class are defined in terms of the malloc_hook_c version.  See malloc_hook_c.h
+// for details of these types/functions.
+
+class PERFTOOLS_DLL_DECL MallocHook {
+ public:
+  // The NewHook is invoked whenever an object is allocated.
+  // It may be passed NULL if the allocator returned NULL.
+  typedef MallocHook_NewHook NewHook;
+  inline static bool AddNewHook(NewHook hook) {
+    return MallocHook_AddNewHook(hook);
+  }
+  inline static bool RemoveNewHook(NewHook hook) {
+    return MallocHook_RemoveNewHook(hook);
+  }
+  inline static void InvokeNewHook(const void* p, size_t s);
+
+  // The DeleteHook is invoked whenever an object is deallocated.
+  // It may be passed NULL if the caller is trying to delete NULL.
+  typedef MallocHook_DeleteHook DeleteHook;
+  inline static bool AddDeleteHook(DeleteHook hook) {
+    return MallocHook_AddDeleteHook(hook);
+  }
+  inline static bool RemoveDeleteHook(DeleteHook hook) {
+    return MallocHook_RemoveDeleteHook(hook);
+  }
+  inline static void InvokeDeleteHook(const void* p);
+
+  // The PreMmapHook is invoked with mmap or mmap64 arguments just
+  // before the call is actually made.  Such a hook may be useful
+  // in memory limited contexts, to catch allocations that will exceed
+  // a memory limit, and take outside actions to increase that limit.
+  typedef MallocHook_PreMmapHook PreMmapHook;
+  inline static bool AddPreMmapHook(PreMmapHook hook) {
+    return MallocHook_AddPreMmapHook(hook);
+  }
+  inline static bool RemovePreMmapHook(PreMmapHook hook) {
+    return MallocHook_RemovePreMmapHook(hook);
+  }
+  inline static void InvokePreMmapHook(const void* start,
+                                       size_t size,
+                                       int protection,
+                                       int flags,
+                                       int fd,
+                                       off_t offset);
+
+  // The MmapReplacement is invoked after the PreMmapHook but before
+  // the call is actually made. The MmapReplacement should return true
+  // if it handled the call, or false if it is still necessary to
+  // call mmap/mmap64.
+  // This should be used only by experts, and users must be be
+  // extremely careful to avoid recursive calls to mmap. The replacement
+  // should be async signal safe.
+  // Only one MmapReplacement is supported. After setting an MmapReplacement
+  // you must call RemoveMmapReplacement before calling SetMmapReplacement
+  // again.
+  typedef MallocHook_MmapReplacement MmapReplacement;
+  inline static bool SetMmapReplacement(MmapReplacement hook) {
+    return MallocHook_SetMmapReplacement(hook);
+  }
+  inline static bool RemoveMmapReplacement(MmapReplacement hook) {
+    return MallocHook_RemoveMmapReplacement(hook);
+  }
+  inline static bool InvokeMmapReplacement(const void* start,
+                                           size_t size,
+                                           int protection,
+                                           int flags,
+                                           int fd,
+                                           off_t offset,
+                                           void** result);
+
+
+  // The MmapHook is invoked whenever a region of memory is mapped.
+  // It may be passed MAP_FAILED if the mmap failed.
+  typedef MallocHook_MmapHook MmapHook;
+  inline static bool AddMmapHook(MmapHook hook) {
+    return MallocHook_AddMmapHook(hook);
+  }
+  inline static bool RemoveMmapHook(MmapHook hook) {
+    return MallocHook_RemoveMmapHook(hook);
+  }
+  inline static void InvokeMmapHook(const void* result,
+                                    const void* start,
+                                    size_t size,
+                                    int protection,
+                                    int flags,
+                                    int fd,
+                                    off_t offset);
+
+  // The MunmapReplacement is invoked with munmap arguments just before
+  // the call is actually made. The MunmapReplacement should return true
+  // if it handled the call, or false if it is still necessary to
+  // call munmap.
+  // This should be used only by experts. The replacement should be
+  // async signal safe.
+  // Only one MunmapReplacement is supported. After setting an
+  // MunmapReplacement you must call RemoveMunmapReplacement before
+  // calling SetMunmapReplacement again.
+  typedef MallocHook_MunmapReplacement MunmapReplacement;
+  inline static bool SetMunmapReplacement(MunmapReplacement hook) {
+    return MallocHook_SetMunmapReplacement(hook);
+  }
+  inline static bool RemoveMunmapReplacement(MunmapReplacement hook) {
+    return MallocHook_RemoveMunmapReplacement(hook);
+  }
+  inline static bool InvokeMunmapReplacement(const void* p,
+                                             size_t size,
+                                             int* result);
+
+  // The MunmapHook is invoked whenever a region of memory is unmapped.
+  typedef MallocHook_MunmapHook MunmapHook;
+  inline static bool AddMunmapHook(MunmapHook hook) {
+    return MallocHook_AddMunmapHook(hook);
+  }
+  inline static bool RemoveMunmapHook(MunmapHook hook) {
+    return MallocHook_RemoveMunmapHook(hook);
+  }
+  inline static void InvokeMunmapHook(const void* p, size_t size);
+
+  // The MremapHook is invoked whenever a region of memory is remapped.
+  typedef MallocHook_MremapHook MremapHook;
+  inline static bool AddMremapHook(MremapHook hook) {
+    return MallocHook_AddMremapHook(hook);
+  }
+  inline static bool RemoveMremapHook(MremapHook hook) {
+    return MallocHook_RemoveMremapHook(hook);
+  }
+  inline static void InvokeMremapHook(const void* result,
+                                      const void* old_addr,
+                                      size_t old_size,
+                                      size_t new_size,
+                                      int flags,
+                                      const void* new_addr);
+
+  // The PreSbrkHook is invoked just before sbrk is called -- except when
+  // the increment is 0.  This is because sbrk(0) is often called
+  // to get the top of the memory stack, and is not actually a
+  // memory-allocation call.  It may be useful in memory-limited contexts,
+  // to catch allocations that will exceed the limit and take outside
+  // actions to increase such a limit.
+  typedef MallocHook_PreSbrkHook PreSbrkHook;
+  inline static bool AddPreSbrkHook(PreSbrkHook hook) {
+    return MallocHook_AddPreSbrkHook(hook);
+  }
+  inline static bool RemovePreSbrkHook(PreSbrkHook hook) {
+    return MallocHook_RemovePreSbrkHook(hook);
+  }
+  inline static void InvokePreSbrkHook(ptrdiff_t increment);
+
+  // The SbrkHook is invoked whenever sbrk is called -- except when
+  // the increment is 0.  This is because sbrk(0) is often called
+  // to get the top of the memory stack, and is not actually a
+  // memory-allocation call.
+  typedef MallocHook_SbrkHook SbrkHook;
+  inline static bool AddSbrkHook(SbrkHook hook) {
+    return MallocHook_AddSbrkHook(hook);
+  }
+  inline static bool RemoveSbrkHook(SbrkHook hook) {
+    return MallocHook_RemoveSbrkHook(hook);
+  }
+  inline static void InvokeSbrkHook(const void* result, ptrdiff_t increment);
+
+  // Get the current stack trace.  Try to skip all routines up to and
+  // and including the caller of MallocHook::Invoke*.
+  // Use "skip_count" (similarly to GetStackTrace from stacktrace.h)
+  // as a hint about how many routines to skip if better information
+  // is not available.
+  inline static int GetCallerStackTrace(void** result, int max_depth,
+                                        int skip_count) {
+    return MallocHook_GetCallerStackTrace(result, max_depth, skip_count);
+  }
+
+  // Unhooked versions of mmap() and munmap().   These should be used
+  // only by experts, since they bypass heapchecking, etc.
+  // Note: These do not run hooks, but they still use the MmapReplacement
+  // and MunmapReplacement.
+  static void* UnhookedMMap(void *start, size_t length, int prot, int flags,
+                            int fd, off_t offset);
+  static int UnhookedMUnmap(void *start, size_t length);
+
+  // The following are DEPRECATED.
+  inline static NewHook GetNewHook();
+  inline static NewHook SetNewHook(NewHook hook) {
+    return MallocHook_SetNewHook(hook);
+  }
+
+  inline static DeleteHook GetDeleteHook();
+  inline static DeleteHook SetDeleteHook(DeleteHook hook) {
+    return MallocHook_SetDeleteHook(hook);
+  }
+
+  inline static PreMmapHook GetPreMmapHook();
+  inline static PreMmapHook SetPreMmapHook(PreMmapHook hook) {
+    return MallocHook_SetPreMmapHook(hook);
+  }
+
+  inline static MmapHook GetMmapHook();
+  inline static MmapHook SetMmapHook(MmapHook hook) {
+    return MallocHook_SetMmapHook(hook);
+  }
+
+  inline static MunmapHook GetMunmapHook();
+  inline static MunmapHook SetMunmapHook(MunmapHook hook) {
+    return MallocHook_SetMunmapHook(hook);
+  }
+
+  inline static MremapHook GetMremapHook();
+  inline static MremapHook SetMremapHook(MremapHook hook) {
+    return MallocHook_SetMremapHook(hook);
+  }
+
+  inline static PreSbrkHook GetPreSbrkHook();
+  inline static PreSbrkHook SetPreSbrkHook(PreSbrkHook hook) {
+    return MallocHook_SetPreSbrkHook(hook);
+  }
+
+  inline static SbrkHook GetSbrkHook();
+  inline static SbrkHook SetSbrkHook(SbrkHook hook) {
+    return MallocHook_SetSbrkHook(hook);
+  }
+  // End of DEPRECATED methods.
+
+ private:
+  // Slow path versions of Invoke*Hook.
+  static void InvokeNewHookSlow(const void* p, size_t s);
+  static void InvokeDeleteHookSlow(const void* p);
+  static void InvokePreMmapHookSlow(const void* start,
+                                    size_t size,
+                                    int protection,
+                                    int flags,
+                                    int fd,
+                                    off_t offset);
+  static void InvokeMmapHookSlow(const void* result,
+                                 const void* start,
+                                 size_t size,
+                                 int protection,
+                                 int flags,
+                                 int fd,
+                                 off_t offset);
+  static bool InvokeMmapReplacementSlow(const void* start,
+                                        size_t size,
+                                        int protection,
+                                        int flags,
+                                        int fd,
+                                        off_t offset,
+                                        void** result);
+  static void InvokeMunmapHookSlow(const void* p, size_t size);
+  static bool InvokeMunmapReplacementSlow(const void* p,
+                                          size_t size,
+                                          int* result);
+  static void InvokeMremapHookSlow(const void* result,
+                                   const void* old_addr,
+                                   size_t old_size,
+                                   size_t new_size,
+                                   int flags,
+                                   const void* new_addr);
+  static void InvokePreSbrkHookSlow(ptrdiff_t increment);
+  static void InvokeSbrkHookSlow(const void* result, ptrdiff_t increment);
+};
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+
+#endif /* _MALLOC_HOOK_H_ */

diff --git a/src/gperftools/malloc_hook_c.h b/src/gperftools/malloc_hook_c.h
new file mode 100644
index 0000000..56337e1
--- /dev/null
+++ b/src/gperftools/malloc_hook_c.h

@@ -0,0 +1,173 @@
+/* Copyright (c) 2008, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * --
+ * Author: Craig Silverstein
+ *
+ * C shims for the C++ malloc_hook.h.  See malloc_hook.h for details
+ * on how to use these.
+ */
+
+#ifndef _MALLOC_HOOK_C_H_
+#define _MALLOC_HOOK_C_H_
+
+#include <stddef.h>
+#include <sys/types.h>
+
+/* Annoying stuff for windows; makes sure clients can import these functions */
+#ifndef PERFTOOLS_DLL_DECL
+# ifdef _WIN32
+#   define PERFTOOLS_DLL_DECL  __declspec(dllimport)
+# else
+#   define PERFTOOLS_DLL_DECL
+# endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Get the current stack trace.  Try to skip all routines up to and
+ * and including the caller of MallocHook::Invoke*.
+ * Use "skip_count" (similarly to GetStackTrace from stacktrace.h)
+ * as a hint about how many routines to skip if better information
+ * is not available.
+ */
+PERFTOOLS_DLL_DECL
+int MallocHook_GetCallerStackTrace(void** result, int max_depth,
+                                   int skip_count);
+
+/* The MallocHook_{Add,Remove}*Hook functions return 1 on success and 0 on
+ * failure.
+ */
+
+typedef void (*MallocHook_NewHook)(const void* ptr, size_t size);
+PERFTOOLS_DLL_DECL
+int MallocHook_AddNewHook(MallocHook_NewHook hook);
+PERFTOOLS_DLL_DECL
+int MallocHook_RemoveNewHook(MallocHook_NewHook hook);
+
+typedef void (*MallocHook_DeleteHook)(const void* ptr);
+PERFTOOLS_DLL_DECL
+int MallocHook_AddDeleteHook(MallocHook_DeleteHook hook);
+PERFTOOLS_DLL_DECL
+int MallocHook_RemoveDeleteHook(MallocHook_DeleteHook hook);
+
+typedef void (*MallocHook_PreMmapHook)(const void *start,
+                                       size_t size,
+                                       int protection,
+                                       int flags,
+                                       int fd,
+                                       off_t offset);
+PERFTOOLS_DLL_DECL
+int MallocHook_AddPreMmapHook(MallocHook_PreMmapHook hook);
+PERFTOOLS_DLL_DECL
+int MallocHook_RemovePreMmapHook(MallocHook_PreMmapHook hook);
+
+typedef void (*MallocHook_MmapHook)(const void* result,
+                                    const void* start,
+                                    size_t size,
+                                    int protection,
+                                    int flags,
+                                    int fd,
+                                    off_t offset);
+PERFTOOLS_DLL_DECL
+int MallocHook_AddMmapHook(MallocHook_MmapHook hook);
+PERFTOOLS_DLL_DECL
+int MallocHook_RemoveMmapHook(MallocHook_MmapHook hook);
+
+typedef int (*MallocHook_MmapReplacement)(const void* start,
+                                          size_t size,
+                                          int protection,
+                                          int flags,
+                                          int fd,
+                                          off_t offset,
+                                          void** result);
+int MallocHook_SetMmapReplacement(MallocHook_MmapReplacement hook);
+int MallocHook_RemoveMmapReplacement(MallocHook_MmapReplacement hook);
+
+typedef void (*MallocHook_MunmapHook)(const void* ptr, size_t size);
+PERFTOOLS_DLL_DECL
+int MallocHook_AddMunmapHook(MallocHook_MunmapHook hook);
+PERFTOOLS_DLL_DECL
+int MallocHook_RemoveMunmapHook(MallocHook_MunmapHook hook);
+
+typedef int (*MallocHook_MunmapReplacement)(const void* ptr,
+                                            size_t size,
+                                            int* result);
+int MallocHook_SetMunmapReplacement(MallocHook_MunmapReplacement hook);
+int MallocHook_RemoveMunmapReplacement(MallocHook_MunmapReplacement hook);
+
+typedef void (*MallocHook_MremapHook)(const void* result,
+                                      const void* old_addr,
+                                      size_t old_size,
+                                      size_t new_size,
+                                      int flags,
+                                      const void* new_addr);
+PERFTOOLS_DLL_DECL
+int MallocHook_AddMremapHook(MallocHook_MremapHook hook);
+PERFTOOLS_DLL_DECL
+int MallocHook_RemoveMremapHook(MallocHook_MremapHook hook);
+
+typedef void (*MallocHook_PreSbrkHook)(ptrdiff_t increment);
+PERFTOOLS_DLL_DECL
+int MallocHook_AddPreSbrkHook(MallocHook_PreSbrkHook hook);
+PERFTOOLS_DLL_DECL
+int MallocHook_RemovePreSbrkHook(MallocHook_PreSbrkHook hook);
+
+typedef void (*MallocHook_SbrkHook)(const void* result, ptrdiff_t increment);
+PERFTOOLS_DLL_DECL
+int MallocHook_AddSbrkHook(MallocHook_SbrkHook hook);
+PERFTOOLS_DLL_DECL
+int MallocHook_RemoveSbrkHook(MallocHook_SbrkHook hook);
+
+/* The following are DEPRECATED. */
+PERFTOOLS_DLL_DECL
+MallocHook_NewHook MallocHook_SetNewHook(MallocHook_NewHook hook);
+PERFTOOLS_DLL_DECL
+MallocHook_DeleteHook MallocHook_SetDeleteHook(MallocHook_DeleteHook hook);
+PERFTOOLS_DLL_DECL
+MallocHook_PreMmapHook MallocHook_SetPreMmapHook(MallocHook_PreMmapHook hook);
+PERFTOOLS_DLL_DECL
+MallocHook_MmapHook MallocHook_SetMmapHook(MallocHook_MmapHook hook);
+PERFTOOLS_DLL_DECL
+MallocHook_MunmapHook MallocHook_SetMunmapHook(MallocHook_MunmapHook hook);
+PERFTOOLS_DLL_DECL
+MallocHook_MremapHook MallocHook_SetMremapHook(MallocHook_MremapHook hook);
+PERFTOOLS_DLL_DECL
+MallocHook_PreSbrkHook MallocHook_SetPreSbrkHook(MallocHook_PreSbrkHook hook);
+PERFTOOLS_DLL_DECL
+MallocHook_SbrkHook MallocHook_SetSbrkHook(MallocHook_SbrkHook hook);
+/* End of DEPRECATED functions. */
+
+#ifdef __cplusplus
+}   // extern "C"
+#endif
+
+#endif /* _MALLOC_HOOK_C_H_ */

diff --git a/src/gperftools/profiler.h b/src/gperftools/profiler.h
new file mode 100644
index 0000000..2d272d6
--- /dev/null
+++ b/src/gperftools/profiler.h

@@ -0,0 +1,169 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2005, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Sanjay Ghemawat
+ *
+ * Module for CPU profiling based on periodic pc-sampling.
+ *
+ * For full(er) information, see doc/cpuprofile.html
+ *
+ * This module is linked into your program with
+ * no slowdown caused by this unless you activate the profiler
+ * using one of the following methods:
+ *
+ *    1. Before starting the program, set the environment variable
+ *       "CPUPROFILE" to be the name of the file to which the profile
+ *       data should be written.
+ *
+ *    2. Programmatically, start and stop the profiler using the
+ *       routines "ProfilerStart(filename)" and "ProfilerStop()".
+ *
+ *
+ * (Note: if using linux 2.4 or earlier, only the main thread may be
+ * profiled.)
+ *
+ * Use pprof to view the resulting profile output.
+ *    % pprof <path_to_executable> <profile_file_name>
+ *    % pprof --gv  <path_to_executable> <profile_file_name>
+ *
+ * These functions are thread-safe.
+ */
+
+#ifndef BASE_PROFILER_H_
+#define BASE_PROFILER_H_
+
+#include <time.h>       /* For time_t */
+
+/* Annoying stuff for windows; makes sure clients can import these functions */
+#ifndef PERFTOOLS_DLL_DECL
+# ifdef _WIN32
+#   define PERFTOOLS_DLL_DECL  __declspec(dllimport)
+# else
+#   define PERFTOOLS_DLL_DECL
+# endif
+#endif
+
+/* All this code should be usable from within C apps. */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Profiler options, for use with ProfilerStartWithOptions.  To use:
+ *
+ *   struct ProfilerOptions options;
+ *   memset(&options, 0, sizeof options);
+ *
+ * then fill in fields as needed.
+ *
+ * This structure is intended to be usable from C code, so no constructor
+ * is provided to initialize it.  (Use memset as described above).
+ */
+struct ProfilerOptions {
+  /* Filter function and argument.
+   *
+   * If filter_in_thread is not NULL, when a profiling tick is delivered
+   * the profiler will call:
+   *
+   *   (*filter_in_thread)(filter_in_thread_arg)
+   *
+   * If it returns nonzero, the sample will be included in the profile.
+   * Note that filter_in_thread runs in a signal handler, so must be
+   * async-signal-safe.
+   *
+   * A typical use would be to set up filter results for each thread
+   * in the system before starting the profiler, then to make
+   * filter_in_thread be a very simple function which retrieves those
+   * results in an async-signal-safe way.  Retrieval could be done
+   * using thread-specific data, or using a shared data structure that
+   * supports async-signal-safe lookups.
+   */
+  int (*filter_in_thread)(void *arg);
+  void *filter_in_thread_arg;
+};
+
+/* Start profiling and write profile info into fname, discarding any
+ * existing profiling data in that file.
+ *
+ * This is equivalent to calling ProfilerStartWithOptions(fname, NULL).
+ */
+PERFTOOLS_DLL_DECL int ProfilerStart(const char* fname);
+
+/* Start profiling and write profile into fname, discarding any
+ * existing profiling data in that file.
+ *
+ * The profiler is configured using the options given by 'options'.
+ * Options which are not specified are given default values.
+ *
+ * 'options' may be NULL, in which case all are given default values.
+ *
+ * Returns nonzero if profiling was started successfully, or zero else.
+ */
+PERFTOOLS_DLL_DECL int ProfilerStartWithOptions(
+    const char *fname, const struct ProfilerOptions *options);
+
+/* Stop profiling. Can be started again with ProfilerStart(), but
+ * the currently accumulated profiling data will be cleared.
+ */
+PERFTOOLS_DLL_DECL void ProfilerStop(void);
+
+/* Flush any currently buffered profiling state to the profile file.
+ * Has no effect if the profiler has not been started.
+ */
+PERFTOOLS_DLL_DECL void ProfilerFlush(void);
+
+
+/* DEPRECATED: these functions were used to enable/disable profiling
+ * in the current thread, but no longer do anything.
+ */
+PERFTOOLS_DLL_DECL void ProfilerEnable(void);
+PERFTOOLS_DLL_DECL void ProfilerDisable(void);
+
+/* Returns nonzero if profile is currently enabled, zero if it's not. */
+PERFTOOLS_DLL_DECL int ProfilingIsEnabledForAllThreads(void);
+
+/* Routine for registering new threads with the profiler.
+ */
+PERFTOOLS_DLL_DECL void ProfilerRegisterThread(void);
+
+/* Stores state about profiler's current status into "*state". */
+struct ProfilerState {
+  int    enabled;             /* Is profiling currently enabled? */
+  time_t start_time;          /* If enabled, when was profiling started? */
+  char   profile_name[1024];  /* Name of profile file being written, or '\0' */
+  int    samples_gathered;    /* Number of samples gathered so far (or 0) */
+};
+PERFTOOLS_DLL_DECL void ProfilerGetCurrentState(struct ProfilerState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  /* BASE_PROFILER_H_ */

diff --git a/src/gperftools/stacktrace.h b/src/gperftools/stacktrace.h
new file mode 100644
index 0000000..2b9c5a1
--- /dev/null
+++ b/src/gperftools/stacktrace.h

@@ -0,0 +1,117 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// Routines to extract the current stack trace.  These functions are
+// thread-safe.
+
+#ifndef GOOGLE_STACKTRACE_H_
+#define GOOGLE_STACKTRACE_H_
+
+// Annoying stuff for windows -- makes sure clients can import these functions
+#ifndef PERFTOOLS_DLL_DECL
+# ifdef _WIN32
+#   define PERFTOOLS_DLL_DECL  __declspec(dllimport)
+# else
+#   define PERFTOOLS_DLL_DECL
+# endif
+#endif
+
+
+// Skips the most recent "skip_count" stack frames (also skips the
+// frame generated for the "GetStackFrames" routine itself), and then
+// records the pc values for up to the next "max_depth" frames in
+// "result", and the corresponding stack frame sizes in "sizes".
+// Returns the number of values recorded in "result"/"sizes".
+//
+// Example:
+//      main() { foo(); }
+//      foo() { bar(); }
+//      bar() {
+//        void* result[10];
+//        int sizes[10];
+//        int depth = GetStackFrames(result, sizes, 10, 1);
+//      }
+//
+// The GetStackFrames call will skip the frame for "bar".  It will
+// return 2 and will produce pc values that map to the following
+// procedures:
+//      result[0]       foo
+//      result[1]       main
+// (Actually, there may be a few more entries after "main" to account for
+// startup procedures.)
+// And corresponding stack frame sizes will also be recorded:
+//    sizes[0]       16
+//    sizes[1]       16
+// (Stack frame sizes of 16 above are just for illustration purposes.)
+// Stack frame sizes of 0 or less indicate that those frame sizes couldn't
+// be identified.
+//
+// This routine may return fewer stack frame entries than are
+// available. Also note that "result" and "sizes" must both be non-NULL.
+extern PERFTOOLS_DLL_DECL int GetStackFrames(void** result, int* sizes, int max_depth,
+                          int skip_count);
+
+// Same as above, but to be used from a signal handler. The "uc" parameter
+// should be the pointer to ucontext_t which was passed as the 3rd parameter
+// to sa_sigaction signal handler. It may help the unwinder to get a
+// better stack trace under certain conditions. The "uc" may safely be NULL.
+extern PERFTOOLS_DLL_DECL int GetStackFramesWithContext(void** result, int* sizes, int max_depth,
+                                     int skip_count, const void *uc);
+
+// This is similar to the GetStackFrames routine, except that it returns
+// the stack trace only, and not the stack frame sizes as well.
+// Example:
+//      main() { foo(); }
+//      foo() { bar(); }
+//      bar() {
+//        void* result[10];
+//        int depth = GetStackTrace(result, 10, 1);
+//      }
+//
+// This produces:
+//      result[0]       foo
+//      result[1]       main
+//           ....       ...
+//
+// "result" must not be NULL.
+extern PERFTOOLS_DLL_DECL int GetStackTrace(void** result, int max_depth,
+                                            int skip_count);
+
+// Same as above, but to be used from a signal handler. The "uc" parameter
+// should be the pointer to ucontext_t which was passed as the 3rd parameter
+// to sa_sigaction signal handler. It may help the unwinder to get a
+// better stack trace under certain conditions. The "uc" may safely be NULL.
+extern PERFTOOLS_DLL_DECL int GetStackTraceWithContext(void** result, int max_depth,
+                                    int skip_count, const void *uc);
+
+#endif /* GOOGLE_STACKTRACE_H_ */

diff --git a/src/gperftools/tcmalloc.h.in b/src/gperftools/tcmalloc.h.in
new file mode 100644
index 0000000..d43184d
--- /dev/null
+++ b/src/gperftools/tcmalloc.h.in

@@ -0,0 +1,135 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2003, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Sanjay Ghemawat <opensource@google.com>
+ *         .h file by Craig Silverstein <opensource@google.com>
+ */
+
+#ifndef TCMALLOC_TCMALLOC_H_
+#define TCMALLOC_TCMALLOC_H_
+
+#include <stddef.h>                     // for size_t
+#ifdef HAVE_SYS_CDEFS_H
+#include <sys/cdefs.h>   // where glibc defines __THROW
+#endif
+
+// __THROW is defined in glibc systems.  It means, counter-intuitively,
+// "This function will never throw an exception."  It's an optional
+// optimization tool, but we may need to use it to match glibc prototypes.
+#ifndef __THROW    /* I guess we're not on a glibc system */
+# define __THROW   /* __THROW is just an optimization, so ok to make it "" */
+#endif
+
+// Define the version number so folks can check against it
+#define TC_VERSION_MAJOR  @TC_VERSION_MAJOR@
+#define TC_VERSION_MINOR  @TC_VERSION_MINOR@
+#define TC_VERSION_PATCH  "@TC_VERSION_PATCH@"
+#define TC_VERSION_STRING "gperftools @TC_VERSION_MAJOR@.@TC_VERSION_MINOR@@TC_VERSION_PATCH@"
+
+// For struct mallinfo, if it's defined.
+#ifdef HAVE_STRUCT_MALLINFO
+// Malloc can be in several places on older versions of OS X.
+# if defined(HAVE_MALLOC_H)
+# include <malloc.h>
+# elif defined(HAVE_SYS_MALLOC_H)
+# include <sys/malloc.h>
+# elif defined(HAVE_MALLOC_MALLOC_H)
+# include <malloc/malloc.h>
+# endif
+#endif
+
+// Annoying stuff for windows -- makes sure clients can import these functions
+#ifndef PERFTOOLS_DLL_DECL
+# ifdef _WIN32
+#   define PERFTOOLS_DLL_DECL  __declspec(dllimport)
+# else
+#   define PERFTOOLS_DLL_DECL
+# endif
+#endif
+
+#ifdef __cplusplus
+namespace std {
+struct nothrow_t;
+}
+
+extern "C" {
+#endif
+  // Returns a human-readable version string.  If major, minor,
+  // and/or patch are not NULL, they are set to the major version,
+  // minor version, and patch-code (a string, usually "").
+  PERFTOOLS_DLL_DECL const char* tc_version(int* major, int* minor,
+                                            const char** patch) __THROW;
+
+  PERFTOOLS_DLL_DECL void* tc_malloc(size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_malloc_skip_new_handler(size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void tc_free(void* ptr) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_realloc(void* ptr, size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_calloc(size_t nmemb, size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void tc_cfree(void* ptr) __THROW;
+
+  PERFTOOLS_DLL_DECL void* tc_memalign(size_t __alignment,
+                                       size_t __size) __THROW;
+  PERFTOOLS_DLL_DECL int tc_posix_memalign(void** ptr,
+                                           size_t align, size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_valloc(size_t __size) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_pvalloc(size_t __size) __THROW;
+
+  PERFTOOLS_DLL_DECL void tc_malloc_stats(void) __THROW;
+  PERFTOOLS_DLL_DECL int tc_mallopt(int cmd, int value) __THROW;
+#if @ac_cv_have_struct_mallinfo@
+  PERFTOOLS_DLL_DECL struct mallinfo tc_mallinfo(void) __THROW;
+#endif
+
+  // This is an alias for MallocExtension::instance()->GetAllocatedSize().
+  // It is equivalent to
+  //    OS X: malloc_size()
+  //    glibc: malloc_usable_size()
+  //    Windows: _msize()
+  PERFTOOLS_DLL_DECL size_t tc_malloc_size(void* ptr) __THROW;
+
+#ifdef __cplusplus
+  PERFTOOLS_DLL_DECL int tc_set_new_mode(int flag) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_new(size_t size);
+  PERFTOOLS_DLL_DECL void* tc_new_nothrow(size_t size,
+                                          const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW;
+  PERFTOOLS_DLL_DECL void tc_delete_nothrow(void* p,
+                                            const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_newarray(size_t size);
+  PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size,
+                                               const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW;
+  PERFTOOLS_DLL_DECL void tc_deletearray_nothrow(void* p,
+                                                 const std::nothrow_t&) __THROW;
+}
+#endif
+
+#endif  // #ifndef TCMALLOC_TCMALLOC_H_

diff --git a/src/heap-checker-bcad.cc b/src/heap-checker-bcad.cc
new file mode 100644
index 0000000..00efdb7
--- /dev/null
+++ b/src/heap-checker-bcad.cc

@@ -0,0 +1,93 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// All Rights Reserved.
+//
+// Author: Maxim Lifantsev
+//
+// A file to ensure that components of heap leak checker run before
+// all global object constructors and after all global object
+// destructors.
+//
+// This file must be the last library any binary links against.
+// Otherwise, the heap checker may not be able to run early enough to
+// catalog all the global objects in your program.  If this happens,
+// and later in the program you allocate memory and have one of these
+// "uncataloged" global objects point to it, the heap checker will
+// consider that allocation to be a leak, even though it's not (since
+// the allocated object is reachable from global data and hence "live").
+
+#include <stdlib.h>      // for abort()
+#include <gperftools/malloc_extension.h>
+
+// A dummy variable to refer from heap-checker.cc.  This is to make
+// sure this file is not optimized out by the linker.
+bool heap_leak_checker_bcad_variable;
+
+extern void HeapLeakChecker_AfterDestructors();  // in heap-checker.cc
+
+// A helper class to ensure that some components of heap leak checking
+// can happen before construction and after destruction
+// of all global/static objects.
+class HeapLeakCheckerGlobalPrePost {
+ public:
+  HeapLeakCheckerGlobalPrePost() {
+    if (count_ == 0) {
+      // The 'new int' will ensure that we have run an initial malloc
+      // hook, which will set up the heap checker via
+      // MallocHook_InitAtFirstAllocation_HeapLeakChecker.  See malloc_hook.cc.
+      // This is done in this roundabout fashion in order to avoid self-deadlock
+      // if we directly called HeapLeakChecker_BeforeConstructors here.
+      delete new int;
+      // This needs to be called before the first allocation of an STL
+      // object, but after libc is done setting up threads (because it
+      // calls setenv, which requires a thread-aware errno).  By
+      // putting it here, we hope it's the first bit of code executed
+      // after the libc global-constructor code.
+      MallocExtension::Initialize();
+    }
+    ++count_;
+  }
+  ~HeapLeakCheckerGlobalPrePost() {
+    if (count_ <= 0)  abort();
+    --count_;
+    if (count_ == 0)  HeapLeakChecker_AfterDestructors();
+  }
+ private:
+  // Counter of constructions/destructions of objects of this class
+  // (just in case there are more than one of them).
+  static int count_;
+};
+
+int HeapLeakCheckerGlobalPrePost::count_ = 0;
+
+// The early-construction/late-destruction global object.
+static const HeapLeakCheckerGlobalPrePost heap_leak_checker_global_pre_post;

diff --git a/src/heap-checker.cc b/src/heap-checker.cc
new file mode 100755
index 0000000..9c82dea
--- /dev/null
+++ b/src/heap-checker.cc

@@ -0,0 +1,2388 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// All Rights Reserved.
+//
+// Author: Maxim Lifantsev
+//
+
+#include "config.h"
+
+#include <fcntl.h>    // for O_RDONLY (we use syscall to do actual reads)
+#include <string.h>
+#include <errno.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_MMAP
+#include <sys/mman.h>
+#endif
+#ifdef HAVE_PTHREAD
+#include <pthread.h>
+#endif
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <assert.h>
+
+#if defined(HAVE_LINUX_PTRACE_H)
+#include <linux/ptrace.h>
+#endif
+#ifdef HAVE_SYS_SYSCALL_H
+#include <sys/syscall.h>
+#endif
+#if defined(_WIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__) || defined(__MINGW32__)
+#include <wtypes.h>
+#include <winbase.h>
+#undef ERROR     // windows defines these as macros, which can cause trouble
+#undef max
+#undef min
+#endif
+
+#include <string>
+#include <vector>
+#include <map>
+#include <set>
+#include <algorithm>
+#include <functional>
+
+#include <gperftools/heap-checker.h>
+
+#include "base/basictypes.h"
+#include "base/googleinit.h"
+#include "base/logging.h"
+#include <gperftools/stacktrace.h>
+#include "base/commandlineflags.h"
+#include "base/elfcore.h"              // for i386_regs
+#include "base/thread_lister.h"
+#include "heap-profile-table.h"
+#include "base/low_level_alloc.h"
+#include "malloc_hook-inl.h"
+#include <gperftools/malloc_hook.h>
+#include <gperftools/malloc_extension.h>
+#include "maybe_threads.h"
+#include "memory_region_map.h"
+#include "base/spinlock.h"
+#include "base/sysinfo.h"
+#include "base/stl_allocator.h"
+
+using std::string;
+using std::basic_string;
+using std::pair;
+using std::map;
+using std::set;
+using std::vector;
+using std::swap;
+using std::make_pair;
+using std::min;
+using std::max;
+using std::less;
+using std::char_traits;
+
+// If current process is being ptrace()d, 'TracerPid' in /proc/self/status
+// will be non-zero.
+static bool IsDebuggerAttached(void) {    // only works under linux, probably
+  char buf[256];   // TracerPid comes relatively earlier in status output
+  int fd = open("/proc/self/status", O_RDONLY);
+  if (fd == -1) {
+    return false;  // Can't tell for sure.
+  }
+  const int len = read(fd, buf, sizeof(buf));
+  bool rc = false;
+  if (len > 0) {
+    const char *const kTracerPid = "TracerPid:\t";
+    buf[len - 1] = '\0';
+    const char *p = strstr(buf, kTracerPid);
+    if (p != NULL) {
+      rc = (strncmp(p + strlen(kTracerPid), "0\n", 2) != 0);
+    }
+  }
+  close(fd);
+  return rc;
+}
+
+// This is the default if you don't link in -lprofiler
+extern "C" {
+ATTRIBUTE_WEAK PERFTOOLS_DLL_DECL bool ProfilingIsEnabledForAllThreads();
+bool ProfilingIsEnabledForAllThreads() { return false; }
+}
+
+//----------------------------------------------------------------------
+// Flags that control heap-checking
+//----------------------------------------------------------------------
+
+DEFINE_string(heap_check,
+              EnvToString("HEAPCHECK", ""),
+              "The heap leak checking to be done over the whole executable: "
+              "\"minimal\", \"normal\", \"strict\", "
+              "\"draconian\", \"as-is\", and \"local\" "
+              " or the empty string are the supported choices. "
+              "(See HeapLeakChecker_InternalInitStart for details.)");
+
+DEFINE_bool(heap_check_report, true, "Obsolete");
+
+DEFINE_bool(heap_check_before_constructors,
+            true,
+            "deprecated; pretty much always true now");
+
+DEFINE_bool(heap_check_after_destructors,
+            EnvToBool("HEAP_CHECK_AFTER_DESTRUCTORS", false),
+            "If overall heap check is to end after global destructors "
+            "or right after all REGISTER_HEAPCHECK_CLEANUP's");
+
+DEFINE_bool(heap_check_strict_check, true, "Obsolete");
+
+DEFINE_bool(heap_check_ignore_global_live,
+            EnvToBool("HEAP_CHECK_IGNORE_GLOBAL_LIVE", true),
+            "If overall heap check is to ignore heap objects reachable "
+            "from the global data");
+
+DEFINE_bool(heap_check_identify_leaks,
+            EnvToBool("HEAP_CHECK_IDENTIFY_LEAKS", false),
+            "If heap check should generate the addresses of the leaked "
+            "objects in the memory leak profiles.  This may be useful "
+            "in tracking down leaks where only a small fraction of "
+            "objects allocated at the same stack trace are leaked.");
+
+DEFINE_bool(heap_check_ignore_thread_live,
+            EnvToBool("HEAP_CHECK_IGNORE_THREAD_LIVE", true),
+            "If set to true, objects reachable from thread stacks "
+            "and registers are not reported as leaks");
+
+DEFINE_bool(heap_check_test_pointer_alignment,
+            EnvToBool("HEAP_CHECK_TEST_POINTER_ALIGNMENT", false),
+            "Set to true to check if the found leak can be due to "
+            "use of unaligned pointers");
+
+// Alignment at which all pointers in memory are supposed to be located;
+// use 1 if any alignment is ok.
+// heap_check_test_pointer_alignment flag guides if we try the value of 1.
+// The larger it can be, the lesser is the chance of missing real leaks.
+static const size_t kPointerSourceAlignment = sizeof(void*);
+DEFINE_int32(heap_check_pointer_source_alignment,
+	     EnvToInt("HEAP_CHECK_POINTER_SOURCE_ALIGNMENT",
+                      kPointerSourceAlignment),
+             "Alignment at which all pointers in memory are supposed to be "
+             "located.  Use 1 if any alignment is ok.");
+
+// A reasonable default to handle pointers inside of typical class objects:
+// Too low and we won't be able to traverse pointers to normally-used
+// nested objects and base parts of multiple-inherited objects.
+// Too high and it will both slow down leak checking (FindInsideAlloc
+// in HaveOnHeapLocked will get slower when there are large on-heap objects)
+// and make it probabilistically more likely to miss leaks
+// of large-sized objects.
+static const int64 kHeapCheckMaxPointerOffset = 1024;
+DEFINE_int64(heap_check_max_pointer_offset,
+	     EnvToInt("HEAP_CHECK_MAX_POINTER_OFFSET",
+                      kHeapCheckMaxPointerOffset),
+             "Largest pointer offset for which we traverse "
+             "pointers going inside of heap allocated objects. "
+             "Set to -1 to use the actual largest heap object size.");
+
+DEFINE_bool(heap_check_run_under_gdb,
+            EnvToBool("HEAP_CHECK_RUN_UNDER_GDB", false),
+            "If false, turns off heap-checking library when running under gdb "
+            "(normally, set to 'true' only when debugging the heap-checker)");
+
+DEFINE_int32(heap_check_delay_seconds, 0,
+             "Number of seconds to delay on-exit heap checking."
+             " If you set this flag,"
+             " you may also want to set exit_timeout_seconds in order to"
+             " avoid exit timeouts.\n"
+             "NOTE: This flag is to be used only to help diagnose issues"
+             " where it is suspected that the heap checker is reporting"
+             " false leaks that will disappear if the heap checker delays"
+             " its checks. Report any such issues to the heap-checker"
+             " maintainer(s).");
+
+//----------------------------------------------------------------------
+
+DEFINE_string(heap_profile_pprof,
+              EnvToString("PPROF_PATH", "pprof"),
+              "OBSOLETE; not used");
+
+DEFINE_string(heap_check_dump_directory,
+              EnvToString("HEAP_CHECK_DUMP_DIRECTORY", "/tmp"),
+              "Directory to put heap-checker leak dump information");
+
+
+//----------------------------------------------------------------------
+// HeapLeakChecker global data
+//----------------------------------------------------------------------
+
+// Global lock for all the global data of this module.
+static SpinLock heap_checker_lock(SpinLock::LINKER_INITIALIZED);
+
+//----------------------------------------------------------------------
+
+// Heap profile prefix for leak checking profiles.
+// Gets assigned once when leak checking is turned on, then never modified.
+static const string* profile_name_prefix = NULL;
+
+// Whole-program heap leak checker.
+// Gets assigned once when leak checking is turned on,
+// then main_heap_checker is never deleted.
+static HeapLeakChecker* main_heap_checker = NULL;
+
+// Whether we will use main_heap_checker to do a check at program exit
+// automatically. In any case user can ask for more checks on main_heap_checker
+// via GlobalChecker().
+static bool do_main_heap_check = false;
+
+// The heap profile we use to collect info about the heap.
+// This is created in HeapLeakChecker::BeforeConstructorsLocked
+// together with setting heap_checker_on (below) to true
+// and registering our new/delete malloc hooks;
+// similarly all are unset in HeapLeakChecker::TurnItselfOffLocked.
+static HeapProfileTable* heap_profile = NULL;
+
+// If we are doing (or going to do) any kind of heap-checking.
+static bool heap_checker_on = false;
+
+// pid of the process that does whole-program heap leak checking
+static pid_t heap_checker_pid = 0;
+
+// If we did heap profiling during global constructors execution
+static bool constructor_heap_profiling = false;
+
+// RAW_VLOG level we dump key INFO messages at.  If you want to turn
+// off these messages, set the environment variable PERFTOOLS_VERBOSE=-1.
+static const int heap_checker_info_level = 0;
+
+//----------------------------------------------------------------------
+// HeapLeakChecker's own memory allocator that is
+// independent of the normal program allocator.
+//----------------------------------------------------------------------
+
+// Wrapper of LowLevelAlloc for STL_Allocator and direct use.
+// We always access this class under held heap_checker_lock,
+// this allows us to in particular protect the period when threads are stopped
+// at random spots with TCMalloc_ListAllProcessThreads by heap_checker_lock,
+// w/o worrying about the lock in LowLevelAlloc::Arena.
+// We rely on the fact that we use an own arena with an own lock here.
+class HeapLeakChecker::Allocator {
+ public:
+  static void Init() {
+    RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+    RAW_DCHECK(arena_ == NULL, "");
+    arena_ = LowLevelAlloc::NewArena(0, LowLevelAlloc::DefaultArena());
+  }
+  static void Shutdown() {
+    RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+    if (!LowLevelAlloc::DeleteArena(arena_)  ||  alloc_count_ != 0) {
+      RAW_LOG(FATAL, "Internal heap checker leak of %d objects", alloc_count_);
+    }
+  }
+  static int alloc_count() {
+    RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+    return alloc_count_;
+  }
+  static void* Allocate(size_t n) {
+    RAW_DCHECK(arena_  &&  heap_checker_lock.IsHeld(), "");
+    void* p = LowLevelAlloc::AllocWithArena(n, arena_);
+    if (p) alloc_count_ += 1;
+    return p;
+  }
+  static void Free(void* p) {
+    RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+    if (p) alloc_count_ -= 1;
+    LowLevelAlloc::Free(p);
+  }
+  static void Free(void* p, size_t /* n */) {
+    Free(p);
+  }
+  // destruct, free, and make *p to be NULL
+  template<typename T> static void DeleteAndNull(T** p) {
+    (*p)->~T();
+    Free(*p);
+    *p = NULL;
+  }
+  template<typename T> static void DeleteAndNullIfNot(T** p) {
+    if (*p != NULL) DeleteAndNull(p);
+  }
+ private:
+  static LowLevelAlloc::Arena* arena_;
+  static int alloc_count_;
+};
+
+LowLevelAlloc::Arena* HeapLeakChecker::Allocator::arena_ = NULL;
+int HeapLeakChecker::Allocator::alloc_count_ = 0;
+
+//----------------------------------------------------------------------
+// HeapLeakChecker live object tracking components
+//----------------------------------------------------------------------
+
+// Cases of live object placement we distinguish
+enum ObjectPlacement {
+  MUST_BE_ON_HEAP,   // Must point to a live object of the matching size in the
+                     // heap_profile map of the heap when we get to it
+  IGNORED_ON_HEAP,   // Is a live (ignored) object on heap
+  MAYBE_LIVE,        // Is a piece of writable memory from /proc/self/maps
+  IN_GLOBAL_DATA,    // Is part of global data region of the executable
+  THREAD_DATA,       // Part of a thread stack and a thread descriptor with TLS
+  THREAD_REGISTERS,  // Values in registers of some thread
+};
+
+// Information about an allocated object
+struct AllocObject {
+  const void* ptr;        // the object
+  uintptr_t size;         // its size
+  ObjectPlacement place;  // where ptr points to
+
+  AllocObject(const void* p, size_t s, ObjectPlacement l)
+    : ptr(p), size(s), place(l) { }
+};
+
+// All objects (memory ranges) ignored via HeapLeakChecker::IgnoreObject
+// Key is the object's address; value is its size.
+typedef map<uintptr_t, size_t, less<uintptr_t>,
+            STL_Allocator<pair<const uintptr_t, size_t>,
+                          HeapLeakChecker::Allocator>
+           > IgnoredObjectsMap;
+static IgnoredObjectsMap* ignored_objects = NULL;
+
+// All objects (memory ranges) that we consider to be the sources of pointers
+// to live (not leaked) objects.
+// At different times this holds (what can be reached from) global data regions
+// and the objects we've been told to ignore.
+// For any AllocObject::ptr "live_objects" is supposed to contain at most one
+// record at any time. We maintain this by checking with the heap_profile map
+// of the heap and removing the live heap objects we've handled from it.
+// This vector is maintained as a stack and the frontier of reachable
+// live heap objects in our flood traversal of them.
+typedef vector<AllocObject,
+               STL_Allocator<AllocObject, HeapLeakChecker::Allocator>
+              > LiveObjectsStack;
+static LiveObjectsStack* live_objects = NULL;
+
+// A special string type that uses my allocator
+typedef basic_string<char, char_traits<char>,
+                     STL_Allocator<char, HeapLeakChecker::Allocator>
+                    > HCL_string;
+
+// A placeholder to fill-in the starting values for live_objects
+// for each library so we can keep the library-name association for logging.
+typedef map<HCL_string, LiveObjectsStack, less<HCL_string>,
+            STL_Allocator<pair<const HCL_string, LiveObjectsStack>,
+                          HeapLeakChecker::Allocator>
+           > LibraryLiveObjectsStacks;
+static LibraryLiveObjectsStacks* library_live_objects = NULL;
+
+// Value stored in the map of disabled address ranges;
+// its key is the end of the address range.
+// We'll ignore allocations with a return address in a disabled range
+// if the address occurs at 'max_depth' or less in the stack trace.
+struct HeapLeakChecker::RangeValue {
+  uintptr_t start_address;  // the start of the range
+  int       max_depth;      // the maximal stack depth to disable at
+};
+typedef map<uintptr_t, HeapLeakChecker::RangeValue, less<uintptr_t>,
+            STL_Allocator<pair<const uintptr_t, HeapLeakChecker::RangeValue>,
+                          HeapLeakChecker::Allocator>
+           > DisabledRangeMap;
+// The disabled program counter address ranges for profile dumping
+// that are registered with HeapLeakChecker::DisableChecksFromToLocked.
+static DisabledRangeMap* disabled_ranges = NULL;
+
+// Set of stack tops.
+// These are used to consider live only appropriate chunks of the memory areas
+// that are used for stacks (and maybe thread-specific data as well)
+// so that we do not treat pointers from outdated stack frames as live.
+typedef set<uintptr_t, less<uintptr_t>,
+            STL_Allocator<uintptr_t, HeapLeakChecker::Allocator>
+           > StackTopSet;
+static StackTopSet* stack_tops = NULL;
+
+// A map of ranges of code addresses for the system libraries
+// that can mmap/mremap/sbrk-allocate memory regions for stacks
+// and thread-local storage that we want to consider as live global data.
+// Maps from the end address to the start address.
+typedef map<uintptr_t, uintptr_t, less<uintptr_t>,
+            STL_Allocator<pair<const uintptr_t, uintptr_t>,
+                          HeapLeakChecker::Allocator>
+           > GlobalRegionCallerRangeMap;
+static GlobalRegionCallerRangeMap* global_region_caller_ranges = NULL;
+
+// TODO(maxim): make our big data structs into own modules
+
+// Disabler is implemented by keeping track of a per-thread count
+// of active Disabler objects.  Any objects allocated while the
+// count > 0 are not reported.
+
+#ifdef HAVE_TLS
+
+static __thread int thread_disable_counter
+// The "inital exec" model is faster than the default TLS model, at
+// the cost you can't dlopen this library.  But dlopen on heap-checker
+// doesn't work anyway -- it must run before main -- so this is a good
+// trade-off.
+# ifdef HAVE___ATTRIBUTE__
+   __attribute__ ((tls_model ("initial-exec")))
+# endif
+    ;
+inline int get_thread_disable_counter() {
+  return thread_disable_counter;
+}
+inline void set_thread_disable_counter(int value) {
+  thread_disable_counter = value;
+}
+
+#else  // #ifdef HAVE_TLS
+
+static pthread_key_t thread_disable_counter_key;
+static int main_thread_counter;   // storage for use before main()
+static bool use_main_thread_counter = true;
+
+// TODO(csilvers): this is called from NewHook, in the middle of malloc().
+// If perftools_pthread_getspecific calls malloc, that will lead to an
+// infinite loop.  I don't know how to fix that, so I hope it never happens!
+inline int get_thread_disable_counter() {
+  if (use_main_thread_counter)  // means we're running really early
+    return main_thread_counter;
+  void* p = perftools_pthread_getspecific(thread_disable_counter_key);
+  return (intptr_t)p;   // kinda evil: store the counter directly in the void*
+}
+
+inline void set_thread_disable_counter(int value) {
+  if (use_main_thread_counter) {   // means we're running really early
+    main_thread_counter = value;
+    return;
+  }
+  intptr_t pointer_sized_value = value;
+  // kinda evil: store the counter directly in the void*
+  void* p = (void*)pointer_sized_value;
+  // NOTE: this may call malloc, which will call NewHook which will call
+  // get_thread_disable_counter() which will call pthread_getspecific().  I
+  // don't know if anything bad can happen if we call getspecific() in the
+  // middle of a setspecific() call.  It seems to work ok in practice...
+  perftools_pthread_setspecific(thread_disable_counter_key, p);
+}
+
+// The idea here is that this initializer will run pretty late: after
+// pthreads have been totally set up.  At this point we can call
+// pthreads routines, so we set those up.
+class InitThreadDisableCounter {
+ public:
+  InitThreadDisableCounter() {
+    perftools_pthread_key_create(&thread_disable_counter_key, NULL);
+    // Set up the main thread's value, which we have a special variable for.
+    void* p = (void*)main_thread_counter;   // store the counter directly
+    perftools_pthread_setspecific(thread_disable_counter_key, p);
+    use_main_thread_counter = false;
+  }
+};
+InitThreadDisableCounter init_thread_disable_counter;
+
+#endif  // #ifdef HAVE_TLS
+
+HeapLeakChecker::Disabler::Disabler() {
+  // It is faster to unconditionally increment the thread-local
+  // counter than to check whether or not heap-checking is on
+  // in a thread-safe manner.
+  int counter = get_thread_disable_counter();
+  set_thread_disable_counter(counter + 1);
+  RAW_VLOG(10, "Increasing thread disable counter to %d", counter + 1);
+}
+
+HeapLeakChecker::Disabler::~Disabler() {
+  int counter = get_thread_disable_counter();
+  RAW_DCHECK(counter > 0, "");
+  if (counter > 0) {
+    set_thread_disable_counter(counter - 1);
+    RAW_VLOG(10, "Decreasing thread disable counter to %d", counter);
+  } else {
+    RAW_VLOG(0, "Thread disable counter underflow : %d", counter);
+  }
+}
+
+//----------------------------------------------------------------------
+
+// The size of the largest heap object allocated so far.
+static size_t max_heap_object_size = 0;
+// The possible range of addresses that can point
+// into one of the elements of heap_objects.
+static uintptr_t min_heap_address = uintptr_t(-1LL);
+static uintptr_t max_heap_address = 0;
+
+//----------------------------------------------------------------------
+
+// Simple casting helpers for uintptr_t and void*:
+template<typename T>
+inline static const void* AsPtr(T addr) {
+  return reinterpret_cast<void*>(addr);
+}
+inline static uintptr_t AsInt(const void* ptr) {
+  return reinterpret_cast<uintptr_t>(ptr);
+}
+
+//----------------------------------------------------------------------
+
+// We've seen reports that strstr causes heap-checker crashes in some
+// libc's (?):
+//    http://code.google.com/p/gperftools/issues/detail?id=263
+// It's simple enough to use our own.  This is not in time-critical code.
+static const char* hc_strstr(const char* s1, const char* s2) {
+  const size_t len = strlen(s2);
+  RAW_CHECK(len > 0, "Unexpected empty string passed to strstr()");
+  for (const char* p = strchr(s1, *s2); p != NULL; p = strchr(p+1, *s2)) {
+    if (strncmp(p, s2, len) == 0) {
+      return p;
+    }
+  }
+  return NULL;
+}
+
+//----------------------------------------------------------------------
+
+// Our hooks for MallocHook
+static void NewHook(const void* ptr, size_t size) {
+  if (ptr != NULL) {
+    const int counter = get_thread_disable_counter();
+    const bool ignore = (counter > 0);
+    RAW_VLOG(16, "Recording Alloc: %p of %" PRIuS "; %d", ptr, size,
+             int(counter));
+
+    // Fetch the caller's stack trace before acquiring heap_checker_lock.
+    void* stack[HeapProfileTable::kMaxStackDepth];
+    int depth = HeapProfileTable::GetCallerStackTrace(0, stack);
+
+    { SpinLockHolder l(&heap_checker_lock);
+      if (size > max_heap_object_size) max_heap_object_size = size;
+      uintptr_t addr = AsInt(ptr);
+      if (addr < min_heap_address) min_heap_address = addr;
+      addr += size;
+      if (addr > max_heap_address) max_heap_address = addr;
+      if (heap_checker_on) {
+        heap_profile->RecordAlloc(ptr, size, depth, stack);
+        if (ignore) {
+          heap_profile->MarkAsIgnored(ptr);
+        }
+      }
+    }
+    RAW_VLOG(17, "Alloc Recorded: %p of %" PRIuS "", ptr, size);
+  }
+}
+
+static void DeleteHook(const void* ptr) {
+  if (ptr != NULL) {
+    RAW_VLOG(16, "Recording Free %p", ptr);
+    { SpinLockHolder l(&heap_checker_lock);
+      if (heap_checker_on) heap_profile->RecordFree(ptr);
+    }
+    RAW_VLOG(17, "Free Recorded: %p", ptr);
+  }
+}
+
+//----------------------------------------------------------------------
+
+enum StackDirection {
+  GROWS_TOWARDS_HIGH_ADDRESSES,
+  GROWS_TOWARDS_LOW_ADDRESSES,
+  UNKNOWN_DIRECTION
+};
+
+// Determine which way the stack grows:
+
+static StackDirection ATTRIBUTE_NOINLINE GetStackDirection(
+    const uintptr_t *const ptr) {
+  uintptr_t x;
+  if (&x < ptr)
+    return GROWS_TOWARDS_LOW_ADDRESSES;
+  if (ptr < &x)
+    return GROWS_TOWARDS_HIGH_ADDRESSES;
+
+  RAW_CHECK(0, "");  // Couldn't determine the stack direction.
+
+  return UNKNOWN_DIRECTION;
+}
+
+// Direction of stack growth (will initialize via GetStackDirection())
+static StackDirection stack_direction = UNKNOWN_DIRECTION;
+
+// This routine is called for every thread stack we know about to register it.
+static void RegisterStackLocked(const void* top_ptr) {
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  RAW_DCHECK(MemoryRegionMap::LockIsHeld(), "");
+  RAW_VLOG(10, "Thread stack at %p", top_ptr);
+  uintptr_t top = AsInt(top_ptr);
+  stack_tops->insert(top);  // add for later use
+
+  // make sure stack_direction is initialized
+  if (stack_direction == UNKNOWN_DIRECTION) {
+    stack_direction = GetStackDirection(&top);
+  }
+
+  // Find memory region with this stack
+  MemoryRegionMap::Region region;
+  if (MemoryRegionMap::FindAndMarkStackRegion(top, &region)) {
+    // Make the proper portion of the stack live:
+    if (stack_direction == GROWS_TOWARDS_LOW_ADDRESSES) {
+      RAW_VLOG(11, "Live stack at %p of %" PRIuPTR " bytes",
+                  top_ptr, region.end_addr - top);
+      live_objects->push_back(AllocObject(top_ptr, region.end_addr - top,
+                                          THREAD_DATA));
+    } else {  // GROWS_TOWARDS_HIGH_ADDRESSES
+      RAW_VLOG(11, "Live stack at %p of %" PRIuPTR " bytes",
+                  AsPtr(region.start_addr),
+                  top - region.start_addr);
+      live_objects->push_back(AllocObject(AsPtr(region.start_addr),
+                                          top - region.start_addr,
+                                          THREAD_DATA));
+    }
+  // not in MemoryRegionMap, look in library_live_objects:
+  } else if (FLAGS_heap_check_ignore_global_live) {
+    for (LibraryLiveObjectsStacks::iterator lib = library_live_objects->begin();
+         lib != library_live_objects->end(); ++lib) {
+      for (LiveObjectsStack::iterator span = lib->second.begin();
+           span != lib->second.end(); ++span) {
+        uintptr_t start = AsInt(span->ptr);
+        uintptr_t end = start + span->size;
+        if (start <= top  &&  top < end) {
+          RAW_VLOG(11, "Stack at %p is inside /proc/self/maps chunk %p..%p",
+                      top_ptr, AsPtr(start), AsPtr(end));
+          // Shrink start..end region by chopping away the memory regions in
+          // MemoryRegionMap that land in it to undo merging of regions
+          // in /proc/self/maps, so that we correctly identify what portion
+          // of start..end is actually the stack region.
+          uintptr_t stack_start = start;
+          uintptr_t stack_end = end;
+          // can optimize-away this loop, but it does not run often
+          RAW_DCHECK(MemoryRegionMap::LockIsHeld(), "");
+          for (MemoryRegionMap::RegionIterator r =
+                 MemoryRegionMap::BeginRegionLocked();
+               r != MemoryRegionMap::EndRegionLocked(); ++r) {
+            if (top < r->start_addr  &&  r->start_addr < stack_end) {
+              stack_end = r->start_addr;
+            }
+            if (stack_start < r->end_addr  &&  r->end_addr <= top) {
+              stack_start = r->end_addr;
+            }
+          }
+          if (stack_start != start  ||  stack_end != end) {
+            RAW_VLOG(11, "Stack at %p is actually inside memory chunk %p..%p",
+                        top_ptr, AsPtr(stack_start), AsPtr(stack_end));
+          }
+          // Make the proper portion of the stack live:
+          if (stack_direction == GROWS_TOWARDS_LOW_ADDRESSES) {
+            RAW_VLOG(11, "Live stack at %p of %" PRIuPTR " bytes",
+                        top_ptr, stack_end - top);
+            live_objects->push_back(
+              AllocObject(top_ptr, stack_end - top, THREAD_DATA));
+          } else {  // GROWS_TOWARDS_HIGH_ADDRESSES
+            RAW_VLOG(11, "Live stack at %p of %" PRIuPTR " bytes",
+                        AsPtr(stack_start), top - stack_start);
+            live_objects->push_back(
+              AllocObject(AsPtr(stack_start), top - stack_start, THREAD_DATA));
+          }
+          lib->second.erase(span);  // kill the rest of the region
+          // Put the non-stack part(s) of the region back:
+          if (stack_start != start) {
+            lib->second.push_back(AllocObject(AsPtr(start), stack_start - start,
+                                  MAYBE_LIVE));
+          }
+          if (stack_end != end) {
+            lib->second.push_back(AllocObject(AsPtr(stack_end), end - stack_end,
+                                  MAYBE_LIVE));
+          }
+          return;
+        }
+      }
+    }
+    RAW_LOG(ERROR, "Memory region for stack at %p not found. "
+                   "Will likely report false leak positives.", top_ptr);
+  }
+}
+
+// Iterator for heap allocation map data to make ignored objects "live"
+// (i.e., treated as roots for the mark-and-sweep phase)
+static void MakeIgnoredObjectsLiveCallbackLocked(
+    const void* ptr, const HeapProfileTable::AllocInfo& info) {
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  if (info.ignored) {
+    live_objects->push_back(AllocObject(ptr, info.object_size,
+                                        MUST_BE_ON_HEAP));
+  }
+}
+
+// Iterator for heap allocation map data to make objects allocated from
+// disabled regions of code to be live.
+static void MakeDisabledLiveCallbackLocked(
+    const void* ptr, const HeapProfileTable::AllocInfo& info) {
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  bool stack_disable = false;
+  bool range_disable = false;
+  for (int depth = 0; depth < info.stack_depth; depth++) {
+    uintptr_t addr = AsInt(info.call_stack[depth]);
+    if (disabled_ranges) {
+      DisabledRangeMap::const_iterator iter
+        = disabled_ranges->upper_bound(addr);
+      if (iter != disabled_ranges->end()) {
+        RAW_DCHECK(iter->first > addr, "");
+        if (iter->second.start_address < addr  &&
+            iter->second.max_depth > depth) {
+          range_disable = true;  // in range; dropping
+          break;
+        }
+      }
+    }
+  }
+  if (stack_disable || range_disable) {
+    uintptr_t start_address = AsInt(ptr);
+    uintptr_t end_address = start_address + info.object_size;
+    StackTopSet::const_iterator iter
+      = stack_tops->lower_bound(start_address);
+    if (iter != stack_tops->end()) {
+      RAW_DCHECK(*iter >= start_address, "");
+      if (*iter < end_address) {
+        // We do not disable (treat as live) whole allocated regions
+        // if they are used to hold thread call stacks
+        // (i.e. when we find a stack inside).
+        // The reason is that we'll treat as live the currently used
+        // stack portions anyway (see RegisterStackLocked),
+        // and the rest of the region where the stack lives can well
+        // contain outdated stack variables which are not live anymore,
+        // hence should not be treated as such.
+        RAW_VLOG(11, "Not %s-disabling %" PRIuS " bytes at %p"
+                    ": have stack inside: %p",
+                    (stack_disable ? "stack" : "range"),
+                    info.object_size, ptr, AsPtr(*iter));
+        return;
+      }
+    }
+    RAW_VLOG(11, "%s-disabling %" PRIuS " bytes at %p",
+                (stack_disable ? "Stack" : "Range"), info.object_size, ptr);
+    live_objects->push_back(AllocObject(ptr, info.object_size,
+                                        MUST_BE_ON_HEAP));
+  }
+}
+
+static const char kUnnamedProcSelfMapEntry[] = "UNNAMED";
+
+// This function takes some fields from a /proc/self/maps line:
+//
+//   start_address  start address of a memory region.
+//   end_address    end address of a memory region
+//   permissions    rwx + private/shared bit
+//   filename       filename of the mapped file
+//
+// If the region is not writeable, then it cannot have any heap
+// pointers in it, otherwise we record it as a candidate live region
+// to get filtered later.
+static void RecordGlobalDataLocked(uintptr_t start_address,
+                                   uintptr_t end_address,
+                                   const char* permissions,
+                                   const char* filename) {
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  // Ignore non-writeable regions.
+  if (strchr(permissions, 'w') == NULL) return;
+  if (filename == NULL  ||  *filename == '\0') {
+    filename = kUnnamedProcSelfMapEntry;
+  }
+  RAW_VLOG(11, "Looking into %s: 0x%" PRIxPTR "..0x%" PRIxPTR,
+              filename, start_address, end_address);
+  (*library_live_objects)[filename].
+    push_back(AllocObject(AsPtr(start_address),
+                          end_address - start_address,
+                          MAYBE_LIVE));
+}
+
+// See if 'library' from /proc/self/maps has base name 'library_base'
+// i.e. contains it and has '.' or '-' after it.
+static bool IsLibraryNamed(const char* library, const char* library_base) {
+  const char* p = hc_strstr(library, library_base);
+  size_t sz = strlen(library_base);
+  return p != NULL  &&  (p[sz] == '.'  ||  p[sz] == '-');
+}
+
+// static
+void HeapLeakChecker::DisableLibraryAllocsLocked(const char* library,
+                                                 uintptr_t start_address,
+                                                 uintptr_t end_address) {
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  int depth = 0;
+  // TODO(maxim): maybe this should be extended to also use objdump
+  //              and pick the text portion of the library more precisely.
+  if (IsLibraryNamed(library, "/libpthread")  ||
+        // libpthread has a lot of small "system" leaks we don't care about.
+        // In particular it allocates memory to store data supplied via
+        // pthread_setspecific (which can be the only pointer to a heap object).
+      IsLibraryNamed(library, "/libdl")  ||
+        // library loaders leak some "system" heap that we don't care about
+      IsLibraryNamed(library, "/libcrypto")  ||
+        // Sometimes libcrypto of OpenSSH is compiled with -fomit-frame-pointer
+        // (any library can be, of course, but this one often is because speed
+        // is so important for making crypto usable).  We ignore all its
+        // allocations because we can't see the call stacks.  We'd prefer
+        // to ignore allocations done in files/symbols that match
+        // "default_malloc_ex|default_realloc_ex"
+        // but that doesn't work when the end-result binary is stripped.
+      IsLibraryNamed(library, "/libjvm")  ||
+        // JVM has a lot of leaks we don't care about.
+      IsLibraryNamed(library, "/libzip")
+        // The JVM leaks java.util.zip.Inflater after loading classes.
+     ) {
+    depth = 1;  // only disable allocation calls directly from the library code
+  } else if (IsLibraryNamed(library, "/ld")
+               // library loader leaks some "system" heap
+               // (e.g. thread-local storage) that we don't care about
+            ) {
+    depth = 2;  // disable allocation calls directly from the library code
+                // and at depth 2 from it.
+    // We need depth 2 here solely because of a libc bug that
+    // forces us to jump through __memalign_hook and MemalignOverride hoops
+    // in tcmalloc.cc.
+    // Those buggy __libc_memalign() calls are in ld-linux.so and happen for
+    // thread-local storage allocations that we want to ignore here.
+    // We go with the depth-2 hack as a workaround for this libc bug:
+    // otherwise we'd need to extend MallocHook interface
+    // so that correct stack depth adjustment can be propagated from
+    // the exceptional case of MemalignOverride.
+    // Using depth 2 here should not mask real leaks because ld-linux.so
+    // does not call user code.
+  }
+  if (depth) {
+    RAW_VLOG(10, "Disabling allocations from %s at depth %d:", library, depth);
+    DisableChecksFromToLocked(AsPtr(start_address), AsPtr(end_address), depth);
+    if (IsLibraryNamed(library, "/libpthread")  ||
+        IsLibraryNamed(library, "/libdl")  ||
+        IsLibraryNamed(library, "/ld")) {
+      RAW_VLOG(10, "Global memory regions made by %s will be live data",
+                  library);
+      if (global_region_caller_ranges == NULL) {
+        global_region_caller_ranges =
+          new(Allocator::Allocate(sizeof(GlobalRegionCallerRangeMap)))
+            GlobalRegionCallerRangeMap;
+      }
+      global_region_caller_ranges
+        ->insert(make_pair(end_address, start_address));
+    }
+  }
+}
+
+// static
+HeapLeakChecker::ProcMapsResult HeapLeakChecker::UseProcMapsLocked(
+                                  ProcMapsTask proc_maps_task) {
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  // Need to provide own scratch memory to ProcMapsIterator:
+  ProcMapsIterator::Buffer buffer;
+  ProcMapsIterator it(0, &buffer);
+  if (!it.Valid()) {
+    int errsv = errno;
+    RAW_LOG(ERROR, "Could not open /proc/self/maps: errno=%d. "
+                   "Libraries will not be handled correctly.", errsv);
+    return CANT_OPEN_PROC_MAPS;
+  }
+  uint64 start_address, end_address, file_offset;
+  int64 inode;
+  char *permissions, *filename;
+  bool saw_shared_lib = false;
+  bool saw_nonzero_inode = false;
+  bool saw_shared_lib_with_nonzero_inode = false;
+  while (it.Next(&start_address, &end_address, &permissions,
+                 &file_offset, &inode, &filename)) {
+    if (start_address >= end_address) {
+      // Warn if a line we can be interested in is ill-formed:
+      if (inode != 0) {
+        RAW_LOG(ERROR, "Errors reading /proc/self/maps. "
+                       "Some global memory regions will not "
+                       "be handled correctly.");
+      }
+      // Silently skip other ill-formed lines: some are possible
+      // probably due to the interplay of how /proc/self/maps is updated
+      // while we read it in chunks in ProcMapsIterator and
+      // do things in this loop.
+      continue;
+    }
+    // Determine if any shared libraries are present (this is the same
+    // list of extensions as is found in pprof).  We want to ignore
+    // 'fake' libraries with inode 0 when determining.  However, some
+    // systems don't share inodes via /proc, so we turn off this check
+    // if we don't see any evidence that we're getting inode info.
+    if (inode != 0) {
+      saw_nonzero_inode = true;
+    }
+    if ((hc_strstr(filename, "lib") && hc_strstr(filename, ".so")) ||
+        hc_strstr(filename, ".dll") ||
+        // not all .dylib filenames start with lib. .dylib is big enough
+        // that we are unlikely to get false matches just checking that.
+        hc_strstr(filename, ".dylib") || hc_strstr(filename, ".bundle")) {
+      saw_shared_lib = true;
+      if (inode != 0) {
+        saw_shared_lib_with_nonzero_inode = true;
+      }
+    }
+
+    switch (proc_maps_task) {
+      case DISABLE_LIBRARY_ALLOCS:
+        // All lines starting like
+        // "401dc000-4030f000 r??p 00132000 03:01 13991972  lib/bin"
+        // identify a data and code sections of a shared library or our binary
+        if (inode != 0 && strncmp(permissions, "r-xp", 4) == 0) {
+          DisableLibraryAllocsLocked(filename, start_address, end_address);
+        }
+        break;
+      case RECORD_GLOBAL_DATA:
+        RecordGlobalDataLocked(start_address, end_address,
+                               permissions, filename);
+        break;
+      default:
+        RAW_CHECK(0, "");
+    }
+  }
+  // If /proc/self/maps is reporting inodes properly (we saw a
+  // non-zero inode), then we only say we saw a shared lib if we saw a
+  // 'real' one, with a non-zero inode.
+  if (saw_nonzero_inode) {
+    saw_shared_lib = saw_shared_lib_with_nonzero_inode;
+  }
+  if (!saw_shared_lib) {
+    RAW_LOG(ERROR, "No shared libs detected. Will likely report false leak "
+                   "positives for statically linked executables.");
+    return NO_SHARED_LIBS_IN_PROC_MAPS;
+  }
+  return PROC_MAPS_USED;
+}
+
+// Total number and size of live objects dropped from the profile;
+// (re)initialized in IgnoreAllLiveObjectsLocked.
+static int64 live_objects_total;
+static int64 live_bytes_total;
+
+// pid of the thread that is doing the current leak check
+// (protected by our lock; IgnoreAllLiveObjectsLocked sets it)
+static pid_t self_thread_pid = 0;
+
+// Status of our thread listing callback execution
+// (protected by our lock; used from within IgnoreAllLiveObjectsLocked)
+static enum {
+  CALLBACK_NOT_STARTED,
+  CALLBACK_STARTED,
+  CALLBACK_COMPLETED,
+} thread_listing_status = CALLBACK_NOT_STARTED;
+
+// Ideally to avoid deadlocks this function should not result in any libc
+// or other function calls that might need to lock a mutex:
+// It is called when all threads of a process are stopped
+// at arbitrary points thus potentially holding those locks.
+//
+// In practice we are calling some simple i/o and sprintf-type library functions
+// for logging messages, but use only our own LowLevelAlloc::Arena allocator.
+//
+// This is known to be buggy: the library i/o function calls are able to cause
+// deadlocks when they request a lock that a stopped thread happens to hold.
+// This issue as far as we know have so far not resulted in any deadlocks
+// in practice, so for now we are taking our chance that the deadlocks
+// have insignificant frequency.
+//
+// If such deadlocks become a problem we should make the i/o calls
+// into appropriately direct system calls (or eliminate them),
+// in particular write() is not safe and vsnprintf() is potentially dangerous
+// due to reliance on locale functions (these are called through RAW_LOG
+// and in other ways).
+//
+
+#if defined(HAVE_LINUX_PTRACE_H) && defined(HAVE_SYS_SYSCALL_H) && defined(DUMPER)
+# if (defined(__i386__) || defined(__x86_64))
+#  define THREAD_REGS i386_regs
+# elif defined(__PPC__)
+#  define THREAD_REGS ppc_regs
+# endif
+#endif
+
+/*static*/ int HeapLeakChecker::IgnoreLiveThreadsLocked(void* parameter,
+                                                        int num_threads,
+                                                        pid_t* thread_pids,
+                                                        va_list /*ap*/) {
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  thread_listing_status = CALLBACK_STARTED;
+  RAW_VLOG(11, "Found %d threads (from pid %d)", num_threads, getpid());
+
+  if (FLAGS_heap_check_ignore_global_live) {
+    UseProcMapsLocked(RECORD_GLOBAL_DATA);
+  }
+
+  // We put the registers from other threads here
+  // to make pointers stored in them live.
+  vector<void*, STL_Allocator<void*, Allocator> > thread_registers;
+
+  int failures = 0;
+  for (int i = 0; i < num_threads; ++i) {
+    // the leak checking thread itself is handled
+    // specially via self_thread_stack, not here:
+    if (thread_pids[i] == self_thread_pid) continue;
+    RAW_VLOG(11, "Handling thread with pid %d", thread_pids[i]);
+#ifdef THREAD_REGS
+    THREAD_REGS thread_regs;
+#define sys_ptrace(r, p, a, d)  syscall(SYS_ptrace, (r), (p), (a), (d))
+    // We use sys_ptrace to avoid thread locking
+    // because this is called from TCMalloc_ListAllProcessThreads
+    // when all but this thread are suspended.
+    if (sys_ptrace(PTRACE_GETREGS, thread_pids[i], NULL, &thread_regs) == 0) {
+      // Need to use SP to get all the data from the very last stack frame:
+      COMPILE_ASSERT(sizeof(thread_regs.SP) == sizeof(void*),
+                     SP_register_does_not_look_like_a_pointer);
+      RegisterStackLocked(reinterpret_cast<void*>(thread_regs.SP));
+      // Make registers live (just in case PTRACE_ATTACH resulted in some
+      // register pointers still being in the registers and not on the stack):
+      for (void** p = reinterpret_cast<void**>(&thread_regs);
+           p < reinterpret_cast<void**>(&thread_regs + 1); ++p) {
+        RAW_VLOG(12, "Thread register %p", *p);
+        thread_registers.push_back(*p);
+      }
+    } else {
+      failures += 1;
+    }
+#else
+    failures += 1;
+#endif
+  }
+  // Use all the collected thread (stack) liveness sources:
+  IgnoreLiveObjectsLocked("threads stack data", "");
+  if (thread_registers.size()) {
+    // Make thread registers be live heap data sources.
+    // we rely here on the fact that vector is in one memory chunk:
+    RAW_VLOG(11, "Live registers at %p of %" PRIuS " bytes",
+                &thread_registers[0], thread_registers.size() * sizeof(void*));
+    live_objects->push_back(AllocObject(&thread_registers[0],
+                                        thread_registers.size() * sizeof(void*),
+                                        THREAD_REGISTERS));
+    IgnoreLiveObjectsLocked("threads register data", "");
+  }
+  // Do all other liveness walking while all threads are stopped:
+  IgnoreNonThreadLiveObjectsLocked();
+  // Can now resume the threads:
+  TCMalloc_ResumeAllProcessThreads(num_threads, thread_pids);
+  thread_listing_status = CALLBACK_COMPLETED;
+  return failures;
+}
+
+// Stack top of the thread that is doing the current leak check
+// (protected by our lock; IgnoreAllLiveObjectsLocked sets it)
+static const void* self_thread_stack_top;
+
+// static
+void HeapLeakChecker::IgnoreNonThreadLiveObjectsLocked() {
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  RAW_DCHECK(MemoryRegionMap::LockIsHeld(), "");
+  RAW_VLOG(11, "Handling self thread with pid %d", self_thread_pid);
+  // Register our own stack:
+
+  // Important that all stack ranges (including the one here)
+  // are known before we start looking at them
+  // in MakeDisabledLiveCallbackLocked:
+  RegisterStackLocked(self_thread_stack_top);
+  IgnoreLiveObjectsLocked("stack data", "");
+
+  // Make objects we were told to ignore live:
+  if (ignored_objects) {
+    for (IgnoredObjectsMap::const_iterator object = ignored_objects->begin();
+         object != ignored_objects->end(); ++object) {
+      const void* ptr = AsPtr(object->first);
+      RAW_VLOG(11, "Ignored live object at %p of %" PRIuS " bytes",
+                  ptr, object->second);
+      live_objects->
+        push_back(AllocObject(ptr, object->second, MUST_BE_ON_HEAP));
+      // we do this liveness check for ignored_objects before doing any
+      // live heap walking to make sure it does not fail needlessly:
+      size_t object_size;
+      if (!(heap_profile->FindAlloc(ptr, &object_size)  &&
+            object->second == object_size)) {
+        RAW_LOG(FATAL, "Object at %p of %" PRIuS " bytes from an"
+                       " IgnoreObject() has disappeared", ptr, object->second);
+      }
+    }
+    IgnoreLiveObjectsLocked("ignored objects", "");
+  }
+
+  // Treat objects that were allocated when a Disabler was live as
+  // roots.  I.e., if X was allocated while a Disabler was active,
+  // and Y is reachable from X, arrange that neither X nor Y are
+  // treated as leaks.
+  heap_profile->IterateAllocs(MakeIgnoredObjectsLiveCallbackLocked);
+  IgnoreLiveObjectsLocked("disabled objects", "");
+
+  // Make code-address-disabled objects live and ignored:
+  // This in particular makes all thread-specific data live
+  // because the basic data structure to hold pointers to thread-specific data
+  // is allocated from libpthreads and we have range-disabled that
+  // library code with UseProcMapsLocked(DISABLE_LIBRARY_ALLOCS);
+  // so now we declare all thread-specific data reachable from there as live.
+  heap_profile->IterateAllocs(MakeDisabledLiveCallbackLocked);
+  IgnoreLiveObjectsLocked("disabled code", "");
+
+  // Actually make global data live:
+  if (FLAGS_heap_check_ignore_global_live) {
+    bool have_null_region_callers = false;
+    for (LibraryLiveObjectsStacks::iterator l = library_live_objects->begin();
+         l != library_live_objects->end(); ++l) {
+      RAW_CHECK(live_objects->empty(), "");
+      // Process library_live_objects in l->second
+      // filtering them by MemoryRegionMap:
+      // It's safe to iterate over MemoryRegionMap
+      // w/o locks here as we are inside MemoryRegionMap::Lock():
+      RAW_DCHECK(MemoryRegionMap::LockIsHeld(), "");
+      // The only change to MemoryRegionMap possible in this loop
+      // is region addition as a result of allocating more memory
+      // for live_objects. This won't invalidate the RegionIterator
+      // or the intent of the loop.
+      // --see the comment by MemoryRegionMap::BeginRegionLocked().
+      for (MemoryRegionMap::RegionIterator region =
+             MemoryRegionMap::BeginRegionLocked();
+           region != MemoryRegionMap::EndRegionLocked(); ++region) {
+        // "region" from MemoryRegionMap is to be subtracted from
+        // (tentatively live) regions in l->second
+        // if it has a stack inside or it was allocated by
+        // a non-special caller (not one covered by a range
+        // in global_region_caller_ranges).
+        // This will in particular exclude all memory chunks used
+        // by the heap itself as well as what's been allocated with
+        // any allocator on top of mmap.
+        bool subtract = true;
+        if (!region->is_stack  &&  global_region_caller_ranges) {
+          if (region->caller() == static_cast<uintptr_t>(NULL)) {
+            have_null_region_callers = true;
+          } else {
+            GlobalRegionCallerRangeMap::const_iterator iter
+              = global_region_caller_ranges->upper_bound(region->caller());
+            if (iter != global_region_caller_ranges->end()) {
+              RAW_DCHECK(iter->first > region->caller(), "");
+              if (iter->second < region->caller()) {  // in special region
+                subtract = false;
+              }
+            }
+          }
+        }
+        if (subtract) {
+          // The loop puts the result of filtering l->second into live_objects:
+          for (LiveObjectsStack::const_iterator i = l->second.begin();
+               i != l->second.end(); ++i) {
+            // subtract *region from *i
+            uintptr_t start = AsInt(i->ptr);
+            uintptr_t end = start + i->size;
+            if (region->start_addr <= start  &&  end <= region->end_addr) {
+              // full deletion due to subsumption
+            } else if (start < region->start_addr  &&
+                       region->end_addr < end) {  // cutting-out split
+              live_objects->push_back(AllocObject(i->ptr,
+                                                  region->start_addr - start,
+                                                  IN_GLOBAL_DATA));
+              live_objects->push_back(AllocObject(AsPtr(region->end_addr),
+                                                  end - region->end_addr,
+                                                  IN_GLOBAL_DATA));
+            } else if (region->end_addr > start  &&
+                       region->start_addr <= start) {  // cut from start
+              live_objects->push_back(AllocObject(AsPtr(region->end_addr),
+                                                  end - region->end_addr,
+                                                  IN_GLOBAL_DATA));
+            } else if (region->start_addr > start  &&
+                       region->start_addr < end) {  // cut from end
+              live_objects->push_back(AllocObject(i->ptr,
+                                                  region->start_addr - start,
+                                                  IN_GLOBAL_DATA));
+            } else {  // pass: no intersection
+              live_objects->push_back(AllocObject(i->ptr, i->size,
+                                                  IN_GLOBAL_DATA));
+            }
+          }
+          // Move live_objects back into l->second
+          // for filtering by the next region.
+          live_objects->swap(l->second);
+          live_objects->clear();
+        }
+      }
+      // Now get and use live_objects from the final version of l->second:
+      if (VLOG_IS_ON(11)) {
+        for (LiveObjectsStack::const_iterator i = l->second.begin();
+             i != l->second.end(); ++i) {
+          RAW_VLOG(11, "Library live region at %p of %" PRIuPTR " bytes",
+                      i->ptr, i->size);
+        }
+      }
+      live_objects->swap(l->second);
+      IgnoreLiveObjectsLocked("in globals of\n  ", l->first.c_str());
+    }
+    if (have_null_region_callers) {
+      RAW_LOG(ERROR, "Have memory regions w/o callers: "
+                     "might report false leaks");
+    }
+    Allocator::DeleteAndNull(&library_live_objects);
+  }
+}
+
+// Callback for TCMalloc_ListAllProcessThreads in IgnoreAllLiveObjectsLocked below
+// to test/verify that we have just the one main thread, in which case
+// we can do everything in that main thread,
+// so that CPU profiler can collect all its samples.
+// Returns the number of threads in the process.
+static int IsOneThread(void* parameter, int num_threads,
+                       pid_t* thread_pids, va_list ap) {
+  if (num_threads != 1) {
+    RAW_LOG(WARNING, "Have threads: Won't CPU-profile the bulk of leak "
+                     "checking work happening in IgnoreLiveThreadsLocked!");
+  }
+  TCMalloc_ResumeAllProcessThreads(num_threads, thread_pids);
+  return num_threads;
+}
+
+// Dummy for IgnoreAllLiveObjectsLocked below.
+// Making it global helps with compiler warnings.
+static va_list dummy_ap;
+
+// static
+void HeapLeakChecker::IgnoreAllLiveObjectsLocked(const void* self_stack_top) {
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  RAW_CHECK(live_objects == NULL, "");
+  live_objects = new(Allocator::Allocate(sizeof(LiveObjectsStack)))
+                   LiveObjectsStack;
+  stack_tops = new(Allocator::Allocate(sizeof(StackTopSet))) StackTopSet;
+  // reset the counts
+  live_objects_total = 0;
+  live_bytes_total = 0;
+  // Reduce max_heap_object_size to FLAGS_heap_check_max_pointer_offset
+  // for the time of leak check.
+  // FLAGS_heap_check_max_pointer_offset caps max_heap_object_size
+  // to manage reasonably low chances of random bytes
+  // appearing to be pointing into large actually leaked heap objects.
+  const size_t old_max_heap_object_size = max_heap_object_size;
+  max_heap_object_size = (
+    FLAGS_heap_check_max_pointer_offset != -1
+    ? min(size_t(FLAGS_heap_check_max_pointer_offset), max_heap_object_size)
+    : max_heap_object_size);
+  // Record global data as live:
+  if (FLAGS_heap_check_ignore_global_live) {
+    library_live_objects =
+      new(Allocator::Allocate(sizeof(LibraryLiveObjectsStacks)))
+        LibraryLiveObjectsStacks;
+  }
+  // Ignore all thread stacks:
+  thread_listing_status = CALLBACK_NOT_STARTED;
+  bool need_to_ignore_non_thread_objects = true;
+  self_thread_pid = getpid();
+  self_thread_stack_top = self_stack_top;
+  if (FLAGS_heap_check_ignore_thread_live) {
+    // In case we are doing CPU profiling we'd like to do all the work
+    // in the main thread, not in the special thread created by
+    // TCMalloc_ListAllProcessThreads, so that CPU profiler can
+    // collect all its samples.  The machinery of
+    // TCMalloc_ListAllProcessThreads conflicts with the CPU profiler
+    // by also relying on signals and ::sigaction.  We can do this
+    // (run everything in the main thread) safely only if there's just
+    // the main thread itself in our process.  This variable reflects
+    // these two conditions:
+    bool want_and_can_run_in_main_thread =
+      ProfilingIsEnabledForAllThreads()  &&
+      TCMalloc_ListAllProcessThreads(NULL, IsOneThread) == 1;
+    // When the normal path of TCMalloc_ListAllProcessThreads below is taken,
+    // we fully suspend the threads right here before any liveness checking
+    // and keep them suspended for the whole time of liveness checking
+    // inside of the IgnoreLiveThreadsLocked callback.
+    // (The threads can't (de)allocate due to lock on the delete hook but
+    //  if not suspended they could still mess with the pointer
+    //  graph while we walk it).
+    int r = want_and_can_run_in_main_thread
+            ? IgnoreLiveThreadsLocked(NULL, 1, &self_thread_pid, dummy_ap)
+            : TCMalloc_ListAllProcessThreads(NULL, IgnoreLiveThreadsLocked);
+    need_to_ignore_non_thread_objects = r < 0;
+    if (r < 0) {
+      RAW_LOG(WARNING, "Thread finding failed with %d errno=%d", r, errno);
+      if (thread_listing_status == CALLBACK_COMPLETED) {
+        RAW_LOG(INFO, "Thread finding callback "
+                      "finished ok; hopefully everything is fine");
+        need_to_ignore_non_thread_objects = false;
+      } else if (thread_listing_status == CALLBACK_STARTED) {
+        RAW_LOG(FATAL, "Thread finding callback was "
+                       "interrupted or crashed; can't fix this");
+      } else {  // CALLBACK_NOT_STARTED
+        RAW_LOG(ERROR, "Could not find thread stacks. "
+                       "Will likely report false leak positives.");
+      }
+    } else if (r != 0) {
+      RAW_LOG(ERROR, "Thread stacks not found for %d threads. "
+                     "Will likely report false leak positives.", r);
+    } else {
+      RAW_VLOG(11, "Thread stacks appear to be found for all threads");
+    }
+  } else {
+    RAW_LOG(WARNING, "Not looking for thread stacks; "
+                     "objects reachable only from there "
+                     "will be reported as leaks");
+  }
+  // Do all other live data ignoring here if we did not do it
+  // within thread listing callback with all threads stopped.
+  if (need_to_ignore_non_thread_objects) {
+    if (FLAGS_heap_check_ignore_global_live) {
+      UseProcMapsLocked(RECORD_GLOBAL_DATA);
+    }
+    IgnoreNonThreadLiveObjectsLocked();
+  }
+  if (live_objects_total) {
+    RAW_VLOG(10, "Ignoring %" PRId64 " reachable objects of %" PRId64 " bytes",
+                live_objects_total, live_bytes_total);
+  }
+  // Free these: we made them here and heap_profile never saw them
+  Allocator::DeleteAndNull(&live_objects);
+  Allocator::DeleteAndNull(&stack_tops);
+  max_heap_object_size = old_max_heap_object_size;  // reset this var
+}
+
+// Alignment at which we should consider pointer positions
+// in IgnoreLiveObjectsLocked. Will normally use the value of
+// FLAGS_heap_check_pointer_source_alignment.
+static size_t pointer_source_alignment = kPointerSourceAlignment;
+// Global lock for HeapLeakChecker::DoNoLeaks
+// to protect pointer_source_alignment.
+static SpinLock alignment_checker_lock(SpinLock::LINKER_INITIALIZED);
+
+// This function changes the live bits in the heap_profile-table's state:
+// we only record the live objects to be skipped.
+//
+// When checking if a byte sequence points to a heap object we use
+// HeapProfileTable::FindInsideAlloc to handle both pointers to
+// the start and inside of heap-allocated objects.
+// The "inside" case needs to be checked to support
+// at least the following relatively common cases:
+// - C++ arrays allocated with new FooClass[size] for classes
+//   with destructors have their size recorded in a sizeof(int) field
+//   before the place normal pointers point to.
+// - basic_string<>-s for e.g. the C++ library of gcc 3.4
+//   have the meta-info in basic_string<...>::_Rep recorded
+//   before the place normal pointers point to.
+// - Multiple-inherited objects have their pointers when cast to
+//   different base classes pointing inside of the actually
+//   allocated object.
+// - Sometimes reachability pointers point to member objects of heap objects,
+//   and then those member objects point to the full heap object.
+// - Third party UnicodeString: it stores a 32-bit refcount
+//   (in both 32-bit and 64-bit binaries) as the first uint32
+//   in the allocated memory and a normal pointer points at
+//   the second uint32 behind the refcount.
+// By finding these additional objects here
+// we slightly increase the chance to mistake random memory bytes
+// for a pointer and miss a leak in a particular run of a binary.
+//
+/*static*/ void HeapLeakChecker::IgnoreLiveObjectsLocked(const char* name,
+                                                         const char* name2) {
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  int64 live_object_count = 0;
+  int64 live_byte_count = 0;
+  while (!live_objects->empty()) {
+    const char* object =
+      reinterpret_cast<const char*>(live_objects->back().ptr);
+    size_t size = live_objects->back().size;
+    const ObjectPlacement place = live_objects->back().place;
+    live_objects->pop_back();
+    if (place == MUST_BE_ON_HEAP  &&  heap_profile->MarkAsLive(object)) {
+      live_object_count += 1;
+      live_byte_count += size;
+    }
+    RAW_VLOG(13, "Looking for heap pointers in %p of %" PRIuS " bytes",
+                object, size);
+    const char* const whole_object = object;
+    size_t const whole_size = size;
+    // Try interpretting any byte sequence in object,size as a heap pointer:
+    const size_t remainder = AsInt(object) % pointer_source_alignment;
+    if (remainder) {
+      object += pointer_source_alignment - remainder;
+      if (size >= pointer_source_alignment - remainder) {
+        size -= pointer_source_alignment - remainder;
+      } else {
+        size = 0;
+      }
+    }
+    if (size < sizeof(void*)) continue;
+
+#ifdef NO_FRAME_POINTER
+    // Frame pointer omission requires us to use libunwind, which uses direct
+    // mmap and munmap system calls, and that needs special handling.
+    if (name2 == kUnnamedProcSelfMapEntry) {
+      static const uintptr_t page_mask = ~(getpagesize() - 1);
+      const uintptr_t addr = reinterpret_cast<uintptr_t>(object);
+      if ((addr & page_mask) == 0 && (size & page_mask) == 0) {
+        // This is an object we slurped from /proc/self/maps.
+        // It may or may not be readable at this point.
+        //
+        // In case all the above conditions made a mistake, and the object is
+        // not related to libunwind, we also verify that it's not readable
+        // before ignoring it.
+        if (msync(const_cast<char*>(object), size, MS_ASYNC) != 0) {
+          // Skip unreadable object, so we don't crash trying to sweep it.
+          RAW_VLOG(0, "Ignoring inaccessible object [%p, %p) "
+                   "(msync error %d (%s))",
+                   object, object + size, errno, strerror(errno));
+          continue;
+        }
+      }
+    }
+#endif
+
+    const char* const max_object = object + size - sizeof(void*);
+    while (object <= max_object) {
+      // potentially unaligned load:
+      const uintptr_t addr = *reinterpret_cast<const uintptr_t*>(object);
+      // Do fast check before the more expensive HaveOnHeapLocked lookup:
+      // this code runs for all memory words that are potentially pointers:
+      const bool can_be_on_heap =
+        // Order tests by the likelyhood of the test failing in 64/32 bit modes.
+        // Yes, this matters: we either lose 5..6% speed in 32 bit mode
+        // (which is already slower) or by a factor of 1.5..1.91 in 64 bit mode.
+        // After the alignment test got dropped the above performance figures
+        // must have changed; might need to revisit this.
+#if defined(__x86_64__)
+        addr <= max_heap_address  &&  // <= is for 0-sized object with max addr
+        min_heap_address <= addr;
+#else
+        min_heap_address <= addr  &&
+        addr <= max_heap_address;  // <= is for 0-sized object with max addr
+#endif
+      if (can_be_on_heap) {
+        const void* ptr = reinterpret_cast<const void*>(addr);
+        // Too expensive (inner loop): manually uncomment when debugging:
+        // RAW_VLOG(17, "Trying pointer to %p at %p", ptr, object);
+        size_t object_size;
+        if (HaveOnHeapLocked(&ptr, &object_size)  &&
+            heap_profile->MarkAsLive(ptr)) {
+          // We take the (hopefully low) risk here of encountering by accident
+          // a byte sequence in memory that matches an address of
+          // a heap object which is in fact leaked.
+          // I.e. in very rare and probably not repeatable/lasting cases
+          // we might miss some real heap memory leaks.
+          RAW_VLOG(14, "Found pointer to %p of %" PRIuS " bytes at %p "
+                      "inside %p of size %" PRIuS "",
+                      ptr, object_size, object, whole_object, whole_size);
+          if (VLOG_IS_ON(15)) {
+            // log call stacks to help debug how come something is not a leak
+            HeapProfileTable::AllocInfo alloc;
+            if (!heap_profile->FindAllocDetails(ptr, &alloc)) {
+              RAW_LOG(FATAL, "FindAllocDetails failed on ptr %p", ptr);
+            }
+            RAW_LOG(INFO, "New live %p object's alloc stack:", ptr);
+            for (int i = 0; i < alloc.stack_depth; ++i) {
+              RAW_LOG(INFO, "  @ %p", alloc.call_stack[i]);
+            }
+          }
+          live_object_count += 1;
+          live_byte_count += object_size;
+          live_objects->push_back(AllocObject(ptr, object_size,
+                                              IGNORED_ON_HEAP));
+        }
+      }
+      object += pointer_source_alignment;
+    }
+  }
+  live_objects_total += live_object_count;
+  live_bytes_total += live_byte_count;
+  if (live_object_count) {
+    RAW_VLOG(10, "Removed %" PRId64 " live heap objects of %" PRId64 " bytes: %s%s",
+                live_object_count, live_byte_count, name, name2);
+  }
+}
+
+//----------------------------------------------------------------------
+// HeapLeakChecker leak check disabling components
+//----------------------------------------------------------------------
+
+// static
+void HeapLeakChecker::DisableChecksIn(const char* pattern) {
+  RAW_LOG(WARNING, "DisableChecksIn(%s) is ignored", pattern);
+}
+
+// static
+void HeapLeakChecker::DoIgnoreObject(const void* ptr) {
+  SpinLockHolder l(&heap_checker_lock);
+  if (!heap_checker_on) return;
+  size_t object_size;
+  if (!HaveOnHeapLocked(&ptr, &object_size)) {
+    RAW_LOG(ERROR, "No live heap object at %p to ignore", ptr);
+  } else {
+    RAW_VLOG(10, "Going to ignore live object at %p of %" PRIuS " bytes",
+                ptr, object_size);
+    if (ignored_objects == NULL)  {
+      ignored_objects = new(Allocator::Allocate(sizeof(IgnoredObjectsMap)))
+                          IgnoredObjectsMap;
+    }
+    if (!ignored_objects->insert(make_pair(AsInt(ptr), object_size)).second) {
+      RAW_LOG(WARNING, "Object at %p is already being ignored", ptr);
+    }
+  }
+}
+
+// static
+void HeapLeakChecker::UnIgnoreObject(const void* ptr) {
+  SpinLockHolder l(&heap_checker_lock);
+  if (!heap_checker_on) return;
+  size_t object_size;
+  if (!HaveOnHeapLocked(&ptr, &object_size)) {
+    RAW_LOG(FATAL, "No live heap object at %p to un-ignore", ptr);
+  } else {
+    bool found = false;
+    if (ignored_objects) {
+      IgnoredObjectsMap::iterator object = ignored_objects->find(AsInt(ptr));
+      if (object != ignored_objects->end()  &&  object_size == object->second) {
+        ignored_objects->erase(object);
+        found = true;
+        RAW_VLOG(10, "Now not going to ignore live object "
+                    "at %p of %" PRIuS " bytes", ptr, object_size);
+      }
+    }
+    if (!found)  RAW_LOG(FATAL, "Object at %p has not been ignored", ptr);
+  }
+}
+
+//----------------------------------------------------------------------
+// HeapLeakChecker non-static functions
+//----------------------------------------------------------------------
+
+char* HeapLeakChecker::MakeProfileNameLocked() {
+  RAW_DCHECK(lock_->IsHeld(), "");
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  const int len = profile_name_prefix->size() + strlen(name_) + 5 +
+                  strlen(HeapProfileTable::kFileExt) + 1;
+  char* file_name = reinterpret_cast<char*>(Allocator::Allocate(len));
+  snprintf(file_name, len, "%s.%s-end%s",
+           profile_name_prefix->c_str(), name_,
+           HeapProfileTable::kFileExt);
+  return file_name;
+}
+
+void HeapLeakChecker::Create(const char *name, bool make_start_snapshot) {
+  SpinLockHolder l(lock_);
+  name_ = NULL;  // checker is inactive
+  start_snapshot_ = NULL;
+  has_checked_ = false;
+  inuse_bytes_increase_ = 0;
+  inuse_allocs_increase_ = 0;
+  keep_profiles_ = false;
+  char* n = new char[strlen(name) + 1];   // do this before we lock
+  IgnoreObject(n);  // otherwise it might be treated as live due to our stack
+  { // Heap activity in other threads is paused for this whole scope.
+    SpinLockHolder al(&alignment_checker_lock);
+    SpinLockHolder hl(&heap_checker_lock);
+    MemoryRegionMap::LockHolder ml;
+    if (heap_checker_on  &&  profile_name_prefix != NULL) {
+      RAW_DCHECK(strchr(name, '/') == NULL, "must be a simple name");
+      memcpy(n, name, strlen(name) + 1);
+      name_ = n;  // checker is active
+      if (make_start_snapshot) {
+        start_snapshot_ = heap_profile->TakeSnapshot();
+      }
+
+      const HeapProfileTable::Stats& t = heap_profile->total();
+      const size_t start_inuse_bytes = t.alloc_size - t.free_size;
+      const size_t start_inuse_allocs = t.allocs - t.frees;
+      RAW_VLOG(10, "Start check \"%s\" profile: %" PRIuS " bytes "
+               "in %" PRIuS " objects",
+               name_, start_inuse_bytes, start_inuse_allocs);
+    } else {
+      RAW_LOG(WARNING, "Heap checker is not active, "
+                       "hence checker \"%s\" will do nothing!", name);
+    RAW_LOG(WARNING, "To activate set the HEAPCHECK environment variable.\n");
+    }
+  }
+  if (name_ == NULL) {
+    UnIgnoreObject(n);
+    delete[] n;  // must be done after we unlock
+  }
+}
+
+HeapLeakChecker::HeapLeakChecker(const char *name) : lock_(new SpinLock) {
+  RAW_DCHECK(strcmp(name, "_main_") != 0, "_main_ is reserved");
+  Create(name, true/*create start_snapshot_*/);
+}
+
+HeapLeakChecker::HeapLeakChecker() : lock_(new SpinLock) {
+  if (FLAGS_heap_check_before_constructors) {
+    // We want to check for leaks of objects allocated during global
+    // constructors (i.e., objects allocated already).  So we do not
+    // create a baseline snapshot and hence check for leaks of objects
+    // that may have already been created.
+    Create("_main_", false);
+  } else {
+    // We want to ignore leaks of objects allocated during global
+    // constructors (i.e., objects allocated already).  So we snapshot
+    // the current heap contents and use them as a baseline that is
+    // not reported by the leak checker.
+    Create("_main_", true);
+  }
+}
+
+ssize_t HeapLeakChecker::BytesLeaked() const {
+  SpinLockHolder l(lock_);
+  if (!has_checked_) {
+    RAW_LOG(FATAL, "*NoLeaks|SameHeap must execute before this call");
+  }
+  return inuse_bytes_increase_;
+}
+
+ssize_t HeapLeakChecker::ObjectsLeaked() const {
+  SpinLockHolder l(lock_);
+  if (!has_checked_) {
+    RAW_LOG(FATAL, "*NoLeaks|SameHeap must execute before this call");
+  }
+  return inuse_allocs_increase_;
+}
+
+// Save pid of main thread for using in naming dump files
+static int32 main_thread_pid = getpid();
+#ifdef HAVE_PROGRAM_INVOCATION_NAME
+#ifdef __UCLIBC__
+extern const char* program_invocation_name;
+extern const char* program_invocation_short_name;
+#else
+extern char* program_invocation_name;
+extern char* program_invocation_short_name;
+#endif
+static const char* invocation_name() { return program_invocation_short_name; }
+static string invocation_path() { return program_invocation_name; }
+#else
+static const char* invocation_name() { return "<your binary>"; }
+static string invocation_path() { return "<your binary>"; }
+#endif
+
+// Prints commands that users can run to get more information
+// about the reported leaks.
+static void SuggestPprofCommand(const char* pprof_file_arg) {
+  // Extra help information to print for the user when the test is
+  // being run in a way where the straightforward pprof command will
+  // not suffice.
+  string extra_help;
+
+  // Common header info to print for remote runs
+  const string remote_header =
+      "This program is being executed remotely and therefore the pprof\n"
+      "command printed above will not work.  Either run this program\n"
+      "locally, or adjust the pprof command as follows to allow it to\n"
+      "work on your local machine:\n";
+
+  // Extra command for fetching remote data
+  string fetch_cmd;
+
+  RAW_LOG(WARNING,
+          "\n\n"
+          "If the preceding stack traces are not enough to find "
+          "the leaks, try running THIS shell command:\n\n"
+          "%s%s %s \"%s\" --inuse_objects --lines --heapcheck "
+          " --edgefraction=1e-10 --nodefraction=1e-10 --gv\n"
+          "\n"
+          "%s"
+          "If you are still puzzled about why the leaks are "
+          "there, try rerunning this program with "
+          "HEAP_CHECK_TEST_POINTER_ALIGNMENT=1 and/or with "
+          "HEAP_CHECK_MAX_POINTER_OFFSET=-1\n"
+          "If the leak report occurs in a small fraction of runs, "
+          "try running with TCMALLOC_MAX_FREE_QUEUE_SIZE of few hundred MB "
+          "or with TCMALLOC_RECLAIM_MEMORY=false, "  // only works for debugalloc
+          "it might help find leaks more repeatably\n",
+          fetch_cmd.c_str(),
+          "pprof",           // works as long as pprof is on your path
+          invocation_path().c_str(),
+          pprof_file_arg,
+          extra_help.c_str()
+          );
+}
+
+bool HeapLeakChecker::DoNoLeaks(ShouldSymbolize should_symbolize) {
+  SpinLockHolder l(lock_);
+  // The locking also helps us keep the messages
+  // for the two checks close together.
+  SpinLockHolder al(&alignment_checker_lock);
+
+  // thread-safe: protected by alignment_checker_lock
+  static bool have_disabled_hooks_for_symbolize = false;
+  // Once we've checked for leaks and symbolized the results once, it's
+  // not safe to do it again.  This is because in order to symbolize
+  // safely, we had to disable all the malloc hooks here, so we no
+  // longer can be confident we've collected all the data we need.
+  if (have_disabled_hooks_for_symbolize) {
+    RAW_LOG(FATAL, "Must not call heap leak checker manually after "
+            " program-exit's automatic check.");
+  }
+
+  HeapProfileTable::Snapshot* leaks = NULL;
+  char* pprof_file = NULL;
+
+  {
+    // Heap activity in other threads is paused during this function
+    // (i.e. until we got all profile difference info).
+    SpinLockHolder hl(&heap_checker_lock);
+    if (heap_checker_on == false) {
+      if (name_ != NULL) {  // leak checking enabled when created the checker
+        RAW_LOG(WARNING, "Heap leak checker got turned off after checker "
+                "\"%s\" has been created, no leak check is being done for it!",
+                name_);
+      }
+      return true;
+    }
+
+    // Update global_region_caller_ranges. They may need to change since
+    // e.g. initialization because shared libraries might have been loaded or
+    // unloaded.
+    Allocator::DeleteAndNullIfNot(&global_region_caller_ranges);
+    ProcMapsResult pm_result = UseProcMapsLocked(DISABLE_LIBRARY_ALLOCS);
+    RAW_CHECK(pm_result == PROC_MAPS_USED, "");
+
+    // Keep track of number of internally allocated objects so we
+    // can detect leaks in the heap-leak-checket itself
+    const int initial_allocs = Allocator::alloc_count();
+
+    if (name_ == NULL) {
+      RAW_LOG(FATAL, "Heap leak checker must not be turned on "
+              "after construction of a HeapLeakChecker");
+    }
+
+    MemoryRegionMap::LockHolder ml;
+    int a_local_var;  // Use our stack ptr to make stack data live:
+
+    // Make the heap profile, other threads are locked out.
+    HeapProfileTable::Snapshot* base =
+        reinterpret_cast<HeapProfileTable::Snapshot*>(start_snapshot_);
+    RAW_DCHECK(FLAGS_heap_check_pointer_source_alignment > 0, "");
+    pointer_source_alignment = FLAGS_heap_check_pointer_source_alignment;
+    IgnoreAllLiveObjectsLocked(&a_local_var);
+    leaks = heap_profile->NonLiveSnapshot(base);
+
+    inuse_bytes_increase_ = static_cast<ssize_t>(leaks->total().alloc_size);
+    inuse_allocs_increase_ = static_cast<ssize_t>(leaks->total().allocs);
+    if (leaks->Empty()) {
+      heap_profile->ReleaseSnapshot(leaks);
+      leaks = NULL;
+
+      // We can only check for internal leaks along the no-user-leak
+      // path since in the leak path we temporarily release
+      // heap_checker_lock and another thread can come in and disturb
+      // allocation counts.
+      if (Allocator::alloc_count() != initial_allocs) {
+        RAW_LOG(FATAL, "Internal HeapChecker leak of %d objects ; %d -> %d",
+                Allocator::alloc_count() - initial_allocs,
+                initial_allocs, Allocator::alloc_count());
+      }
+    } else if (FLAGS_heap_check_test_pointer_alignment) {
+      if (pointer_source_alignment == 1) {
+        RAW_LOG(WARNING, "--heap_check_test_pointer_alignment has no effect: "
+                "--heap_check_pointer_source_alignment was already set to 1");
+      } else {
+        // Try with reduced pointer aligment
+        pointer_source_alignment = 1;
+        IgnoreAllLiveObjectsLocked(&a_local_var);
+        HeapProfileTable::Snapshot* leaks_wo_align =
+            heap_profile->NonLiveSnapshot(base);
+        pointer_source_alignment = FLAGS_heap_check_pointer_source_alignment;
+        if (leaks_wo_align->Empty()) {
+          RAW_LOG(WARNING, "Found no leaks without pointer alignment: "
+                  "something might be placing pointers at "
+                  "unaligned addresses! This needs to be fixed.");
+        } else {
+          RAW_LOG(INFO, "Found leaks without pointer alignment as well: "
+                  "unaligned pointers must not be the cause of leaks.");
+          RAW_LOG(INFO, "--heap_check_test_pointer_alignment did not help "
+                  "to diagnose the leaks.");
+        }
+        heap_profile->ReleaseSnapshot(leaks_wo_align);
+      }
+    }
+
+    if (leaks != NULL) {
+      pprof_file = MakeProfileNameLocked();
+    }
+  }
+
+  has_checked_ = true;
+  if (leaks == NULL) {
+    if (FLAGS_heap_check_max_pointer_offset == -1) {
+      RAW_LOG(WARNING,
+              "Found no leaks without max_pointer_offset restriction: "
+              "it's possible that the default value of "
+              "heap_check_max_pointer_offset flag is too low. "
+              "Do you use pointers with larger than that offsets "
+              "pointing in the middle of heap-allocated objects?");
+    }
+    const HeapProfileTable::Stats& stats = heap_profile->total();
+    RAW_VLOG(heap_checker_info_level,
+             "No leaks found for check \"%s\" "
+             "(but no 100%% guarantee that there aren't any): "
+             "found %" PRId64 " reachable heap objects of %" PRId64 " bytes",
+             name_,
+             int64(stats.allocs - stats.frees),
+             int64(stats.alloc_size - stats.free_size));
+  } else {
+    if (should_symbolize == SYMBOLIZE) {
+      // To turn addresses into symbols, we need to fork, which is a
+      // problem if both parent and child end up trying to call the
+      // same malloc-hooks we've set up, at the same time.  To avoid
+      // trouble, we turn off the hooks before symbolizing.  Note that
+      // this makes it unsafe to ever leak-report again!  Luckily, we
+      // typically only want to report once in a program's run, at the
+      // very end.
+      if (MallocHook::GetNewHook() == NewHook)
+        MallocHook::SetNewHook(NULL);
+      if (MallocHook::GetDeleteHook() == DeleteHook)
+        MallocHook::SetDeleteHook(NULL);
+      MemoryRegionMap::Shutdown();
+      // Make sure all the hooks really got unset:
+      RAW_CHECK(MallocHook::GetNewHook() == NULL, "");
+      RAW_CHECK(MallocHook::GetDeleteHook() == NULL, "");
+      RAW_CHECK(MallocHook::GetMmapHook() == NULL, "");
+      RAW_CHECK(MallocHook::GetSbrkHook() == NULL, "");
+      have_disabled_hooks_for_symbolize = true;
+      leaks->ReportLeaks(name_, pprof_file, true);  // true = should_symbolize
+    } else {
+      leaks->ReportLeaks(name_, pprof_file, false);
+    }
+    if (FLAGS_heap_check_identify_leaks) {
+      leaks->ReportIndividualObjects();
+    }
+
+    SuggestPprofCommand(pprof_file);
+
+    {
+      SpinLockHolder hl(&heap_checker_lock);
+      heap_profile->ReleaseSnapshot(leaks);
+      Allocator::Free(pprof_file);
+    }
+  }
+
+  return (leaks == NULL);
+}
+
+HeapLeakChecker::~HeapLeakChecker() {
+  if (name_ != NULL) {  // had leak checking enabled when created the checker
+    if (!has_checked_) {
+      RAW_LOG(FATAL, "Some *NoLeaks|SameHeap method"
+                     " must be called on any created HeapLeakChecker");
+    }
+
+    // Deallocate any snapshot taken at start
+    if (start_snapshot_ != NULL) {
+      SpinLockHolder l(&heap_checker_lock);
+      heap_profile->ReleaseSnapshot(
+          reinterpret_cast<HeapProfileTable::Snapshot*>(start_snapshot_));
+    }
+
+    UnIgnoreObject(name_);
+    delete[] name_;
+    name_ = NULL;
+  }
+  delete lock_;
+}
+
+//----------------------------------------------------------------------
+// HeapLeakChecker overall heap check components
+//----------------------------------------------------------------------
+
+// static
+bool HeapLeakChecker::IsActive() {
+  SpinLockHolder l(&heap_checker_lock);
+  return heap_checker_on;
+}
+
+vector<HeapCleaner::void_function>* HeapCleaner::heap_cleanups_ = NULL;
+
+// When a HeapCleaner object is intialized, add its function to the static list
+// of cleaners to be run before leaks checking.
+HeapCleaner::HeapCleaner(void_function f) {
+  if (heap_cleanups_ == NULL)
+    heap_cleanups_ = new vector<HeapCleaner::void_function>;
+  heap_cleanups_->push_back(f);
+}
+
+// Run all of the cleanup functions and delete the vector.
+void HeapCleaner::RunHeapCleanups() {
+  if (!heap_cleanups_)
+    return;
+  for (int i = 0; i < heap_cleanups_->size(); i++) {
+    void (*f)(void) = (*heap_cleanups_)[i];
+    f();
+  }
+  delete heap_cleanups_;
+  heap_cleanups_ = NULL;
+}
+
+// Program exit heap cleanup registered as a module object destructor.
+// Will not get executed when we crash on a signal.
+//
+void HeapLeakChecker_RunHeapCleanups() {
+  if (FLAGS_heap_check == "local")   // don't check heap in this mode
+    return;
+  { SpinLockHolder l(&heap_checker_lock);
+    // can get here (via forks?) with other pids
+    if (heap_checker_pid != getpid()) return;
+  }
+  HeapCleaner::RunHeapCleanups();
+  if (!FLAGS_heap_check_after_destructors) HeapLeakChecker::DoMainHeapCheck();
+}
+
+static bool internal_init_start_has_run = false;
+
+// Called exactly once, before main() (but hopefully just before).
+// This picks a good unique name for the dumped leak checking heap profiles.
+//
+// Because we crash when InternalInitStart is called more than once,
+// it's fine that we hold heap_checker_lock only around pieces of
+// this function: this is still enough for thread-safety w.r.t. other functions
+// of this module.
+// We can't hold heap_checker_lock throughout because it would deadlock
+// on a memory allocation since our new/delete hooks can be on.
+//
+void HeapLeakChecker_InternalInitStart() {
+  { SpinLockHolder l(&heap_checker_lock);
+    RAW_CHECK(!internal_init_start_has_run,
+              "Heap-check constructor called twice.  Perhaps you both linked"
+              " in the heap checker, and also used LD_PRELOAD to load it?");
+    internal_init_start_has_run = true;
+
+#ifdef ADDRESS_SANITIZER
+    // AddressSanitizer's custom malloc conflicts with HeapChecker.
+    FLAGS_heap_check = "";
+#endif
+
+    if (FLAGS_heap_check.empty()) {
+      // turns out we do not need checking in the end; can stop profiling
+      HeapLeakChecker::TurnItselfOffLocked();
+      return;
+    } else if (RunningOnValgrind()) {
+      // There is no point in trying -- we'll just fail.
+      RAW_LOG(WARNING, "Can't run under Valgrind; will turn itself off");
+      HeapLeakChecker::TurnItselfOffLocked();
+      return;
+    }
+  }
+
+  // Changing this to false can be useful when debugging heap-checker itself:
+  if (!FLAGS_heap_check_run_under_gdb && IsDebuggerAttached()) {
+    RAW_LOG(WARNING, "Someone is ptrace()ing us; will turn itself off");
+    SpinLockHolder l(&heap_checker_lock);
+    HeapLeakChecker::TurnItselfOffLocked();
+    return;
+  }
+
+  { SpinLockHolder l(&heap_checker_lock);
+    if (!constructor_heap_profiling) {
+      RAW_LOG(FATAL, "Can not start so late. You have to enable heap checking "
+	             "with HEAPCHECK=<mode>.");
+    }
+  }
+
+  // Set all flags
+  RAW_DCHECK(FLAGS_heap_check_pointer_source_alignment > 0, "");
+  if (FLAGS_heap_check == "minimal") {
+    // The least we can check.
+    FLAGS_heap_check_before_constructors = false;  // from after main
+                                                   // (ignore more)
+    FLAGS_heap_check_after_destructors = false;  // to after cleanup
+                                                 // (most data is live)
+    FLAGS_heap_check_ignore_thread_live = true;  // ignore all live
+    FLAGS_heap_check_ignore_global_live = true;  // ignore all live
+  } else if (FLAGS_heap_check == "normal") {
+    // Faster than 'minimal' and not much stricter.
+    FLAGS_heap_check_before_constructors = true;  // from no profile (fast)
+    FLAGS_heap_check_after_destructors = false;  // to after cleanup
+                                                 // (most data is live)
+    FLAGS_heap_check_ignore_thread_live = true;  // ignore all live
+    FLAGS_heap_check_ignore_global_live = true;  // ignore all live
+  } else if (FLAGS_heap_check == "strict") {
+    // A bit stricter than 'normal': global destructors must fully clean up
+    // after themselves if they are present.
+    FLAGS_heap_check_before_constructors = true;  // from no profile (fast)
+    FLAGS_heap_check_after_destructors = true;  // to after destructors
+                                                // (less data live)
+    FLAGS_heap_check_ignore_thread_live = true;  // ignore all live
+    FLAGS_heap_check_ignore_global_live = true;  // ignore all live
+  } else if (FLAGS_heap_check == "draconian") {
+    // Drop not very portable and not very exact live heap flooding.
+    FLAGS_heap_check_before_constructors = true;  // from no profile (fast)
+    FLAGS_heap_check_after_destructors = true;  // to after destructors
+                                                // (need them)
+    FLAGS_heap_check_ignore_thread_live = false;  // no live flood (stricter)
+    FLAGS_heap_check_ignore_global_live = false;  // no live flood (stricter)
+  } else if (FLAGS_heap_check == "as-is") {
+    // do nothing: use other flags as is
+  } else if (FLAGS_heap_check == "local") {
+    // do nothing
+  } else {
+    RAW_LOG(FATAL, "Unsupported heap_check flag: %s",
+                   FLAGS_heap_check.c_str());
+  }
+  // FreeBSD doesn't seem to honor atexit execution order:
+  //    http://code.google.com/p/gperftools/issues/detail?id=375
+  // Since heap-checking before destructors depends on atexit running
+  // at the right time, on FreeBSD we always check after, even in the
+  // less strict modes.  This just means FreeBSD is always a bit
+  // stricter in its checking than other OSes.
+  // This now appears to be the case in other OSes as well;
+  // so always check afterwards.
+  FLAGS_heap_check_after_destructors = true;
+
+  { SpinLockHolder l(&heap_checker_lock);
+    RAW_DCHECK(heap_checker_pid == getpid(), "");
+    heap_checker_on = true;
+    RAW_DCHECK(heap_profile, "");
+    HeapLeakChecker::ProcMapsResult pm_result = HeapLeakChecker::UseProcMapsLocked(HeapLeakChecker::DISABLE_LIBRARY_ALLOCS);
+      // might neeed to do this more than once
+      // if one later dynamically loads libraries that we want disabled
+    if (pm_result != HeapLeakChecker::PROC_MAPS_USED) {  // can't function
+      HeapLeakChecker::TurnItselfOffLocked();
+      return;
+    }
+  }
+
+  // make a good place and name for heap profile leak dumps
+  string* profile_prefix =
+    new string(FLAGS_heap_check_dump_directory + "/" + invocation_name());
+
+  // Finalize prefix for dumping leak checking profiles.
+  const int32 our_pid = getpid();   // safest to call getpid() outside lock
+  { SpinLockHolder l(&heap_checker_lock);
+    // main_thread_pid might still be 0 if this function is being called before
+    // global constructors.  In that case, our pid *is* the main pid.
+    if (main_thread_pid == 0)
+      main_thread_pid = our_pid;
+  }
+  char pid_buf[15];
+  snprintf(pid_buf, sizeof(pid_buf), ".%d", main_thread_pid);
+  *profile_prefix += pid_buf;
+  { SpinLockHolder l(&heap_checker_lock);
+    RAW_DCHECK(profile_name_prefix == NULL, "");
+    profile_name_prefix = profile_prefix;
+  }
+
+  // Make sure new/delete hooks are installed properly
+  // and heap profiler is indeed able to keep track
+  // of the objects being allocated.
+  // We test this to make sure we are indeed checking for leaks.
+  char* test_str = new char[5];
+  size_t size;
+  { SpinLockHolder l(&heap_checker_lock);
+    RAW_CHECK(heap_profile->FindAlloc(test_str, &size),
+              "our own new/delete not linked?");
+  }
+  delete[] test_str;
+  { SpinLockHolder l(&heap_checker_lock);
+    // This check can fail when it should not if another thread allocates
+    // into this same spot right this moment,
+    // which is unlikely since this code runs in InitGoogle.
+    RAW_CHECK(!heap_profile->FindAlloc(test_str, &size),
+              "our own new/delete not linked?");
+  }
+  // If we crash in the above code, it probably means that
+  // "nm <this_binary> | grep new" will show that tcmalloc's new/delete
+  // implementation did not get linked-in into this binary
+  // (i.e. nm will list __builtin_new and __builtin_vec_new as undefined).
+  // If this happens, it is a BUILD bug to be fixed.
+
+  RAW_VLOG(heap_checker_info_level,
+           "WARNING: Perftools heap leak checker is active "
+           "-- Performance may suffer");
+
+  if (FLAGS_heap_check != "local") {
+    HeapLeakChecker* main_hc = new HeapLeakChecker();
+    SpinLockHolder l(&heap_checker_lock);
+    RAW_DCHECK(main_heap_checker == NULL,
+               "Repeated creation of main_heap_checker");
+    main_heap_checker = main_hc;
+    do_main_heap_check = true;
+  }
+
+  { SpinLockHolder l(&heap_checker_lock);
+    RAW_CHECK(heap_checker_on  &&  constructor_heap_profiling,
+              "Leak checking is expected to be fully turned on now");
+  }
+
+  // For binaries built in debug mode, this will set release queue of
+  // debugallocation.cc to 100M to make it less likely for real leaks to
+  // be hidden due to reuse of heap memory object addresses.
+  // Running a test with --malloc_reclaim_memory=0 would help find leaks even
+  // better, but the test might run out of memory as a result.
+  // The scenario is that a heap object at address X is allocated and freed,
+  // but some other data-structure still retains a pointer to X.
+  // Then the same heap memory is used for another object, which is leaked,
+  // but the leak is not noticed due to the pointer to the original object at X.
+  // TODO(csilvers): support this in some manner.
+#if 0
+  SetCommandLineOptionWithMode("max_free_queue_size", "104857600",  // 100M
+                               SET_FLAG_IF_DEFAULT);
+#endif
+}
+
+// We want this to run early as well, but not so early as
+// ::BeforeConstructors (we want flag assignments to have already
+// happened, for instance).  Initializer-registration does the trick.
+REGISTER_MODULE_INITIALIZER(init_start, HeapLeakChecker_InternalInitStart());
+REGISTER_MODULE_DESTRUCTOR(init_start, HeapLeakChecker_RunHeapCleanups());
+
+// static
+bool HeapLeakChecker::NoGlobalLeaksMaybeSymbolize(
+    ShouldSymbolize should_symbolize) {
+  // we never delete or change main_heap_checker once it's set:
+  HeapLeakChecker* main_hc = GlobalChecker();
+  if (main_hc) {
+    RAW_VLOG(10, "Checking for whole-program memory leaks");
+    return main_hc->DoNoLeaks(should_symbolize);
+  }
+  return true;
+}
+
+// static
+bool HeapLeakChecker::DoMainHeapCheck() {
+  if (FLAGS_heap_check_delay_seconds > 0) {
+    sleep(FLAGS_heap_check_delay_seconds);
+  }
+  { SpinLockHolder l(&heap_checker_lock);
+    if (!do_main_heap_check) return false;
+    RAW_DCHECK(heap_checker_pid == getpid(), "");
+    do_main_heap_check = false;  // will do it now; no need to do it more
+  }
+
+  // The program is over, so it's safe to symbolize addresses (which
+  // requires a fork) because no serious work is expected to be done
+  // after this.  Symbolizing is really useful -- knowing what
+  // function has a leak is better than knowing just an address --
+  // and while we can only safely symbolize once in a program run,
+  // now is the time (after all, there's no "later" that would be better).
+  if (!NoGlobalLeaksMaybeSymbolize(SYMBOLIZE)) {
+    if (FLAGS_heap_check_identify_leaks) {
+      RAW_LOG(FATAL, "Whole-program memory leaks found.");
+    }
+    RAW_LOG(ERROR, "Exiting with error code (instead of crashing) "
+                   "because of whole-program memory leaks");
+    _exit(1);    // we don't want to call atexit() routines!
+  }
+  return true;
+}
+
+// static
+HeapLeakChecker* HeapLeakChecker::GlobalChecker() {
+  SpinLockHolder l(&heap_checker_lock);
+  return main_heap_checker;
+}
+
+// static
+bool HeapLeakChecker::NoGlobalLeaks() {
+  // symbolizing requires a fork, which isn't safe to do in general.
+  return NoGlobalLeaksMaybeSymbolize(DO_NOT_SYMBOLIZE);
+}
+
+// static
+void HeapLeakChecker::CancelGlobalCheck() {
+  SpinLockHolder l(&heap_checker_lock);
+  if (do_main_heap_check) {
+    RAW_VLOG(heap_checker_info_level,
+             "Canceling the automatic at-exit whole-program memory leak check");
+    do_main_heap_check = false;
+  }
+}
+
+// static
+void HeapLeakChecker::BeforeConstructorsLocked() {
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  RAW_CHECK(!constructor_heap_profiling,
+            "BeforeConstructorsLocked called multiple times");
+#ifdef ADDRESS_SANITIZER
+  // AddressSanitizer's custom malloc conflicts with HeapChecker.
+  return;
+#endif
+  // Set hooks early to crash if 'new' gets called before we make heap_profile,
+  // and make sure no other hooks existed:
+  RAW_CHECK(MallocHook::AddNewHook(&NewHook), "");
+  RAW_CHECK(MallocHook::AddDeleteHook(&DeleteHook), "");
+  constructor_heap_profiling = true;
+  MemoryRegionMap::Init(1, /* use_buckets */ false);
+    // Set up MemoryRegionMap with (at least) one caller stack frame to record
+    // (important that it's done before HeapProfileTable creation below).
+  Allocator::Init();
+  RAW_CHECK(heap_profile == NULL, "");
+  heap_profile = new(Allocator::Allocate(sizeof(HeapProfileTable)))
+      HeapProfileTable(&Allocator::Allocate, &Allocator::Free,
+                       /* profile_mmap */ false);
+  RAW_VLOG(10, "Starting tracking the heap");
+  heap_checker_on = true;
+}
+
+// static
+void HeapLeakChecker::TurnItselfOffLocked() {
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  // Set FLAGS_heap_check to "", for users who test for it
+  if (!FLAGS_heap_check.empty())  // be a noop in the common case
+    FLAGS_heap_check.clear();     // because clear() could allocate memory
+  if (constructor_heap_profiling) {
+    RAW_CHECK(heap_checker_on, "");
+    RAW_VLOG(heap_checker_info_level, "Turning perftools heap leak checking off");
+    heap_checker_on = false;
+    // Unset our hooks checking they were set:
+    RAW_CHECK(MallocHook::RemoveNewHook(&NewHook), "");
+    RAW_CHECK(MallocHook::RemoveDeleteHook(&DeleteHook), "");
+    Allocator::DeleteAndNull(&heap_profile);
+    // free our optional global data:
+    Allocator::DeleteAndNullIfNot(&ignored_objects);
+    Allocator::DeleteAndNullIfNot(&disabled_ranges);
+    Allocator::DeleteAndNullIfNot(&global_region_caller_ranges);
+    Allocator::Shutdown();
+    MemoryRegionMap::Shutdown();
+  }
+  RAW_CHECK(!heap_checker_on, "");
+}
+
+extern bool heap_leak_checker_bcad_variable;  // in heap-checker-bcad.cc
+
+static bool has_called_before_constructors = false;
+
+// TODO(maxim): inline this function with
+// MallocHook_InitAtFirstAllocation_HeapLeakChecker, and also rename
+// HeapLeakChecker::BeforeConstructorsLocked.
+void HeapLeakChecker_BeforeConstructors() {
+  SpinLockHolder l(&heap_checker_lock);
+  // We can be called from several places: the first mmap/sbrk/alloc call
+  // or the first global c-tor from heap-checker-bcad.cc:
+  // Do not re-execute initialization:
+  if (has_called_before_constructors) return;
+  has_called_before_constructors = true;
+
+  heap_checker_pid = getpid();  // set it always
+  heap_leak_checker_bcad_variable = true;
+  // just to reference it, so that heap-checker-bcad.o is linked in
+
+  // This function can be called *very* early, before the normal
+  // global-constructor that sets FLAGS_verbose.  Set it manually now,
+  // so the RAW_LOG messages here are controllable.
+  const char* verbose_str = GetenvBeforeMain("PERFTOOLS_VERBOSE");
+  if (verbose_str && atoi(verbose_str)) {  // different than the default of 0?
+    FLAGS_verbose = atoi(verbose_str);
+  }
+
+  bool need_heap_check = true;
+  // The user indicates a desire for heap-checking via the HEAPCHECK
+  // environment variable.  If it's not set, there's no way to do
+  // heap-checking.
+  if (!GetenvBeforeMain("HEAPCHECK")) {
+    need_heap_check = false;
+  }
+#ifdef HAVE_GETEUID
+  if (need_heap_check && getuid() != geteuid()) {
+    // heap-checker writes out files.  Thus, for security reasons, we don't
+    // recognize the env. var. to turn on heap-checking if we're setuid.
+    RAW_LOG(WARNING, ("HeapChecker: ignoring HEAPCHECK because "
+                      "program seems to be setuid\n"));
+    need_heap_check = false;
+  }
+#endif
+  if (need_heap_check) {
+    HeapLeakChecker::BeforeConstructorsLocked();
+  }
+}
+
+// This function overrides the weak function defined in malloc_hook.cc and
+// called by one of the initial malloc hooks (malloc_hook.cc) when the very
+// first memory allocation or an mmap/sbrk happens.  This ensures that
+// HeapLeakChecker is initialized and installs all its hooks early enough to
+// track absolutely all memory allocations and all memory region acquisitions
+// via mmap and sbrk.
+extern "C" void MallocHook_InitAtFirstAllocation_HeapLeakChecker() {
+  HeapLeakChecker_BeforeConstructors();
+}
+
+// This function is executed after all global object destructors run.
+void HeapLeakChecker_AfterDestructors() {
+  { SpinLockHolder l(&heap_checker_lock);
+    // can get here (via forks?) with other pids
+    if (heap_checker_pid != getpid()) return;
+  }
+  if (FLAGS_heap_check_after_destructors) {
+    if (HeapLeakChecker::DoMainHeapCheck()) {
+      const struct timespec sleep_time = { 0, 500000000 };  // 500 ms
+      nanosleep(&sleep_time, NULL);
+        // Need this hack to wait for other pthreads to exit.
+        // Otherwise tcmalloc find errors
+        // on a free() call from pthreads.
+    }
+  }
+  SpinLockHolder l(&heap_checker_lock);
+  RAW_CHECK(!do_main_heap_check, "should have done it");
+}
+
+//----------------------------------------------------------------------
+// HeapLeakChecker disabling helpers
+//----------------------------------------------------------------------
+
+// These functions are at the end of the file to prevent their inlining:
+
+// static
+void HeapLeakChecker::DisableChecksFromToLocked(const void* start_address,
+                                                const void* end_address,
+                                                int max_depth) {
+  RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  RAW_DCHECK(start_address < end_address, "");
+  if (disabled_ranges == NULL) {
+    disabled_ranges = new(Allocator::Allocate(sizeof(DisabledRangeMap)))
+                        DisabledRangeMap;
+  }
+  RangeValue value;
+  value.start_address = AsInt(start_address);
+  value.max_depth = max_depth;
+  if (disabled_ranges->insert(make_pair(AsInt(end_address), value)).second) {
+    RAW_VLOG(10, "Disabling leak checking in stack traces "
+                "under frame addresses between %p..%p",
+                start_address, end_address);
+  } else {  // check that this is just a verbatim repetition
+    RangeValue const& val = disabled_ranges->find(AsInt(end_address))->second;
+    if (val.max_depth != value.max_depth  ||
+        val.start_address != value.start_address) {
+      RAW_LOG(FATAL, "Two DisableChecksToHereFrom calls conflict: "
+                     "(%p, %p, %d) vs. (%p, %p, %d)",
+                     AsPtr(val.start_address), end_address, val.max_depth,
+                     start_address, end_address, max_depth);
+    }
+  }
+}
+
+// static
+inline bool HeapLeakChecker::HaveOnHeapLocked(const void** ptr,
+                                              size_t* object_size) {
+  // Commented-out because HaveOnHeapLocked is very performance-critical:
+  // RAW_DCHECK(heap_checker_lock.IsHeld(), "");
+  const uintptr_t addr = AsInt(*ptr);
+  if (heap_profile->FindInsideAlloc(
+        *ptr, max_heap_object_size, ptr, object_size)) {
+    RAW_VLOG(16, "Got pointer into %p at +%" PRIuPTR " offset",
+             *ptr, addr - AsInt(*ptr));
+    return true;
+  }
+  return false;
+}
+
+// static
+const void* HeapLeakChecker::GetAllocCaller(void* ptr) {
+  // this is used only in the unittest, so the heavy checks are fine
+  HeapProfileTable::AllocInfo info;
+  { SpinLockHolder l(&heap_checker_lock);
+    RAW_CHECK(heap_profile->FindAllocDetails(ptr, &info), "");
+  }
+  RAW_CHECK(info.stack_depth >= 1, "");
+  return info.call_stack[0];
+}

diff --git a/src/heap-profile-stats.h b/src/heap-profile-stats.h
new file mode 100644
index 0000000..ae45d58
--- /dev/null
+++ b/src/heap-profile-stats.h

@@ -0,0 +1,78 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2013, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file defines structs to accumulate memory allocation and deallocation
+// counts.  These structs are commonly used for malloc (in HeapProfileTable)
+// and mmap (in MemoryRegionMap).
+
+// A bucket is data structure for heap profiling to store a pair of a stack
+// trace and counts of (de)allocation.  Buckets are stored in a hash table
+// which is declared as "HeapProfileBucket**".
+//
+// A hash value is computed from a stack trace.  Collision in the hash table
+// is resolved by separate chaining with linked lists.  The links in the list
+// are implemented with the member "HeapProfileBucket* next".
+//
+// A structure of a hash table HeapProfileBucket** bucket_table would be like:
+// bucket_table[0] => NULL
+// bucket_table[1] => HeapProfileBucket() => HeapProfileBucket() => NULL
+// ...
+// bucket_table[i] => HeapProfileBucket() => NULL
+// ...
+// bucket_table[n] => HeapProfileBucket() => NULL
+
+#ifndef HEAP_PROFILE_STATS_H_
+#define HEAP_PROFILE_STATS_H_
+
+struct HeapProfileStats {
+  // Returns true if the two HeapProfileStats are semantically equal.
+  bool Equivalent(const HeapProfileStats& other) const {
+    return allocs - frees == other.allocs - other.frees &&
+        alloc_size - free_size == other.alloc_size - other.free_size;
+  }
+
+  int32 allocs;      // Number of allocation calls.
+  int32 frees;       // Number of free calls.
+  int64 alloc_size;  // Total size of all allocated objects so far.
+  int64 free_size;   // Total size of all freed objects so far.
+};
+
+// Allocation and deallocation statistics per each stack trace.
+struct HeapProfileBucket : public HeapProfileStats {
+  // Longest stack trace we record.
+  static const int kMaxStackDepth = 32;
+
+  uintptr_t hash;           // Hash value of the stack trace.
+  int depth;                // Depth of stack trace.
+  const void** stack;       // Stack trace.
+  HeapProfileBucket* next;  // Next entry in hash-table.
+};
+
+#endif  // HEAP_PROFILE_STATS_H_

diff --git a/src/heap-profile-table.cc b/src/heap-profile-table.cc
new file mode 100644
index 0000000..7486468
--- /dev/null
+++ b/src/heap-profile-table.cc

@@ -0,0 +1,631 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//         Maxim Lifantsev (refactoring)
+//
+
+#include <config.h>
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>   // for write()
+#endif
+#include <fcntl.h>    // for open()
+#ifdef HAVE_GLOB_H
+#include <glob.h>
+#ifndef GLOB_NOMATCH  // true on some old cygwins
+# define GLOB_NOMATCH 0
+#endif
+#endif
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h> // for PRIxPTR
+#endif
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
+#include <errno.h>
+#include <stdarg.h>
+#include <string>
+#include <map>
+#include <algorithm>  // for sort(), equal(), and copy()
+
+#include "heap-profile-table.h"
+
+#include "base/logging.h"
+#include "raw_printer.h"
+#include "symbolize.h"
+#include <gperftools/stacktrace.h>
+#include <gperftools/malloc_hook.h>
+#include "memory_region_map.h"
+#include "base/commandlineflags.h"
+#include "base/logging.h"    // for the RawFD I/O commands
+#include "base/sysinfo.h"
+
+using std::sort;
+using std::equal;
+using std::copy;
+using std::string;
+using std::map;
+
+using tcmalloc::FillProcSelfMaps;   // from sysinfo.h
+using tcmalloc::DumpProcSelfMaps;   // from sysinfo.h
+
+//----------------------------------------------------------------------
+
+DEFINE_bool(cleanup_old_heap_profiles,
+            EnvToBool("HEAP_PROFILE_CLEANUP", true),
+            "At initialization time, delete old heap profiles.");
+
+DEFINE_int32(heap_check_max_leaks,
+             EnvToInt("HEAP_CHECK_MAX_LEAKS", 20),
+             "The maximum number of leak reports to print.");
+
+//----------------------------------------------------------------------
+
+// header of the dumped heap profile
+static const char kProfileHeader[] = "heap profile: ";
+static const char kProcSelfMapsHeader[] = "\nMAPPED_LIBRARIES:\n";
+
+//----------------------------------------------------------------------
+
+const char HeapProfileTable::kFileExt[] = ".heap";
+
+//----------------------------------------------------------------------
+
+static const int kHashTableSize = 179999;   // Size for bucket_table_.
+/*static*/ const int HeapProfileTable::kMaxStackDepth;
+
+//----------------------------------------------------------------------
+
+// We strip out different number of stack frames in debug mode
+// because less inlining happens in that case
+#ifdef NDEBUG
+static const int kStripFrames = 2;
+#else
+static const int kStripFrames = 3;
+#endif
+
+// For sorting Stats or Buckets by in-use space
+static bool ByAllocatedSpace(HeapProfileTable::Stats* a,
+                             HeapProfileTable::Stats* b) {
+  // Return true iff "a" has more allocated space than "b"
+  return (a->alloc_size - a->free_size) > (b->alloc_size - b->free_size);
+}
+
+//----------------------------------------------------------------------
+
+HeapProfileTable::HeapProfileTable(Allocator alloc,
+                                   DeAllocator dealloc,
+                                   bool profile_mmap)
+    : alloc_(alloc),
+      dealloc_(dealloc),
+      profile_mmap_(profile_mmap),
+      bucket_table_(NULL),
+      num_buckets_(0),
+      address_map_(NULL) {
+  // Make a hash table for buckets.
+  const int table_bytes = kHashTableSize * sizeof(*bucket_table_);
+  bucket_table_ = static_cast<Bucket**>(alloc_(table_bytes));
+  memset(bucket_table_, 0, table_bytes);
+
+  // Make an allocation map.
+  address_map_ =
+      new(alloc_(sizeof(AllocationMap))) AllocationMap(alloc_, dealloc_);
+
+  // Initialize.
+  memset(&total_, 0, sizeof(total_));
+  num_buckets_ = 0;
+}
+
+HeapProfileTable::~HeapProfileTable() {
+  // Free the allocation map.
+  address_map_->~AllocationMap();
+  dealloc_(address_map_);
+  address_map_ = NULL;
+
+  // Free the hash table.
+  for (int i = 0; i < kHashTableSize; i++) {
+    for (Bucket* curr = bucket_table_[i]; curr != 0; /**/) {
+      Bucket* bucket = curr;
+      curr = curr->next;
+      dealloc_(bucket->stack);
+      dealloc_(bucket);
+    }
+  }
+  dealloc_(bucket_table_);
+  bucket_table_ = NULL;
+}
+
+HeapProfileTable::Bucket* HeapProfileTable::GetBucket(int depth,
+                                                      const void* const key[]) {
+  // Make hash-value
+  uintptr_t h = 0;
+  for (int i = 0; i < depth; i++) {
+    h += reinterpret_cast<uintptr_t>(key[i]);
+    h += h << 10;
+    h ^= h >> 6;
+  }
+  h += h << 3;
+  h ^= h >> 11;
+
+  // Lookup stack trace in table
+  unsigned int buck = ((unsigned int) h) % kHashTableSize;
+  for (Bucket* b = bucket_table_[buck]; b != 0; b = b->next) {
+    if ((b->hash == h) &&
+        (b->depth == depth) &&
+        equal(key, key + depth, b->stack)) {
+      return b;
+    }
+  }
+
+  // Create new bucket
+  const size_t key_size = sizeof(key[0]) * depth;
+  const void** kcopy = reinterpret_cast<const void**>(alloc_(key_size));
+  copy(key, key + depth, kcopy);
+  Bucket* b = reinterpret_cast<Bucket*>(alloc_(sizeof(Bucket)));
+  memset(b, 0, sizeof(*b));
+  b->hash  = h;
+  b->depth = depth;
+  b->stack = kcopy;
+  b->next  = bucket_table_[buck];
+  bucket_table_[buck] = b;
+  num_buckets_++;
+  return b;
+}
+
+int HeapProfileTable::GetCallerStackTrace(
+    int skip_count, void* stack[kMaxStackDepth]) {
+  return MallocHook::GetCallerStackTrace(
+      stack, kMaxStackDepth, kStripFrames + skip_count + 1);
+}
+
+void HeapProfileTable::RecordAlloc(
+    const void* ptr, size_t bytes, int stack_depth,
+    const void* const call_stack[]) {
+  Bucket* b = GetBucket(stack_depth, call_stack);
+  b->allocs++;
+  b->alloc_size += bytes;
+  total_.allocs++;
+  total_.alloc_size += bytes;
+
+  AllocValue v;
+  v.set_bucket(b);  // also did set_live(false); set_ignore(false)
+  v.bytes = bytes;
+  address_map_->Insert(ptr, v);
+}
+
+void HeapProfileTable::RecordFree(const void* ptr) {
+  AllocValue v;
+  if (address_map_->FindAndRemove(ptr, &v)) {
+    Bucket* b = v.bucket();
+    b->frees++;
+    b->free_size += v.bytes;
+    total_.frees++;
+    total_.free_size += v.bytes;
+  }
+}
+
+bool HeapProfileTable::FindAlloc(const void* ptr, size_t* object_size) const {
+  const AllocValue* alloc_value = address_map_->Find(ptr);
+  if (alloc_value != NULL) *object_size = alloc_value->bytes;
+  return alloc_value != NULL;
+}
+
+bool HeapProfileTable::FindAllocDetails(const void* ptr,
+                                        AllocInfo* info) const {
+  const AllocValue* alloc_value = address_map_->Find(ptr);
+  if (alloc_value != NULL) {
+    info->object_size = alloc_value->bytes;
+    info->call_stack = alloc_value->bucket()->stack;
+    info->stack_depth = alloc_value->bucket()->depth;
+  }
+  return alloc_value != NULL;
+}
+
+bool HeapProfileTable::FindInsideAlloc(const void* ptr,
+                                       size_t max_size,
+                                       const void** object_ptr,
+                                       size_t* object_size) const {
+  const AllocValue* alloc_value =
+    address_map_->FindInside(&AllocValueSize, max_size, ptr, object_ptr);
+  if (alloc_value != NULL) *object_size = alloc_value->bytes;
+  return alloc_value != NULL;
+}
+
+bool HeapProfileTable::MarkAsLive(const void* ptr) {
+  AllocValue* alloc = address_map_->FindMutable(ptr);
+  if (alloc && !alloc->live()) {
+    alloc->set_live(true);
+    return true;
+  }
+  return false;
+}
+
+void HeapProfileTable::MarkAsIgnored(const void* ptr) {
+  AllocValue* alloc = address_map_->FindMutable(ptr);
+  if (alloc) {
+    alloc->set_ignore(true);
+  }
+}
+
+// We'd be happier using snprintfer, but we don't to reduce dependencies.
+int HeapProfileTable::UnparseBucket(const Bucket& b,
+                                    char* buf, int buflen, int bufsize,
+                                    const char* extra,
+                                    Stats* profile_stats) {
+  if (profile_stats != NULL) {
+    profile_stats->allocs += b.allocs;
+    profile_stats->alloc_size += b.alloc_size;
+    profile_stats->frees += b.frees;
+    profile_stats->free_size += b.free_size;
+  }
+  int printed =
+    snprintf(buf + buflen, bufsize - buflen, "%6d: %8" PRId64 " [%6d: %8" PRId64 "] @%s",
+             b.allocs - b.frees,
+             b.alloc_size - b.free_size,
+             b.allocs,
+             b.alloc_size,
+             extra);
+  // If it looks like the snprintf failed, ignore the fact we printed anything
+  if (printed < 0 || printed >= bufsize - buflen) return buflen;
+  buflen += printed;
+  for (int d = 0; d < b.depth; d++) {
+    printed = snprintf(buf + buflen, bufsize - buflen, " 0x%08" PRIxPTR,
+                       reinterpret_cast<uintptr_t>(b.stack[d]));
+    if (printed < 0 || printed >= bufsize - buflen) return buflen;
+    buflen += printed;
+  }
+  printed = snprintf(buf + buflen, bufsize - buflen, "\n");
+  if (printed < 0 || printed >= bufsize - buflen) return buflen;
+  buflen += printed;
+  return buflen;
+}
+
+HeapProfileTable::Bucket**
+HeapProfileTable::MakeSortedBucketList() const {
+  Bucket** list = static_cast<Bucket**>(alloc_(sizeof(Bucket) * num_buckets_));
+
+  int bucket_count = 0;
+  for (int i = 0; i < kHashTableSize; i++) {
+    for (Bucket* curr = bucket_table_[i]; curr != 0; curr = curr->next) {
+      list[bucket_count++] = curr;
+    }
+  }
+  RAW_DCHECK(bucket_count == num_buckets_, "");
+
+  sort(list, list + num_buckets_, ByAllocatedSpace);
+
+  return list;
+}
+
+void HeapProfileTable::IterateOrderedAllocContexts(
+    AllocContextIterator callback) const {
+  Bucket** list = MakeSortedBucketList();
+  AllocContextInfo info;
+  for (int i = 0; i < num_buckets_; ++i) {
+    *static_cast<Stats*>(&info) = *static_cast<Stats*>(list[i]);
+    info.stack_depth = list[i]->depth;
+    info.call_stack = list[i]->stack;
+    callback(info);
+  }
+  dealloc_(list);
+}
+
+int HeapProfileTable::FillOrderedProfile(char buf[], int size) const {
+  Bucket** list = MakeSortedBucketList();
+
+  // Our file format is "bucket, bucket, ..., bucket, proc_self_maps_info".
+  // In the cases buf is too small, we'd rather leave out the last
+  // buckets than leave out the /proc/self/maps info.  To ensure that,
+  // we actually print the /proc/self/maps info first, then move it to
+  // the end of the buffer, then write the bucket info into whatever
+  // is remaining, and then move the maps info one last time to close
+  // any gaps.  Whew!
+  int map_length = snprintf(buf, size, "%s", kProcSelfMapsHeader);
+  if (map_length < 0 || map_length >= size) {
+      dealloc_(list);
+      return 0;
+  }
+  bool dummy;   // "wrote_all" -- did /proc/self/maps fit in its entirety?
+  map_length += FillProcSelfMaps(buf + map_length, size - map_length, &dummy);
+  RAW_DCHECK(map_length <= size, "");
+  char* const map_start = buf + size - map_length;      // move to end
+  memmove(map_start, buf, map_length);
+  size -= map_length;
+
+  Stats stats;
+  memset(&stats, 0, sizeof(stats));
+  int bucket_length = snprintf(buf, size, "%s", kProfileHeader);
+  if (bucket_length < 0 || bucket_length >= size) {
+      dealloc_(list);
+      return 0;
+  }
+  bucket_length = UnparseBucket(total_, buf, bucket_length, size,
+                                " heapprofile", &stats);
+
+  // Dump the mmap list first.
+  if (profile_mmap_) {
+    BufferArgs buffer(buf, bucket_length, size);
+    MemoryRegionMap::IterateBuckets<BufferArgs*>(DumpBucketIterator, &buffer);
+    bucket_length = buffer.buflen;
+  }
+
+  for (int i = 0; i < num_buckets_; i++) {
+    bucket_length = UnparseBucket(*list[i], buf, bucket_length, size, "",
+                                  &stats);
+  }
+  RAW_DCHECK(bucket_length < size, "");
+
+  dealloc_(list);
+
+  RAW_DCHECK(buf + bucket_length <= map_start, "");
+  memmove(buf + bucket_length, map_start, map_length);  // close the gap
+
+  return bucket_length + map_length;
+}
+
+// static
+void HeapProfileTable::DumpBucketIterator(const Bucket* bucket,
+                                          BufferArgs* args) {
+  args->buflen = UnparseBucket(*bucket, args->buf, args->buflen, args->bufsize,
+                               "", NULL);
+}
+
+inline
+void HeapProfileTable::DumpNonLiveIterator(const void* ptr, AllocValue* v,
+                                           const DumpArgs& args) {
+  if (v->live()) {
+    v->set_live(false);
+    return;
+  }
+  if (v->ignore()) {
+    return;
+  }
+  Bucket b;
+  memset(&b, 0, sizeof(b));
+  b.allocs = 1;
+  b.alloc_size = v->bytes;
+  b.depth = v->bucket()->depth;
+  b.stack = v->bucket()->stack;
+  char buf[1024];
+  int len = UnparseBucket(b, buf, 0, sizeof(buf), "", args.profile_stats);
+  RawWrite(args.fd, buf, len);
+}
+
+// Callback from NonLiveSnapshot; adds entry to arg->dest
+// if not the entry is not live and is not present in arg->base.
+void HeapProfileTable::AddIfNonLive(const void* ptr, AllocValue* v,
+                                    AddNonLiveArgs* arg) {
+  if (v->live()) {
+    v->set_live(false);
+  } else {
+    if (arg->base != NULL && arg->base->map_.Find(ptr) != NULL) {
+      // Present in arg->base, so do not save
+    } else {
+      arg->dest->Add(ptr, *v);
+    }
+  }
+}
+
+bool HeapProfileTable::WriteProfile(const char* file_name,
+                                    const Bucket& total,
+                                    AllocationMap* allocations) {
+  RAW_VLOG(1, "Dumping non-live heap profile to %s", file_name);
+  RawFD fd = RawOpenForWriting(file_name);
+  if (fd != kIllegalRawFD) {
+    RawWrite(fd, kProfileHeader, strlen(kProfileHeader));
+    char buf[512];
+    int len = UnparseBucket(total, buf, 0, sizeof(buf), " heapprofile",
+                            NULL);
+    RawWrite(fd, buf, len);
+    const DumpArgs args(fd, NULL);
+    allocations->Iterate<const DumpArgs&>(DumpNonLiveIterator, args);
+    RawWrite(fd, kProcSelfMapsHeader, strlen(kProcSelfMapsHeader));
+    DumpProcSelfMaps(fd);
+    RawClose(fd);
+    return true;
+  } else {
+    RAW_LOG(ERROR, "Failed dumping filtered heap profile to %s", file_name);
+    return false;
+  }
+}
+
+void HeapProfileTable::CleanupOldProfiles(const char* prefix) {
+  if (!FLAGS_cleanup_old_heap_profiles)
+    return;
+  string pattern = string(prefix) + ".*" + kFileExt;
+#if defined(HAVE_GLOB_H)
+  glob_t g;
+  const int r = glob(pattern.c_str(), GLOB_ERR, NULL, &g);
+  if (r == 0 || r == GLOB_NOMATCH) {
+    const int prefix_length = strlen(prefix);
+    for (int i = 0; i < g.gl_pathc; i++) {
+      const char* fname = g.gl_pathv[i];
+      if ((strlen(fname) >= prefix_length) &&
+          (memcmp(fname, prefix, prefix_length) == 0)) {
+        RAW_VLOG(1, "Removing old heap profile %s", fname);
+        unlink(fname);
+      }
+    }
+  }
+  globfree(&g);
+#else   /* HAVE_GLOB_H */
+  RAW_LOG(WARNING, "Unable to remove old heap profiles (can't run glob())");
+#endif
+}
+
+HeapProfileTable::Snapshot* HeapProfileTable::TakeSnapshot() {
+  Snapshot* s = new (alloc_(sizeof(Snapshot))) Snapshot(alloc_, dealloc_);
+  address_map_->Iterate(AddToSnapshot, s);
+  return s;
+}
+
+void HeapProfileTable::ReleaseSnapshot(Snapshot* s) {
+  s->~Snapshot();
+  dealloc_(s);
+}
+
+// Callback from TakeSnapshot; adds a single entry to snapshot
+void HeapProfileTable::AddToSnapshot(const void* ptr, AllocValue* v,
+                                     Snapshot* snapshot) {
+  snapshot->Add(ptr, *v);
+}
+
+HeapProfileTable::Snapshot* HeapProfileTable::NonLiveSnapshot(
+    Snapshot* base) {
+  RAW_VLOG(2, "NonLiveSnapshot input: %d %d\n",
+           int(total_.allocs - total_.frees),
+           int(total_.alloc_size - total_.free_size));
+
+  Snapshot* s = new (alloc_(sizeof(Snapshot))) Snapshot(alloc_, dealloc_);
+  AddNonLiveArgs args;
+  args.dest = s;
+  args.base = base;
+  address_map_->Iterate<AddNonLiveArgs*>(AddIfNonLive, &args);
+  RAW_VLOG(2, "NonLiveSnapshot output: %d %d\n",
+           int(s->total_.allocs - s->total_.frees),
+           int(s->total_.alloc_size - s->total_.free_size));
+  return s;
+}
+
+// Information kept per unique bucket seen
+struct HeapProfileTable::Snapshot::Entry {
+  int count;
+  int bytes;
+  Bucket* bucket;
+  Entry() : count(0), bytes(0) { }
+
+  // Order by decreasing bytes
+  bool operator<(const Entry& x) const {
+    return this->bytes > x.bytes;
+  }
+};
+
+// State used to generate leak report.  We keep a mapping from Bucket pointer
+// the collected stats for that bucket.
+struct HeapProfileTable::Snapshot::ReportState {
+  map<Bucket*, Entry> buckets_;
+};
+
+// Callback from ReportLeaks; updates ReportState.
+void HeapProfileTable::Snapshot::ReportCallback(const void* ptr,
+                                                AllocValue* v,
+                                                ReportState* state) {
+  Entry* e = &state->buckets_[v->bucket()]; // Creates empty Entry first time
+  e->bucket = v->bucket();
+  e->count++;
+  e->bytes += v->bytes;
+}
+
+void HeapProfileTable::Snapshot::ReportLeaks(const char* checker_name,
+                                             const char* filename,
+                                             bool should_symbolize) {
+  // This is only used by the heap leak checker, but is intimately
+  // tied to the allocation map that belongs in this module and is
+  // therefore placed here.
+  RAW_LOG(ERROR, "Leak check %s detected leaks of %" PRIuS " bytes "
+          "in %" PRIuS " objects",
+          checker_name,
+          size_t(total_.alloc_size),
+          size_t(total_.allocs));
+
+  // Group objects by Bucket
+  ReportState state;
+  map_.Iterate(&ReportCallback, &state);
+
+  // Sort buckets by decreasing leaked size
+  const int n = state.buckets_.size();
+  Entry* entries = new Entry[n];
+  int dst = 0;
+  for (map<Bucket*,Entry>::const_iterator iter = state.buckets_.begin();
+       iter != state.buckets_.end();
+       ++iter) {
+    entries[dst++] = iter->second;
+  }
+  sort(entries, entries + n);
+
+  // Report a bounded number of leaks to keep the leak report from
+  // growing too long.
+  const int to_report =
+      (FLAGS_heap_check_max_leaks > 0 &&
+       n > FLAGS_heap_check_max_leaks) ? FLAGS_heap_check_max_leaks : n;
+  RAW_LOG(ERROR, "The %d largest leaks:", to_report);
+
+  // Print
+  SymbolTable symbolization_table;
+  for (int i = 0; i < to_report; i++) {
+    const Entry& e = entries[i];
+    for (int j = 0; j < e.bucket->depth; j++) {
+      symbolization_table.Add(e.bucket->stack[j]);
+    }
+  }
+  static const int kBufSize = 2<<10;
+  char buffer[kBufSize];
+  if (should_symbolize)
+    symbolization_table.Symbolize();
+  for (int i = 0; i < to_report; i++) {
+    const Entry& e = entries[i];
+    base::RawPrinter printer(buffer, kBufSize);
+    printer.Printf("Leak of %d bytes in %d objects allocated from:\n",
+                   e.bytes, e.count);
+    for (int j = 0; j < e.bucket->depth; j++) {
+      const void* pc = e.bucket->stack[j];
+      printer.Printf("\t@ %" PRIxPTR " %s\n",
+          reinterpret_cast<uintptr_t>(pc), symbolization_table.GetSymbol(pc));
+    }
+    RAW_LOG(ERROR, "%s", buffer);
+  }
+
+  if (to_report < n) {
+    RAW_LOG(ERROR, "Skipping leaks numbered %d..%d",
+            to_report, n-1);
+  }
+  delete[] entries;
+
+  // TODO: Dump the sorted Entry list instead of dumping raw data?
+  // (should be much shorter)
+  if (!HeapProfileTable::WriteProfile(filename, total_, &map_)) {
+    RAW_LOG(ERROR, "Could not write pprof profile to %s", filename);
+  }
+}
+
+void HeapProfileTable::Snapshot::ReportObject(const void* ptr,
+                                              AllocValue* v,
+                                              char* unused) {
+  // Perhaps also log the allocation stack trace (unsymbolized)
+  // on this line in case somebody finds it useful.
+  RAW_LOG(ERROR, "leaked %" PRIuS " byte object %p", v->bytes, ptr);
+}
+
+void HeapProfileTable::Snapshot::ReportIndividualObjects() {
+  char unused;
+  map_.Iterate(ReportObject, &unused);
+}

diff --git a/src/heap-profile-table.h b/src/heap-profile-table.h
new file mode 100644
index 0000000..3c62847
--- /dev/null
+++ b/src/heap-profile-table.h

@@ -0,0 +1,399 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//         Maxim Lifantsev (refactoring)
+//
+
+#ifndef BASE_HEAP_PROFILE_TABLE_H_
+#define BASE_HEAP_PROFILE_TABLE_H_
+
+#include "addressmap-inl.h"
+#include "base/basictypes.h"
+#include "base/logging.h"   // for RawFD
+#include "heap-profile-stats.h"
+
+// Table to maintain a heap profile data inside,
+// i.e. the set of currently active heap memory allocations.
+// thread-unsafe and non-reentrant code:
+// each instance object must be used by one thread
+// at a time w/o self-recursion.
+//
+// TODO(maxim): add a unittest for this class.
+class HeapProfileTable {
+ public:
+
+  // Extension to be used for heap pforile files.
+  static const char kFileExt[];
+
+  // Longest stack trace we record.
+  static const int kMaxStackDepth = 32;
+
+  // data types ----------------------------
+
+  // Profile stats.
+  typedef HeapProfileStats Stats;
+
+  // Info we can return about an allocation.
+  struct AllocInfo {
+    size_t object_size;  // size of the allocation
+    const void* const* call_stack;  // call stack that made the allocation call
+    int stack_depth;  // depth of call_stack
+    bool live;
+    bool ignored;
+  };
+
+  // Info we return about an allocation context.
+  // An allocation context is a unique caller stack trace
+  // of an allocation operation.
+  struct AllocContextInfo : public Stats {
+    int stack_depth;                // Depth of stack trace
+    const void* const* call_stack;  // Stack trace
+  };
+
+  // Memory (de)allocator interface we'll use.
+  typedef void* (*Allocator)(size_t size);
+  typedef void  (*DeAllocator)(void* ptr);
+
+  // interface ---------------------------
+
+  HeapProfileTable(Allocator alloc, DeAllocator dealloc, bool profile_mmap);
+  ~HeapProfileTable();
+
+  // Collect the stack trace for the function that asked to do the
+  // allocation for passing to RecordAlloc() below.
+  //
+  // The stack trace is stored in 'stack'. The stack depth is returned.
+  //
+  // 'skip_count' gives the number of stack frames between this call
+  // and the memory allocation function.
+  static int GetCallerStackTrace(int skip_count, void* stack[kMaxStackDepth]);
+
+  // Record an allocation at 'ptr' of 'bytes' bytes.  'stack_depth'
+  // and 'call_stack' identifying the function that requested the
+  // allocation. They can be generated using GetCallerStackTrace() above.
+  void RecordAlloc(const void* ptr, size_t bytes,
+                   int stack_depth, const void* const call_stack[]);
+
+  // Record the deallocation of memory at 'ptr'.
+  void RecordFree(const void* ptr);
+
+  // Return true iff we have recorded an allocation at 'ptr'.
+  // If yes, fill *object_size with the allocation byte size.
+  bool FindAlloc(const void* ptr, size_t* object_size) const;
+  // Same as FindAlloc, but fills all of *info.
+  bool FindAllocDetails(const void* ptr, AllocInfo* info) const;
+
+  // Return true iff "ptr" points into a recorded allocation
+  // If yes, fill *object_ptr with the actual allocation address
+  // and *object_size with the allocation byte size.
+  // max_size specifies largest currently possible allocation size.
+  bool FindInsideAlloc(const void* ptr, size_t max_size,
+                       const void** object_ptr, size_t* object_size) const;
+
+  // If "ptr" points to a recorded allocation and it's not marked as live
+  // mark it as live and return true. Else return false.
+  // All allocations start as non-live.
+  bool MarkAsLive(const void* ptr);
+
+  // If "ptr" points to a recorded allocation, mark it as "ignored".
+  // Ignored objects are treated like other objects, except that they
+  // are skipped in heap checking reports.
+  void MarkAsIgnored(const void* ptr);
+
+  // Return current total (de)allocation statistics.  It doesn't contain
+  // mmap'ed regions.
+  const Stats& total() const { return total_; }
+
+  // Allocation data iteration callback: gets passed object pointer and
+  // fully-filled AllocInfo.
+  typedef void (*AllocIterator)(const void* ptr, const AllocInfo& info);
+
+  // Iterate over the allocation profile data calling "callback"
+  // for every allocation.
+  void IterateAllocs(AllocIterator callback) const {
+    address_map_->Iterate(MapArgsAllocIterator, callback);
+  }
+
+  // Allocation context profile data iteration callback
+  typedef void (*AllocContextIterator)(const AllocContextInfo& info);
+
+  // Iterate over the allocation context profile data calling "callback"
+  // for every allocation context. Allocation contexts are ordered by the
+  // size of allocated space.
+  void IterateOrderedAllocContexts(AllocContextIterator callback) const;
+
+  // Fill profile data into buffer 'buf' of size 'size'
+  // and return the actual size occupied by the dump in 'buf'.
+  // The profile buckets are dumped in the decreasing order
+  // of currently allocated bytes.
+  // We do not provision for 0-terminating 'buf'.
+  int FillOrderedProfile(char buf[], int size) const;
+
+  // Cleanup any old profile files matching prefix + ".*" + kFileExt.
+  static void CleanupOldProfiles(const char* prefix);
+
+  // Return a snapshot of the current contents of *this.
+  // Caller must call ReleaseSnapshot() on result when no longer needed.
+  // The result is only valid while this exists and until
+  // the snapshot is discarded by calling ReleaseSnapshot().
+  class Snapshot;
+  Snapshot* TakeSnapshot();
+
+  // Release a previously taken snapshot.  snapshot must not
+  // be used after this call.
+  void ReleaseSnapshot(Snapshot* snapshot);
+
+  // Return a snapshot of every non-live, non-ignored object in *this.
+  // If "base" is non-NULL, skip any objects present in "base".
+  // As a side-effect, clears the "live" bit on every live object in *this.
+  // Caller must call ReleaseSnapshot() on result when no longer needed.
+  Snapshot* NonLiveSnapshot(Snapshot* base);
+
+ private:
+
+  // data types ----------------------------
+
+  // Hash table bucket to hold (de)allocation stats
+  // for a given allocation call stack trace.
+  typedef HeapProfileBucket Bucket;
+
+  // Info stored in the address map
+  struct AllocValue {
+    // Access to the stack-trace bucket
+    Bucket* bucket() const {
+      return reinterpret_cast<Bucket*>(bucket_rep & ~uintptr_t(kMask));
+    }
+    // This also does set_live(false).
+    void set_bucket(Bucket* b) { bucket_rep = reinterpret_cast<uintptr_t>(b); }
+    size_t  bytes;   // Number of bytes in this allocation
+
+    // Access to the allocation liveness flag (for leak checking)
+    bool live() const { return bucket_rep & kLive; }
+    void set_live(bool l) {
+      bucket_rep = (bucket_rep & ~uintptr_t(kLive)) | (l ? kLive : 0);
+    }
+
+    // Should this allocation be ignored if it looks like a leak?
+    bool ignore() const { return bucket_rep & kIgnore; }
+    void set_ignore(bool r) {
+      bucket_rep = (bucket_rep & ~uintptr_t(kIgnore)) | (r ? kIgnore : 0);
+    }
+
+   private:
+    // We store a few bits in the bottom bits of bucket_rep.
+    // (Alignment is at least four, so we have at least two bits.)
+    static const int kLive = 1;
+    static const int kIgnore = 2;
+    static const int kMask = kLive | kIgnore;
+
+    uintptr_t bucket_rep;
+  };
+
+  // helper for FindInsideAlloc
+  static size_t AllocValueSize(const AllocValue& v) { return v.bytes; }
+
+  typedef AddressMap<AllocValue> AllocationMap;
+
+  // Arguments that need to be passed DumpBucketIterator callback below.
+  struct BufferArgs {
+    BufferArgs(char* buf_arg, int buflen_arg, int bufsize_arg)
+        : buf(buf_arg),
+          buflen(buflen_arg),
+          bufsize(bufsize_arg) {
+    }
+
+    char* buf;
+    int buflen;
+    int bufsize;
+
+    DISALLOW_COPY_AND_ASSIGN(BufferArgs);
+  };
+
+  // Arguments that need to be passed DumpNonLiveIterator callback below.
+  struct DumpArgs {
+    DumpArgs(RawFD fd_arg, Stats* profile_stats_arg)
+        : fd(fd_arg),
+          profile_stats(profile_stats_arg) {
+    }
+
+    RawFD fd;  // file to write to
+    Stats* profile_stats;  // stats to update (may be NULL)
+  };
+
+  // helpers ----------------------------
+
+  // Unparse bucket b and print its portion of profile dump into buf.
+  // We return the amount of space in buf that we use.  We start printing
+  // at buf + buflen, and promise not to go beyond buf + bufsize.
+  // We do not provision for 0-terminating 'buf'.
+  //
+  // If profile_stats is non-NULL, we update *profile_stats by
+  // counting bucket b.
+  //
+  // "extra" is appended to the unparsed bucket.  Typically it is empty,
+  // but may be set to something like " heapprofile" for the total
+  // bucket to indicate the type of the profile.
+  static int UnparseBucket(const Bucket& b,
+                           char* buf, int buflen, int bufsize,
+                           const char* extra,
+                           Stats* profile_stats);
+
+  // Get the bucket for the caller stack trace 'key' of depth 'depth'
+  // creating the bucket if needed.
+  Bucket* GetBucket(int depth, const void* const key[]);
+
+  // Helper for IterateAllocs to do callback signature conversion
+  // from AllocationMap::Iterate to AllocIterator.
+  static void MapArgsAllocIterator(const void* ptr, AllocValue* v,
+                                   AllocIterator callback) {
+    AllocInfo info;
+    info.object_size = v->bytes;
+    info.call_stack = v->bucket()->stack;
+    info.stack_depth = v->bucket()->depth;
+    info.live = v->live();
+    info.ignored = v->ignore();
+    callback(ptr, info);
+  }
+
+  // Helper to dump a bucket.
+  inline static void DumpBucketIterator(const Bucket* bucket,
+                                        BufferArgs* args);
+
+  // Helper for DumpNonLiveProfile to do object-granularity
+  // heap profile dumping. It gets passed to AllocationMap::Iterate.
+  inline static void DumpNonLiveIterator(const void* ptr, AllocValue* v,
+                                         const DumpArgs& args);
+
+  // Helper for IterateOrderedAllocContexts and FillOrderedProfile.
+  // Creates a sorted list of Buckets whose length is num_buckets_.
+  // The caller is responsible for deallocating the returned list.
+  Bucket** MakeSortedBucketList() const;
+
+  // Helper for TakeSnapshot.  Saves object to snapshot.
+  static void AddToSnapshot(const void* ptr, AllocValue* v, Snapshot* s);
+
+  // Arguments passed to AddIfNonLive
+  struct AddNonLiveArgs {
+    Snapshot* dest;
+    Snapshot* base;
+  };
+
+  // Helper for NonLiveSnapshot.  Adds the object to the destination
+  // snapshot if it is non-live.
+  static void AddIfNonLive(const void* ptr, AllocValue* v,
+                           AddNonLiveArgs* arg);
+
+  // Write contents of "*allocations" as a heap profile to
+  // "file_name".  "total" must contain the total of all entries in
+  // "*allocations".
+  static bool WriteProfile(const char* file_name,
+                           const Bucket& total,
+                           AllocationMap* allocations);
+
+  // data ----------------------------
+
+  // Memory (de)allocator that we use.
+  Allocator alloc_;
+  DeAllocator dealloc_;
+
+  // Overall profile stats; we use only the Stats part,
+  // but make it a Bucket to pass to UnparseBucket.
+  Bucket total_;
+
+  bool profile_mmap_;
+
+  // Bucket hash table for malloc.
+  // We hand-craft one instead of using one of the pre-written
+  // ones because we do not want to use malloc when operating on the table.
+  // It is only few lines of code, so no big deal.
+  Bucket** bucket_table_;
+  int num_buckets_;
+
+  // Map of all currently allocated objects and mapped regions we know about.
+  AllocationMap* address_map_;
+
+  DISALLOW_COPY_AND_ASSIGN(HeapProfileTable);
+};
+
+class HeapProfileTable::Snapshot {
+ public:
+  const Stats& total() const { return total_; }
+
+  // Report anything in this snapshot as a leak.
+  // May use new/delete for temporary storage.
+  // If should_symbolize is true, will fork (which is not threadsafe)
+  // to turn addresses into symbol names.  Set to false for maximum safety.
+  // Also writes a heap profile to "filename" that contains
+  // all of the objects in this snapshot.
+  void ReportLeaks(const char* checker_name, const char* filename,
+                   bool should_symbolize);
+
+  // Report the addresses of all leaked objects.
+  // May use new/delete for temporary storage.
+  void ReportIndividualObjects();
+
+  bool Empty() const {
+    return (total_.allocs == 0) && (total_.alloc_size == 0);
+  }
+
+ private:
+  friend class HeapProfileTable;
+
+  // Total count/size are stored in a Bucket so we can reuse UnparseBucket
+  Bucket total_;
+
+  // We share the Buckets managed by the parent table, but have our
+  // own object->bucket map.
+  AllocationMap map_;
+
+  Snapshot(Allocator alloc, DeAllocator dealloc) : map_(alloc, dealloc) {
+    memset(&total_, 0, sizeof(total_));
+  }
+
+  // Callback used to populate a Snapshot object with entries found
+  // in another allocation map.
+  inline void Add(const void* ptr, const AllocValue& v) {
+    map_.Insert(ptr, v);
+    total_.allocs++;
+    total_.alloc_size += v.bytes;
+  }
+
+  // Helpers for sorting and generating leak reports
+  struct Entry;
+  struct ReportState;
+  static void ReportCallback(const void* ptr, AllocValue* v, ReportState*);
+  static void ReportObject(const void* ptr, AllocValue* v, char*);
+
+  DISALLOW_COPY_AND_ASSIGN(Snapshot);
+};
+
+#endif  // BASE_HEAP_PROFILE_TABLE_H_

diff --git a/src/heap-profiler.cc b/src/heap-profiler.cc
new file mode 100755
index 0000000..17d8697
--- /dev/null
+++ b/src/heap-profiler.cc

@@ -0,0 +1,620 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// TODO: Log large allocations
+
+#include <config.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#endif
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>    // for open()
+#endif
+#ifdef HAVE_MMAP
+#include <sys/mman.h>
+#endif
+#include <errno.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <signal.h>
+
+#include <algorithm>
+#include <string>
+
+#include <gperftools/heap-profiler.h>
+
+#include "base/logging.h"
+#include "base/basictypes.h"   // for PRId64, among other things
+#include "base/googleinit.h"
+#include "base/commandlineflags.h"
+#include "malloc_hook-inl.h"
+#include "tcmalloc_guard.h"
+#include <gperftools/malloc_hook.h>
+#include <gperftools/malloc_extension.h>
+#include "base/spinlock.h"
+#include "base/low_level_alloc.h"
+#include "base/sysinfo.h"      // for GetUniquePathFromEnv()
+#include "heap-profile-table.h"
+#include "memory_region_map.h"
+
+
+#ifndef	PATH_MAX
+#ifdef MAXPATHLEN
+#define	PATH_MAX	MAXPATHLEN
+#else
+#define	PATH_MAX	4096         // seems conservative for max filename len!
+#endif
+#endif
+
+using STL_NAMESPACE::string;
+using STL_NAMESPACE::sort;
+
+//----------------------------------------------------------------------
+// Flags that control heap-profiling
+//
+// The thread-safety of the profiler depends on these being immutable
+// after main starts, so don't change them.
+//----------------------------------------------------------------------
+
+DEFINE_int64(heap_profile_allocation_interval,
+             EnvToInt64("HEAP_PROFILE_ALLOCATION_INTERVAL", 1 << 30 /*1GB*/),
+             "If non-zero, dump heap profiling information once every "
+             "specified number of bytes allocated by the program since "
+             "the last dump.");
+DEFINE_int64(heap_profile_deallocation_interval,
+             EnvToInt64("HEAP_PROFILE_DEALLOCATION_INTERVAL", 0),
+             "If non-zero, dump heap profiling information once every "
+             "specified number of bytes deallocated by the program "
+             "since the last dump.");
+// We could also add flags that report whenever inuse_bytes changes by
+// X or -X, but there hasn't been a need for that yet, so we haven't.
+DEFINE_int64(heap_profile_inuse_interval,
+             EnvToInt64("HEAP_PROFILE_INUSE_INTERVAL", 100 << 20 /*100MB*/),
+             "If non-zero, dump heap profiling information whenever "
+             "the high-water memory usage mark increases by the specified "
+             "number of bytes.");
+DEFINE_int64(heap_profile_time_interval,
+             EnvToInt64("HEAP_PROFILE_TIME_INTERVAL", 0),
+             "If non-zero, dump heap profiling information once every "
+             "specified number of seconds since the last dump.");
+DEFINE_bool(mmap_log,
+            EnvToBool("HEAP_PROFILE_MMAP_LOG", false),
+            "Should mmap/munmap calls be logged?");
+DEFINE_bool(mmap_profile,
+            EnvToBool("HEAP_PROFILE_MMAP", false),
+            "If heap-profiling is on, also profile mmap, mremap, and sbrk)");
+DEFINE_bool(only_mmap_profile,
+            EnvToBool("HEAP_PROFILE_ONLY_MMAP", false),
+            "If heap-profiling is on, only profile mmap, mremap, and sbrk; "
+            "do not profile malloc/new/etc");
+
+
+//----------------------------------------------------------------------
+// Locking
+//----------------------------------------------------------------------
+
+// A pthread_mutex has way too much lock contention to be used here.
+//
+// I would like to use Mutex, but it can call malloc(),
+// which can cause us to fall into an infinite recursion.
+//
+// So we use a simple spinlock.
+static SpinLock heap_lock(SpinLock::LINKER_INITIALIZED);
+
+//----------------------------------------------------------------------
+// Simple allocator for heap profiler's internal memory
+//----------------------------------------------------------------------
+
+static LowLevelAlloc::Arena *heap_profiler_memory;
+
+static void* ProfilerMalloc(size_t bytes) {
+  return LowLevelAlloc::AllocWithArena(bytes, heap_profiler_memory);
+}
+static void ProfilerFree(void* p) {
+  LowLevelAlloc::Free(p);
+}
+
+// We use buffers of this size in DoGetHeapProfile.
+static const int kProfileBufferSize = 1 << 20;
+
+// This is a last-ditch buffer we use in DumpProfileLocked in case we
+// can't allocate more memory from ProfilerMalloc.  We expect this
+// will be used by HeapProfileEndWriter when the application has to
+// exit due to out-of-memory.  This buffer is allocated in
+// HeapProfilerStart.  Access to this must be protected by heap_lock.
+static char* global_profiler_buffer = NULL;
+
+
+//----------------------------------------------------------------------
+// Profiling control/state data
+//----------------------------------------------------------------------
+
+// Access to all of these is protected by heap_lock.
+static bool  is_on = false;           // If are on as a subsytem.
+static bool  dumping = false;         // Dumping status to prevent recursion
+static char* filename_prefix = NULL;  // Prefix used for profile file names
+                                      // (NULL if no need for dumping yet)
+static int   dump_count = 0;          // How many dumps so far
+static int64 last_dump_alloc = 0;     // alloc_size when did we last dump
+static int64 last_dump_free = 0;      // free_size when did we last dump
+static int64 high_water_mark = 0;     // In-use-bytes at last high-water dump
+static int64 last_dump_time = 0;      // The time of the last dump
+
+static HeapProfileTable* heap_profile = NULL;  // the heap profile table
+
+//----------------------------------------------------------------------
+// Profile generation
+//----------------------------------------------------------------------
+
+// Input must be a buffer of size at least 1MB.
+static char* DoGetHeapProfileLocked(char* buf, int buflen) {
+  // We used to be smarter about estimating the required memory and
+  // then capping it to 1MB and generating the profile into that.
+  if (buf == NULL || buflen < 1)
+    return NULL;
+
+  RAW_DCHECK(heap_lock.IsHeld(), "");
+  int bytes_written = 0;
+  if (is_on) {
+    HeapProfileTable::Stats const stats = heap_profile->total();
+    (void)stats;   // avoid an unused-variable warning in non-debug mode.
+    bytes_written = heap_profile->FillOrderedProfile(buf, buflen - 1);
+    // FillOrderedProfile should not reduce the set of active mmap-ed regions,
+    // hence MemoryRegionMap will let us remove everything we've added above:
+    RAW_DCHECK(stats.Equivalent(heap_profile->total()), "");
+    // if this fails, we somehow removed by FillOrderedProfile
+    // more than we have added.
+  }
+  buf[bytes_written] = '\0';
+  RAW_DCHECK(bytes_written == strlen(buf), "");
+
+  return buf;
+}
+
+extern "C" char* GetHeapProfile() {
+  // Use normal malloc: we return the profile to the user to free it:
+  char* buffer = reinterpret_cast<char*>(malloc(kProfileBufferSize));
+  SpinLockHolder l(&heap_lock);
+  return DoGetHeapProfileLocked(buffer, kProfileBufferSize);
+}
+
+// defined below
+static void NewHook(const void* ptr, size_t size);
+static void DeleteHook(const void* ptr);
+
+// Helper for HeapProfilerDump.
+static void DumpProfileLocked(const char* reason) {
+  RAW_DCHECK(heap_lock.IsHeld(), "");
+  RAW_DCHECK(is_on, "");
+  RAW_DCHECK(!dumping, "");
+
+  if (filename_prefix == NULL) return;  // we do not yet need dumping
+
+  dumping = true;
+
+  // Make file name
+  char file_name[1000];
+  dump_count++;
+  snprintf(file_name, sizeof(file_name), "%s.%04d%s",
+           filename_prefix, dump_count, HeapProfileTable::kFileExt);
+
+  // Dump the profile
+  RAW_VLOG(0, "Dumping heap profile to %s (%s)", file_name, reason);
+  // We must use file routines that don't access memory, since we hold
+  // a memory lock now.
+  RawFD fd = RawOpenForWriting(file_name);
+  if (fd == kIllegalRawFD) {
+    RAW_LOG(ERROR, "Failed dumping heap profile to %s", file_name);
+    dumping = false;
+    return;
+  }
+
+  // This case may be impossible, but it's best to be safe.
+  // It's safe to use the global buffer: we're protected by heap_lock.
+  if (global_profiler_buffer == NULL) {
+    global_profiler_buffer =
+        reinterpret_cast<char*>(ProfilerMalloc(kProfileBufferSize));
+  }
+
+  char* profile = DoGetHeapProfileLocked(global_profiler_buffer,
+                                         kProfileBufferSize);
+  RawWrite(fd, profile, strlen(profile));
+  RawClose(fd);
+
+  dumping = false;
+}
+
+//----------------------------------------------------------------------
+// Profile collection
+//----------------------------------------------------------------------
+
+// Dump a profile after either an allocation or deallocation, if
+// the memory use has changed enough since the last dump.
+static void MaybeDumpProfileLocked() {
+  if (!dumping) {
+    const HeapProfileTable::Stats& total = heap_profile->total();
+    const int64 inuse_bytes = total.alloc_size - total.free_size;
+    bool need_to_dump = false;
+    char buf[128];
+    int64 current_time = time(NULL);
+    if (FLAGS_heap_profile_allocation_interval > 0 &&
+        total.alloc_size >=
+        last_dump_alloc + FLAGS_heap_profile_allocation_interval) {
+      snprintf(buf, sizeof(buf), ("%" PRId64 " MB allocated cumulatively, "
+                                  "%" PRId64 " MB currently in use"),
+               total.alloc_size >> 20, inuse_bytes >> 20);
+      need_to_dump = true;
+    } else if (FLAGS_heap_profile_deallocation_interval > 0 &&
+               total.free_size >=
+               last_dump_free + FLAGS_heap_profile_deallocation_interval) {
+      snprintf(buf, sizeof(buf), ("%" PRId64 " MB freed cumulatively, "
+                                  "%" PRId64 " MB currently in use"),
+               total.free_size >> 20, inuse_bytes >> 20);
+      need_to_dump = true;
+    } else if (FLAGS_heap_profile_inuse_interval > 0 &&
+               inuse_bytes >
+               high_water_mark + FLAGS_heap_profile_inuse_interval) {
+      snprintf(buf, sizeof(buf), "%" PRId64 " MB currently in use",
+               inuse_bytes >> 20);
+      need_to_dump = true;
+    } else if (FLAGS_heap_profile_time_interval > 0 &&
+               current_time - last_dump_time >=
+               FLAGS_heap_profile_time_interval) {
+      snprintf(buf, sizeof(buf), "%" PRId64 " sec since the last dump",
+               current_time - last_dump_time);
+      need_to_dump = true;
+      last_dump_time = current_time;
+    }
+    if (need_to_dump) {
+      DumpProfileLocked(buf);
+
+      last_dump_alloc = total.alloc_size;
+      last_dump_free = total.free_size;
+      if (inuse_bytes > high_water_mark)
+        high_water_mark = inuse_bytes;
+    }
+  }
+}
+
+// Record an allocation in the profile.
+static void RecordAlloc(const void* ptr, size_t bytes, int skip_count) {
+  // Take the stack trace outside the critical section.
+  void* stack[HeapProfileTable::kMaxStackDepth];
+  int depth = HeapProfileTable::GetCallerStackTrace(skip_count + 1, stack);
+  SpinLockHolder l(&heap_lock);
+  if (is_on) {
+    heap_profile->RecordAlloc(ptr, bytes, depth, stack);
+    MaybeDumpProfileLocked();
+  }
+}
+
+// Record a deallocation in the profile.
+static void RecordFree(const void* ptr) {
+  SpinLockHolder l(&heap_lock);
+  if (is_on) {
+    heap_profile->RecordFree(ptr);
+    MaybeDumpProfileLocked();
+  }
+}
+
+//----------------------------------------------------------------------
+// Allocation/deallocation hooks for MallocHook
+//----------------------------------------------------------------------
+
+// static
+void NewHook(const void* ptr, size_t size) {
+  if (ptr != NULL) RecordAlloc(ptr, size, 0);
+}
+
+// static
+void DeleteHook(const void* ptr) {
+  if (ptr != NULL) RecordFree(ptr);
+}
+
+// TODO(jandrews): Re-enable stack tracing
+#ifdef TODO_REENABLE_STACK_TRACING
+static void RawInfoStackDumper(const char* message, void*) {
+  RAW_LOG(INFO, "%.*s", static_cast<int>(strlen(message) - 1), message);
+  // -1 is to chop the \n which will be added by RAW_LOG
+}
+#endif
+
+static void MmapHook(const void* result, const void* start, size_t size,
+                     int prot, int flags, int fd, off_t offset) {
+  if (FLAGS_mmap_log) {  // log it
+    // We use PRIxS not just '%p' to avoid deadlocks
+    // in pretty-printing of NULL as "nil".
+    // TODO(maxim): instead should use a safe snprintf reimplementation
+    RAW_LOG(INFO,
+            "mmap(start=0x%" PRIxPTR ", len=%" PRIuS ", prot=0x%x, flags=0x%x, "
+            "fd=%d, offset=0x%x) = 0x%" PRIxPTR "",
+            (uintptr_t) start, size, prot, flags, fd, (unsigned int) offset,
+            (uintptr_t) result);
+#ifdef TODO_REENABLE_STACK_TRACING
+    DumpStackTrace(1, RawInfoStackDumper, NULL);
+#endif
+  }
+}
+
+static void MremapHook(const void* result, const void* old_addr,
+                       size_t old_size, size_t new_size,
+                       int flags, const void* new_addr) {
+  if (FLAGS_mmap_log) {  // log it
+    // We use PRIxS not just '%p' to avoid deadlocks
+    // in pretty-printing of NULL as "nil".
+    // TODO(maxim): instead should use a safe snprintf reimplementation
+    RAW_LOG(INFO,
+            "mremap(old_addr=0x%" PRIxPTR ", old_size=%" PRIuS ", "
+            "new_size=%" PRIuS ", flags=0x%x, new_addr=0x%" PRIxPTR ") = "
+            "0x%" PRIxPTR "",
+            (uintptr_t) old_addr, old_size, new_size, flags,
+            (uintptr_t) new_addr, (uintptr_t) result);
+#ifdef TODO_REENABLE_STACK_TRACING
+    DumpStackTrace(1, RawInfoStackDumper, NULL);
+#endif
+  }
+}
+
+static void MunmapHook(const void* ptr, size_t size) {
+  if (FLAGS_mmap_log) {  // log it
+    // We use PRIxS not just '%p' to avoid deadlocks
+    // in pretty-printing of NULL as "nil".
+    // TODO(maxim): instead should use a safe snprintf reimplementation
+    RAW_LOG(INFO, "munmap(start=0x%" PRIxPTR ", len=%" PRIuS ")",
+                  (uintptr_t) ptr, size);
+#ifdef TODO_REENABLE_STACK_TRACING
+    DumpStackTrace(1, RawInfoStackDumper, NULL);
+#endif
+  }
+}
+
+static void SbrkHook(const void* result, ptrdiff_t increment) {
+  if (FLAGS_mmap_log) {  // log it
+    RAW_LOG(INFO, "sbrk(inc=%" PRIdS ") = 0x%" PRIxPTR "",
+                  increment, (uintptr_t) result);
+#ifdef TODO_REENABLE_STACK_TRACING
+    DumpStackTrace(1, RawInfoStackDumper, NULL);
+#endif
+  }
+}
+
+//----------------------------------------------------------------------
+// Starting/stopping/dumping
+//----------------------------------------------------------------------
+
+extern "C" void HeapProfilerStart(const char* prefix) {
+  SpinLockHolder l(&heap_lock);
+
+  if (is_on) return;
+
+  is_on = true;
+
+  RAW_VLOG(0, "Starting tracking the heap");
+
+  // This should be done before the hooks are set up, since it should
+  // call new, and we want that to be accounted for correctly.
+  MallocExtension::Initialize();
+
+  if (FLAGS_only_mmap_profile) {
+    FLAGS_mmap_profile = true;
+  }
+
+  if (FLAGS_mmap_profile) {
+    // Ask MemoryRegionMap to record all mmap, mremap, and sbrk
+    // call stack traces of at least size kMaxStackDepth:
+    MemoryRegionMap::Init(HeapProfileTable::kMaxStackDepth,
+                          /* use_buckets */ true);
+  }
+
+  if (FLAGS_mmap_log) {
+    // Install our hooks to do the logging:
+    RAW_CHECK(MallocHook::AddMmapHook(&MmapHook), "");
+    RAW_CHECK(MallocHook::AddMremapHook(&MremapHook), "");
+    RAW_CHECK(MallocHook::AddMunmapHook(&MunmapHook), "");
+    RAW_CHECK(MallocHook::AddSbrkHook(&SbrkHook), "");
+  }
+
+  heap_profiler_memory =
+    LowLevelAlloc::NewArena(0, LowLevelAlloc::DefaultArena());
+
+  // Reserve space now for the heap profiler, so we can still write a
+  // heap profile even if the application runs out of memory.
+  global_profiler_buffer =
+      reinterpret_cast<char*>(ProfilerMalloc(kProfileBufferSize));
+
+  heap_profile = new(ProfilerMalloc(sizeof(HeapProfileTable)))
+      HeapProfileTable(ProfilerMalloc, ProfilerFree, FLAGS_mmap_profile);
+
+  last_dump_alloc = 0;
+  last_dump_free = 0;
+  high_water_mark = 0;
+  last_dump_time = 0;
+
+  // We do not reset dump_count so if the user does a sequence of
+  // HeapProfilerStart/HeapProfileStop, we will get a continuous
+  // sequence of profiles.
+
+  if (FLAGS_only_mmap_profile == false) {
+    // Now set the hooks that capture new/delete and malloc/free.
+    RAW_CHECK(MallocHook::AddNewHook(&NewHook), "");
+    RAW_CHECK(MallocHook::AddDeleteHook(&DeleteHook), "");
+  }
+
+  // Copy filename prefix
+  RAW_DCHECK(filename_prefix == NULL, "");
+  const int prefix_length = strlen(prefix);
+  filename_prefix = reinterpret_cast<char*>(ProfilerMalloc(prefix_length + 1));
+  memcpy(filename_prefix, prefix, prefix_length);
+  filename_prefix[prefix_length] = '\0';
+}
+
+extern "C" int IsHeapProfilerRunning() {
+  SpinLockHolder l(&heap_lock);
+  return is_on ? 1 : 0;   // return an int, because C code doesn't have bool
+}
+
+extern "C" void HeapProfilerStop() {
+  SpinLockHolder l(&heap_lock);
+
+  if (!is_on) return;
+
+  if (FLAGS_only_mmap_profile == false) {
+    // Unset our new/delete hooks, checking they were set:
+    RAW_CHECK(MallocHook::RemoveNewHook(&NewHook), "");
+    RAW_CHECK(MallocHook::RemoveDeleteHook(&DeleteHook), "");
+  }
+  if (FLAGS_mmap_log) {
+    // Restore mmap/sbrk hooks, checking that our hooks were set:
+    RAW_CHECK(MallocHook::RemoveMmapHook(&MmapHook), "");
+    RAW_CHECK(MallocHook::RemoveMremapHook(&MremapHook), "");
+    RAW_CHECK(MallocHook::RemoveSbrkHook(&SbrkHook), "");
+    RAW_CHECK(MallocHook::RemoveMunmapHook(&MunmapHook), "");
+  }
+
+  // free profile
+  heap_profile->~HeapProfileTable();
+  ProfilerFree(heap_profile);
+  heap_profile = NULL;
+
+  // free output-buffer memory
+  ProfilerFree(global_profiler_buffer);
+
+  // free prefix
+  ProfilerFree(filename_prefix);
+  filename_prefix = NULL;
+
+  if (!LowLevelAlloc::DeleteArena(heap_profiler_memory)) {
+    RAW_LOG(FATAL, "Memory leak in HeapProfiler:");
+  }
+
+  if (FLAGS_mmap_profile) {
+    MemoryRegionMap::Shutdown();
+  }
+
+  is_on = false;
+}
+
+extern "C" void HeapProfilerDump(const char *reason) {
+  SpinLockHolder l(&heap_lock);
+  if (is_on && !dumping) {
+    DumpProfileLocked(reason);
+  }
+}
+
+// Signal handler that is registered when a user selectable signal
+// number is defined in the environment variable HEAPPROFILESIGNAL.
+static void HeapProfilerDumpSignal(int signal_number) {
+  (void)signal_number;
+  if (!heap_lock.TryLock()) {
+    return;
+  }
+  if (is_on && !dumping) {
+    DumpProfileLocked("signal");
+  }
+  heap_lock.Unlock();
+}
+
+
+//----------------------------------------------------------------------
+// Initialization/finalization code
+//----------------------------------------------------------------------
+
+// Initialization code
+static void HeapProfilerInit() {
+  // Everything after this point is for setting up the profiler based on envvar
+  char fname[PATH_MAX];
+  if (!GetUniquePathFromEnv("HEAPPROFILE", fname)) {
+    return;
+  }
+  // We do a uid check so we don't write out files in a setuid executable.
+#ifdef HAVE_GETEUID
+  if (getuid() != geteuid()) {
+    RAW_LOG(WARNING, ("HeapProfiler: ignoring HEAPPROFILE because "
+                      "program seems to be setuid\n"));
+    return;
+  }
+#endif
+
+  char *signal_number_str = getenv("HEAPPROFILESIGNAL");
+  if (signal_number_str != NULL) {
+    long int signal_number = strtol(signal_number_str, NULL, 10);
+    intptr_t old_signal_handler = reinterpret_cast<intptr_t>(signal(signal_number, HeapProfilerDumpSignal));
+    if (old_signal_handler == reinterpret_cast<intptr_t>(SIG_ERR)) {
+      RAW_LOG(FATAL, "Failed to set signal. Perhaps signal number %s is invalid\n", signal_number_str);
+    } else if (old_signal_handler == 0) {
+      RAW_LOG(INFO,"Using signal %d as heap profiling switch", signal_number);
+    } else {
+      RAW_LOG(FATAL, "Signal %d already in use\n", signal_number);
+    }
+  }
+
+  HeapProfileTable::CleanupOldProfiles(fname);
+
+  HeapProfilerStart(fname);
+}
+
+// class used for finalization -- dumps the heap-profile at program exit
+struct HeapProfileEndWriter {
+  ~HeapProfileEndWriter() {
+    char buf[128];
+    if (heap_profile) {
+      const HeapProfileTable::Stats& total = heap_profile->total();
+      const int64 inuse_bytes = total.alloc_size - total.free_size;
+
+      if ((inuse_bytes >> 20) > 0) {
+        snprintf(buf, sizeof(buf), ("Exiting, %" PRId64 " MB in use"),
+                 inuse_bytes >> 20);
+      } else if ((inuse_bytes >> 10) > 0) {
+        snprintf(buf, sizeof(buf), ("Exiting, %" PRId64 " kB in use"),
+                 inuse_bytes >> 10);
+      } else {
+        snprintf(buf, sizeof(buf), ("Exiting, %" PRId64 " bytes in use"),
+                 inuse_bytes);
+      }
+    } else {
+      snprintf(buf, sizeof(buf), ("Exiting"));
+    }
+    HeapProfilerDump(buf);
+  }
+};
+
+// We want to make sure tcmalloc is up and running before starting the profiler
+static const TCMallocGuard tcmalloc_initializer;
+REGISTER_MODULE_INITIALIZER(heapprofiler, HeapProfilerInit());
+static HeapProfileEndWriter heap_profile_end_writer;

diff --git a/src/internal_logging.cc b/src/internal_logging.cc
new file mode 100644
index 0000000..4e7fc87
--- /dev/null
+++ b/src/internal_logging.cc

@@ -0,0 +1,194 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Sanjay Ghemawat <opensource@google.com>
+
+#include <config.h>
+#include "internal_logging.h"
+#include <stdarg.h>                     // for va_end, va_start
+#include <stdio.h>                      // for vsnprintf, va_list, etc
+#include <stdlib.h>                     // for abort
+#include <string.h>                     // for strlen, memcpy
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>    // for write()
+#endif
+
+#include <gperftools/malloc_extension.h>
+#include "base/logging.h"   // for perftools_vsnprintf
+#include "base/spinlock.h"              // for SpinLockHolder, SpinLock
+
+static const int kLogBufSize = 800;
+
+// Variables for storing crash output.  Allocated statically since we
+// may not be able to heap-allocate while crashing.
+static SpinLock crash_lock(base::LINKER_INITIALIZED);
+static bool crashed = false;
+static const int kStatsBufferSize = 16 << 10;
+static char stats_buffer[kStatsBufferSize] = { 0 };
+
+namespace tcmalloc {
+
+static void WriteMessage(const char* msg, int length) {
+  write(STDERR_FILENO, msg, length);
+}
+
+void (*log_message_writer)(const char* msg, int length) = WriteMessage;
+
+
+class Logger {
+ public:
+  bool Add(const LogItem& item);
+  bool AddStr(const char* str, int n);
+  bool AddNum(uint64_t num, int base);  // base must be 10 or 16.
+
+  static const int kBufSize = 200;
+  char* p_;
+  char* end_;
+  char buf_[kBufSize];
+};
+
+void Log(LogMode mode, const char* filename, int line,
+         LogItem a, LogItem b, LogItem c, LogItem d) {
+  Logger state;
+  state.p_ = state.buf_;
+  state.end_ = state.buf_ + sizeof(state.buf_);
+  state.AddStr(filename, strlen(filename))
+      && state.AddStr(":", 1)
+      && state.AddNum(line, 10)
+      && state.AddStr("]", 1)
+      && state.Add(a)
+      && state.Add(b)
+      && state.Add(c)
+      && state.Add(d);
+
+  // Teminate with newline
+  if (state.p_ >= state.end_) {
+    state.p_ = state.end_ - 1;
+  }
+  *state.p_ = '\n';
+  state.p_++;
+
+  int msglen = state.p_ - state.buf_;
+  if (mode == kLog) {
+    (*log_message_writer)(state.buf_, msglen);
+    return;
+  }
+
+  bool first_crash = false;
+  {
+    SpinLockHolder l(&crash_lock);
+    if (!crashed) {
+      crashed = true;
+      first_crash = true;
+    }
+  }
+
+  (*log_message_writer)(state.buf_, msglen);
+  if (first_crash && mode == kCrashWithStats) {
+    MallocExtension::instance()->GetStats(stats_buffer, kStatsBufferSize);
+    (*log_message_writer)(stats_buffer, strlen(stats_buffer));
+  }
+
+  abort();
+}
+
+bool Logger::Add(const LogItem& item) {
+  // Separate items with spaces
+  if (p_ < end_) {
+    *p_ = ' ';
+    p_++;
+  }
+
+  switch (item.tag_) {
+    case LogItem::kStr:
+      return AddStr(item.u_.str, strlen(item.u_.str));
+    case LogItem::kUnsigned:
+      return AddNum(item.u_.unum, 10);
+    case LogItem::kSigned:
+      if (item.u_.snum < 0) {
+        // The cast to uint64_t is intentionally before the negation
+        // so that we do not attempt to negate -2^63.
+        return AddStr("-", 1)
+            && AddNum(- static_cast<uint64_t>(item.u_.snum), 10);
+      } else {
+        return AddNum(static_cast<uint64_t>(item.u_.snum), 10);
+      }
+    case LogItem::kPtr:
+      return AddStr("0x", 2)
+          && AddNum(reinterpret_cast<uintptr_t>(item.u_.ptr), 16);
+    default:
+      return false;
+  }
+}
+
+bool Logger::AddStr(const char* str, int n) {
+  if (end_ - p_ < n) {
+    return false;
+  } else {
+    memcpy(p_, str, n);
+    p_ += n;
+    return true;
+  }
+}
+
+bool Logger::AddNum(uint64_t num, int base) {
+  static const char kDigits[] = "0123456789abcdef";
+  char space[22];  // more than enough for 2^64 in smallest supported base (10)
+  char* end = space + sizeof(space);
+  char* pos = end;
+  do {
+    pos--;
+    *pos = kDigits[num % base];
+    num /= base;
+  } while (num > 0 && pos > space);
+  return AddStr(pos, end - pos);
+}
+
+}  // end tcmalloc namespace
+
+void TCMalloc_Printer::printf(const char* format, ...) {
+  if (left_ > 0) {
+    va_list ap;
+    va_start(ap, format);
+    const int r = perftools_vsnprintf(buf_, left_, format, ap);
+    va_end(ap);
+    if (r < 0) {
+      // Perhaps an old glibc that returns -1 on truncation?
+      left_ = 0;
+    } else if (r > left_) {
+      // Truncation
+      left_ = 0;
+    } else {
+      left_ -= r;
+      buf_ += r;
+    }
+  }
+}

diff --git a/src/internal_logging.h b/src/internal_logging.h
new file mode 100644
index 0000000..0c300c3
--- /dev/null
+++ b/src/internal_logging.h

@@ -0,0 +1,144 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+//
+// Internal logging and related utility routines.
+
+#ifndef TCMALLOC_INTERNAL_LOGGING_H_
+#define TCMALLOC_INTERNAL_LOGGING_H_
+
+#include <config.h>
+#include <stddef.h>                     // for size_t
+#if defined HAVE_STDINT_H
+#include <stdint.h>
+#elif defined HAVE_INTTYPES_H
+#include <inttypes.h>
+#else
+#include <sys/types.h>
+#endif
+
+//-------------------------------------------------------------------
+// Utility routines
+//-------------------------------------------------------------------
+
+// Safe logging helper: we write directly to the stderr file
+// descriptor and avoid FILE buffering because that may invoke
+// malloc().
+//
+// Example:
+//   Log(kLog, __FILE__, __LINE__, "error", bytes);
+
+namespace tcmalloc {
+enum LogMode {
+  kLog,                       // Just print the message
+  kCrash,                     // Print the message and crash
+  kCrashWithStats             // Print the message, some stats, and crash
+};
+
+class Logger;
+
+// A LogItem holds any of the argument types that can be passed to Log()
+class LogItem {
+ public:
+  LogItem()                     : tag_(kEnd)      { }
+  LogItem(const char* v)        : tag_(kStr)      { u_.str = v; }
+  LogItem(int v)                : tag_(kSigned)   { u_.snum = v; }
+  LogItem(long v)               : tag_(kSigned)   { u_.snum = v; }
+  LogItem(long long v)          : tag_(kSigned)   { u_.snum = v; }
+  LogItem(unsigned int v)       : tag_(kUnsigned) { u_.unum = v; }
+  LogItem(unsigned long v)      : tag_(kUnsigned) { u_.unum = v; }
+  LogItem(unsigned long long v) : tag_(kUnsigned) { u_.unum = v; }
+  LogItem(const void* v)        : tag_(kPtr)      { u_.ptr = v; }
+ private:
+  friend class Logger;
+  enum Tag {
+    kStr,
+    kSigned,
+    kUnsigned,
+    kPtr,
+    kEnd
+  };
+  Tag tag_;
+  union {
+    const char* str;
+    const void* ptr;
+    int64_t snum;
+    uint64_t unum;
+  } u_;
+};
+
+extern PERFTOOLS_DLL_DECL void Log(LogMode mode, const char* filename, int line,
+                LogItem a, LogItem b = LogItem(),
+                LogItem c = LogItem(), LogItem d = LogItem());
+
+// Tests can override this function to collect logging messages.
+extern PERFTOOLS_DLL_DECL void (*log_message_writer)(const char* msg, int length);
+
+}  // end tcmalloc namespace
+
+// Like assert(), but executed even in NDEBUG mode
+#undef CHECK_CONDITION
+#define CHECK_CONDITION(cond)                                            \
+do {                                                                     \
+  if (!(cond)) {                                                         \
+    ::tcmalloc::Log(::tcmalloc::kCrash, __FILE__, __LINE__, #cond);      \
+  }                                                                      \
+} while (0)
+
+// Our own version of assert() so we can avoid hanging by trying to do
+// all kinds of goofy printing while holding the malloc lock.
+#ifndef NDEBUG
+#define ASSERT(cond) CHECK_CONDITION(cond)
+#else
+#define ASSERT(cond) ((void) 0)
+#endif
+
+// Print into buffer
+class TCMalloc_Printer {
+ private:
+  char* buf_;           // Where should we write next
+  int   left_;          // Space left in buffer (including space for \0)
+
+ public:
+  // REQUIRES: "length > 0"
+  TCMalloc_Printer(char* buf, int length) : buf_(buf), left_(length) {
+    buf[0] = '\0';
+  }
+
+  void printf(const char* format, ...)
+#ifdef HAVE___ATTRIBUTE__
+    __attribute__ ((__format__ (__printf__, 2, 3)))
+#endif
+;
+};
+
+#endif  // TCMALLOC_INTERNAL_LOGGING_H_

diff --git a/src/libc_override.h b/src/libc_override.h
new file mode 100644
index 0000000..c01a97c
--- /dev/null
+++ b/src/libc_override.h

@@ -0,0 +1,91 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein <opensource@google.com>
+//
+// This .h file imports the code that causes tcmalloc to override libc
+// versions of malloc/free/new/delete/etc.  That is, it provides the
+// logic that makes it so calls to malloc(10) go through tcmalloc,
+// rather than the default (libc) malloc.
+//
+// This file also provides a method: ReplaceSystemAlloc(), that every
+// libc_override_*.h file it #includes is required to provide.  This
+// is called when first setting up tcmalloc -- that is, when a global
+// constructor in tcmalloc.cc is executed -- to do any initialization
+// work that may be required for this OS.  (Note we cannot entirely
+// control when tcmalloc is initialized, and the system may do some
+// mallocs and frees before this routine is called.)  It may be a
+// noop.
+//
+// Every libc has its own way of doing this, and sometimes the compiler
+// matters too, so we have a different file for each libc, and often
+// for different compilers and OS's.
+
+#ifndef TCMALLOC_LIBC_OVERRIDE_INL_H_
+#define TCMALLOC_LIBC_OVERRIDE_INL_H_
+
+#include <config.h>
+#ifdef HAVE_FEATURES_H
+#include <features.h>   // for __GLIBC__
+#endif
+#include <gperftools/tcmalloc.h>
+
+static void ReplaceSystemAlloc();  // defined in the .h files below
+
+// For windows, there are two ways to get tcmalloc.  If we're
+// patching, then src/windows/patch_function.cc will do the necessary
+// overriding here.  Otherwise, we doing the 'redefine' trick, where
+// we remove malloc/new/etc from mscvcrt.dll, and just need to define
+// them now.
+#if defined(_WIN32) && defined(WIN32_DO_PATCHING)
+void PatchWindowsFunctions();   // in src/windows/patch_function.cc
+static void ReplaceSystemAlloc() { PatchWindowsFunctions(); }
+
+#elif defined(_WIN32) && !defined(WIN32_DO_PATCHING)
+#include "libc_override_redefine.h"
+
+#elif defined(__APPLE__)
+#include "libc_override_osx.h"
+
+#elif defined(__GLIBC__)
+#include "libc_override_glibc.h"
+
+// Not all gcc systems necessarily support weak symbols, but all the
+// ones I know of do, so for now just assume they all do.
+#elif defined(__GNUC__)
+#include "libc_override_gcc_and_weak.h"
+
+#else
+#error Need to add support for your libc/OS here
+
+#endif
+
+#endif  // TCMALLOC_LIBC_OVERRIDE_INL_H_

diff --git a/src/libc_override_gcc_and_weak.h b/src/libc_override_gcc_and_weak.h
new file mode 100644
index 0000000..818e43d
--- /dev/null
+++ b/src/libc_override_gcc_and_weak.h

@@ -0,0 +1,107 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein <opensource@google.com>
+//
+// Used to override malloc routines on systems that define the
+// memory allocation routines to be weak symbols in their libc
+// (almost all unix-based systems are like this), on gcc, which
+// suppports the 'alias' attribute.
+
+#ifndef TCMALLOC_LIBC_OVERRIDE_GCC_AND_WEAK_INL_H_
+#define TCMALLOC_LIBC_OVERRIDE_GCC_AND_WEAK_INL_H_
+
+#ifdef HAVE_SYS_CDEFS_H
+#include <sys/cdefs.h>    // for __THROW
+#endif
+#include <gperftools/tcmalloc.h>
+
+#ifndef __THROW    // I guess we're not on a glibc-like system
+# define __THROW   // __THROW is just an optimization, so ok to make it ""
+#endif
+
+#ifndef __GNUC__
+# error libc_override_gcc_and_weak.h is for gcc distributions only.
+#endif
+
+#define ALIAS(tc_fn)   __attribute__ ((alias (#tc_fn)))
+
+void* operator new(size_t size) throw (std::bad_alloc)
+    ALIAS(tc_new);
+void operator delete(void* p) __THROW
+    ALIAS(tc_delete);
+void* operator new[](size_t size) throw (std::bad_alloc)
+    ALIAS(tc_newarray);
+void operator delete[](void* p) __THROW
+    ALIAS(tc_deletearray);
+void* operator new(size_t size, const std::nothrow_t& nt) __THROW
+    ALIAS(tc_new_nothrow);
+void* operator new[](size_t size, const std::nothrow_t& nt) __THROW
+    ALIAS(tc_newarray_nothrow);
+void operator delete(void* p, const std::nothrow_t& nt) __THROW
+    ALIAS(tc_delete_nothrow);
+void operator delete[](void* p, const std::nothrow_t& nt) __THROW
+    ALIAS(tc_deletearray_nothrow);
+
+extern "C" {
+  void* malloc(size_t size) __THROW               ALIAS(tc_malloc);
+  void free(void* ptr) __THROW                    ALIAS(tc_free);
+  void* realloc(void* ptr, size_t size) __THROW   ALIAS(tc_realloc);
+  void* calloc(size_t n, size_t size) __THROW     ALIAS(tc_calloc);
+  void cfree(void* ptr) __THROW                   ALIAS(tc_cfree);
+  void* memalign(size_t align, size_t s) __THROW  ALIAS(tc_memalign);
+  void* valloc(size_t size) __THROW               ALIAS(tc_valloc);
+  void* pvalloc(size_t size) __THROW              ALIAS(tc_pvalloc);
+  int posix_memalign(void** r, size_t a, size_t s) __THROW
+      ALIAS(tc_posix_memalign);
+#ifndef __UCLIBC__
+  void malloc_stats(void) __THROW                 ALIAS(tc_malloc_stats);
+#endif
+  int mallopt(int cmd, int value) __THROW         ALIAS(tc_mallopt);
+#ifdef HAVE_STRUCT_MALLINFO
+  struct mallinfo mallinfo(void) __THROW          ALIAS(tc_mallinfo);
+#endif
+  size_t malloc_size(void* p) __THROW             ALIAS(tc_malloc_size);
+#if defined(__ANDROID__)
+  size_t malloc_usable_size(const void* p) __THROW
+         ALIAS(tc_malloc_size);
+#else
+  size_t malloc_usable_size(void* p) __THROW      ALIAS(tc_malloc_size);
+#endif
+}   // extern "C"
+
+#undef ALIAS
+
+// No need to do anything at tcmalloc-registration time: we do it all
+// via overriding weak symbols (at link time).
+static void ReplaceSystemAlloc() { }
+
+#endif  // TCMALLOC_LIBC_OVERRIDE_GCC_AND_WEAK_INL_H_

diff --git a/src/libc_override_glibc.h b/src/libc_override_glibc.h
new file mode 100644
index 0000000..b6843e1
--- /dev/null
+++ b/src/libc_override_glibc.h

@@ -0,0 +1,149 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein <opensource@google.com>
+//
+// Used to override malloc routines on systems that are using glibc.
+
+#ifndef TCMALLOC_LIBC_OVERRIDE_GLIBC_INL_H_
+#define TCMALLOC_LIBC_OVERRIDE_GLIBC_INL_H_
+
+#include <config.h>
+#include <features.h>     // for __GLIBC__
+#ifdef HAVE_SYS_CDEFS_H
+#include <sys/cdefs.h>    // for __THROW
+#endif
+#include <gperftools/tcmalloc.h>
+
+#ifndef __GLIBC__
+# error libc_override_glibc.h is for glibc distributions only.
+#endif
+
+// In glibc, the memory-allocation methods are weak symbols, so we can
+// just override them with our own.  If we're using gcc, we can use
+// __attribute__((alias)) to do the overriding easily (exception:
+// Mach-O, which doesn't support aliases).  Otherwise we have to use a
+// function call.
+#if !defined(__GNUC__) || defined(__MACH__)
+
+// This also defines ReplaceSystemAlloc().
+# include "libc_override_redefine.h"  // defines functions malloc()/etc
+
+#else  // #if !defined(__GNUC__) || defined(__MACH__)
+
+// If we get here, we're a gcc system, so do all the overriding we do
+// with gcc.  This does the overriding of all the 'normal' memory
+// allocation.  This also defines ReplaceSystemAlloc().
+# include "libc_override_gcc_and_weak.h"
+
+// We also have to do some glibc-specific overriding.  Some library
+// routines on RedHat 9 allocate memory using malloc() and free it
+// using __libc_free() (or vice-versa).  Since we provide our own
+// implementations of malloc/free, we need to make sure that the
+// __libc_XXX variants (defined as part of glibc) also point to the
+// same implementations.  Since it only matters for redhat, we
+// do it inside the gcc #ifdef, since redhat uses gcc.
+// TODO(csilvers): only do this if we detect we're an old enough glibc?
+
+#define ALIAS(tc_fn)   __attribute__ ((alias (#tc_fn)))
+extern "C" {
+  void* __libc_malloc(size_t size)                ALIAS(tc_malloc);
+  void __libc_free(void* ptr)                     ALIAS(tc_free);
+  void* __libc_realloc(void* ptr, size_t size)    ALIAS(tc_realloc);
+  void* __libc_calloc(size_t n, size_t size)      ALIAS(tc_calloc);
+  void __libc_cfree(void* ptr)                    ALIAS(tc_cfree);
+  void* __libc_memalign(size_t align, size_t s)   ALIAS(tc_memalign);
+  void* __libc_valloc(size_t size)                ALIAS(tc_valloc);
+  void* __libc_pvalloc(size_t size)               ALIAS(tc_pvalloc);
+  int __posix_memalign(void** r, size_t a, size_t s)  ALIAS(tc_posix_memalign);
+}   // extern "C"
+#undef ALIAS
+
+#endif  // #if defined(__GNUC__) && !defined(__MACH__)
+
+
+// We also have to hook libc malloc.  While our work with weak symbols
+// should make sure libc malloc is never called in most situations, it
+// can be worked around by shared libraries with the DEEPBIND
+// environment variable set.  The below hooks libc to call our malloc
+// routines even in that situation.  In other situations, this hook
+// should never be called.
+extern "C" {
+static void* glibc_override_malloc(size_t size, const void *caller) {
+  return tc_malloc(size);
+}
+static void* glibc_override_realloc(void *ptr, size_t size,
+                                    const void *caller) {
+  return tc_realloc(ptr, size);
+}
+static void glibc_override_free(void *ptr, const void *caller) {
+  tc_free(ptr);
+}
+static void* glibc_override_memalign(size_t align, size_t size,
+                                     const void *caller) {
+  return tc_memalign(align, size);
+}
+
+// We should be using __malloc_initialize_hook here, like the #if 0
+// code below.  (See http://swoolley.org/man.cgi/3/malloc_hook.)
+// However, this causes weird linker errors with programs that link
+// with -static, so instead we just assign the vars directly at
+// static-constructor time.  That should serve the same effect of
+// making sure the hooks are set before the first malloc call the
+// program makes.
+#if 0
+#include <malloc.h>  // for __malloc_hook, etc.
+void glibc_override_malloc_init_hook(void) {
+  __malloc_hook = glibc_override_malloc;
+  __realloc_hook = glibc_override_realloc;
+  __free_hook = glibc_override_free;
+  __memalign_hook = glibc_override_memalign;
+}
+
+void (* MALLOC_HOOK_MAYBE_VOLATILE __malloc_initialize_hook)(void)
+    = &glibc_override_malloc_init_hook;
+#endif
+
+void* (* MALLOC_HOOK_MAYBE_VOLATILE __malloc_hook)(size_t, const void*)
+    = &glibc_override_malloc;
+void* (* MALLOC_HOOK_MAYBE_VOLATILE __realloc_hook)(void*, size_t, const void*)
+    = &glibc_override_realloc;
+void (* MALLOC_HOOK_MAYBE_VOLATILE __free_hook)(void*, const void*)
+    = &glibc_override_free;
+void* (* MALLOC_HOOK_MAYBE_VOLATILE __memalign_hook)(size_t,size_t, const void*)
+    = &glibc_override_memalign;
+
+}   // extern "C"
+
+// No need to write ReplaceSystemAlloc(); one of the #includes above
+// did it for us.
+
+#endif  // TCMALLOC_LIBC_OVERRIDE_GLIBC_INL_H_

diff --git a/src/libc_override_osx.h b/src/libc_override_osx.h
new file mode 100644
index 0000000..b801f22
--- /dev/null
+++ b/src/libc_override_osx.h

@@ -0,0 +1,281 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein <opensource@google.com>
+//
+// Used to override malloc routines on OS X systems.  We use the
+// malloc-zone functionality built into OS X to register our malloc
+// routine.
+//
+// 1) We used to use the normal 'override weak libc malloc/etc'
+// technique for OS X.  This is not optimal because mach does not
+// support the 'alias' attribute, so we had to have forwarding
+// functions.  It also does not work very well with OS X shared
+// libraries (dylibs) -- in general, the shared libs don't use
+// tcmalloc unless run with the DYLD_FORCE_FLAT_NAMESPACE envvar.
+//
+// 2) Another approach would be to use an interposition array:
+//      static const interpose_t interposers[] __attribute__((section("__DATA, __interpose"))) = {
+//        { (void *)tc_malloc, (void *)malloc },
+//        { (void *)tc_free, (void *)free },
+//      };
+// This requires the user to set the DYLD_INSERT_LIBRARIES envvar, so
+// is not much better.
+//
+// 3) Registering a new malloc zone avoids all these issues:
+//  http://www.opensource.apple.com/source/Libc/Libc-583/include/malloc/malloc.h
+//  http://www.opensource.apple.com/source/Libc/Libc-583/gen/malloc.c
+// If we make tcmalloc the default malloc zone (undocumented but
+// possible) then all new allocs use it, even those in shared
+// libraries.  Allocs done before tcmalloc was installed, or in libs
+// that aren't using tcmalloc for some reason, will correctly go
+// through the malloc-zone interface when free-ing, and will pick up
+// the libc free rather than tcmalloc free.  So it should "never"
+// cause a crash (famous last words).
+//
+// 4) The routines one must define for one's own malloc have changed
+// between OS X versions.  This requires some hoops on our part, but
+// is only really annoying when it comes to posix_memalign.  The right
+// behavior there depends on what OS version tcmalloc was compiled on,
+// but also what OS version the program is running on.  For now, we
+// punt and don't implement our own posix_memalign.  Apps that really
+// care can use tc_posix_memalign directly.
+
+#ifndef TCMALLOC_LIBC_OVERRIDE_OSX_INL_H_
+#define TCMALLOC_LIBC_OVERRIDE_OSX_INL_H_
+
+#include <config.h>
+#ifdef HAVE_FEATURES_H
+#include <features.h>
+#endif
+#include <gperftools/tcmalloc.h>
+
+#if !defined(__APPLE__)
+# error libc_override_glibc-osx.h is for OS X distributions only.
+#endif
+
+#include <AvailabilityMacros.h>
+#include <malloc/malloc.h>
+
+namespace tcmalloc {
+  void CentralCacheLockAll();
+  void CentralCacheUnlockAll();
+}
+
+// from AvailabilityMacros.h
+#if defined(MAC_OS_X_VERSION_10_6) && \
+    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
+extern "C" {
+  // This function is only available on 10.6 (and later) but the
+  // LibSystem headers do not use AvailabilityMacros.h to handle weak
+  // importing automatically.  This prototype is a copy of the one in
+  // <malloc/malloc.h> with the WEAK_IMPORT_ATTRBIUTE added.
+  extern malloc_zone_t *malloc_default_purgeable_zone(void)
+      WEAK_IMPORT_ATTRIBUTE;
+}
+#endif
+
+// We need to provide wrappers around all the libc functions.
+namespace {
+size_t mz_size(malloc_zone_t* zone, const void* ptr) {
+  if (MallocExtension::instance()->GetOwnership(ptr) != MallocExtension::kOwned)
+    return 0;  // malloc_zone semantics: return 0 if we don't own the memory
+
+  // TODO(csilvers): change this method to take a const void*, one day.
+  return MallocExtension::instance()->GetAllocatedSize(const_cast<void*>(ptr));
+}
+
+void* mz_malloc(malloc_zone_t* zone, size_t size) {
+  return tc_malloc(size);
+}
+
+void* mz_calloc(malloc_zone_t* zone, size_t num_items, size_t size) {
+  return tc_calloc(num_items, size);
+}
+
+void* mz_valloc(malloc_zone_t* zone, size_t size) {
+  return tc_valloc(size);
+}
+
+void mz_free(malloc_zone_t* zone, void* ptr) {
+  return tc_free(ptr);
+}
+
+void* mz_realloc(malloc_zone_t* zone, void* ptr, size_t size) {
+  return tc_realloc(ptr, size);
+}
+
+void* mz_memalign(malloc_zone_t* zone, size_t align, size_t size) {
+  return tc_memalign(align, size);
+}
+
+void mz_destroy(malloc_zone_t* zone) {
+  // A no-op -- we will not be destroyed!
+}
+
+// malloc_introspection callbacks.  I'm not clear on what all of these do.
+kern_return_t mi_enumerator(task_t task, void *,
+                            unsigned type_mask, vm_address_t zone_address,
+                            memory_reader_t reader,
+                            vm_range_recorder_t recorder) {
+  // Should enumerate all the pointers we have.  Seems like a lot of work.
+  return KERN_FAILURE;
+}
+
+size_t mi_good_size(malloc_zone_t *zone, size_t size) {
+  // I think it's always safe to return size, but we maybe could do better.
+  return size;
+}
+
+boolean_t mi_check(malloc_zone_t *zone) {
+  return MallocExtension::instance()->VerifyAllMemory();
+}
+
+void mi_print(malloc_zone_t *zone, boolean_t verbose) {
+  int bufsize = 8192;
+  if (verbose)
+    bufsize = 102400;   // I picked this size arbitrarily
+  char* buffer = new char[bufsize];
+  MallocExtension::instance()->GetStats(buffer, bufsize);
+  fprintf(stdout, "%s", buffer);
+  delete[] buffer;
+}
+
+void mi_log(malloc_zone_t *zone, void *address) {
+  // I don't think we support anything like this
+}
+
+void mi_force_lock(malloc_zone_t *zone) {
+  tcmalloc::CentralCacheLockAll();
+}
+
+void mi_force_unlock(malloc_zone_t *zone) {
+  tcmalloc::CentralCacheUnlockAll();
+}
+
+void mi_statistics(malloc_zone_t *zone, malloc_statistics_t *stats) {
+  // TODO(csilvers): figure out how to fill these out
+  stats->blocks_in_use = 0;
+  stats->size_in_use = 0;
+  stats->max_size_in_use = 0;
+  stats->size_allocated = 0;
+}
+
+boolean_t mi_zone_locked(malloc_zone_t *zone) {
+  return false;  // Hopefully unneeded by us!
+}
+
+}  // unnamed namespace
+
+// OS X doesn't have pvalloc, cfree, malloc_statc, etc, so we can just
+// define our own. :-)  OS X supplies posix_memalign in some versions
+// but not others, either strongly or weakly linked, in a way that's
+// difficult enough to code to correctly, that I just don't try to
+// support either memalign() or posix_memalign().  If you need them
+// and are willing to code to tcmalloc, you can use tc_posix_memalign().
+extern "C" {
+  void  cfree(void* p)                   { tc_cfree(p);               }
+  void* pvalloc(size_t s)                { return tc_pvalloc(s);      }
+  void malloc_stats(void)                { tc_malloc_stats();         }
+  int mallopt(int cmd, int v)            { return tc_mallopt(cmd, v); }
+  // No struct mallinfo on OS X, so don't define mallinfo().
+  // An alias for malloc_size(), which OS X defines.
+  size_t malloc_usable_size(void* p)     { return tc_malloc_size(p); }
+}  // extern "C"
+
+static void ReplaceSystemAlloc() {
+  static malloc_introspection_t tcmalloc_introspection;
+  memset(&tcmalloc_introspection, 0, sizeof(tcmalloc_introspection));
+
+  tcmalloc_introspection.enumerator = &mi_enumerator;
+  tcmalloc_introspection.good_size = &mi_good_size;
+  tcmalloc_introspection.check = &mi_check;
+  tcmalloc_introspection.print = &mi_print;
+  tcmalloc_introspection.log = &mi_log;
+  tcmalloc_introspection.force_lock = &mi_force_lock;
+  tcmalloc_introspection.force_unlock = &mi_force_unlock;
+
+  static malloc_zone_t tcmalloc_zone;
+  memset(&tcmalloc_zone, 0, sizeof(malloc_zone_t));
+
+  // Start with a version 4 zone which is used for OS X 10.4 and 10.5.
+  tcmalloc_zone.version = 4;
+  tcmalloc_zone.zone_name = "tcmalloc";
+  tcmalloc_zone.size = &mz_size;
+  tcmalloc_zone.malloc = &mz_malloc;
+  tcmalloc_zone.calloc = &mz_calloc;
+  tcmalloc_zone.valloc = &mz_valloc;
+  tcmalloc_zone.free = &mz_free;
+  tcmalloc_zone.realloc = &mz_realloc;
+  tcmalloc_zone.destroy = &mz_destroy;
+  tcmalloc_zone.batch_malloc = NULL;
+  tcmalloc_zone.batch_free = NULL;
+  tcmalloc_zone.introspect = &tcmalloc_introspection;
+
+  // from AvailabilityMacros.h
+#if defined(MAC_OS_X_VERSION_10_6) && \
+    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
+  // Switch to version 6 on OSX 10.6 to support memalign.
+  tcmalloc_zone.version = 6;
+  tcmalloc_zone.free_definite_size = NULL;
+  tcmalloc_zone.memalign = &mz_memalign;
+  tcmalloc_introspection.zone_locked = &mi_zone_locked;
+
+  // Request the default purgable zone to force its creation. The
+  // current default zone is registered with the purgable zone for
+  // doing tiny and small allocs.  Sadly, it assumes that the default
+  // zone is the szone implementation from OS X and will crash if it
+  // isn't.  By creating the zone now, this will be true and changing
+  // the default zone won't cause a problem.  This only needs to
+  // happen when actually running on OS X 10.6 and higher (note the
+  // ifdef above only checks if we were *compiled* with 10.6 or
+  // higher; at runtime we have to check if this symbol is defined.)
+  if (malloc_default_purgeable_zone) {
+    malloc_default_purgeable_zone();
+  }
+#endif
+
+  // Register the tcmalloc zone. At this point, it will not be the
+  // default zone.
+  malloc_zone_register(&tcmalloc_zone);
+
+  // Unregister and reregister the default zone.  Unregistering swaps
+  // the specified zone with the last one registered which for the
+  // default zone makes the more recently registered zone the default
+  // zone.  The default zone is then re-registered to ensure that
+  // allocations made from it earlier will be handled correctly.
+  // Things are not guaranteed to work that way, but it's how they work now.
+  malloc_zone_t *default_zone = malloc_default_zone();
+  malloc_zone_unregister(default_zone);
+  malloc_zone_register(default_zone);
+}
+
+#endif  // TCMALLOC_LIBC_OVERRIDE_OSX_INL_H_

diff --git a/src/libc_override_redefine.h b/src/libc_override_redefine.h
new file mode 100644
index 0000000..a1e50f8
--- /dev/null
+++ b/src/libc_override_redefine.h

@@ -0,0 +1,94 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein <opensource@google.com>
+//
+// Used on systems that don't have their own definition of
+// malloc/new/etc.  (Typically this will be a windows msvcrt.dll that
+// has been edited to remove the definitions.)  We can just define our
+// own as normal functions.
+//
+// This should also work on systems were all the malloc routines are
+// defined as weak symbols, and there's no support for aliasing.
+
+#ifndef TCMALLOC_LIBC_OVERRIDE_REDEFINE_H_
+#define TCMALLOC_LIBC_OVERRIDE_REDEFINE_H_
+
+#ifdef HAVE_SYS_CDEFS_H
+#include <sys/cdefs.h>    // for __THROW
+#endif
+
+#ifndef __THROW    // I guess we're not on a glibc-like system
+# define __THROW   // __THROW is just an optimization, so ok to make it ""
+#endif
+
+void* operator new(size_t size)                  { return tc_new(size);       }
+void operator delete(void* p) __THROW            { tc_delete(p);              }
+void* operator new[](size_t size)                { return tc_newarray(size);  }
+void operator delete[](void* p) __THROW          { tc_deletearray(p);         }
+void* operator new(size_t size, const std::nothrow_t& nt) __THROW {
+  return tc_new_nothrow(size, nt);
+}
+void* operator new[](size_t size, const std::nothrow_t& nt) __THROW {
+  return tc_newarray_nothrow(size, nt);
+}
+void operator delete(void* ptr, const std::nothrow_t& nt) __THROW {
+  return tc_delete_nothrow(ptr, nt);
+}
+void operator delete[](void* ptr, const std::nothrow_t& nt) __THROW {
+  return tc_deletearray_nothrow(ptr, nt);
+}
+extern "C" {
+  void* malloc(size_t s) __THROW                 { return tc_malloc(s);       }
+  void  free(void* p) __THROW                    { tc_free(p);                }
+  void* realloc(void* p, size_t s) __THROW       { return tc_realloc(p, s);   }
+  void* calloc(size_t n, size_t s) __THROW       { return tc_calloc(n, s);    }
+  void  cfree(void* p) __THROW                   { tc_cfree(p);               }
+  void* memalign(size_t a, size_t s) __THROW     { return tc_memalign(a, s);  }
+  void* valloc(size_t s) __THROW                 { return tc_valloc(s);       }
+  void* pvalloc(size_t s) __THROW                { return tc_pvalloc(s);      }
+  int posix_memalign(void** r, size_t a, size_t s) __THROW {
+    return tc_posix_memalign(r, a, s);
+  }
+  void malloc_stats(void) __THROW                { tc_malloc_stats();         }
+  int mallopt(int cmd, int v) __THROW            { return tc_mallopt(cmd, v); }
+#ifdef HAVE_STRUCT_MALLINFO
+  struct mallinfo mallinfo(void) __THROW         { return tc_mallinfo();      }
+#endif
+  size_t malloc_size(void* p) __THROW            { return tc_malloc_size(p); }
+  size_t malloc_usable_size(void* p) __THROW     { return tc_malloc_size(p); }
+}  // extern "C"
+
+// No need to do anything at tcmalloc-registration time: we do it all
+// via overriding weak symbols (at link time).
+static void ReplaceSystemAlloc() { }
+
+#endif  // TCMALLOC_LIBC_OVERRIDE_REDEFINE_H_

diff --git a/src/linked_list.h b/src/linked_list.h
new file mode 100644
index 0000000..66a0741
--- /dev/null
+++ b/src/linked_list.h

@@ -0,0 +1,103 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+//
+// Some very basic linked list functions for dealing with using void * as
+// storage.
+
+#ifndef TCMALLOC_LINKED_LIST_H_
+#define TCMALLOC_LINKED_LIST_H_
+
+#include <stddef.h>
+
+namespace tcmalloc {
+
+inline void *SLL_Next(void *t) {
+  return *(reinterpret_cast<void**>(t));
+}
+
+inline void SLL_SetNext(void *t, void *n) {
+  *(reinterpret_cast<void**>(t)) = n;
+}
+
+inline void SLL_Push(void **list, void *element) {
+  SLL_SetNext(element, *list);
+  *list = element;
+}
+
+inline void *SLL_Pop(void **list) {
+  void *result = *list;
+  *list = SLL_Next(*list);
+  return result;
+}
+
+// Remove N elements from a linked list to which head points.  head will be
+// modified to point to the new head.  start and end will point to the first
+// and last nodes of the range.  Note that end will point to NULL after this
+// function is called.
+inline void SLL_PopRange(void **head, int N, void **start, void **end) {
+  if (N == 0) {
+    *start = NULL;
+    *end = NULL;
+    return;
+  }
+
+  void *tmp = *head;
+  for (int i = 1; i < N; ++i) {
+    tmp = SLL_Next(tmp);
+  }
+
+  *start = *head;
+  *end = tmp;
+  *head = SLL_Next(tmp);
+  // Unlink range from list.
+  SLL_SetNext(tmp, NULL);
+}
+
+inline void SLL_PushRange(void **head, void *start, void *end) {
+  if (!start) return;
+  SLL_SetNext(end, *head);
+  *head = start;
+}
+
+inline size_t SLL_Size(void *head) {
+  int count = 0;
+  while (head) {
+    count++;
+    head = SLL_Next(head);
+  }
+  return count;
+}
+
+}  // namespace tcmalloc
+
+#endif  // TCMALLOC_LINKED_LIST_H_

diff --git a/src/malloc_extension.cc b/src/malloc_extension.cc
new file mode 100644
index 0000000..4ff719c
--- /dev/null
+++ b/src/malloc_extension.cc

@@ -0,0 +1,378 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+
+#include <config.h>
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#if defined HAVE_STDINT_H
+#include <stdint.h>
+#elif defined HAVE_INTTYPES_H
+#include <inttypes.h>
+#else
+#include <sys/types.h>
+#endif
+#include <string>
+#include "base/dynamic_annotations.h"
+#include "base/sysinfo.h"    // for FillProcSelfMaps
+#ifndef NO_HEAP_CHECK
+#include "gperftools/heap-checker.h"
+#endif
+#include "gperftools/malloc_extension.h"
+#include "gperftools/malloc_extension_c.h"
+#include "maybe_threads.h"
+#include "base/googleinit.h"
+
+using STL_NAMESPACE::string;
+using STL_NAMESPACE::vector;
+
+static void DumpAddressMap(string* result) {
+  *result += "\nMAPPED_LIBRARIES:\n";
+  // We keep doubling until we get a fit
+  const size_t old_resultlen = result->size();
+  for (int amap_size = 10240; amap_size < 10000000; amap_size *= 2) {
+    result->resize(old_resultlen + amap_size);
+    bool wrote_all = false;
+    const int bytes_written =
+        tcmalloc::FillProcSelfMaps(&((*result)[old_resultlen]), amap_size,
+                                   &wrote_all);
+    if (wrote_all) {   // we fit!
+      (*result)[old_resultlen + bytes_written] = '\0';
+      result->resize(old_resultlen + bytes_written);
+      return;
+    }
+  }
+  result->reserve(old_resultlen);   // just don't print anything
+}
+
+// Note: this routine is meant to be called before threads are spawned.
+void MallocExtension::Initialize() {
+  static bool initialize_called = false;
+
+  if (initialize_called) return;
+  initialize_called = true;
+
+#ifdef __GLIBC__
+  // GNU libc++ versions 3.3 and 3.4 obey the environment variables
+  // GLIBCPP_FORCE_NEW and GLIBCXX_FORCE_NEW respectively.  Setting
+  // one of these variables forces the STL default allocator to call
+  // new() or delete() for each allocation or deletion.  Otherwise
+  // the STL allocator tries to avoid the high cost of doing
+  // allocations by pooling memory internally.  However, tcmalloc
+  // does allocations really fast, especially for the types of small
+  // items one sees in STL, so it's better off just using us.
+  // TODO: control whether we do this via an environment variable?
+  setenv("GLIBCPP_FORCE_NEW", "1", false /* no overwrite*/);
+  setenv("GLIBCXX_FORCE_NEW", "1", false /* no overwrite*/);
+
+  // Now we need to make the setenv 'stick', which it may not do since
+  // the env is flakey before main() is called.  But luckily stl only
+  // looks at this env var the first time it tries to do an alloc, and
+  // caches what it finds.  So we just cause an stl alloc here.
+  string dummy("I need to be allocated");
+  dummy += "!";         // so the definition of dummy isn't optimized out
+#endif  /* __GLIBC__ */
+}
+
+// SysAllocator implementation
+SysAllocator::~SysAllocator() {}
+
+// Default implementation -- does nothing
+MallocExtension::~MallocExtension() { }
+bool MallocExtension::VerifyAllMemory() { return true; }
+bool MallocExtension::VerifyNewMemory(const void* p) { return true; }
+bool MallocExtension::VerifyArrayNewMemory(const void* p) { return true; }
+bool MallocExtension::VerifyMallocMemory(const void* p) { return true; }
+
+bool MallocExtension::GetNumericProperty(const char* property, size_t* value) {
+  return false;
+}
+
+bool MallocExtension::SetNumericProperty(const char* property, size_t value) {
+  return false;
+}
+
+void MallocExtension::GetStats(char* buffer, int length) {
+  assert(length > 0);
+  buffer[0] = '\0';
+}
+
+bool MallocExtension::MallocMemoryStats(int* blocks, size_t* total,
+                                       int histogram[kMallocHistogramSize]) {
+  *blocks = 0;
+  *total = 0;
+  memset(histogram, 0, sizeof(*histogram) * kMallocHistogramSize);
+  return true;
+}
+
+void** MallocExtension::ReadStackTraces(int* sample_period) {
+  return NULL;
+}
+
+void** MallocExtension::ReadHeapGrowthStackTraces() {
+  return NULL;
+}
+
+void MallocExtension::MarkThreadIdle() {
+  // Default implementation does nothing
+}
+
+void MallocExtension::MarkThreadBusy() {
+  // Default implementation does nothing
+}
+
+SysAllocator* MallocExtension::GetSystemAllocator() {
+  return NULL;
+}
+
+void MallocExtension::SetSystemAllocator(SysAllocator *a) {
+  // Default implementation does nothing
+}
+
+void MallocExtension::ReleaseToSystem(size_t num_bytes) {
+  // Default implementation does nothing
+}
+
+void MallocExtension::ReleaseFreeMemory() {
+  ReleaseToSystem(static_cast<size_t>(-1));   // SIZE_T_MAX
+}
+
+void MallocExtension::SetMemoryReleaseRate(double rate) {
+  // Default implementation does nothing
+}
+
+double MallocExtension::GetMemoryReleaseRate() {
+  return -1.0;
+}
+
+size_t MallocExtension::GetEstimatedAllocatedSize(size_t size) {
+  return size;
+}
+
+size_t MallocExtension::GetAllocatedSize(const void* p) {
+  assert(GetOwnership(p) != kNotOwned);
+  return 0;
+}
+
+MallocExtension::Ownership MallocExtension::GetOwnership(const void* p) {
+  return kUnknownOwnership;
+}
+
+void MallocExtension::GetFreeListSizes(
+    vector<MallocExtension::FreeListInfo>* v) {
+  v->clear();
+}
+
+// The current malloc extension object.
+
+static MallocExtension* current_instance;
+
+static void InitModule() {
+  if (current_instance != NULL) {
+    return;
+  }
+  current_instance = new MallocExtension;
+#ifndef NO_HEAP_CHECK
+  HeapLeakChecker::IgnoreObject(current_instance);
+#endif
+}
+
+REGISTER_MODULE_INITIALIZER(malloc_extension_init, InitModule())
+
+MallocExtension* MallocExtension::instance() {
+  InitModule();
+  return current_instance;
+}
+
+void MallocExtension::Register(MallocExtension* implementation) {
+  InitModule();
+  // When running under valgrind, our custom malloc is replaced with
+  // valgrind's one and malloc extensions will not work.  (Note:
+  // callers should be responsible for checking that they are the
+  // malloc that is really being run, before calling Register.  This
+  // is just here as an extra sanity check.)
+  if (!RunningOnValgrind()) {
+    current_instance = implementation;
+  }
+}
+
+// -----------------------------------------------------------------------
+// Heap sampling support
+// -----------------------------------------------------------------------
+
+namespace {
+
+// Accessors
+uintptr_t Count(void** entry) {
+  return reinterpret_cast<uintptr_t>(entry[0]);
+}
+uintptr_t Size(void** entry) {
+  return reinterpret_cast<uintptr_t>(entry[1]);
+}
+uintptr_t Depth(void** entry) {
+  return reinterpret_cast<uintptr_t>(entry[2]);
+}
+void* PC(void** entry, int i) {
+  return entry[3+i];
+}
+
+void PrintCountAndSize(MallocExtensionWriter* writer,
+                       uintptr_t count, uintptr_t size) {
+  char buf[100];
+  snprintf(buf, sizeof(buf),
+           "%6" PRIu64 ": %8" PRIu64 " [%6" PRIu64 ": %8" PRIu64 "] @",
+           static_cast<uint64>(count),
+           static_cast<uint64>(size),
+           static_cast<uint64>(count),
+           static_cast<uint64>(size));
+  writer->append(buf, strlen(buf));
+}
+
+void PrintHeader(MallocExtensionWriter* writer,
+                 const char* label, void** entries) {
+  // Compute the total count and total size
+  uintptr_t total_count = 0;
+  uintptr_t total_size = 0;
+  for (void** entry = entries; Count(entry) != 0; entry += 3 + Depth(entry)) {
+    total_count += Count(entry);
+    total_size += Size(entry);
+  }
+
+  const char* const kTitle = "heap profile: ";
+  writer->append(kTitle, strlen(kTitle));
+  PrintCountAndSize(writer, total_count, total_size);
+  writer->append(" ", 1);
+  writer->append(label, strlen(label));
+  writer->append("\n", 1);
+}
+
+void PrintStackEntry(MallocExtensionWriter* writer, void** entry) {
+  PrintCountAndSize(writer, Count(entry), Size(entry));
+
+  for (int i = 0; i < Depth(entry); i++) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), " %p", PC(entry, i));
+    writer->append(buf, strlen(buf));
+  }
+  writer->append("\n", 1);
+}
+
+}
+
+void MallocExtension::GetHeapSample(MallocExtensionWriter* writer) {
+  int sample_period = 0;
+  void** entries = ReadStackTraces(&sample_period);
+  if (entries == NULL) {
+    const char* const kErrorMsg =
+        "This malloc implementation does not support sampling.\n"
+        "As of 2005/01/26, only tcmalloc supports sampling, and\n"
+        "you are probably running a binary that does not use\n"
+        "tcmalloc.\n";
+    writer->append(kErrorMsg, strlen(kErrorMsg));
+    return;
+  }
+
+  char label[32];
+  sprintf(label, "heap_v2/%d", sample_period);
+  PrintHeader(writer, label, entries);
+  for (void** entry = entries; Count(entry) != 0; entry += 3 + Depth(entry)) {
+    PrintStackEntry(writer, entry);
+  }
+  delete[] entries;
+
+  DumpAddressMap(writer);
+}
+
+void MallocExtension::GetHeapGrowthStacks(MallocExtensionWriter* writer) {
+  void** entries = ReadHeapGrowthStackTraces();
+  if (entries == NULL) {
+    const char* const kErrorMsg =
+        "This malloc implementation does not support "
+        "ReadHeapGrowthStackTraces().\n"
+        "As of 2005/09/27, only tcmalloc supports this, and you\n"
+        "are probably running a binary that does not use tcmalloc.\n";
+    writer->append(kErrorMsg, strlen(kErrorMsg));
+    return;
+  }
+
+  // Do not canonicalize the stack entries, so that we get a
+  // time-ordered list of stack traces, which may be useful if the
+  // client wants to focus on the latest stack traces.
+  PrintHeader(writer, "growth", entries);
+  for (void** entry = entries; Count(entry) != 0; entry += 3 + Depth(entry)) {
+    PrintStackEntry(writer, entry);
+  }
+  delete[] entries;
+
+  DumpAddressMap(writer);
+}
+
+void MallocExtension::Ranges(void* arg, RangeFunction func) {
+  // No callbacks by default
+}
+
+// These are C shims that work on the current instance.
+
+#define C_SHIM(fn, retval, paramlist, arglist)          \
+  extern "C" PERFTOOLS_DLL_DECL retval MallocExtension_##fn paramlist {    \
+    return MallocExtension::instance()->fn arglist;     \
+  }
+
+C_SHIM(VerifyAllMemory, int, (void), ());
+C_SHIM(VerifyNewMemory, int, (const void* p), (p));
+C_SHIM(VerifyArrayNewMemory, int, (const void* p), (p));
+C_SHIM(VerifyMallocMemory, int, (const void* p), (p));
+C_SHIM(MallocMemoryStats, int,
+       (int* blocks, size_t* total, int histogram[kMallocHistogramSize]),
+       (blocks, total, histogram));
+
+C_SHIM(GetStats, void,
+       (char* buffer, int buffer_length), (buffer, buffer_length));
+C_SHIM(GetNumericProperty, int,
+       (const char* property, size_t* value), (property, value));
+C_SHIM(SetNumericProperty, int,
+       (const char* property, size_t value), (property, value));
+
+C_SHIM(MarkThreadIdle, void, (void), ());
+C_SHIM(MarkThreadBusy, void, (void), ());
+C_SHIM(ReleaseFreeMemory, void, (void), ());
+C_SHIM(ReleaseToSystem, void, (size_t num_bytes), (num_bytes));
+C_SHIM(GetEstimatedAllocatedSize, size_t, (size_t size), (size));
+C_SHIM(GetAllocatedSize, size_t, (const void* p), (p));
+
+// Can't use the shim here because of the need to translate the enums.
+extern "C"
+MallocExtension_Ownership MallocExtension_GetOwnership(const void* p) {
+  return static_cast<MallocExtension_Ownership>(
+      MallocExtension::instance()->GetOwnership(p));
+}

diff --git a/src/malloc_hook-inl.h b/src/malloc_hook-inl.h
new file mode 100644
index 0000000..9e74ec8
--- /dev/null
+++ b/src/malloc_hook-inl.h

@@ -0,0 +1,247 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// This has the implementation details of malloc_hook that are needed
+// to use malloc-hook inside the tcmalloc system.  It does not hold
+// any of the client-facing calls that are used to add new hooks.
+
+#ifndef _MALLOC_HOOK_INL_H_
+#define _MALLOC_HOOK_INL_H_
+
+#include <stddef.h>
+#include <sys/types.h>
+#include "base/atomicops.h"
+#include "base/basictypes.h"
+#include <gperftools/malloc_hook.h>
+
+namespace base { namespace internal {
+
+// Capacity of 8 means that HookList is 9 words.
+static const int kHookListCapacity = 8;
+// last entry is reserved for deprecated "singular" hooks. So we have
+// 7 "normal" hooks per list
+static const int kHookListMaxValues = 7;
+static const int kHookListSingularIdx = 7;
+
+// HookList: a class that provides synchronized insertions and removals and
+// lockless traversal.  Most of the implementation is in malloc_hook.cc.
+template <typename T>
+struct PERFTOOLS_DLL_DECL HookList {
+  COMPILE_ASSERT(sizeof(T) <= sizeof(AtomicWord), T_should_fit_in_AtomicWord);
+
+  // Adds value to the list.  Note that duplicates are allowed.  Thread-safe and
+  // blocking (acquires hooklist_spinlock).  Returns true on success; false
+  // otherwise (failures include invalid value and no space left).
+  bool Add(T value);
+
+  void FixupPrivEndLocked();
+
+  // Removes the first entry matching value from the list.  Thread-safe and
+  // blocking (acquires hooklist_spinlock).  Returns true on success; false
+  // otherwise (failures include invalid value and no value found).
+  bool Remove(T value);
+
+  // Store up to n values of the list in output_array, and return the number of
+  // elements stored.  Thread-safe and non-blocking.  This is fast (one memory
+  // access) if the list is empty.
+  int Traverse(T* output_array, int n) const;
+
+  // Fast inline implementation for fast path of Invoke*Hook.
+  bool empty() const {
+    return base::subtle::NoBarrier_Load(&priv_end) == 0;
+  }
+
+  // Used purely to handle deprecated singular hooks
+  T GetSingular() const {
+    const AtomicWord *place = &priv_data[kHookListSingularIdx];
+    return bit_cast<T>(base::subtle::NoBarrier_Load(place));
+  }
+
+  T ExchangeSingular(T new_val);
+
+  // This internal data is not private so that the class is an aggregate and can
+  // be initialized by the linker.  Don't access this directly.  Use the
+  // INIT_HOOK_LIST macro in malloc_hook.cc.
+
+  // One more than the index of the last valid element in priv_data.  During
+  // 'Remove' this may be past the last valid element in priv_data, but
+  // subsequent values will be 0.
+  //
+  // Index kHookListCapacity-1 is reserved as 'deprecated' single hook pointer
+  AtomicWord priv_end;
+  AtomicWord priv_data[kHookListCapacity];
+};
+
+ATTRIBUTE_VISIBILITY_HIDDEN extern HookList<MallocHook::NewHook> new_hooks_;
+ATTRIBUTE_VISIBILITY_HIDDEN extern HookList<MallocHook::DeleteHook> delete_hooks_;
+ATTRIBUTE_VISIBILITY_HIDDEN extern HookList<MallocHook::PreMmapHook> premmap_hooks_;
+ATTRIBUTE_VISIBILITY_HIDDEN extern HookList<MallocHook::MmapHook> mmap_hooks_;
+ATTRIBUTE_VISIBILITY_HIDDEN extern HookList<MallocHook::MmapReplacement> mmap_replacement_;
+ATTRIBUTE_VISIBILITY_HIDDEN extern HookList<MallocHook::MunmapHook> munmap_hooks_;
+ATTRIBUTE_VISIBILITY_HIDDEN extern HookList<MallocHook::MunmapReplacement> munmap_replacement_;
+ATTRIBUTE_VISIBILITY_HIDDEN extern HookList<MallocHook::MremapHook> mremap_hooks_;
+ATTRIBUTE_VISIBILITY_HIDDEN extern HookList<MallocHook::PreSbrkHook> presbrk_hooks_;
+ATTRIBUTE_VISIBILITY_HIDDEN extern HookList<MallocHook::SbrkHook> sbrk_hooks_;
+
+} }  // namespace base::internal
+
+// The following method is DEPRECATED
+inline MallocHook::NewHook MallocHook::GetNewHook() {
+  return base::internal::new_hooks_.GetSingular();
+}
+
+inline void MallocHook::InvokeNewHook(const void* p, size_t s) {
+  if (!base::internal::new_hooks_.empty()) {
+    InvokeNewHookSlow(p, s);
+  }
+}
+
+// The following method is DEPRECATED
+inline MallocHook::DeleteHook MallocHook::GetDeleteHook() {
+  return base::internal::delete_hooks_.GetSingular();
+}
+
+inline void MallocHook::InvokeDeleteHook(const void* p) {
+  if (!base::internal::delete_hooks_.empty()) {
+    InvokeDeleteHookSlow(p);
+  }
+}
+
+// The following method is DEPRECATED
+inline MallocHook::PreMmapHook MallocHook::GetPreMmapHook() {
+  return base::internal::premmap_hooks_.GetSingular();
+}
+
+inline void MallocHook::InvokePreMmapHook(const void* start,
+                                          size_t size,
+                                          int protection,
+                                          int flags,
+                                          int fd,
+                                          off_t offset) {
+  if (!base::internal::premmap_hooks_.empty()) {
+    InvokePreMmapHookSlow(start, size, protection, flags, fd, offset);
+  }
+}
+
+// The following method is DEPRECATED
+inline MallocHook::MmapHook MallocHook::GetMmapHook() {
+  return base::internal::mmap_hooks_.GetSingular();
+}
+
+inline void MallocHook::InvokeMmapHook(const void* result,
+                                       const void* start,
+                                       size_t size,
+                                       int protection,
+                                       int flags,
+                                       int fd,
+                                       off_t offset) {
+  if (!base::internal::mmap_hooks_.empty()) {
+    InvokeMmapHookSlow(result, start, size, protection, flags, fd, offset);
+  }
+}
+
+inline bool MallocHook::InvokeMmapReplacement(const void* start,
+                                              size_t size,
+                                              int protection,
+                                              int flags,
+                                              int fd,
+                                              off_t offset,
+                                              void** result) {
+  if (!base::internal::mmap_replacement_.empty()) {
+    return InvokeMmapReplacementSlow(start, size,
+                                     protection, flags,
+                                     fd, offset,
+                                     result);
+  }
+  return false;
+}
+
+// The following method is DEPRECATED
+inline MallocHook::MunmapHook MallocHook::GetMunmapHook() {
+  return base::internal::munmap_hooks_.GetSingular();
+}
+
+inline void MallocHook::InvokeMunmapHook(const void* p, size_t size) {
+  if (!base::internal::munmap_hooks_.empty()) {
+    InvokeMunmapHookSlow(p, size);
+  }
+}
+
+inline bool MallocHook::InvokeMunmapReplacement(
+    const void* p, size_t size, int* result) {
+  if (!base::internal::mmap_replacement_.empty()) {
+    return InvokeMunmapReplacementSlow(p, size, result);
+  }
+  return false;
+}
+
+// The following method is DEPRECATED
+inline MallocHook::MremapHook MallocHook::GetMremapHook() {
+  return base::internal::mremap_hooks_.GetSingular();
+}
+
+inline void MallocHook::InvokeMremapHook(const void* result,
+                                         const void* old_addr,
+                                         size_t old_size,
+                                         size_t new_size,
+                                         int flags,
+                                         const void* new_addr) {
+  if (!base::internal::mremap_hooks_.empty()) {
+    InvokeMremapHookSlow(result, old_addr, old_size, new_size, flags, new_addr);
+  }
+}
+
+// The following method is DEPRECATED
+inline MallocHook::PreSbrkHook MallocHook::GetPreSbrkHook() {
+  return base::internal::presbrk_hooks_.GetSingular();
+}
+
+inline void MallocHook::InvokePreSbrkHook(ptrdiff_t increment) {
+  if (!base::internal::presbrk_hooks_.empty() && increment != 0) {
+    InvokePreSbrkHookSlow(increment);
+  }
+}
+
+// The following method is DEPRECATED
+inline MallocHook::SbrkHook MallocHook::GetSbrkHook() {
+  return base::internal::sbrk_hooks_.GetSingular();
+}
+
+inline void MallocHook::InvokeSbrkHook(const void* result,
+                                       ptrdiff_t increment) {
+  if (!base::internal::sbrk_hooks_.empty() && increment != 0) {
+    InvokeSbrkHookSlow(result, increment);
+  }
+}
+
+#endif /* _MALLOC_HOOK_INL_H_ */

diff --git a/src/malloc_hook.cc b/src/malloc_hook.cc
new file mode 100644
index 0000000..681d8a2
--- /dev/null
+++ b/src/malloc_hook.cc

@@ -0,0 +1,692 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+
+#include <config.h>
+
+// Disable the glibc prototype of mremap(), as older versions of the
+// system headers define this function with only four arguments,
+// whereas newer versions allow an optional fifth argument:
+#ifdef HAVE_MMAP
+# define mremap glibc_mremap
+# include <sys/mman.h>
+# undef mremap
+#endif
+
+#include <stddef.h>
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#include <algorithm>
+#include "base/logging.h"
+#include "base/spinlock.h"
+#include "maybe_threads.h"
+#include "malloc_hook-inl.h"
+#include <gperftools/malloc_hook.h>
+
+// This #ifdef should almost never be set.  Set NO_TCMALLOC_SAMPLES if
+// you're porting to a system where you really can't get a stacktrace.
+#ifdef NO_TCMALLOC_SAMPLES
+  // We use #define so code compiles even if you #include stacktrace.h somehow.
+# define GetStackTrace(stack, depth, skip)  (0)
+#else
+# include <gperftools/stacktrace.h>
+#endif
+
+// __THROW is defined in glibc systems.  It means, counter-intuitively,
+// "This function will never throw an exception."  It's an optional
+// optimization tool, but we may need to use it to match glibc prototypes.
+#ifndef __THROW    // I guess we're not on a glibc system
+# define __THROW   // __THROW is just an optimization, so ok to make it ""
+#endif
+
+using std::copy;
+
+
+// Declaration of default weak initialization function, that can be overridden
+// by linking-in a strong definition (as heap-checker.cc does).  This is
+// extern "C" so that it doesn't trigger gold's --detect-odr-violations warning,
+// which only looks at C++ symbols.
+//
+// This function is declared here as weak, and defined later, rather than a more
+// straightforward simple weak definition, as a workround for an icc compiler
+// issue ((Intel reference 290819).  This issue causes icc to resolve weak
+// symbols too early, at compile rather than link time.  By declaring it (weak)
+// here, then defining it below after its use, we can avoid the problem.
+extern "C" {
+ATTRIBUTE_WEAK void MallocHook_InitAtFirstAllocation_HeapLeakChecker();
+}
+
+namespace {
+
+void RemoveInitialHooksAndCallInitializers();  // below.
+
+pthread_once_t once = PTHREAD_ONCE_INIT;
+
+// These hooks are installed in MallocHook as the only initial hooks.  The first
+// hook that is called will run RemoveInitialHooksAndCallInitializers (see the
+// definition below) and then redispatch to any malloc hooks installed by
+// RemoveInitialHooksAndCallInitializers.
+//
+// Note(llib): there is a possibility of a race in the event that there are
+// multiple threads running before the first allocation.  This is pretty
+// difficult to achieve, but if it is then multiple threads may concurrently do
+// allocations.  The first caller will call
+// RemoveInitialHooksAndCallInitializers via one of the initial hooks.  A
+// concurrent allocation may, depending on timing either:
+// * still have its initial malloc hook installed, run that and block on waiting
+//   for the first caller to finish its call to
+//   RemoveInitialHooksAndCallInitializers, and proceed normally.
+// * occur some time during the RemoveInitialHooksAndCallInitializers call, at
+//   which point there could be no initial hooks and the subsequent hooks that
+//   are about to be set up by RemoveInitialHooksAndCallInitializers haven't
+//   been installed yet.  I think the worst we can get is that some allocations
+//   will not get reported to some hooks set by the initializers called from
+//   RemoveInitialHooksAndCallInitializers.
+
+void InitialNewHook(const void* ptr, size_t size) {
+  perftools_pthread_once(&once, &RemoveInitialHooksAndCallInitializers);
+  MallocHook::InvokeNewHook(ptr, size);
+}
+
+void InitialPreMMapHook(const void* start,
+                               size_t size,
+                               int protection,
+                               int flags,
+                               int fd,
+                               off_t offset) {
+  perftools_pthread_once(&once, &RemoveInitialHooksAndCallInitializers);
+  MallocHook::InvokePreMmapHook(start, size, protection, flags, fd, offset);
+}
+
+void InitialPreSbrkHook(ptrdiff_t increment) {
+  perftools_pthread_once(&once, &RemoveInitialHooksAndCallInitializers);
+  MallocHook::InvokePreSbrkHook(increment);
+}
+
+// This function is called at most once by one of the above initial malloc
+// hooks.  It removes all initial hooks and initializes all other clients that
+// want to get control at the very first memory allocation.  The initializers
+// may assume that the initial malloc hooks have been removed.  The initializers
+// may set up malloc hooks and allocate memory.
+void RemoveInitialHooksAndCallInitializers() {
+  RAW_CHECK(MallocHook::RemoveNewHook(&InitialNewHook), "");
+  RAW_CHECK(MallocHook::RemovePreMmapHook(&InitialPreMMapHook), "");
+  RAW_CHECK(MallocHook::RemovePreSbrkHook(&InitialPreSbrkHook), "");
+
+  // HeapLeakChecker is currently the only module that needs to get control on
+  // the first memory allocation, but one can add other modules by following the
+  // same weak/strong function pattern.
+  MallocHook_InitAtFirstAllocation_HeapLeakChecker();
+}
+
+}  // namespace
+
+// Weak default initialization function that must go after its use.
+extern "C" void MallocHook_InitAtFirstAllocation_HeapLeakChecker() {
+  // Do nothing.
+}
+
+namespace base { namespace internal {
+
+// This lock is shared between all implementations of HookList::Add & Remove.
+// The potential for contention is very small.  This needs to be a SpinLock and
+// not a Mutex since it's possible for Mutex locking to allocate memory (e.g.,
+// per-thread allocation in debug builds), which could cause infinite recursion.
+static SpinLock hooklist_spinlock(base::LINKER_INITIALIZED);
+
+template <typename T>
+bool HookList<T>::Add(T value_as_t) {
+  AtomicWord value = bit_cast<AtomicWord>(value_as_t);
+  if (value == 0) {
+    return false;
+  }
+  SpinLockHolder l(&hooklist_spinlock);
+  // Find the first slot in data that is 0.
+  int index = 0;
+  while ((index < kHookListMaxValues) &&
+         (base::subtle::NoBarrier_Load(&priv_data[index]) != 0)) {
+    ++index;
+  }
+  if (index == kHookListMaxValues) {
+    return false;
+  }
+  AtomicWord prev_num_hooks = base::subtle::Acquire_Load(&priv_end);
+  base::subtle::NoBarrier_Store(&priv_data[index], value);
+  if (prev_num_hooks <= index) {
+    base::subtle::NoBarrier_Store(&priv_end, index + 1);
+  }
+  return true;
+}
+
+template <typename T>
+void HookList<T>::FixupPrivEndLocked() {
+  AtomicWord hooks_end = base::subtle::NoBarrier_Load(&priv_end);
+  while ((hooks_end > 0) &&
+         (base::subtle::NoBarrier_Load(&priv_data[hooks_end - 1]) == 0)) {
+    --hooks_end;
+  }
+  base::subtle::NoBarrier_Store(&priv_end, hooks_end);
+}
+
+template <typename T>
+bool HookList<T>::Remove(T value_as_t) {
+  if (value_as_t == 0) {
+    return false;
+  }
+  SpinLockHolder l(&hooklist_spinlock);
+  AtomicWord hooks_end = base::subtle::NoBarrier_Load(&priv_end);
+  int index = 0;
+  while (index < hooks_end && value_as_t != bit_cast<T>(
+             base::subtle::NoBarrier_Load(&priv_data[index]))) {
+    ++index;
+  }
+  if (index == hooks_end) {
+    return false;
+  }
+  base::subtle::NoBarrier_Store(&priv_data[index], 0);
+  FixupPrivEndLocked();
+  return true;
+}
+
+template <typename T>
+int HookList<T>::Traverse(T* output_array, int n) const {
+  AtomicWord hooks_end = base::subtle::Acquire_Load(&priv_end);
+  int actual_hooks_end = 0;
+  for (int i = 0; i < hooks_end && n > 0; ++i) {
+    AtomicWord data = base::subtle::Acquire_Load(&priv_data[i]);
+    if (data != 0) {
+      *output_array++ = bit_cast<T>(data);
+      ++actual_hooks_end;
+      --n;
+    }
+  }
+  return actual_hooks_end;
+}
+
+template <typename T>
+T HookList<T>::ExchangeSingular(T value_as_t) {
+  AtomicWord value = bit_cast<AtomicWord>(value_as_t);
+  AtomicWord old_value;
+  SpinLockHolder l(&hooklist_spinlock);
+  old_value = base::subtle::NoBarrier_Load(&priv_data[kHookListSingularIdx]);
+  base::subtle::NoBarrier_Store(&priv_data[kHookListSingularIdx], value);
+  if (value != 0) {
+    base::subtle::NoBarrier_Store(&priv_end, kHookListSingularIdx + 1);
+  } else {
+    FixupPrivEndLocked();
+  }
+  return bit_cast<T>(old_value);
+}
+
+// Initialize a HookList (optionally with the given initial_value in index 0).
+#define INIT_HOOK_LIST { 0 }
+#define INIT_HOOK_LIST_WITH_VALUE(initial_value)                \
+  { 1, { reinterpret_cast<AtomicWord>(initial_value) } }
+
+// Explicit instantiation for malloc_hook_test.cc.  This ensures all the methods
+// are instantiated.
+template struct HookList<MallocHook::NewHook>;
+
+HookList<MallocHook::NewHook> new_hooks_ =
+    INIT_HOOK_LIST_WITH_VALUE(&InitialNewHook);
+HookList<MallocHook::DeleteHook> delete_hooks_ = INIT_HOOK_LIST;
+HookList<MallocHook::PreMmapHook> premmap_hooks_ =
+    INIT_HOOK_LIST_WITH_VALUE(&InitialPreMMapHook);
+HookList<MallocHook::MmapHook> mmap_hooks_ = INIT_HOOK_LIST;
+HookList<MallocHook::MunmapHook> munmap_hooks_ = INIT_HOOK_LIST;
+HookList<MallocHook::MremapHook> mremap_hooks_ = INIT_HOOK_LIST;
+HookList<MallocHook::PreSbrkHook> presbrk_hooks_ =
+    INIT_HOOK_LIST_WITH_VALUE(InitialPreSbrkHook);
+HookList<MallocHook::SbrkHook> sbrk_hooks_ = INIT_HOOK_LIST;
+
+// These lists contain either 0 or 1 hooks.
+HookList<MallocHook::MmapReplacement> mmap_replacement_ = { 0 };
+HookList<MallocHook::MunmapReplacement> munmap_replacement_ = { 0 };
+
+#undef INIT_HOOK_LIST_WITH_VALUE
+#undef INIT_HOOK_LIST
+
+} }  // namespace base::internal
+
+using base::internal::kHookListMaxValues;
+using base::internal::new_hooks_;
+using base::internal::delete_hooks_;
+using base::internal::premmap_hooks_;
+using base::internal::mmap_hooks_;
+using base::internal::mmap_replacement_;
+using base::internal::munmap_hooks_;
+using base::internal::munmap_replacement_;
+using base::internal::mremap_hooks_;
+using base::internal::presbrk_hooks_;
+using base::internal::sbrk_hooks_;
+
+// These are available as C bindings as well as C++, hence their
+// definition outside the MallocHook class.
+extern "C"
+int MallocHook_AddNewHook(MallocHook_NewHook hook) {
+  RAW_VLOG(10, "AddNewHook(%p)", hook);
+  return new_hooks_.Add(hook);
+}
+
+extern "C"
+int MallocHook_RemoveNewHook(MallocHook_NewHook hook) {
+  RAW_VLOG(10, "RemoveNewHook(%p)", hook);
+  return new_hooks_.Remove(hook);
+}
+
+extern "C"
+int MallocHook_AddDeleteHook(MallocHook_DeleteHook hook) {
+  RAW_VLOG(10, "AddDeleteHook(%p)", hook);
+  return delete_hooks_.Add(hook);
+}
+
+extern "C"
+int MallocHook_RemoveDeleteHook(MallocHook_DeleteHook hook) {
+  RAW_VLOG(10, "RemoveDeleteHook(%p)", hook);
+  return delete_hooks_.Remove(hook);
+}
+
+extern "C"
+int MallocHook_AddPreMmapHook(MallocHook_PreMmapHook hook) {
+  RAW_VLOG(10, "AddPreMmapHook(%p)", hook);
+  return premmap_hooks_.Add(hook);
+}
+
+extern "C"
+int MallocHook_RemovePreMmapHook(MallocHook_PreMmapHook hook) {
+  RAW_VLOG(10, "RemovePreMmapHook(%p)", hook);
+  return premmap_hooks_.Remove(hook);
+}
+
+extern "C"
+int MallocHook_SetMmapReplacement(MallocHook_MmapReplacement hook) {
+  RAW_VLOG(10, "SetMmapReplacement(%p)", hook);
+  // NOTE this is a best effort CHECK. Concurrent sets could succeed since
+  // this test is outside of the Add spin lock.
+  RAW_CHECK(mmap_replacement_.empty(), "Only one MMapReplacement is allowed.");
+  return mmap_replacement_.Add(hook);
+}
+
+extern "C"
+int MallocHook_RemoveMmapReplacement(MallocHook_MmapReplacement hook) {
+  RAW_VLOG(10, "RemoveMmapReplacement(%p)", hook);
+  return mmap_replacement_.Remove(hook);
+}
+
+extern "C"
+int MallocHook_AddMmapHook(MallocHook_MmapHook hook) {
+  RAW_VLOG(10, "AddMmapHook(%p)", hook);
+  return mmap_hooks_.Add(hook);
+}
+
+extern "C"
+int MallocHook_RemoveMmapHook(MallocHook_MmapHook hook) {
+  RAW_VLOG(10, "RemoveMmapHook(%p)", hook);
+  return mmap_hooks_.Remove(hook);
+}
+
+extern "C"
+int MallocHook_AddMunmapHook(MallocHook_MunmapHook hook) {
+  RAW_VLOG(10, "AddMunmapHook(%p)", hook);
+  return munmap_hooks_.Add(hook);
+}
+
+extern "C"
+int MallocHook_RemoveMunmapHook(MallocHook_MunmapHook hook) {
+  RAW_VLOG(10, "RemoveMunmapHook(%p)", hook);
+  return munmap_hooks_.Remove(hook);
+}
+
+extern "C"
+int MallocHook_SetMunmapReplacement(MallocHook_MunmapReplacement hook) {
+  RAW_VLOG(10, "SetMunmapReplacement(%p)", hook);
+  // NOTE this is a best effort CHECK. Concurrent sets could succeed since
+  // this test is outside of the Add spin lock.
+  RAW_CHECK(munmap_replacement_.empty(),
+            "Only one MunmapReplacement is allowed.");
+  return munmap_replacement_.Add(hook);
+}
+
+extern "C"
+int MallocHook_RemoveMunmapReplacement(MallocHook_MunmapReplacement hook) {
+  RAW_VLOG(10, "RemoveMunmapReplacement(%p)", hook);
+  return munmap_replacement_.Remove(hook);
+}
+
+extern "C"
+int MallocHook_AddMremapHook(MallocHook_MremapHook hook) {
+  RAW_VLOG(10, "AddMremapHook(%p)", hook);
+  return mremap_hooks_.Add(hook);
+}
+
+extern "C"
+int MallocHook_RemoveMremapHook(MallocHook_MremapHook hook) {
+  RAW_VLOG(10, "RemoveMremapHook(%p)", hook);
+  return mremap_hooks_.Remove(hook);
+}
+
+extern "C"
+int MallocHook_AddPreSbrkHook(MallocHook_PreSbrkHook hook) {
+  RAW_VLOG(10, "AddPreSbrkHook(%p)", hook);
+  return presbrk_hooks_.Add(hook);
+}
+
+extern "C"
+int MallocHook_RemovePreSbrkHook(MallocHook_PreSbrkHook hook) {
+  RAW_VLOG(10, "RemovePreSbrkHook(%p)", hook);
+  return presbrk_hooks_.Remove(hook);
+}
+
+extern "C"
+int MallocHook_AddSbrkHook(MallocHook_SbrkHook hook) {
+  RAW_VLOG(10, "AddSbrkHook(%p)", hook);
+  return sbrk_hooks_.Add(hook);
+}
+
+extern "C"
+int MallocHook_RemoveSbrkHook(MallocHook_SbrkHook hook) {
+  RAW_VLOG(10, "RemoveSbrkHook(%p)", hook);
+  return sbrk_hooks_.Remove(hook);
+}
+
+// The code below is DEPRECATED.
+extern "C"
+MallocHook_NewHook MallocHook_SetNewHook(MallocHook_NewHook hook) {
+  RAW_VLOG(10, "SetNewHook(%p)", hook);
+  return new_hooks_.ExchangeSingular(hook);
+}
+
+extern "C"
+MallocHook_DeleteHook MallocHook_SetDeleteHook(MallocHook_DeleteHook hook) {
+  RAW_VLOG(10, "SetDeleteHook(%p)", hook);
+  return delete_hooks_.ExchangeSingular(hook);
+}
+
+extern "C"
+MallocHook_PreMmapHook MallocHook_SetPreMmapHook(MallocHook_PreMmapHook hook) {
+  RAW_VLOG(10, "SetPreMmapHook(%p)", hook);
+  return premmap_hooks_.ExchangeSingular(hook);
+}
+
+extern "C"
+MallocHook_MmapHook MallocHook_SetMmapHook(MallocHook_MmapHook hook) {
+  RAW_VLOG(10, "SetMmapHook(%p)", hook);
+  return mmap_hooks_.ExchangeSingular(hook);
+}
+
+extern "C"
+MallocHook_MunmapHook MallocHook_SetMunmapHook(MallocHook_MunmapHook hook) {
+  RAW_VLOG(10, "SetMunmapHook(%p)", hook);
+  return munmap_hooks_.ExchangeSingular(hook);
+}
+
+extern "C"
+MallocHook_MremapHook MallocHook_SetMremapHook(MallocHook_MremapHook hook) {
+  RAW_VLOG(10, "SetMremapHook(%p)", hook);
+  return mremap_hooks_.ExchangeSingular(hook);
+}
+
+extern "C"
+MallocHook_PreSbrkHook MallocHook_SetPreSbrkHook(MallocHook_PreSbrkHook hook) {
+  RAW_VLOG(10, "SetPreSbrkHook(%p)", hook);
+  return presbrk_hooks_.ExchangeSingular(hook);
+}
+
+extern "C"
+MallocHook_SbrkHook MallocHook_SetSbrkHook(MallocHook_SbrkHook hook) {
+  RAW_VLOG(10, "SetSbrkHook(%p)", hook);
+  return sbrk_hooks_.ExchangeSingular(hook);
+}
+// End of DEPRECATED code section.
+
+// Note: embedding the function calls inside the traversal of HookList would be
+// very confusing, as it is legal for a hook to remove itself and add other
+// hooks.  Doing traversal first, and then calling the hooks ensures we only
+// call the hooks registered at the start.
+#define INVOKE_HOOKS(HookType, hook_list, args) do {                    \
+    HookType hooks[kHookListMaxValues];                                 \
+    int num_hooks = hook_list.Traverse(hooks, kHookListMaxValues);      \
+    for (int i = 0; i < num_hooks; ++i) {                               \
+      (*hooks[i])args;                                                  \
+    }                                                                   \
+  } while (0)
+
+// There should only be one replacement. Return the result of the first
+// one, or false if there is none.
+#define INVOKE_REPLACEMENT(HookType, hook_list, args) do {              \
+    HookType hooks[kHookListMaxValues];                                 \
+    int num_hooks = hook_list.Traverse(hooks, kHookListMaxValues);      \
+    return (num_hooks > 0 && (*hooks[0])args);                          \
+  } while (0)
+
+
+void MallocHook::InvokeNewHookSlow(const void* p, size_t s) {
+  INVOKE_HOOKS(NewHook, new_hooks_, (p, s));
+}
+
+void MallocHook::InvokeDeleteHookSlow(const void* p) {
+  INVOKE_HOOKS(DeleteHook, delete_hooks_, (p));
+}
+
+void MallocHook::InvokePreMmapHookSlow(const void* start,
+                                       size_t size,
+                                       int protection,
+                                       int flags,
+                                       int fd,
+                                       off_t offset) {
+  INVOKE_HOOKS(PreMmapHook, premmap_hooks_, (start, size, protection, flags, fd,
+                                            offset));
+}
+
+void MallocHook::InvokeMmapHookSlow(const void* result,
+                                    const void* start,
+                                    size_t size,
+                                    int protection,
+                                    int flags,
+                                    int fd,
+                                    off_t offset) {
+  INVOKE_HOOKS(MmapHook, mmap_hooks_, (result, start, size, protection, flags,
+                                       fd, offset));
+}
+
+bool MallocHook::InvokeMmapReplacementSlow(const void* start,
+                                           size_t size,
+                                           int protection,
+                                           int flags,
+                                           int fd,
+                                           off_t offset,
+                                           void** result) {
+  INVOKE_REPLACEMENT(MmapReplacement, mmap_replacement_,
+                      (start, size, protection, flags, fd, offset, result));
+}
+
+void MallocHook::InvokeMunmapHookSlow(const void* p, size_t s) {
+  INVOKE_HOOKS(MunmapHook, munmap_hooks_, (p, s));
+}
+
+bool MallocHook::InvokeMunmapReplacementSlow(const void* p,
+                                             size_t s,
+                                             int* result) {
+  INVOKE_REPLACEMENT(MunmapReplacement, munmap_replacement_, (p, s, result));
+}
+
+void MallocHook::InvokeMremapHookSlow(const void* result,
+                                      const void* old_addr,
+                                      size_t old_size,
+                                      size_t new_size,
+                                      int flags,
+                                      const void* new_addr) {
+  INVOKE_HOOKS(MremapHook, mremap_hooks_, (result, old_addr, old_size, new_size,
+                                           flags, new_addr));
+}
+
+void MallocHook::InvokePreSbrkHookSlow(ptrdiff_t increment) {
+  INVOKE_HOOKS(PreSbrkHook, presbrk_hooks_, (increment));
+}
+
+void MallocHook::InvokeSbrkHookSlow(const void* result, ptrdiff_t increment) {
+  INVOKE_HOOKS(SbrkHook, sbrk_hooks_, (result, increment));
+}
+
+#undef INVOKE_HOOKS
+
+DEFINE_ATTRIBUTE_SECTION_VARS(google_malloc);
+DECLARE_ATTRIBUTE_SECTION_VARS(google_malloc);
+  // actual functions are in debugallocation.cc or tcmalloc.cc
+DEFINE_ATTRIBUTE_SECTION_VARS(malloc_hook);
+DECLARE_ATTRIBUTE_SECTION_VARS(malloc_hook);
+  // actual functions are in this file, malloc_hook.cc, and low_level_alloc.cc
+
+#define ADDR_IN_ATTRIBUTE_SECTION(addr, name) \
+  (reinterpret_cast<uintptr_t>(ATTRIBUTE_SECTION_START(name)) <= \
+     reinterpret_cast<uintptr_t>(addr) && \
+   reinterpret_cast<uintptr_t>(addr) < \
+     reinterpret_cast<uintptr_t>(ATTRIBUTE_SECTION_STOP(name)))
+
+// Return true iff 'caller' is a return address within a function
+// that calls one of our hooks via MallocHook:Invoke*.
+// A helper for GetCallerStackTrace.
+static inline bool InHookCaller(const void* caller) {
+  return ADDR_IN_ATTRIBUTE_SECTION(caller, google_malloc) ||
+         ADDR_IN_ATTRIBUTE_SECTION(caller, malloc_hook);
+  // We can use one section for everything except tcmalloc_or_debug
+  // due to its special linkage mode, which prevents merging of the sections.
+}
+
+#undef ADDR_IN_ATTRIBUTE_SECTION
+
+static bool checked_sections = false;
+
+static inline void CheckInHookCaller() {
+  if (!checked_sections) {
+    INIT_ATTRIBUTE_SECTION_VARS(google_malloc);
+    if (ATTRIBUTE_SECTION_START(google_malloc) ==
+        ATTRIBUTE_SECTION_STOP(google_malloc)) {
+      RAW_LOG(ERROR, "google_malloc section is missing, "
+                     "thus InHookCaller is broken!");
+    }
+    INIT_ATTRIBUTE_SECTION_VARS(malloc_hook);
+    if (ATTRIBUTE_SECTION_START(malloc_hook) ==
+        ATTRIBUTE_SECTION_STOP(malloc_hook)) {
+      RAW_LOG(ERROR, "malloc_hook section is missing, "
+                     "thus InHookCaller is broken!");
+    }
+    checked_sections = true;
+  }
+}
+
+// We can improve behavior/compactness of this function
+// if we pass a generic test function (with a generic arg)
+// into the implementations for GetStackTrace instead of the skip_count.
+extern "C" int MallocHook_GetCallerStackTrace(void** result, int max_depth,
+                                              int skip_count) {
+#if defined(NO_TCMALLOC_SAMPLES)
+  return 0;
+#elif !defined(HAVE_ATTRIBUTE_SECTION_START)
+  // Fall back to GetStackTrace and good old but fragile frame skip counts.
+  // Note: this path is inaccurate when a hook is not called directly by an
+  // allocation function but is daisy-chained through another hook,
+  // search for MallocHook::(Get|Set|Invoke)* to find such cases.
+  return GetStackTrace(result, max_depth, skip_count + int(DEBUG_MODE));
+  // due to -foptimize-sibling-calls in opt mode
+  // there's no need for extra frame skip here then
+#else
+  CheckInHookCaller();
+  // MallocHook caller determination via InHookCaller works, use it:
+  static const int kMaxSkip = 32 + 6 + 3;
+    // Constant tuned to do just one GetStackTrace call below in practice
+    // and not get many frames that we don't actually need:
+    // currently max passsed max_depth is 32,
+    // max passed/needed skip_count is 6
+    // and 3 is to account for some hook daisy chaining.
+  static const int kStackSize = kMaxSkip + 1;
+  void* stack[kStackSize];
+  int depth = GetStackTrace(stack, kStackSize, 1);  // skip this function frame
+  if (depth == 0)   // silenty propagate cases when GetStackTrace does not work
+    return 0;
+  for (int i = 0; i < depth; ++i) {  // stack[0] is our immediate caller
+    if (InHookCaller(stack[i])) {
+      RAW_VLOG(10, "Found hooked allocator at %d: %p <- %p",
+                   i, stack[i], stack[i+1]);
+      i += 1;  // skip hook caller frame
+      depth -= i;  // correct depth
+      if (depth > max_depth) depth = max_depth;
+      copy(stack + i, stack + i + depth, result);
+      if (depth < max_depth  &&  depth + i == kStackSize) {
+        // get frames for the missing depth
+        depth +=
+          GetStackTrace(result + depth, max_depth - depth, 1 + kStackSize);
+      }
+      return depth;
+    }
+  }
+  RAW_LOG(WARNING, "Hooked allocator frame not found, returning empty trace");
+    // If this happens try increasing kMaxSkip
+    // or else something must be wrong with InHookCaller,
+    // e.g. for every section used in InHookCaller
+    // all functions in that section must be inside the same library.
+  return 0;
+#endif
+}
+
+// On systems where we know how, we override mmap/munmap/mremap/sbrk
+// to provide support for calling the related hooks (in addition,
+// of course, to doing what these functions normally do).
+
+#if defined(__linux)
+# include "malloc_hook_mmap_linux.h"
+
+#elif defined(__FreeBSD__)
+# include "malloc_hook_mmap_freebsd.h"
+
+#else
+
+/*static*/void* MallocHook::UnhookedMMap(void *start, size_t length, int prot,
+                                         int flags, int fd, off_t offset) {
+  void* result;
+  if (!MallocHook::InvokeMmapReplacement(
+          start, length, prot, flags, fd, offset, &result)) {
+    result = mmap(start, length, prot, flags, fd, offset);
+  }
+  return result;
+}
+
+/*static*/int MallocHook::UnhookedMUnmap(void *start, size_t length) {
+  int result;
+  if (!MallocHook::InvokeMunmapReplacement(start, length, &result)) {
+    result = munmap(start, length);
+  }
+  return result;
+}
+
+#endif

diff --git a/src/malloc_hook_mmap_freebsd.h b/src/malloc_hook_mmap_freebsd.h
new file mode 100644
index 0000000..8575dcc
--- /dev/null
+++ b/src/malloc_hook_mmap_freebsd.h

@@ -0,0 +1,135 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Override mmap/munmap/mremap/sbrk to provide support for calling the
+// related hooks (in addition, of course, to doing what these
+// functions normally do).
+
+#ifndef __FreeBSD__
+# error Should only be including malloc_hook_mmap_freebsd.h on FreeBSD systems.
+#endif
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <dlfcn.h>
+
+// Make sure mmap doesn't get #define'd away by <sys/mman.h>
+#undef mmap
+
+// According to the FreeBSD documentation, use syscall if you do not
+// need 64-bit alignment otherwise use __syscall. Indeed, syscall
+// doesn't work correctly in most situations on 64-bit. It's return
+// type is 'int' so for things like SYS_mmap, it actually truncates
+// the returned address to 32-bits.
+#if defined(__amd64__) || defined(__x86_64__)
+# define MALLOC_HOOK_SYSCALL __syscall
+#else
+# define MALLOC_HOOK_SYSCALL syscall
+#endif
+
+
+extern "C" {
+  void* mmap(void *start, size_t length,int prot, int flags,
+             int fd, off_t offset) __THROW
+    ATTRIBUTE_SECTION(malloc_hook);
+  int munmap(void* start, size_t length) __THROW
+    ATTRIBUTE_SECTION(malloc_hook);
+  void* sbrk(intptr_t increment) __THROW
+    ATTRIBUTE_SECTION(malloc_hook);
+}
+
+static inline void* do_mmap(void *start, size_t length,
+                            int prot, int flags,
+                            int fd, off_t offset) __THROW {
+  return (void *)MALLOC_HOOK_SYSCALL(SYS_mmap,
+                                     start, length, prot, flags, fd, offset);
+}
+
+static inline void* do_sbrk(intptr_t increment) {
+  static void *(*libc_sbrk)(intptr_t);
+  if (libc_sbrk == NULL)
+    libc_sbrk = (void *(*)(intptr_t))dlsym(RTLD_NEXT, "sbrk");
+
+  return libc_sbrk(increment);
+}
+
+
+extern "C" void* mmap(void *start, size_t length, int prot, int flags,
+                      int fd, off_t offset) __THROW {
+  MallocHook::InvokePreMmapHook(start, length, prot, flags, fd, offset);
+  void *result;
+  if (!MallocHook::InvokeMmapReplacement(
+          start, length, prot, flags, fd, offset, &result)) {
+    result = do_mmap(start, length, prot, flags, fd,
+                       static_cast<size_t>(offset)); // avoid sign extension
+  }
+  MallocHook::InvokeMmapHook(result, start, length, prot, flags, fd, offset);
+  return result;
+}
+
+extern "C" int munmap(void* start, size_t length) __THROW {
+  MallocHook::InvokeMunmapHook(start, length);
+  int result;
+  if (!MallocHook::InvokeMunmapReplacement(start, length, &result)) {
+    result = MALLOC_HOOK_SYSCALL(SYS_munmap, start, length);
+  }
+
+  return result;
+}
+
+extern "C" void* sbrk(intptr_t increment) __THROW {
+  MallocHook::InvokePreSbrkHook(increment);
+  void *result = do_sbrk(increment);
+  MallocHook::InvokeSbrkHook(result, increment);
+  return result;
+}
+
+/*static*/void* MallocHook::UnhookedMMap(void *start, size_t length, int prot,
+                                         int flags, int fd, off_t offset) {
+  void* result;
+  if (!MallocHook::InvokeMmapReplacement(
+	  start, length, prot, flags, fd, offset, &result)) {
+    result = do_mmap(start, length, prot, flags, fd, offset);
+  }
+
+  return result;
+}
+
+/*static*/int MallocHook::UnhookedMUnmap(void *start, size_t length) {
+  int result;
+  if (!MallocHook::InvokeMunmapReplacement(start, length, &result)) {
+    result = MALLOC_HOOK_SYSCALL(SYS_munmap, start, length);
+  }
+  return result;
+}
+
+#undef MALLOC_HOOK_SYSCALL

diff --git a/src/malloc_hook_mmap_linux.h b/src/malloc_hook_mmap_linux.h
new file mode 100755
index 0000000..0f531db
--- /dev/null
+++ b/src/malloc_hook_mmap_linux.h

@@ -0,0 +1,238 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+
+// We define mmap() and mmap64(), which somewhat reimplements libc's mmap
+// syscall stubs.  Unfortunately libc only exports the stubs via weak symbols
+// (which we're overriding with our mmap64() and mmap() wrappers) so we can't
+// just call through to them.
+
+#ifndef __linux
+# error Should only be including malloc_hook_mmap_linux.h on linux systems.
+#endif
+
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include "base/linux_syscall_support.h"
+
+// The x86-32 case and the x86-64 case differ:
+// 32b has a mmap2() syscall, 64b does not.
+// 64b and 32b have different calling conventions for mmap().
+
+// I test for 64-bit first so I don't have to do things like
+// '#if (defined(__mips__) && !defined(__MIPS64__))' as a mips32 check.
+#if defined(__x86_64__) || defined(__PPC64__) || defined(__aarch64__) || (defined(_MIPS_SIM) && _MIPS_SIM == _ABI64)
+
+static inline void* do_mmap64(void *start, size_t length,
+                              int prot, int flags,
+                              int fd, __off64_t offset) __THROW {
+  return sys_mmap(start, length, prot, flags, fd, offset);
+}
+
+#define MALLOC_HOOK_HAVE_DO_MMAP64 1
+
+#elif defined(__i386__) || defined(__PPC__) || defined(__mips__) || \
+      defined(__arm__)
+
+static inline void* do_mmap64(void *start, size_t length,
+                              int prot, int flags,
+                              int fd, __off64_t offset) __THROW {
+  void *result;
+
+  // Try mmap2() unless it's not supported
+  static bool have_mmap2 = true;
+  if (have_mmap2) {
+    static int pagesize = 0;
+    if (!pagesize) pagesize = getpagesize();
+
+    // Check that the offset is page aligned
+    if (offset & (pagesize - 1)) {
+      result = MAP_FAILED;
+      errno = EINVAL;
+      goto out;
+    }
+
+    result = (void *)syscall(SYS_mmap2,
+                             start, length, prot, flags, fd,
+                             (off_t) (offset / pagesize));
+    if (result != MAP_FAILED || errno != ENOSYS)  goto out;
+
+    // We don't have mmap2() after all - don't bother trying it in future
+    have_mmap2 = false;
+  }
+
+  if (((off_t)offset) != offset) {
+    // If we're trying to map a 64-bit offset, fail now since we don't
+    // have 64-bit mmap() support.
+    result = MAP_FAILED;
+    errno = EINVAL;
+    goto out;
+  }
+
+#ifdef __NR_mmap
+  {
+    // Fall back to old 32-bit offset mmap() call
+    // Old syscall interface cannot handle six args, so pass in an array
+    int32 args[6] = { (int32) start, (int32) length, prot, flags, fd,
+                      (int32)(off_t) offset };
+    result = (void *)syscall(SYS_mmap, args);
+  }
+#else
+  // Some Linux ports like ARM EABI Linux has no mmap, just mmap2.
+  result = MAP_FAILED;
+#endif
+
+ out:
+  return result;
+}
+
+#define MALLOC_HOOK_HAVE_DO_MMAP64 1
+
+#endif  // #if defined(__x86_64__)
+
+
+#ifdef MALLOC_HOOK_HAVE_DO_MMAP64
+
+// We use do_mmap64 abstraction to put MallocHook::InvokeMmapHook
+// calls right into mmap and mmap64, so that the stack frames in the caller's
+// stack are at the same offsets for all the calls of memory allocating
+// functions.
+
+// Put all callers of MallocHook::Invoke* in this module into
+// malloc_hook section,
+// so that MallocHook::GetCallerStackTrace can function accurately:
+
+// Make sure mmap doesn't get #define'd away by <sys/mman.h>
+# undef mmap
+
+extern "C" {
+  void* mmap64(void *start, size_t length, int prot, int flags,
+               int fd, __off64_t offset  ) __THROW
+    ATTRIBUTE_SECTION(malloc_hook);
+  void* mmap(void *start, size_t length,int prot, int flags,
+             int fd, off_t offset) __THROW
+    ATTRIBUTE_SECTION(malloc_hook);
+  int munmap(void* start, size_t length) __THROW
+    ATTRIBUTE_SECTION(malloc_hook);
+  void* mremap(void* old_addr, size_t old_size, size_t new_size,
+               int flags, ...) __THROW
+    ATTRIBUTE_SECTION(malloc_hook);
+  void* sbrk(ptrdiff_t increment) __THROW
+    ATTRIBUTE_SECTION(malloc_hook);
+}
+
+extern "C" void* mmap64(void *start, size_t length, int prot, int flags,
+                        int fd, __off64_t offset) __THROW {
+  MallocHook::InvokePreMmapHook(start, length, prot, flags, fd, offset);
+  void *result;
+  if (!MallocHook::InvokeMmapReplacement(
+          start, length, prot, flags, fd, offset, &result)) {
+    result = do_mmap64(start, length, prot, flags, fd, offset);
+  }
+  MallocHook::InvokeMmapHook(result, start, length, prot, flags, fd, offset);
+  return result;
+}
+
+# if !defined(__USE_FILE_OFFSET64) || !defined(__REDIRECT_NTH)
+
+extern "C" void* mmap(void *start, size_t length, int prot, int flags,
+                      int fd, off_t offset) __THROW {
+  MallocHook::InvokePreMmapHook(start, length, prot, flags, fd, offset);
+  void *result;
+  if (!MallocHook::InvokeMmapReplacement(
+          start, length, prot, flags, fd, offset, &result)) {
+    result = do_mmap64(start, length, prot, flags, fd,
+                       static_cast<size_t>(offset)); // avoid sign extension
+  }
+  MallocHook::InvokeMmapHook(result, start, length, prot, flags, fd, offset);
+  return result;
+}
+
+# endif  // !defined(__USE_FILE_OFFSET64) || !defined(__REDIRECT_NTH)
+
+extern "C" int munmap(void* start, size_t length) __THROW {
+  MallocHook::InvokeMunmapHook(start, length);
+  int result;
+  if (!MallocHook::InvokeMunmapReplacement(start, length, &result)) {
+    result = sys_munmap(start, length);
+  }
+  return result;
+}
+
+extern "C" void* mremap(void* old_addr, size_t old_size, size_t new_size,
+                        int flags, ...) __THROW {
+  va_list ap;
+  va_start(ap, flags);
+  void *new_address = va_arg(ap, void *);
+  va_end(ap);
+  void* result = sys_mremap(old_addr, old_size, new_size, flags, new_address);
+  MallocHook::InvokeMremapHook(result, old_addr, old_size, new_size, flags,
+                               new_address);
+  return result;
+}
+
+#ifndef __UCLIBC__
+// libc's version:
+extern "C" void* __sbrk(ptrdiff_t increment);
+
+extern "C" void* sbrk(ptrdiff_t increment) __THROW {
+  MallocHook::InvokePreSbrkHook(increment);
+  void *result = __sbrk(increment);
+  MallocHook::InvokeSbrkHook(result, increment);
+  return result;
+}
+
+#endif
+
+/*static*/void* MallocHook::UnhookedMMap(void *start, size_t length, int prot,
+                                         int flags, int fd, off_t offset) {
+  void* result;
+  if (!MallocHook::InvokeMmapReplacement(
+          start, length, prot, flags, fd, offset, &result)) {
+    result = do_mmap64(start, length, prot, flags, fd, offset);
+  }
+  return result;
+}
+
+/*static*/int MallocHook::UnhookedMUnmap(void *start, size_t length) {
+  int result;
+  if (!MallocHook::InvokeMunmapReplacement(start, length, &result)) {
+    result = syscall(SYS_munmap, start, length);
+  }
+  return result;
+}
+
+#undef MALLOC_HOOK_HAVE_DO_MMAP64
+
+#endif  // #ifdef MALLOC_HOOK_HAVE_DO_MMAP64

diff --git a/src/maybe_threads.cc b/src/maybe_threads.cc
new file mode 100644
index 0000000..6dd0d8d
--- /dev/null
+++ b/src/maybe_threads.cc

@@ -0,0 +1,157 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Paul Menage <opensource@google.com>
+//
+// Some wrappers for pthread functions so that we can be LD_PRELOADed
+// against non-pthreads apps.
+//
+// This module will behave very strangely if some pthreads functions
+// exist and others don't.
+
+#include "config.h"
+#include <assert.h>
+#include <string.h>    // for memcmp
+#include <stdio.h>     // for __isthreaded on FreeBSD
+// We don't actually need strings. But including this header seems to
+// stop the compiler trying to short-circuit our pthreads existence
+// tests and claiming that the address of a function is always
+// non-zero. I have no idea why ...
+#include <string>
+#include "maybe_threads.h"
+#include "base/basictypes.h"
+
+// __THROW is defined in glibc systems.  It means, counter-intuitively,
+// "This function will never throw an exception."  It's an optional
+// optimization tool, but we may need to use it to match glibc prototypes.
+#ifndef __THROW    // I guess we're not on a glibc system
+# define __THROW   // __THROW is just an optimization, so ok to make it ""
+#endif
+
+// These are the methods we're going to conditionally include.
+extern "C" {
+  int pthread_key_create (pthread_key_t*, void (*)(void*))
+      __THROW ATTRIBUTE_WEAK;
+  int pthread_key_delete (pthread_key_t)
+      __THROW ATTRIBUTE_WEAK;
+  void *pthread_getspecific(pthread_key_t)
+      __THROW ATTRIBUTE_WEAK;
+  int pthread_setspecific(pthread_key_t, const void*)
+      __THROW ATTRIBUTE_WEAK;
+  int pthread_once(pthread_once_t *, void (*)(void))
+      ATTRIBUTE_WEAK;
+}
+
+#define MAX_PERTHREAD_VALS 16
+static void *perftools_pthread_specific_vals[MAX_PERTHREAD_VALS];
+static int next_key;
+
+// NOTE: it's similar to bitcast defined in basic_types.h with
+// exception of ignoring sizes mismatch
+template <typename T1, typename T2>
+static T2 memcpy_cast(const T1 &input) {
+  T2 output;
+  size_t s = sizeof(input);
+  if (sizeof(output) < s) {
+    s = sizeof(output);
+  }
+  memcpy(&output, &input, s);
+  return output;
+}
+
+int perftools_pthread_key_create(pthread_key_t *key,
+                                 void (*destr_function) (void *)) {
+  if (pthread_key_create) {
+    return pthread_key_create(key, destr_function);
+  } else {
+    assert(next_key < MAX_PERTHREAD_VALS);
+    *key = memcpy_cast<int, pthread_key_t>(next_key++);
+    return 0;
+  }
+}
+
+int perftools_pthread_key_delete(pthread_key_t key) {
+  if (pthread_key_delete) {
+    return pthread_key_delete(key);
+  } else {
+    return 0;
+  }
+}
+
+void *perftools_pthread_getspecific(pthread_key_t key) {
+  if (pthread_getspecific) {
+    return pthread_getspecific(key);
+  } else {
+    return perftools_pthread_specific_vals[memcpy_cast<pthread_key_t, int>(key)];
+  }
+}
+
+int perftools_pthread_setspecific(pthread_key_t key, void *val) {
+  if (pthread_setspecific) {
+    return pthread_setspecific(key, val);
+  } else {
+    perftools_pthread_specific_vals[memcpy_cast<pthread_key_t, int>(key)] = val;
+    return 0;
+  }
+}
+
+
+static pthread_once_t pthread_once_init = PTHREAD_ONCE_INIT;
+int perftools_pthread_once(pthread_once_t *ctl,
+                           void  (*init_routine) (void)) {
+#ifdef __FreeBSD__
+  // On __FreeBSD__, calling pthread_once on a system that is not
+  // linked with -pthread is silently a noop. :-( Luckily, we have a
+  // workaround: FreeBSD exposes __isthreaded in <stdio.h>, which is
+  // set to 1 when the first thread is spawned.  So on those systems,
+  // we can use our own separate pthreads-once mechanism, which is
+  // used until __isthreaded is 1 (which will never be true if the app
+  // is not linked with -pthread).
+  static bool pthread_once_ran_before_threads = false;
+  if (pthread_once_ran_before_threads) {
+    return 0;
+  }
+  if (!__isthreaded) {
+    init_routine();
+    pthread_once_ran_before_threads = true;
+    return 0;
+  }
+#endif
+  if (pthread_once) {
+    return pthread_once(ctl, init_routine);
+  } else {
+    if (memcmp(ctl, &pthread_once_init, sizeof(*ctl)) == 0) {
+      init_routine();
+      ++*(char*)(ctl);        // make it so it's no longer equal to init
+    }
+    return 0;
+  }
+}

diff --git a/src/maybe_threads.h b/src/maybe_threads.h
new file mode 100644
index 0000000..b60f4ef
--- /dev/null
+++ b/src/maybe_threads.h

@@ -0,0 +1,54 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Paul Menage <opensource@google.com>
+
+//-------------------------------------------------------------------
+// Some wrappers for pthread functions so that we can be LD_PRELOADed
+// against non-pthreads apps.
+//-------------------------------------------------------------------
+
+#ifndef GOOGLE_MAYBE_THREADS_H_
+#define GOOGLE_MAYBE_THREADS_H_
+
+#ifdef HAVE_PTHREAD
+#include <pthread.h>
+#endif
+
+int perftools_pthread_key_create(pthread_key_t *key,
+                                 void (*destr_function) (void *));
+int perftools_pthread_key_delete(pthread_key_t key);
+void *perftools_pthread_getspecific(pthread_key_t key);
+int perftools_pthread_setspecific(pthread_key_t key, void *val);
+int perftools_pthread_once(pthread_once_t *ctl,
+                           void  (*init_routine) (void));
+
+#endif  /* GOOGLE_MAYBE_THREADS_H_ */

diff --git a/src/memfs_malloc.cc b/src/memfs_malloc.cc
new file mode 100644
index 0000000..ce20891
--- /dev/null
+++ b/src/memfs_malloc.cc

@@ -0,0 +1,268 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Arun Sharma
+//
+// A tcmalloc system allocator that uses a memory based filesystem such as
+// tmpfs or hugetlbfs
+//
+// Since these only exist on linux, we only register this allocator there.
+
+#ifdef __linux
+
+#include <config.h>
+#include <errno.h>                      // for errno, EINVAL
+#include <inttypes.h>                   // for PRId64
+#include <limits.h>                     // for PATH_MAX
+#include <stddef.h>                     // for size_t, NULL
+#ifdef HAVE_STDINT_H
+#include <stdint.h>                     // for int64_t, uintptr_t
+#endif
+#include <stdio.h>                      // for snprintf
+#include <stdlib.h>                     // for mkstemp
+#include <string.h>                     // for strerror
+#include <sys/mman.h>                   // for mmap, MAP_FAILED, etc
+#include <sys/statfs.h>                 // for fstatfs, statfs
+#include <unistd.h>                     // for ftruncate, off_t, unlink
+#include <new>                          // for operator new
+#include <string>
+
+#include <gperftools/malloc_extension.h>
+#include "base/basictypes.h"
+#include "base/googleinit.h"
+#include "base/sysinfo.h"
+#include "internal_logging.h"
+
+// TODO(sanjay): Move the code below into the tcmalloc namespace
+using tcmalloc::kLog;
+using tcmalloc::kCrash;
+using tcmalloc::Log;
+using std::string;
+
+DEFINE_string(memfs_malloc_path, EnvToString("TCMALLOC_MEMFS_MALLOC_PATH", ""),
+              "Path where hugetlbfs or tmpfs is mounted. The caller is "
+              "responsible for ensuring that the path is unique and does "
+              "not conflict with another process");
+DEFINE_int64(memfs_malloc_limit_mb,
+             EnvToInt("TCMALLOC_MEMFS_LIMIT_MB", 0),
+             "Limit total allocation size to the "
+             "specified number of MiB.  0 == no limit.");
+DEFINE_bool(memfs_malloc_abort_on_fail,
+            EnvToBool("TCMALLOC_MEMFS_ABORT_ON_FAIL", false),
+            "abort() whenever memfs_malloc fails to satisfy an allocation "
+            "for any reason.");
+DEFINE_bool(memfs_malloc_ignore_mmap_fail,
+            EnvToBool("TCMALLOC_MEMFS_IGNORE_MMAP_FAIL", false),
+            "Ignore failures from mmap");
+DEFINE_bool(memfs_malloc_map_private,
+            EnvToBool("TCMALLOC_MEMFS_MAP_PRIVATE", false),
+	    "Use MAP_PRIVATE with mmap");
+
+// Hugetlbfs based allocator for tcmalloc
+class HugetlbSysAllocator: public SysAllocator {
+public:
+  explicit HugetlbSysAllocator(SysAllocator* fallback)
+    : failed_(true),  // To disable allocator until Initialize() is called.
+      big_page_size_(0),
+      hugetlb_fd_(-1),
+      hugetlb_base_(0),
+      fallback_(fallback) {
+  }
+
+  void* Alloc(size_t size, size_t *actual_size, size_t alignment);
+  bool Initialize();
+
+  bool failed_;          // Whether failed to allocate memory.
+
+private:
+  void* AllocInternal(size_t size, size_t *actual_size, size_t alignment);
+
+  int64 big_page_size_;
+  int hugetlb_fd_;       // file descriptor for hugetlb
+  off_t hugetlb_base_;
+
+  SysAllocator* fallback_;  // Default system allocator to fall back to.
+};
+static char hugetlb_space[sizeof(HugetlbSysAllocator)];
+
+// No locking needed here since we assume that tcmalloc calls
+// us with an internal lock held (see tcmalloc/system-alloc.cc).
+void* HugetlbSysAllocator::Alloc(size_t size, size_t *actual_size,
+                                 size_t alignment) {
+  if (failed_) {
+    return fallback_->Alloc(size, actual_size, alignment);
+  }
+
+  // We don't respond to allocation requests smaller than big_page_size_ unless
+  // the caller is ok to take more than they asked for. Used by MetaDataAlloc.
+  if (actual_size == NULL && size < big_page_size_) {
+    return fallback_->Alloc(size, actual_size, alignment);
+  }
+
+  // Enforce huge page alignment.  Be careful to deal with overflow.
+  size_t new_alignment = alignment;
+  if (new_alignment < big_page_size_) new_alignment = big_page_size_;
+  size_t aligned_size = ((size + new_alignment - 1) /
+                         new_alignment) * new_alignment;
+  if (aligned_size < size) {
+    return fallback_->Alloc(size, actual_size, alignment);
+  }
+
+  void* result = AllocInternal(aligned_size, actual_size, new_alignment);
+  if (result != NULL) {
+    return result;
+  }
+  Log(kLog, __FILE__, __LINE__,
+      "HugetlbSysAllocator: (failed, allocated)", failed_, hugetlb_base_);
+  if (FLAGS_memfs_malloc_abort_on_fail) {
+    Log(kCrash, __FILE__, __LINE__,
+        "memfs_malloc_abort_on_fail is set");
+  }
+  return fallback_->Alloc(size, actual_size, alignment);
+}
+
+void* HugetlbSysAllocator::AllocInternal(size_t size, size_t* actual_size,
+                                         size_t alignment) {
+  // Ask for extra memory if alignment > pagesize
+  size_t extra = 0;
+  if (alignment > big_page_size_) {
+    extra = alignment - big_page_size_;
+  }
+
+  // Test if this allocation would put us over the limit.
+  off_t limit = FLAGS_memfs_malloc_limit_mb*1024*1024;
+  if (limit > 0 && hugetlb_base_ + size + extra > limit) {
+    // Disable the allocator when there's less than one page left.
+    if (limit - hugetlb_base_ < big_page_size_) {
+      Log(kLog, __FILE__, __LINE__, "reached memfs_malloc_limit_mb");
+      failed_ = true;
+    }
+    else {
+      Log(kLog, __FILE__, __LINE__,
+          "alloc too large (size, bytes left)", size, limit-hugetlb_base_);
+    }
+    return NULL;
+  }
+
+  // This is not needed for hugetlbfs, but needed for tmpfs.  Annoyingly
+  // hugetlbfs returns EINVAL for ftruncate.
+  int ret = ftruncate(hugetlb_fd_, hugetlb_base_ + size + extra);
+  if (ret != 0 && errno != EINVAL) {
+    Log(kLog, __FILE__, __LINE__,
+        "ftruncate failed", strerror(errno));
+    failed_ = true;
+    return NULL;
+  }
+
+  // Note: size + extra does not overflow since:
+  //            size + alignment < (1<<NBITS).
+  // and        extra <= alignment
+  // therefore  size + extra < (1<<NBITS)
+  void *result;
+  result = mmap(0, size + extra, PROT_WRITE|PROT_READ,
+                FLAGS_memfs_malloc_map_private ? MAP_PRIVATE : MAP_SHARED,
+                hugetlb_fd_, hugetlb_base_);
+  if (result == reinterpret_cast<void*>(MAP_FAILED)) {
+    if (!FLAGS_memfs_malloc_ignore_mmap_fail) {
+      Log(kLog, __FILE__, __LINE__,
+          "mmap failed (size, error)", size + extra, strerror(errno));
+      failed_ = true;
+    }
+    return NULL;
+  }
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(result);
+
+  // Adjust the return memory so it is aligned
+  size_t adjust = 0;
+  if ((ptr & (alignment - 1)) != 0) {
+    adjust = alignment - (ptr & (alignment - 1));
+  }
+  ptr += adjust;
+  hugetlb_base_ += (size + extra);
+
+  if (actual_size) {
+    *actual_size = size + extra - adjust;
+  }
+
+  return reinterpret_cast<void*>(ptr);
+}
+
+bool HugetlbSysAllocator::Initialize() {
+  char path[PATH_MAX];
+  const int pathlen = FLAGS_memfs_malloc_path.size();
+  if (pathlen + 8 > sizeof(path)) {
+    Log(kCrash, __FILE__, __LINE__, "XX fatal: memfs_malloc_path too long");
+    return false;
+  }
+  memcpy(path, FLAGS_memfs_malloc_path.data(), pathlen);
+  memcpy(path + pathlen, ".XXXXXX", 8);  // Also copies terminating \0
+
+  int hugetlb_fd = mkstemp(path);
+  if (hugetlb_fd == -1) {
+    Log(kLog, __FILE__, __LINE__,
+        "warning: unable to create memfs_malloc_path",
+        path, strerror(errno));
+    return false;
+  }
+
+  // Cleanup memory on process exit
+  if (unlink(path) == -1) {
+    Log(kCrash, __FILE__, __LINE__,
+        "fatal: error unlinking memfs_malloc_path", path, strerror(errno));
+    return false;
+  }
+
+  // Use fstatfs to figure out the default page size for memfs
+  struct statfs sfs;
+  if (fstatfs(hugetlb_fd, &sfs) == -1) {
+    Log(kCrash, __FILE__, __LINE__,
+        "fatal: error fstatfs of memfs_malloc_path", strerror(errno));
+    return false;
+  }
+  int64 page_size = sfs.f_bsize;
+
+  hugetlb_fd_ = hugetlb_fd;
+  big_page_size_ = page_size;
+  failed_ = false;
+  return true;
+}
+
+REGISTER_MODULE_INITIALIZER(memfs_malloc, {
+  if (FLAGS_memfs_malloc_path.length()) {
+    SysAllocator* alloc = MallocExtension::instance()->GetSystemAllocator();
+    HugetlbSysAllocator* hp = new (hugetlb_space) HugetlbSysAllocator(alloc);
+    if (hp->Initialize()) {
+      MallocExtension::instance()->SetSystemAllocator(hp);
+    }
+  }
+});
+
+#endif   /* ifdef __linux */

diff --git a/src/memory_region_map.cc b/src/memory_region_map.cc
new file mode 100755
index 0000000..e885859
--- /dev/null
+++ b/src/memory_region_map.cc

@@ -0,0 +1,829 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2006, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Maxim Lifantsev
+ */
+
+//
+// Background and key design points of MemoryRegionMap.
+//
+// MemoryRegionMap is a low-level module with quite atypical requirements that
+// result in some degree of non-triviality of the implementation and design.
+//
+// MemoryRegionMap collects info about *all* memory regions created with
+// mmap, munmap, mremap, sbrk.
+// They key word above is 'all': all that are happening in a process
+// during its lifetime frequently starting even before global object
+// constructor execution.
+//
+// This is needed by the primary client of MemoryRegionMap:
+// HeapLeakChecker uses the regions and the associated stack traces
+// to figure out what part of the memory is the heap:
+// if MemoryRegionMap were to miss some (early) regions, leak checking would
+// stop working correctly.
+//
+// To accomplish the goal of functioning before/during global object
+// constructor execution MemoryRegionMap is done as a singleton service
+// that relies on own on-demand initialized static constructor-less data,
+// and only relies on other low-level modules that can also function properly
+// even before global object constructors run.
+//
+// Accomplishing the goal of collecting data about all mmap, munmap, mremap,
+// sbrk occurrences is a more involved: conceptually to do this one needs to
+// record some bits of data in particular about any mmap or sbrk call,
+// but to do that one needs to allocate memory for that data at some point,
+// but all memory allocations in the end themselves come from an mmap
+// or sbrk call (that's how the address space of the process grows).
+//
+// Also note that we need to do all the above recording from
+// within an mmap/sbrk hook which is sometimes/frequently is made by a memory
+// allocator, including the allocator MemoryRegionMap itself must rely on.
+// In the case of heap-checker usage this includes even the very first
+// mmap/sbrk call happening in the program: heap-checker gets activated due to
+// a link-time installed mmap/sbrk hook and it initializes MemoryRegionMap
+// and asks it to record info about this very first call right from that
+// very first hook invocation.
+//
+// MemoryRegionMap is doing its memory allocations via LowLevelAlloc:
+// unlike more complex standard memory allocator, LowLevelAlloc cooperates with
+// MemoryRegionMap by not holding any of its own locks while it calls mmap
+// to get memory, thus we are able to call LowLevelAlloc from
+// our mmap/sbrk hooks without causing a deadlock in it.
+// For the same reason of deadlock prevention the locking in MemoryRegionMap
+// itself is write-recursive which is an exception to Google's mutex usage.
+//
+// We still need to break the infinite cycle of mmap calling our hook,
+// which asks LowLevelAlloc for memory to record this mmap,
+// which (sometimes) causes mmap, which calls our hook, and so on.
+// We do this as follows: on a recursive call of MemoryRegionMap's
+// mmap/sbrk/mremap hook we record the data about the allocation in a
+// static fixed-sized stack (saved_regions and saved_buckets), when the
+// recursion unwinds but before returning from the outer hook call we unwind
+// this stack and move the data from saved_regions and saved_buckets to its
+// permanent place in the RegionSet and "bucket_table" respectively,
+// which can cause more allocations and mmap-s and recursion and unwinding,
+// but the whole process ends eventually due to the fact that for the small
+// allocations we are doing LowLevelAlloc reuses one mmap call and parcels out
+// the memory it created to satisfy several of our allocation requests.
+//
+
+// ========================================================================= //
+
+#include <config.h>
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#endif
+#ifdef HAVE_MMAP
+#include <sys/mman.h>
+#elif !defined(MAP_FAILED)
+#define MAP_FAILED -1  // the only thing we need from mman.h
+#endif
+#ifdef HAVE_PTHREAD
+#include <pthread.h>   // for pthread_t, pthread_self()
+#endif
+#include <stddef.h>
+
+#include <algorithm>
+#include <set>
+
+#include "memory_region_map.h"
+
+#include "base/logging.h"
+#include "base/low_level_alloc.h"
+#include "malloc_hook-inl.h"
+
+#include <gperftools/stacktrace.h>
+#include <gperftools/malloc_hook.h>
+
+// MREMAP_FIXED is a linux extension.  How it's used in this file,
+// setting it to 0 is equivalent to saying, "This feature isn't
+// supported", which is right.
+#ifndef MREMAP_FIXED
+# define MREMAP_FIXED  0
+#endif
+
+using std::max;
+
+// ========================================================================= //
+
+int MemoryRegionMap::client_count_ = 0;
+int MemoryRegionMap::max_stack_depth_ = 0;
+MemoryRegionMap::RegionSet* MemoryRegionMap::regions_ = NULL;
+LowLevelAlloc::Arena* MemoryRegionMap::arena_ = NULL;
+SpinLock MemoryRegionMap::lock_(SpinLock::LINKER_INITIALIZED);
+SpinLock MemoryRegionMap::owner_lock_(  // ACQUIRED_AFTER(lock_)
+    SpinLock::LINKER_INITIALIZED);
+int MemoryRegionMap::recursion_count_ = 0;  // GUARDED_BY(owner_lock_)
+pthread_t MemoryRegionMap::lock_owner_tid_;  // GUARDED_BY(owner_lock_)
+int64 MemoryRegionMap::map_size_ = 0;
+int64 MemoryRegionMap::unmap_size_ = 0;
+HeapProfileBucket** MemoryRegionMap::bucket_table_ = NULL;  // GUARDED_BY(lock_)
+int MemoryRegionMap::num_buckets_ = 0;  // GUARDED_BY(lock_)
+int MemoryRegionMap::saved_buckets_count_ = 0;  // GUARDED_BY(lock_)
+HeapProfileBucket MemoryRegionMap::saved_buckets_[20];  // GUARDED_BY(lock_)
+
+// GUARDED_BY(lock_)
+const void* MemoryRegionMap::saved_buckets_keys_[20][kMaxStackDepth];
+
+// ========================================================================= //
+
+// Simple hook into execution of global object constructors,
+// so that we do not call pthread_self() when it does not yet work.
+static bool libpthread_initialized = false;
+static bool initializer = (libpthread_initialized = true, true);
+
+static inline bool current_thread_is(pthread_t should_be) {
+  // Before main() runs, there's only one thread, so we're always that thread
+  if (!libpthread_initialized) return true;
+  // this starts working only sometime well into global constructor execution:
+  return pthread_equal(pthread_self(), should_be);
+}
+
+// ========================================================================= //
+
+// Constructor-less place-holder to store a RegionSet in.
+union MemoryRegionMap::RegionSetRep {
+  char rep[sizeof(RegionSet)];
+  void* align_it;  // do not need a better alignment for 'rep' than this
+  RegionSet* region_set() { return reinterpret_cast<RegionSet*>(rep); }
+};
+
+// The bytes where MemoryRegionMap::regions_ will point to.
+// We use RegionSetRep with noop c-tor so that global construction
+// does not interfere.
+static MemoryRegionMap::RegionSetRep regions_rep;
+
+// ========================================================================= //
+
+// Has InsertRegionLocked been called recursively
+// (or rather should we *not* use regions_ to record a hooked mmap).
+static bool recursive_insert = false;
+
+void MemoryRegionMap::Init(int max_stack_depth, bool use_buckets) {
+  RAW_VLOG(10, "MemoryRegionMap Init");
+  RAW_CHECK(max_stack_depth >= 0, "");
+  // Make sure we don't overflow the memory in region stacks:
+  RAW_CHECK(max_stack_depth <= kMaxStackDepth,
+            "need to increase kMaxStackDepth?");
+  Lock();
+  client_count_ += 1;
+  max_stack_depth_ = max(max_stack_depth_, max_stack_depth);
+  if (client_count_ > 1) {
+    // not first client: already did initialization-proper
+    Unlock();
+    RAW_VLOG(10, "MemoryRegionMap Init increment done");
+    return;
+  }
+  // Set our hooks and make sure they were installed:
+  RAW_CHECK(MallocHook::AddMmapHook(&MmapHook), "");
+  RAW_CHECK(MallocHook::AddMremapHook(&MremapHook), "");
+  RAW_CHECK(MallocHook::AddSbrkHook(&SbrkHook), "");
+  RAW_CHECK(MallocHook::AddMunmapHook(&MunmapHook), "");
+  // We need to set recursive_insert since the NewArena call itself
+  // will already do some allocations with mmap which our hooks will catch
+  // recursive_insert allows us to buffer info about these mmap calls.
+  // Note that Init() can be (and is) sometimes called
+  // already from within an mmap/sbrk hook.
+  recursive_insert = true;
+  arena_ = LowLevelAlloc::NewArena(0, LowLevelAlloc::DefaultArena());
+  recursive_insert = false;
+  HandleSavedRegionsLocked(&InsertRegionLocked);  // flush the buffered ones
+    // Can't instead use HandleSavedRegionsLocked(&DoInsertRegionLocked) before
+    // recursive_insert = false; as InsertRegionLocked will also construct
+    // regions_ on demand for us.
+  if (use_buckets) {
+    const int table_bytes = kHashTableSize * sizeof(*bucket_table_);
+    recursive_insert = true;
+    bucket_table_ = static_cast<HeapProfileBucket**>(
+        MyAllocator::Allocate(table_bytes));
+    recursive_insert = false;
+    memset(bucket_table_, 0, table_bytes);
+    num_buckets_ = 0;
+  }
+  Unlock();
+  RAW_VLOG(10, "MemoryRegionMap Init done");
+}
+
+bool MemoryRegionMap::Shutdown() {
+  RAW_VLOG(10, "MemoryRegionMap Shutdown");
+  Lock();
+  RAW_CHECK(client_count_ > 0, "");
+  client_count_ -= 1;
+  if (client_count_ != 0) {  // not last client; need not really shutdown
+    Unlock();
+    RAW_VLOG(10, "MemoryRegionMap Shutdown decrement done");
+    return true;
+  }
+  if (bucket_table_ != NULL) {
+    for (int i = 0; i < kHashTableSize; i++) {
+      for (HeapProfileBucket* curr = bucket_table_[i]; curr != 0; /**/) {
+        HeapProfileBucket* bucket = curr;
+        curr = curr->next;
+        MyAllocator::Free(bucket->stack, 0);
+        MyAllocator::Free(bucket, 0);
+      }
+    }
+    MyAllocator::Free(bucket_table_, 0);
+    num_buckets_ = 0;
+    bucket_table_ = NULL;
+  }
+  RAW_CHECK(MallocHook::RemoveMmapHook(&MmapHook), "");
+  RAW_CHECK(MallocHook::RemoveMremapHook(&MremapHook), "");
+  RAW_CHECK(MallocHook::RemoveSbrkHook(&SbrkHook), "");
+  RAW_CHECK(MallocHook::RemoveMunmapHook(&MunmapHook), "");
+  if (regions_) regions_->~RegionSet();
+  regions_ = NULL;
+  bool deleted_arena = LowLevelAlloc::DeleteArena(arena_);
+  if (deleted_arena) {
+    arena_ = 0;
+  } else {
+    RAW_LOG(WARNING, "Can't delete LowLevelAlloc arena: it's being used");
+  }
+  Unlock();
+  RAW_VLOG(10, "MemoryRegionMap Shutdown done");
+  return deleted_arena;
+}
+
+bool MemoryRegionMap::IsRecordingLocked() {
+  RAW_CHECK(LockIsHeld(), "should be held (by this thread)");
+  return client_count_ > 0;
+}
+
+// Invariants (once libpthread_initialized is true):
+//   * While lock_ is not held, recursion_count_ is 0 (and
+//     lock_owner_tid_ is the previous owner, but we don't rely on
+//     that).
+//   * recursion_count_ and lock_owner_tid_ are only written while
+//     both lock_ and owner_lock_ are held. They may be read under
+//     just owner_lock_.
+//   * At entry and exit of Lock() and Unlock(), the current thread
+//     owns lock_ iff pthread_equal(lock_owner_tid_, pthread_self())
+//     && recursion_count_ > 0.
+void MemoryRegionMap::Lock() {
+  {
+    SpinLockHolder l(&owner_lock_);
+    if (recursion_count_ > 0 && current_thread_is(lock_owner_tid_)) {
+      RAW_CHECK(lock_.IsHeld(), "Invariants violated");
+      recursion_count_++;
+      RAW_CHECK(recursion_count_ <= 5,
+                "recursive lock nesting unexpectedly deep");
+      return;
+    }
+  }
+  lock_.Lock();
+  {
+    SpinLockHolder l(&owner_lock_);
+    RAW_CHECK(recursion_count_ == 0,
+              "Last Unlock didn't reset recursion_count_");
+    if (libpthread_initialized)
+      lock_owner_tid_ = pthread_self();
+    recursion_count_ = 1;
+  }
+}
+
+void MemoryRegionMap::Unlock() {
+  SpinLockHolder l(&owner_lock_);
+  RAW_CHECK(recursion_count_ >  0, "unlock when not held");
+  RAW_CHECK(lock_.IsHeld(),
+            "unlock when not held, and recursion_count_ is wrong");
+  RAW_CHECK(current_thread_is(lock_owner_tid_), "unlock by non-holder");
+  recursion_count_--;
+  if (recursion_count_ == 0) {
+    lock_.Unlock();
+  }
+}
+
+bool MemoryRegionMap::LockIsHeld() {
+  SpinLockHolder l(&owner_lock_);
+  return lock_.IsHeld()  &&  current_thread_is(lock_owner_tid_);
+}
+
+const MemoryRegionMap::Region*
+MemoryRegionMap::DoFindRegionLocked(uintptr_t addr) {
+  RAW_CHECK(LockIsHeld(), "should be held (by this thread)");
+  if (regions_ != NULL) {
+    Region sample;
+    sample.SetRegionSetKey(addr);
+    RegionSet::iterator region = regions_->lower_bound(sample);
+    if (region != regions_->end()) {
+      RAW_CHECK(addr <= region->end_addr, "");
+      if (region->start_addr <= addr  &&  addr < region->end_addr) {
+        return &(*region);
+      }
+    }
+  }
+  return NULL;
+}
+
+bool MemoryRegionMap::FindRegion(uintptr_t addr, Region* result) {
+  Lock();
+  const Region* region = DoFindRegionLocked(addr);
+  if (region != NULL) *result = *region;  // create it as an independent copy
+  Unlock();
+  return region != NULL;
+}
+
+bool MemoryRegionMap::FindAndMarkStackRegion(uintptr_t stack_top,
+                                             Region* result) {
+  Lock();
+  const Region* region = DoFindRegionLocked(stack_top);
+  if (region != NULL) {
+    RAW_VLOG(10, "Stack at %p is inside region %p..%p",
+                reinterpret_cast<void*>(stack_top),
+                reinterpret_cast<void*>(region->start_addr),
+                reinterpret_cast<void*>(region->end_addr));
+    const_cast<Region*>(region)->set_is_stack();  // now we know
+      // cast is safe (set_is_stack does not change the set ordering key)
+    *result = *region;  // create *result as an independent copy
+  }
+  Unlock();
+  return region != NULL;
+}
+
+HeapProfileBucket* MemoryRegionMap::GetBucket(int depth,
+                                              const void* const key[]) {
+  RAW_CHECK(LockIsHeld(), "should be held (by this thread)");
+  // Make hash-value
+  uintptr_t hash = 0;
+  for (int i = 0; i < depth; i++) {
+    hash += reinterpret_cast<uintptr_t>(key[i]);
+    hash += hash << 10;
+    hash ^= hash >> 6;
+  }
+  hash += hash << 3;
+  hash ^= hash >> 11;
+
+  // Lookup stack trace in table
+  unsigned int hash_index = (static_cast<unsigned int>(hash)) % kHashTableSize;
+  for (HeapProfileBucket* bucket = bucket_table_[hash_index];
+       bucket != 0;
+       bucket = bucket->next) {
+    if ((bucket->hash == hash) && (bucket->depth == depth) &&
+        std::equal(key, key + depth, bucket->stack)) {
+      return bucket;
+    }
+  }
+
+  // Create new bucket
+  const size_t key_size = sizeof(key[0]) * depth;
+  HeapProfileBucket* bucket;
+  if (recursive_insert) {  // recursion: save in saved_buckets_
+    const void** key_copy = saved_buckets_keys_[saved_buckets_count_];
+    std::copy(key, key + depth, key_copy);
+    bucket = &saved_buckets_[saved_buckets_count_];
+    memset(bucket, 0, sizeof(*bucket));
+    ++saved_buckets_count_;
+    bucket->stack = key_copy;
+    bucket->next  = NULL;
+  } else {
+    recursive_insert = true;
+    const void** key_copy = static_cast<const void**>(
+        MyAllocator::Allocate(key_size));
+    recursive_insert = false;
+    std::copy(key, key + depth, key_copy);
+    recursive_insert = true;
+    bucket = static_cast<HeapProfileBucket*>(
+        MyAllocator::Allocate(sizeof(HeapProfileBucket)));
+    recursive_insert = false;
+    memset(bucket, 0, sizeof(*bucket));
+    bucket->stack = key_copy;
+    bucket->next  = bucket_table_[hash_index];
+  }
+  bucket->hash = hash;
+  bucket->depth = depth;
+  bucket_table_[hash_index] = bucket;
+  ++num_buckets_;
+  return bucket;
+}
+
+MemoryRegionMap::RegionIterator MemoryRegionMap::BeginRegionLocked() {
+  RAW_CHECK(LockIsHeld(), "should be held (by this thread)");
+  RAW_CHECK(regions_ != NULL, "");
+  return regions_->begin();
+}
+
+MemoryRegionMap::RegionIterator MemoryRegionMap::EndRegionLocked() {
+  RAW_CHECK(LockIsHeld(), "should be held (by this thread)");
+  RAW_CHECK(regions_ != NULL, "");
+  return regions_->end();
+}
+
+inline void MemoryRegionMap::DoInsertRegionLocked(const Region& region) {
+  RAW_VLOG(12, "Inserting region %p..%p from %p",
+              reinterpret_cast<void*>(region.start_addr),
+              reinterpret_cast<void*>(region.end_addr),
+              reinterpret_cast<void*>(region.caller()));
+  RegionSet::const_iterator i = regions_->lower_bound(region);
+  if (i != regions_->end() && i->start_addr <= region.start_addr) {
+    RAW_DCHECK(region.end_addr <= i->end_addr, "");  // lower_bound ensures this
+    return;  // 'region' is a subset of an already recorded region; do nothing
+    // We can be stricter and allow this only when *i has been created via
+    // an mmap with MAP_NORESERVE flag set.
+  }
+  if (DEBUG_MODE) {
+    RAW_CHECK(i == regions_->end()  ||  !region.Overlaps(*i),
+              "Wow, overlapping memory regions");
+    Region sample;
+    sample.SetRegionSetKey(region.start_addr);
+    i = regions_->lower_bound(sample);
+    RAW_CHECK(i == regions_->end()  ||  !region.Overlaps(*i),
+              "Wow, overlapping memory regions");
+  }
+  region.AssertIsConsistent();  // just making sure
+  // This inserts and allocates permanent storage for region
+  // and its call stack data: it's safe to do it now:
+  regions_->insert(region);
+  RAW_VLOG(12, "Inserted region %p..%p :",
+              reinterpret_cast<void*>(region.start_addr),
+              reinterpret_cast<void*>(region.end_addr));
+  if (VLOG_IS_ON(12))  LogAllLocked();
+}
+
+// These variables are local to MemoryRegionMap::InsertRegionLocked()
+// and MemoryRegionMap::HandleSavedRegionsLocked()
+// and are file-level to ensure that they are initialized at load time.
+
+// Number of unprocessed region inserts.
+static int saved_regions_count = 0;
+
+// Unprocessed inserts (must be big enough to hold all allocations that can
+// be caused by a InsertRegionLocked call).
+// Region has no constructor, so that c-tor execution does not interfere
+// with the any-time use of the static memory behind saved_regions.
+static MemoryRegionMap::Region saved_regions[20];
+
+inline void MemoryRegionMap::HandleSavedRegionsLocked(
+              void (*insert_func)(const Region& region)) {
+  while (saved_regions_count > 0) {
+    // Making a local-var copy of the region argument to insert_func
+    // including its stack (w/o doing any memory allocations) is important:
+    // in many cases the memory in saved_regions
+    // will get written-to during the (*insert_func)(r) call below.
+    Region r = saved_regions[--saved_regions_count];
+    (*insert_func)(r);
+  }
+}
+
+void MemoryRegionMap::RestoreSavedBucketsLocked() {
+  RAW_CHECK(LockIsHeld(), "should be held (by this thread)");
+  while (saved_buckets_count_ > 0) {
+    HeapProfileBucket bucket = saved_buckets_[--saved_buckets_count_];
+    unsigned int hash_index =
+        static_cast<unsigned int>(bucket.hash) % kHashTableSize;
+    bool is_found = false;
+    for (HeapProfileBucket* curr = bucket_table_[hash_index];
+         curr != 0;
+         curr = curr->next) {
+      if ((curr->hash == bucket.hash) && (curr->depth == bucket.depth) &&
+          std::equal(bucket.stack, bucket.stack + bucket.depth, curr->stack)) {
+        curr->allocs += bucket.allocs;
+        curr->alloc_size += bucket.alloc_size;
+        curr->frees += bucket.frees;
+        curr->free_size += bucket.free_size;
+        is_found = true;
+        break;
+      }
+    }
+    if (is_found) continue;
+
+    const size_t key_size = sizeof(bucket.stack[0]) * bucket.depth;
+    const void** key_copy = static_cast<const void**>(
+        MyAllocator::Allocate(key_size));
+    std::copy(bucket.stack, bucket.stack + bucket.depth, key_copy);
+    HeapProfileBucket* new_bucket = static_cast<HeapProfileBucket*>(
+        MyAllocator::Allocate(sizeof(HeapProfileBucket)));
+    memset(new_bucket, 0, sizeof(*new_bucket));
+    new_bucket->hash = bucket.hash;
+    new_bucket->depth = bucket.depth;
+    new_bucket->stack = key_copy;
+    new_bucket->next = bucket_table_[hash_index];
+    bucket_table_[hash_index] = new_bucket;
+    ++num_buckets_;
+  }
+}
+
+inline void MemoryRegionMap::InsertRegionLocked(const Region& region) {
+  RAW_CHECK(LockIsHeld(), "should be held (by this thread)");
+  // We can be called recursively, because RegionSet constructor
+  // and DoInsertRegionLocked() (called below) can call the allocator.
+  // recursive_insert tells us if that's the case. When this happens,
+  // region insertion information is recorded in saved_regions[],
+  // and taken into account when the recursion unwinds.
+  // Do the insert:
+  if (recursive_insert) {  // recursion: save in saved_regions
+    RAW_VLOG(12, "Saving recursive insert of region %p..%p from %p",
+                reinterpret_cast<void*>(region.start_addr),
+                reinterpret_cast<void*>(region.end_addr),
+                reinterpret_cast<void*>(region.caller()));
+    RAW_CHECK(saved_regions_count < arraysize(saved_regions), "");
+    // Copy 'region' to saved_regions[saved_regions_count]
+    // together with the contents of its call_stack,
+    // then increment saved_regions_count.
+    saved_regions[saved_regions_count++] = region;
+  } else {  // not a recusrive call
+    if (regions_ == NULL) {  // init regions_
+      RAW_VLOG(12, "Initializing region set");
+      regions_ = regions_rep.region_set();
+      recursive_insert = true;
+      new(regions_) RegionSet();
+      HandleSavedRegionsLocked(&DoInsertRegionLocked);
+      recursive_insert = false;
+    }
+    recursive_insert = true;
+    // Do the actual insertion work to put new regions into regions_:
+    DoInsertRegionLocked(region);
+    HandleSavedRegionsLocked(&DoInsertRegionLocked);
+    recursive_insert = false;
+  }
+}
+
+// We strip out different number of stack frames in debug mode
+// because less inlining happens in that case
+#ifdef NDEBUG
+static const int kStripFrames = 1;
+#else
+static const int kStripFrames = 3;
+#endif
+
+void MemoryRegionMap::RecordRegionAddition(const void* start, size_t size) {
+  // Record start/end info about this memory acquisition call in a new region:
+  Region region;
+  region.Create(start, size);
+  // First get the call stack info into the local varible 'region':
+  int depth = 0;
+  // NOTE: libunwind also does mmap and very much likely while holding
+  // it's own lock(s). So some threads may first take libunwind lock,
+  // and then take region map lock (necessary to record mmap done from
+  // inside libunwind). On the other hand other thread(s) may do
+  // normal mmap. Which would call this method to record it. Which
+  // would then proceed with installing that record to region map
+  // while holding region map lock. That may cause mmap from our own
+  // internal allocators, so attempt to unwind in this case may cause
+  // reverse order of taking libuwind and region map locks. Which is
+  // obvious deadlock.
+  //
+  // Thankfully, we can easily detect if we're holding region map lock
+  // and avoid recording backtrace in this (rare and largely
+  // irrelevant) case. By doing this we "declare" that thread needing
+  // both locks must take region map lock last. In other words we do
+  // not allow taking libuwind lock when we already have region map
+  // lock. Note, this is generally impossible when somebody tries to
+  // mix cpu profiling and heap checking/profiling, because cpu
+  // profiler grabs backtraces at arbitrary places. But at least such
+  // combination is rarer and less relevant.
+  if (max_stack_depth_ > 0 && !LockIsHeld()) {
+    depth = MallocHook::GetCallerStackTrace(const_cast<void**>(region.call_stack),
+                                            max_stack_depth_, kStripFrames + 1);
+  }
+  region.set_call_stack_depth(depth);  // record stack info fully
+  RAW_VLOG(10, "New global region %p..%p from %p",
+              reinterpret_cast<void*>(region.start_addr),
+              reinterpret_cast<void*>(region.end_addr),
+              reinterpret_cast<void*>(region.caller()));
+  // Note: none of the above allocates memory.
+  Lock();  // recursively lock
+  map_size_ += size;
+  InsertRegionLocked(region);
+    // This will (eventually) allocate storage for and copy over the stack data
+    // from region.call_stack_data_ that is pointed by region.call_stack().
+  if (bucket_table_ != NULL) {
+    HeapProfileBucket* b = GetBucket(depth, region.call_stack);
+    ++b->allocs;
+    b->alloc_size += size;
+    if (!recursive_insert) {
+      recursive_insert = true;
+      RestoreSavedBucketsLocked();
+      recursive_insert = false;
+    }
+  }
+  Unlock();
+}
+
+void MemoryRegionMap::RecordRegionRemoval(const void* start, size_t size) {
+  Lock();
+  if (recursive_insert) {
+    // First remove the removed region from saved_regions, if it's
+    // there, to prevent overrunning saved_regions in recursive
+    // map/unmap call sequences, and also from later inserting regions
+    // which have already been unmapped.
+    uintptr_t start_addr = reinterpret_cast<uintptr_t>(start);
+    uintptr_t end_addr = start_addr + size;
+    int put_pos = 0;
+    int old_count = saved_regions_count;
+    for (int i = 0; i < old_count; ++i, ++put_pos) {
+      Region& r = saved_regions[i];
+      if (r.start_addr == start_addr && r.end_addr == end_addr) {
+        // An exact match, so it's safe to remove.
+        RecordRegionRemovalInBucket(r.call_stack_depth, r.call_stack, size);
+        --saved_regions_count;
+        --put_pos;
+        RAW_VLOG(10, ("Insta-Removing saved region %p..%p; "
+                     "now have %d saved regions"),
+                 reinterpret_cast<void*>(start_addr),
+                 reinterpret_cast<void*>(end_addr),
+                 saved_regions_count);
+      } else {
+        if (put_pos < i) {
+          saved_regions[put_pos] = saved_regions[i];
+        }
+      }
+    }
+  }
+  if (regions_ == NULL) {  // We must have just unset the hooks,
+                           // but this thread was already inside the hook.
+    Unlock();
+    return;
+  }
+  if (!recursive_insert) {
+    HandleSavedRegionsLocked(&InsertRegionLocked);
+  }
+    // first handle adding saved regions if any
+  uintptr_t start_addr = reinterpret_cast<uintptr_t>(start);
+  uintptr_t end_addr = start_addr + size;
+  // subtract start_addr, end_addr from all the regions
+  RAW_VLOG(10, "Removing global region %p..%p; have %" PRIuS " regions",
+              reinterpret_cast<void*>(start_addr),
+              reinterpret_cast<void*>(end_addr),
+              regions_->size());
+  Region sample;
+  sample.SetRegionSetKey(start_addr);
+  // Only iterate over the regions that might overlap start_addr..end_addr:
+  for (RegionSet::iterator region = regions_->lower_bound(sample);
+       region != regions_->end()  &&  region->start_addr < end_addr;
+       /*noop*/) {
+    RAW_VLOG(13, "Looking at region %p..%p",
+                reinterpret_cast<void*>(region->start_addr),
+                reinterpret_cast<void*>(region->end_addr));
+    if (start_addr <= region->start_addr  &&
+        region->end_addr <= end_addr) {  // full deletion
+      RAW_VLOG(12, "Deleting region %p..%p",
+                  reinterpret_cast<void*>(region->start_addr),
+                  reinterpret_cast<void*>(region->end_addr));
+      RecordRegionRemovalInBucket(region->call_stack_depth, region->call_stack,
+                                  region->end_addr - region->start_addr);
+      RegionSet::iterator d = region;
+      ++region;
+      regions_->erase(d);
+      continue;
+    } else if (region->start_addr < start_addr  &&
+               end_addr < region->end_addr) {  // cutting-out split
+      RAW_VLOG(12, "Splitting region %p..%p in two",
+                  reinterpret_cast<void*>(region->start_addr),
+                  reinterpret_cast<void*>(region->end_addr));
+      RecordRegionRemovalInBucket(region->call_stack_depth, region->call_stack,
+                                  end_addr - start_addr);
+      // Make another region for the start portion:
+      // The new region has to be the start portion because we can't
+      // just modify region->end_addr as it's the sorting key.
+      Region r = *region;
+      r.set_end_addr(start_addr);
+      InsertRegionLocked(r);
+      // cut *region from start:
+      const_cast<Region&>(*region).set_start_addr(end_addr);
+    } else if (end_addr > region->start_addr  &&
+               start_addr <= region->start_addr) {  // cut from start
+      RAW_VLOG(12, "Start-chopping region %p..%p",
+                  reinterpret_cast<void*>(region->start_addr),
+                  reinterpret_cast<void*>(region->end_addr));
+      RecordRegionRemovalInBucket(region->call_stack_depth, region->call_stack,
+                                  end_addr - region->start_addr);
+      const_cast<Region&>(*region).set_start_addr(end_addr);
+    } else if (start_addr > region->start_addr  &&
+               start_addr < region->end_addr) {  // cut from end
+      RAW_VLOG(12, "End-chopping region %p..%p",
+                  reinterpret_cast<void*>(region->start_addr),
+                  reinterpret_cast<void*>(region->end_addr));
+      RecordRegionRemovalInBucket(region->call_stack_depth, region->call_stack,
+                                  region->end_addr - start_addr);
+      // Can't just modify region->end_addr (it's the sorting key):
+      Region r = *region;
+      r.set_end_addr(start_addr);
+      RegionSet::iterator d = region;
+      ++region;
+      // It's safe to erase before inserting since r is independent of *d:
+      // r contains an own copy of the call stack:
+      regions_->erase(d);
+      InsertRegionLocked(r);
+      continue;
+    }
+    ++region;
+  }
+  RAW_VLOG(12, "Removed region %p..%p; have %" PRIuS " regions",
+              reinterpret_cast<void*>(start_addr),
+              reinterpret_cast<void*>(end_addr),
+              regions_->size());
+  if (VLOG_IS_ON(12))  LogAllLocked();
+  unmap_size_ += size;
+  Unlock();
+}
+
+void MemoryRegionMap::RecordRegionRemovalInBucket(int depth,
+                                                  const void* const stack[],
+                                                  size_t size) {
+  RAW_CHECK(LockIsHeld(), "should be held (by this thread)");
+  if (bucket_table_ == NULL) return;
+  HeapProfileBucket* b = GetBucket(depth, stack);
+  ++b->frees;
+  b->free_size += size;
+}
+
+void MemoryRegionMap::MmapHook(const void* result,
+                               const void* start, size_t size,
+                               int prot, int flags,
+                               int fd, off_t offset) {
+  // TODO(maxim): replace all 0x%" PRIxS " by %p when RAW_VLOG uses a safe
+  // snprintf reimplementation that does not malloc to pretty-print NULL
+  RAW_VLOG(10, "MMap = 0x%" PRIxPTR " of %" PRIuS " at %" PRIu64 " "
+              "prot %d flags %d fd %d offs %" PRId64,
+              reinterpret_cast<uintptr_t>(result), size,
+              reinterpret_cast<uint64>(start), prot, flags, fd,
+              static_cast<int64>(offset));
+  if (result != reinterpret_cast<void*>(MAP_FAILED)  &&  size != 0) {
+    RecordRegionAddition(result, size);
+  }
+}
+
+void MemoryRegionMap::MunmapHook(const void* ptr, size_t size) {
+  RAW_VLOG(10, "MUnmap of %p %" PRIuS "", ptr, size);
+  if (size != 0) {
+    RecordRegionRemoval(ptr, size);
+  }
+}
+
+void MemoryRegionMap::MremapHook(const void* result,
+                                 const void* old_addr, size_t old_size,
+                                 size_t new_size, int flags,
+                                 const void* new_addr) {
+  RAW_VLOG(10, "MRemap = 0x%" PRIxPTR " of 0x%" PRIxPTR " %" PRIuS " "
+              "to %" PRIuS " flags %d new_addr=0x%" PRIxPTR,
+              (uintptr_t)result, (uintptr_t)old_addr,
+               old_size, new_size, flags,
+               flags & MREMAP_FIXED ? (uintptr_t)new_addr : 0);
+  if (result != reinterpret_cast<void*>(-1)) {
+    RecordRegionRemoval(old_addr, old_size);
+    RecordRegionAddition(result, new_size);
+  }
+}
+
+void MemoryRegionMap::SbrkHook(const void* result, ptrdiff_t increment) {
+  RAW_VLOG(10, "Sbrk = 0x%" PRIxPTR " of %" PRIdS "", (uintptr_t)result, increment);
+  if (result != reinterpret_cast<void*>(-1)) {
+    if (increment > 0) {
+      void* new_end = sbrk(0);
+      RecordRegionAddition(result, reinterpret_cast<uintptr_t>(new_end) -
+                                   reinterpret_cast<uintptr_t>(result));
+    } else if (increment < 0) {
+      void* new_end = sbrk(0);
+      RecordRegionRemoval(new_end, reinterpret_cast<uintptr_t>(result) -
+                                   reinterpret_cast<uintptr_t>(new_end));
+    }
+  }
+}
+
+void MemoryRegionMap::LogAllLocked() {
+  RAW_CHECK(LockIsHeld(), "should be held (by this thread)");
+  RAW_LOG(INFO, "List of regions:");
+  uintptr_t previous = 0;
+  for (RegionSet::const_iterator r = regions_->begin();
+       r != regions_->end(); ++r) {
+    RAW_LOG(INFO, "Memory region 0x%" PRIxPTR "..0x%" PRIxPTR " "
+                  "from 0x%" PRIxPTR " stack=%d",
+                  r->start_addr, r->end_addr, r->caller(), r->is_stack);
+    RAW_CHECK(previous < r->end_addr, "wow, we messed up the set order");
+      // this must be caused by uncontrolled recursive operations on regions_
+    previous = r->end_addr;
+  }
+  RAW_LOG(INFO, "End of regions list");
+}

diff --git a/src/memory_region_map.h b/src/memory_region_map.h
new file mode 100644
index 0000000..ec388e1
--- /dev/null
+++ b/src/memory_region_map.h

@@ -0,0 +1,413 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2006, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Maxim Lifantsev
+ */
+
+#ifndef BASE_MEMORY_REGION_MAP_H_
+#define BASE_MEMORY_REGION_MAP_H_
+
+#include <config.h>
+
+#ifdef HAVE_PTHREAD
+#include <pthread.h>
+#endif
+#include <stddef.h>
+#include <set>
+#include "base/stl_allocator.h"
+#include "base/spinlock.h"
+#include "base/thread_annotations.h"
+#include "base/low_level_alloc.h"
+#include "heap-profile-stats.h"
+
+// TODO(maxim): add a unittest:
+//  execute a bunch of mmaps and compare memory map what strace logs
+//  execute a bunch of mmap/munmup and compare memory map with
+//  own accounting of what those mmaps generated
+
+// Thread-safe class to collect and query the map of all memory regions
+// in a process that have been created with mmap, munmap, mremap, sbrk.
+// For each memory region, we keep track of (and provide to users)
+// the stack trace that allocated that memory region.
+// The recorded stack trace depth is bounded by
+// a user-supplied max_stack_depth parameter of Init().
+// After initialization with Init()
+// (which can happened even before global object constructor execution)
+// we collect the map by installing and monitoring MallocHook-s
+// to mmap, munmap, mremap, sbrk.
+// At any time one can query this map via provided interface.
+// For more details on the design of MemoryRegionMap
+// see the comment at the top of our .cc file.
+class MemoryRegionMap {
+ private:
+  // Max call stack recording depth supported by Init().  Set it to be
+  // high enough for all our clients.  Note: we do not define storage
+  // for this (doing that requires special handling in windows), so
+  // don't take the address of it!
+  static const int kMaxStackDepth = 32;
+
+  // Size of the hash table of buckets.  A structure of the bucket table is
+  // described in heap-profile-stats.h.
+  static const int kHashTableSize = 179999;
+
+ public:
+  // interface ================================================================
+
+  // Every client of MemoryRegionMap must call Init() before first use,
+  // and Shutdown() after last use.  This allows us to reference count
+  // this (singleton) class properly.  MemoryRegionMap assumes it's the
+  // only client of MallocHooks, so a client can only register other
+  // MallocHooks after calling Init() and must unregister them before
+  // calling Shutdown().
+
+  // Initialize this module to record memory allocation stack traces.
+  // Stack traces that have more than "max_stack_depth" frames
+  // are automatically shrunk to "max_stack_depth" when they are recorded.
+  // Init() can be called more than once w/o harm, largest max_stack_depth
+  // will be the effective one.
+  // When "use_buckets" is true, then counts of mmap and munmap sizes will be
+  // recorded with each stack trace.  If Init() is called more than once, then
+  // counting will be effective after any call contained "use_buckets" of true.
+  // It will install mmap, munmap, mremap, sbrk hooks
+  // and initialize arena_ and our hook and locks, hence one can use
+  // MemoryRegionMap::Lock()/Unlock() to manage the locks.
+  // Uses Lock/Unlock inside.
+  static void Init(int max_stack_depth, bool use_buckets);
+
+  // Try to shutdown this module undoing what Init() did.
+  // Returns true iff could do full shutdown (or it was not attempted).
+  // Full shutdown is attempted when the number of Shutdown() calls equals
+  // the number of Init() calls.
+  static bool Shutdown();
+
+  // Return true if MemoryRegionMap is initialized and recording, i.e. when
+  // then number of Init() calls are more than the number of Shutdown() calls.
+  static bool IsRecordingLocked();
+
+  // Locks to protect our internal data structures.
+  // These also protect use of arena_ if our Init() has been done.
+  // The lock is recursive.
+  static void Lock() EXCLUSIVE_LOCK_FUNCTION(lock_);
+  static void Unlock() UNLOCK_FUNCTION(lock_);
+
+  // Returns true when the lock is held by this thread (for use in RAW_CHECK-s).
+  static bool LockIsHeld();
+
+  // Locker object that acquires the MemoryRegionMap::Lock
+  // for the duration of its lifetime (a C++ scope).
+  class LockHolder {
+   public:
+    LockHolder() { Lock(); }
+    ~LockHolder() { Unlock(); }
+   private:
+    DISALLOW_COPY_AND_ASSIGN(LockHolder);
+  };
+
+  // A memory region that we know about through malloc_hook-s.
+  // This is essentially an interface through which MemoryRegionMap
+  // exports the collected data to its clients.  Thread-compatible.
+  struct Region {
+    uintptr_t start_addr;  // region start address
+    uintptr_t end_addr;  // region end address
+    int call_stack_depth;  // number of caller stack frames that we saved
+    const void* call_stack[kMaxStackDepth];  // caller address stack array
+                                             // filled to call_stack_depth size
+    bool is_stack;  // does this region contain a thread's stack:
+                    // a user of MemoryRegionMap supplies this info
+
+    // Convenience accessor for call_stack[0],
+    // i.e. (the program counter of) the immediate caller
+    // of this region's allocation function,
+    // but it also returns NULL when call_stack_depth is 0,
+    // i.e whe we weren't able to get the call stack.
+    // This usually happens in recursive calls, when the stack-unwinder
+    // calls mmap() which in turn calls the stack-unwinder.
+    uintptr_t caller() const {
+      return reinterpret_cast<uintptr_t>(call_stack_depth >= 1
+                                         ? call_stack[0] : NULL);
+    }
+
+    // Return true iff this region overlaps region x.
+    bool Overlaps(const Region& x) const {
+      return start_addr < x.end_addr  &&  end_addr > x.start_addr;
+    }
+
+   private:  // helpers for MemoryRegionMap
+    friend class MemoryRegionMap;
+
+    // The ways we create Region-s:
+    void Create(const void* start, size_t size) {
+      start_addr = reinterpret_cast<uintptr_t>(start);
+      end_addr = start_addr + size;
+      is_stack = false;  // not a stack till marked such
+      call_stack_depth = 0;
+      AssertIsConsistent();
+    }
+    void set_call_stack_depth(int depth) {
+      RAW_DCHECK(call_stack_depth == 0, "");  // only one such set is allowed
+      call_stack_depth = depth;
+      AssertIsConsistent();
+    }
+
+    // The ways we modify Region-s:
+    void set_is_stack() { is_stack = true; }
+    void set_start_addr(uintptr_t addr) {
+      start_addr = addr;
+      AssertIsConsistent();
+    }
+    void set_end_addr(uintptr_t addr) {
+      end_addr = addr;
+      AssertIsConsistent();
+    }
+
+    // Verifies that *this contains consistent data, crashes if not the case.
+    void AssertIsConsistent() const {
+      RAW_DCHECK(start_addr < end_addr, "");
+      RAW_DCHECK(call_stack_depth >= 0  &&
+                 call_stack_depth <= kMaxStackDepth, "");
+    }
+
+    // Post-default construction helper to make a Region suitable
+    // for searching in RegionSet regions_.
+    void SetRegionSetKey(uintptr_t addr) {
+      // make sure *this has no usable data:
+      if (DEBUG_MODE) memset(this, 0xFF, sizeof(*this));
+      end_addr = addr;
+    }
+
+    // Note: call_stack[kMaxStackDepth] as a member lets us make Region
+    // a simple self-contained struct with correctly behaving bit-vise copying.
+    // This simplifies the code of this module but wastes some memory:
+    // in most-often use case of this module (leak checking)
+    // only one call_stack element out of kMaxStackDepth is actually needed.
+    // Making the storage for call_stack variable-sized,
+    // substantially complicates memory management for the Region-s:
+    // as they need to be created and manipulated for some time
+    // w/o any memory allocations, yet are also given out to the users.
+  };
+
+  // Find the region that covers addr and write its data into *result if found,
+  // in which case *result gets filled so that it stays fully functional
+  // even when the underlying region gets removed from MemoryRegionMap.
+  // Returns success. Uses Lock/Unlock inside.
+  static bool FindRegion(uintptr_t addr, Region* result);
+
+  // Find the region that contains stack_top, mark that region as
+  // a stack region, and write its data into *result if found,
+  // in which case *result gets filled so that it stays fully functional
+  // even when the underlying region gets removed from MemoryRegionMap.
+  // Returns success. Uses Lock/Unlock inside.
+  static bool FindAndMarkStackRegion(uintptr_t stack_top, Region* result);
+
+  // Iterate over the buckets which store mmap and munmap counts per stack
+  // trace.  It calls "callback" for each bucket, and passes "arg" to it.
+  template<class Type>
+  static void IterateBuckets(void (*callback)(const HeapProfileBucket*, Type),
+                             Type arg);
+
+  // Get the bucket whose caller stack trace is "key".  The stack trace is
+  // used to a depth of "depth" at most.  The requested bucket is created if
+  // needed.
+  // The bucket table is described in heap-profile-stats.h.
+  static HeapProfileBucket* GetBucket(int depth, const void* const key[]);
+
+ private:  // our internal types ==============================================
+
+  // Region comparator for sorting with STL
+  struct RegionCmp {
+    bool operator()(const Region& x, const Region& y) const {
+      return x.end_addr < y.end_addr;
+    }
+  };
+
+  // We allocate STL objects in our own arena.
+  struct MyAllocator {
+    static void *Allocate(size_t n) {
+      return LowLevelAlloc::AllocWithArena(n, arena_);
+    }
+    static void Free(const void *p, size_t /* n */) {
+      LowLevelAlloc::Free(const_cast<void*>(p));
+    }
+  };
+
+  // Set of the memory regions
+  typedef std::set<Region, RegionCmp,
+              STL_Allocator<Region, MyAllocator> > RegionSet;
+
+ public:  // more in-depth interface ==========================================
+
+  // STL iterator with values of Region
+  typedef RegionSet::const_iterator RegionIterator;
+
+  // Return the begin/end iterators to all the regions.
+  // These need Lock/Unlock protection around their whole usage (loop).
+  // Even when the same thread causes modifications during such a loop
+  // (which are permitted due to recursive locking)
+  // the loop iterator will still be valid as long as its region
+  // has not been deleted, but EndRegionLocked should be
+  // re-evaluated whenever the set of regions has changed.
+  static RegionIterator BeginRegionLocked();
+  static RegionIterator EndRegionLocked();
+
+  // Return the accumulated sizes of mapped and unmapped regions.
+  static int64 MapSize() { return map_size_; }
+  static int64 UnmapSize() { return unmap_size_; }
+
+  // Effectively private type from our .cc =================================
+  // public to let us declare global objects:
+  union RegionSetRep;
+
+ private:
+  // representation ===========================================================
+
+  // Counter of clients of this module that have called Init().
+  static int client_count_;
+
+  // Maximal number of caller stack frames to save (>= 0).
+  static int max_stack_depth_;
+
+  // Arena used for our allocations in regions_.
+  static LowLevelAlloc::Arena* arena_;
+
+  // Set of the mmap/sbrk/mremap-ed memory regions
+  // To be accessed *only* when Lock() is held.
+  // Hence we protect the non-recursive lock used inside of arena_
+  // with our recursive Lock(). This lets a user prevent deadlocks
+  // when threads are stopped by TCMalloc_ListAllProcessThreads at random spots
+  // simply by acquiring our recursive Lock() before that.
+  static RegionSet* regions_;
+
+  // Lock to protect regions_ and buckets_ variables and the data behind.
+  static SpinLock lock_;
+  // Lock to protect the recursive lock itself.
+  static SpinLock owner_lock_;
+
+  // Recursion count for the recursive lock.
+  static int recursion_count_;
+  // The thread id of the thread that's inside the recursive lock.
+  static pthread_t lock_owner_tid_;
+
+  // Total size of all mapped pages so far
+  static int64 map_size_;
+  // Total size of all unmapped pages so far
+  static int64 unmap_size_;
+
+  // Bucket hash table which is described in heap-profile-stats.h.
+  static HeapProfileBucket** bucket_table_ GUARDED_BY(lock_);
+  static int num_buckets_ GUARDED_BY(lock_);
+
+  // The following members are local to MemoryRegionMap::GetBucket()
+  // and MemoryRegionMap::HandleSavedBucketsLocked()
+  // and are file-level to ensure that they are initialized at load time.
+  //
+  // These are used as temporary storage to break the infinite cycle of mmap
+  // calling our hook which (sometimes) causes mmap.  It must be a static
+  // fixed-size array.  The size 20 is just an expected value for safety.
+  // The details are described in memory_region_map.cc.
+
+  // Number of unprocessed bucket inserts.
+  static int saved_buckets_count_ GUARDED_BY(lock_);
+
+  // Unprocessed inserts (must be big enough to hold all mmaps that can be
+  // caused by a GetBucket call).
+  // Bucket has no constructor, so that c-tor execution does not interfere
+  // with the any-time use of the static memory behind saved_buckets.
+  static HeapProfileBucket saved_buckets_[20] GUARDED_BY(lock_);
+
+  static const void* saved_buckets_keys_[20][kMaxStackDepth] GUARDED_BY(lock_);
+
+  // helpers ==================================================================
+
+  // Helper for FindRegion and FindAndMarkStackRegion:
+  // returns the region covering 'addr' or NULL; assumes our lock_ is held.
+  static const Region* DoFindRegionLocked(uintptr_t addr);
+
+  // Verifying wrapper around regions_->insert(region)
+  // To be called to do InsertRegionLocked's work only!
+  inline static void DoInsertRegionLocked(const Region& region);
+  // Handle regions saved by InsertRegionLocked into a tmp static array
+  // by calling insert_func on them.
+  inline static void HandleSavedRegionsLocked(
+                       void (*insert_func)(const Region& region));
+
+  // Restore buckets saved in a tmp static array by GetBucket to the bucket
+  // table where all buckets eventually should be.
+  static void RestoreSavedBucketsLocked();
+
+  // Wrapper around DoInsertRegionLocked
+  // that handles the case of recursive allocator calls.
+  inline static void InsertRegionLocked(const Region& region);
+
+  // Record addition of a memory region at address "start" of size "size"
+  // (called from our mmap/mremap/sbrk hooks).
+  static void RecordRegionAddition(const void* start, size_t size);
+  // Record deletion of a memory region at address "start" of size "size"
+  // (called from our munmap/mremap/sbrk hooks).
+  static void RecordRegionRemoval(const void* start, size_t size);
+
+  // Record deletion of a memory region of size "size" in a bucket whose
+  // caller stack trace is "key".  The stack trace is used to a depth of
+  // "depth" at most.
+  static void RecordRegionRemovalInBucket(int depth,
+                                          const void* const key[],
+                                          size_t size);
+
+  // Hooks for MallocHook
+  static void MmapHook(const void* result,
+                       const void* start, size_t size,
+                       int prot, int flags,
+                       int fd, off_t offset);
+  static void MunmapHook(const void* ptr, size_t size);
+  static void MremapHook(const void* result, const void* old_addr,
+                         size_t old_size, size_t new_size, int flags,
+                         const void* new_addr);
+  static void SbrkHook(const void* result, ptrdiff_t increment);
+
+  // Log all memory regions; Useful for debugging only.
+  // Assumes Lock() is held
+  static void LogAllLocked();
+
+  DISALLOW_COPY_AND_ASSIGN(MemoryRegionMap);
+};
+
+template <class Type>
+void MemoryRegionMap::IterateBuckets(
+    void (*callback)(const HeapProfileBucket*, Type), Type callback_arg) {
+  for (int index = 0; index < kHashTableSize; index++) {
+    for (HeapProfileBucket* bucket = bucket_table_[index];
+         bucket != NULL;
+         bucket = bucket->next) {
+      callback(bucket, callback_arg);
+    }
+  }
+}
+
+#endif  // BASE_MEMORY_REGION_MAP_H_

diff --git a/src/packed-cache-inl.h b/src/packed-cache-inl.h
new file mode 100644
index 0000000..0946260
--- /dev/null
+++ b/src/packed-cache-inl.h

@@ -0,0 +1,239 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Geoff Pike
+//
+// This file provides a minimal cache that can hold a <key, value> pair
+// with little if any wasted space.  The types of the key and value
+// must be unsigned integral types or at least have unsigned semantics
+// for >>, casting, and similar operations.
+//
+// Synchronization is not provided.  However, the cache is implemented
+// as an array of cache entries whose type is chosen at compile time.
+// If a[i] is atomic on your hardware for the chosen array type then
+// raciness will not necessarily lead to bugginess.  The cache entries
+// must be large enough to hold a partial key and a value packed
+// together.  The partial keys are bit strings of length
+// kKeybits - kHashbits, and the values are bit strings of length kValuebits.
+//
+// In an effort to use minimal space, every cache entry represents
+// some <key, value> pair; the class provides no way to mark a cache
+// entry as empty or uninitialized.  In practice, you may want to have
+// reserved keys or values to get around this limitation.  For example, in
+// tcmalloc's PageID-to-sizeclass cache, a value of 0 is used as
+// "unknown sizeclass."
+//
+// Usage Considerations
+// --------------------
+//
+// kHashbits controls the size of the cache.  The best value for
+// kHashbits will of course depend on the application.  Perhaps try
+// tuning the value of kHashbits by measuring different values on your
+// favorite benchmark.  Also remember not to be a pig; other
+// programs that need resources may suffer if you are.
+//
+// The main uses for this class will be when performance is
+// critical and there's a convenient type to hold the cache's
+// entries.  As described above, the number of bits required
+// for a cache entry is (kKeybits - kHashbits) + kValuebits.  Suppose
+// kKeybits + kValuebits is 43.  Then it probably makes sense to
+// chose kHashbits >= 11 so that cache entries fit in a uint32.
+//
+// On the other hand, suppose kKeybits = kValuebits = 64.  Then
+// using this class may be less worthwhile.  You'll probably
+// be using 128 bits for each entry anyway, so maybe just pick
+// a hash function, H, and use an array indexed by H(key):
+//    void Put(K key, V value) { a_[H(key)] = pair<K, V>(key, value); }
+//    V GetOrDefault(K key, V default) { const pair<K, V> &p = a_[H(key)]; ... }
+//    etc.
+//
+// Further Details
+// ---------------
+//
+// For caches used only by one thread, the following is true:
+// 1. For a cache c,
+//      (c.Put(key, value), c.GetOrDefault(key, 0)) == value
+//    and
+//      (c.Put(key, value), <...>, c.GetOrDefault(key, 0)) == value
+//    if the elided code contains no c.Put calls.
+//
+// 2. Has(key) will return false if no <key, value> pair with that key
+//    has ever been Put.  However, a newly initialized cache will have
+//    some <key, value> pairs already present.  When you create a new
+//    cache, you must specify an "initial value."  The initialization
+//    procedure is equivalent to Clear(initial_value), which is
+//    equivalent to Put(k, initial_value) for all keys k from 0 to
+//    2^kHashbits - 1.
+//
+// 3. If key and key' differ then the only way Put(key, value) may
+//    cause Has(key') to change is that Has(key') may change from true to
+//    false. Furthermore, a Put() call that doesn't change Has(key')
+//    doesn't change GetOrDefault(key', ...) either.
+//
+// Implementation details:
+//
+// This is a direct-mapped cache with 2^kHashbits entries; the hash
+// function simply takes the low bits of the key.  We store whole keys
+// if a whole key plus a whole value fits in an entry.  Otherwise, an
+// entry is the high bits of a key and a value, packed together.
+// E.g., a 20 bit key and a 7 bit value only require a uint16 for each
+// entry if kHashbits >= 11.
+//
+// Alternatives to this scheme will be added as needed.
+
+#ifndef TCMALLOC_PACKED_CACHE_INL_H_
+#define TCMALLOC_PACKED_CACHE_INL_H_
+
+#include "config.h"
+#include <stddef.h>                     // for size_t
+#ifdef HAVE_STDINT_H
+#include <stdint.h>                     // for uintptr_t
+#endif
+#include "base/basictypes.h"
+#include "internal_logging.h"
+
+// A safe way of doing "(1 << n) - 1" -- without worrying about overflow
+// Note this will all be resolved to a constant expression at compile-time
+#define N_ONES_(IntType, N)                                     \
+  ( (N) == 0 ? 0 : ((static_cast<IntType>(1) << ((N)-1))-1 +    \
+                    (static_cast<IntType>(1) << ((N)-1))) )
+
+// The types K and V provide upper bounds on the number of valid keys
+// and values, but we explicitly require the keys to be less than
+// 2^kKeybits and the values to be less than 2^kValuebits.  The size of
+// the table is controlled by kHashbits, and the type of each entry in
+// the cache is T.  See also the big comment at the top of the file.
+template <int kKeybits, typename T>
+class PackedCache {
+ public:
+  typedef uintptr_t K;
+  typedef size_t V;
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+  // Decrease the size map cache if running in the small memory mode.
+  static const int kHashbits = 12;
+#else
+  static const int kHashbits = 16;
+#endif
+  static const int kValuebits = 7;
+  static const bool kUseWholeKeys = kKeybits + kValuebits <= 8 * sizeof(T);
+
+  explicit PackedCache(V initial_value) {
+    COMPILE_ASSERT(kKeybits <= sizeof(K) * 8, key_size);
+    COMPILE_ASSERT(kValuebits <= sizeof(V) * 8, value_size);
+    COMPILE_ASSERT(kHashbits <= kKeybits, hash_function);
+    COMPILE_ASSERT(kKeybits - kHashbits + kValuebits <= kTbits,
+                   entry_size_must_be_big_enough);
+    Clear(initial_value);
+  }
+
+  void Put(K key, V value) {
+    ASSERT(key == (key & kKeyMask));
+    ASSERT(value == (value & kValueMask));
+    array_[Hash(key)] = KeyToUpper(key) | value;
+  }
+
+  bool Has(K key) const {
+    ASSERT(key == (key & kKeyMask));
+    return KeyMatch(array_[Hash(key)], key);
+  }
+
+  V GetOrDefault(K key, V default_value) const {
+    // As with other code in this class, we touch array_ as few times
+    // as we can.  Assuming entries are read atomically (e.g., their
+    // type is uintptr_t on most hardware) then certain races are
+    // harmless.
+    ASSERT(key == (key & kKeyMask));
+    T entry = array_[Hash(key)];
+    return KeyMatch(entry, key) ? EntryToValue(entry) : default_value;
+  }
+
+  void Clear(V value) {
+    ASSERT(value == (value & kValueMask));
+    for (int i = 0; i < 1 << kHashbits; i++) {
+      ASSERT(kUseWholeKeys || KeyToUpper(i) == 0);
+      array_[i] = kUseWholeKeys ? (value | KeyToUpper(i)) : value;
+    }
+  }
+
+ private:
+  // We are going to pack a value and the upper part of a key (or a
+  // whole key) into an entry of type T.  The UPPER type is for the
+  // upper part of a key, after the key has been masked and shifted
+  // for inclusion in an entry.
+  typedef T UPPER;
+
+  static V EntryToValue(T t) { return t & kValueMask; }
+
+  // If we have space for a whole key, we just shift it left.
+  // Otherwise kHashbits determines where in a K to find the upper
+  // part of the key, and kValuebits determines where in the entry to
+  // put it.
+  static UPPER KeyToUpper(K k) {
+    if (kUseWholeKeys) {
+      return static_cast<T>(k) << kValuebits;
+    } else {
+      const int shift = kHashbits - kValuebits;
+      // Assume kHashbits >= kValuebits.  It'd be easy to lift this assumption.
+      return static_cast<T>(k >> shift) & kUpperMask;
+    }
+  }
+
+  static size_t Hash(K key) {
+    return static_cast<size_t>(key) & N_ONES_(size_t, kHashbits);
+  }
+
+  // Does the entry match the relevant part of the given key?
+  static bool KeyMatch(T entry, K key) {
+    return kUseWholeKeys ?
+        (entry >> kValuebits == key) :
+        ((KeyToUpper(key) ^ entry) & kUpperMask) == 0;
+  }
+
+  static const int kTbits = 8 * sizeof(T);
+  static const int kUpperbits = kUseWholeKeys ? kKeybits : kKeybits - kHashbits;
+
+  // For masking a K.
+  static const K kKeyMask = N_ONES_(K, kKeybits);
+
+  // For masking a T.
+  static const T kUpperMask = N_ONES_(T, kUpperbits) << kValuebits;
+
+  // For masking a V or a T.
+  static const V kValueMask = N_ONES_(V, kValuebits);
+
+  // array_ is the cache.  Its elements are volatile because any
+  // thread can write any array element at any time.
+  volatile T array_[1 << kHashbits];
+};
+
+#undef N_ONES_
+
+#endif  // TCMALLOC_PACKED_CACHE_INL_H_

diff --git a/src/page_heap.cc b/src/page_heap.cc
new file mode 100644
index 0000000..f52ae2a
--- /dev/null
+++ b/src/page_heap.cc

@@ -0,0 +1,682 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+
+#include <config.h>
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>                   // for PRIuPTR
+#endif
+#include <errno.h>                      // for ENOMEM, errno
+#include <gperftools/malloc_extension.h>      // for MallocRange, etc
+#include "base/basictypes.h"
+#include "base/commandlineflags.h"
+#include "internal_logging.h"  // for ASSERT, TCMalloc_Printer, etc
+#include "page_heap_allocator.h"  // for PageHeapAllocator
+#include "static_vars.h"       // for Static
+#include "system-alloc.h"      // for TCMalloc_SystemAlloc, etc
+
+DEFINE_double(tcmalloc_release_rate,
+              EnvToDouble("TCMALLOC_RELEASE_RATE", 1.0),
+              "Rate at which we release unused memory to the system.  "
+              "Zero means we never release memory back to the system.  "
+              "Increase this flag to return memory faster; decrease it "
+              "to return memory slower.  Reasonable rates are in the "
+              "range [0,10]");
+
+DEFINE_int64(tcmalloc_heap_limit_mb,
+              EnvToInt("TCMALLOC_HEAP_LIMIT_MB", 0),
+              "Limit total size of the process heap to the "
+              "specified number of MiB. "
+              "When we approach the limit the memory is released "
+              "to the system more aggressively (more minor page faults). "
+              "Zero means to allocate as long as system allows.");
+
+namespace tcmalloc {
+
+PageHeap::PageHeap()
+    : pagemap_(MetaDataAlloc),
+      pagemap_cache_(0),
+      scavenge_counter_(0),
+      // Start scavenging at kMaxPages list
+      release_index_(kMaxPages),
+      aggressive_decommit_(false) {
+  COMPILE_ASSERT(kNumClasses <= (1 << PageMapCache::kValuebits), valuebits);
+  DLL_Init(&large_.normal);
+  DLL_Init(&large_.returned);
+  for (int i = 0; i < kMaxPages; i++) {
+    DLL_Init(&free_[i].normal);
+    DLL_Init(&free_[i].returned);
+  }
+}
+
+Span* PageHeap::SearchFreeAndLargeLists(Length n) {
+  ASSERT(Check());
+  ASSERT(n > 0);
+
+  // Find first size >= n that has a non-empty list
+  for (Length s = n; s < kMaxPages; s++) {
+    Span* ll = &free_[s].normal;
+    // If we're lucky, ll is non-empty, meaning it has a suitable span.
+    if (!DLL_IsEmpty(ll)) {
+      ASSERT(ll->next->location == Span::ON_NORMAL_FREELIST);
+      return Carve(ll->next, n);
+    }
+    // Alternatively, maybe there's a usable returned span.
+    ll = &free_[s].returned;
+    if (!DLL_IsEmpty(ll)) {
+      // We did not call EnsureLimit before, to avoid releasing the span
+      // that will be taken immediately back.
+      // Calling EnsureLimit here is not very expensive, as it fails only if
+      // there is no more normal spans (and it fails efficiently)
+      // or SystemRelease does not work (there is probably no returned spans).
+      if (EnsureLimit(n)) {
+        // ll may have became empty due to coalescing
+        if (!DLL_IsEmpty(ll)) {
+          ASSERT(ll->next->location == Span::ON_RETURNED_FREELIST);
+          return Carve(ll->next, n);
+        }
+      }
+    }
+  }
+  // No luck in free lists, our last chance is in a larger class.
+  return AllocLarge(n);  // May be NULL
+}
+
+static const size_t kForcedCoalesceInterval = 128*1024*1024;
+
+Span* PageHeap::New(Length n) {
+  ASSERT(Check());
+  ASSERT(n > 0);
+
+  Span* result = SearchFreeAndLargeLists(n);
+  if (result != NULL)
+    return result;
+
+  if (stats_.free_bytes != 0 && stats_.unmapped_bytes != 0
+      && stats_.free_bytes + stats_.unmapped_bytes >= stats_.system_bytes / 4
+      && (stats_.system_bytes / kForcedCoalesceInterval
+          != (stats_.system_bytes + (n << kPageShift)) / kForcedCoalesceInterval)) {
+    // We're about to grow heap, but there are lots of free pages.
+    // tcmalloc's design decision to keep unmapped and free spans
+    // separately and never coalesce them means that sometimes there
+    // can be free pages span of sufficient size, but it consists of
+    // "segments" of different type so page heap search cannot find
+    // it. In order to prevent growing heap and wasting memory in such
+    // case we're going to unmap all free pages. So that all free
+    // spans are maximally coalesced.
+    //
+    // We're also limiting 'rate' of going into this path to be at
+    // most once per 128 megs of heap growth. Otherwise programs that
+    // grow heap frequently (and that means by small amount) could be
+    // penalized with higher count of minor page faults.
+    //
+    // See also large_heap_fragmentation_unittest.cc and
+    // https://code.google.com/p/gperftools/issues/detail?id=368
+    ReleaseAtLeastNPages(static_cast<Length>(0x7fffffff));
+
+    // then try again. If we are forced to grow heap because of large
+    // spans fragmentation and not because of problem described above,
+    // then at the very least we've just unmapped free but
+    // insufficiently big large spans back to OS. So in case of really
+    // unlucky memory fragmentation we'll be consuming virtual address
+    // space, but not real memory
+    result = SearchFreeAndLargeLists(n);
+    if (result != NULL) return result;
+  }
+
+  // Grow the heap and try again.
+  if (!GrowHeap(n)) {
+    ASSERT(stats_.unmapped_bytes+ stats_.committed_bytes==stats_.system_bytes);
+    ASSERT(Check());
+    // underlying SysAllocator likely set ENOMEM but we can get here
+    // due to EnsureLimit so we set it here too.
+    //
+    // Setting errno to ENOMEM here allows us to avoid dealing with it
+    // in fast-path.
+    errno = ENOMEM;
+    return NULL;
+  }
+  return SearchFreeAndLargeLists(n);
+}
+
+Span* PageHeap::AllocLarge(Length n) {
+  // find the best span (closest to n in size).
+  // The following loops implements address-ordered best-fit.
+  Span *best = NULL;
+
+  // Search through normal list
+  for (Span* span = large_.normal.next;
+       span != &large_.normal;
+       span = span->next) {
+    if (span->length >= n) {
+      if ((best == NULL)
+          || (span->length < best->length)
+          || ((span->length == best->length) && (span->start < best->start))) {
+        best = span;
+        ASSERT(best->location == Span::ON_NORMAL_FREELIST);
+      }
+    }
+  }
+
+  Span *bestNormal = best;
+
+  // Search through released list in case it has a better fit
+  for (Span* span = large_.returned.next;
+       span != &large_.returned;
+       span = span->next) {
+    if (span->length >= n) {
+      if ((best == NULL)
+          || (span->length < best->length)
+          || ((span->length == best->length) && (span->start < best->start))) {
+        best = span;
+        ASSERT(best->location == Span::ON_RETURNED_FREELIST);
+      }
+    }
+  }
+
+  if (best == bestNormal) {
+    return best == NULL ? NULL : Carve(best, n);
+  }
+
+  // best comes from returned list.
+
+  if (EnsureLimit(n, false)) {
+    return Carve(best, n);
+  }
+
+  if (EnsureLimit(n, true)) {
+    // best could have been destroyed by coalescing.
+    // bestNormal is not a best-fit, and it could be destroyed as well.
+    // We retry, the limit is already ensured:
+    return AllocLarge(n);
+  }
+
+  // If bestNormal existed, EnsureLimit would succeeded:
+  ASSERT(bestNormal == NULL);
+  // We are not allowed to take best from returned list.
+  return NULL;
+}
+
+Span* PageHeap::Split(Span* span, Length n) {
+  ASSERT(0 < n);
+  ASSERT(n < span->length);
+  ASSERT(span->location == Span::IN_USE);
+  ASSERT(span->sizeclass == 0);
+  Event(span, 'T', n);
+
+  const int extra = span->length - n;
+  Span* leftover = NewSpan(span->start + n, extra);
+  ASSERT(leftover->location == Span::IN_USE);
+  Event(leftover, 'U', extra);
+  RecordSpan(leftover);
+  pagemap_.set(span->start + n - 1, span); // Update map from pageid to span
+  span->length = n;
+
+  return leftover;
+}
+
+void PageHeap::CommitSpan(Span* span) {
+  TCMalloc_SystemCommit(reinterpret_cast<void*>(span->start << kPageShift),
+                        static_cast<size_t>(span->length << kPageShift));
+  stats_.committed_bytes += span->length << kPageShift;
+}
+
+bool PageHeap::DecommitSpan(Span* span) {
+  bool rv = TCMalloc_SystemRelease(reinterpret_cast<void*>(span->start << kPageShift),
+                                   static_cast<size_t>(span->length << kPageShift));
+  if (rv) {
+    stats_.committed_bytes -= span->length << kPageShift;
+  }
+
+  return rv;
+}
+
+Span* PageHeap::Carve(Span* span, Length n) {
+  ASSERT(n > 0);
+  ASSERT(span->location != Span::IN_USE);
+  const int old_location = span->location;
+  RemoveFromFreeList(span);
+  span->location = Span::IN_USE;
+  Event(span, 'A', n);
+
+  const int extra = span->length - n;
+  ASSERT(extra >= 0);
+  if (extra > 0) {
+    Span* leftover = NewSpan(span->start + n, extra);
+    leftover->location = old_location;
+    Event(leftover, 'S', extra);
+    RecordSpan(leftover);
+
+    // The previous span of |leftover| was just splitted -- no need to
+    // coalesce them. The next span of |leftover| was not previously coalesced
+    // with |span|, i.e. is NULL or has got location other than |old_location|.
+#ifndef NDEBUG
+    const PageID p = leftover->start;
+    const Length len = leftover->length;
+    Span* next = GetDescriptor(p+len);
+    ASSERT (next == NULL ||
+            next->location == Span::IN_USE ||
+            next->location != leftover->location);
+#endif
+
+    PrependToFreeList(leftover);  // Skip coalescing - no candidates possible
+    span->length = n;
+    pagemap_.set(span->start + n - 1, span);
+  }
+  ASSERT(Check());
+  if (old_location == Span::ON_RETURNED_FREELIST) {
+    // We need to recommit this address space.
+    CommitSpan(span);
+  }
+  ASSERT(span->location == Span::IN_USE);
+  ASSERT(span->length == n);
+  ASSERT(stats_.unmapped_bytes+ stats_.committed_bytes==stats_.system_bytes);
+  return span;
+}
+
+void PageHeap::Delete(Span* span) {
+  ASSERT(Check());
+  ASSERT(span->location == Span::IN_USE);
+  ASSERT(span->length > 0);
+  ASSERT(GetDescriptor(span->start) == span);
+  ASSERT(GetDescriptor(span->start + span->length - 1) == span);
+  const Length n = span->length;
+  span->sizeclass = 0;
+  span->sample = 0;
+  span->location = Span::ON_NORMAL_FREELIST;
+  Event(span, 'D', span->length);
+  MergeIntoFreeList(span);  // Coalesces if possible
+  IncrementalScavenge(n);
+  ASSERT(stats_.unmapped_bytes+ stats_.committed_bytes==stats_.system_bytes);
+  ASSERT(Check());
+}
+
+bool PageHeap::MayMergeSpans(Span *span, Span *other) {
+  if (aggressive_decommit_) {
+    return other->location != Span::IN_USE;
+  }
+  return span->location == other->location;
+}
+
+void PageHeap::MergeIntoFreeList(Span* span) {
+  ASSERT(span->location != Span::IN_USE);
+
+  // Coalesce -- we guarantee that "p" != 0, so no bounds checking
+  // necessary.  We do not bother resetting the stale pagemap
+  // entries for the pieces we are merging together because we only
+  // care about the pagemap entries for the boundaries.
+  //
+  // Note: depending on aggressive_decommit_ mode we allow only
+  // similar spans to be coalesced.
+  //
+  // The following applies if aggressive_decommit_ is enabled:
+  //
+  // Note that the adjacent spans we merge into "span" may come out of a
+  // "normal" (committed) list, and cleanly merge with our IN_USE span, which
+  // is implicitly committed.  If the adjacents spans are on the "returned"
+  // (decommitted) list, then we must get both spans into the same state before
+  // or after we coalesce them.  The current code always decomits. This is
+  // achieved by blindly decommitting the entire coalesced region, which  may
+  // include any combination of committed and decommitted spans, at the end of
+  // the method.
+
+  // TODO(jar): "Always decommit" causes some extra calls to commit when we are
+  // called in GrowHeap() during an allocation :-/.  We need to eval the cost of
+  // that oscillation, and possibly do something to reduce it.
+
+  // TODO(jar): We need a better strategy for deciding to commit, or decommit,
+  // based on memory usage and free heap sizes.
+
+  uint64_t temp_committed = 0;
+
+  const PageID p = span->start;
+  const Length n = span->length;
+  Span* prev = GetDescriptor(p-1);
+  if (prev != NULL && MayMergeSpans(span, prev)) {
+    // Merge preceding span into this span
+    ASSERT(prev->start + prev->length == p);
+    const Length len = prev->length;
+    if (aggressive_decommit_ && prev->location == Span::ON_RETURNED_FREELIST) {
+      // We're about to put the merge span into the returned freelist and call
+      // DecommitSpan() on it, which will mark the entire span including this
+      // one as released and decrease stats_.committed_bytes by the size of the
+      // merged span.  To make the math work out we temporarily increase the
+      // stats_.committed_bytes amount.
+      temp_committed = prev->length << kPageShift;
+    }
+    RemoveFromFreeList(prev);
+    DeleteSpan(prev);
+    span->start -= len;
+    span->length += len;
+    pagemap_.set(span->start, span);
+    Event(span, 'L', len);
+  }
+  Span* next = GetDescriptor(p+n);
+  if (next != NULL && MayMergeSpans(span, next)) {
+    // Merge next span into this span
+    ASSERT(next->start == p+n);
+    const Length len = next->length;
+    if (aggressive_decommit_ && next->location == Span::ON_RETURNED_FREELIST) {
+      // See the comment below 'if (prev->location ...' for explanation.
+      temp_committed += next->length << kPageShift;
+    }
+    RemoveFromFreeList(next);
+    DeleteSpan(next);
+    span->length += len;
+    pagemap_.set(span->start + span->length - 1, span);
+    Event(span, 'R', len);
+  }
+
+  if (aggressive_decommit_) {
+    if (DecommitSpan(span)) {
+      span->location = Span::ON_RETURNED_FREELIST;
+      stats_.committed_bytes += temp_committed;
+    } else {
+      ASSERT(temp_committed == 0);
+    }
+  }
+  PrependToFreeList(span);
+}
+
+void PageHeap::PrependToFreeList(Span* span) {
+  ASSERT(span->location != Span::IN_USE);
+  SpanList* list = (span->length < kMaxPages) ? &free_[span->length] : &large_;
+  if (span->location == Span::ON_NORMAL_FREELIST) {
+    stats_.free_bytes += (span->length << kPageShift);
+    DLL_Prepend(&list->normal, span);
+  } else {
+    stats_.unmapped_bytes += (span->length << kPageShift);
+    DLL_Prepend(&list->returned, span);
+  }
+}
+
+void PageHeap::RemoveFromFreeList(Span* span) {
+  ASSERT(span->location != Span::IN_USE);
+  if (span->location == Span::ON_NORMAL_FREELIST) {
+    stats_.free_bytes -= (span->length << kPageShift);
+  } else {
+    stats_.unmapped_bytes -= (span->length << kPageShift);
+  }
+  DLL_Remove(span);
+}
+
+void PageHeap::IncrementalScavenge(Length n) {
+  // Fast path; not yet time to release memory
+  scavenge_counter_ -= n;
+  if (scavenge_counter_ >= 0) return;  // Not yet time to scavenge
+
+  const double rate = FLAGS_tcmalloc_release_rate;
+  if (rate <= 1e-6) {
+    // Tiny release rate means that releasing is disabled.
+    scavenge_counter_ = kDefaultReleaseDelay;
+    return;
+  }
+
+  Length released_pages = ReleaseAtLeastNPages(1);
+
+  if (released_pages == 0) {
+    // Nothing to scavenge, delay for a while.
+    scavenge_counter_ = kDefaultReleaseDelay;
+  } else {
+    // Compute how long to wait until we return memory.
+    // FLAGS_tcmalloc_release_rate==1 means wait for 1000 pages
+    // after releasing one page.
+    const double mult = 1000.0 / rate;
+    double wait = mult * static_cast<double>(released_pages);
+    if (wait > kMaxReleaseDelay) {
+      // Avoid overflow and bound to reasonable range.
+      wait = kMaxReleaseDelay;
+    }
+    scavenge_counter_ = static_cast<int64_t>(wait);
+  }
+}
+
+Length PageHeap::ReleaseLastNormalSpan(SpanList* slist) {
+  Span* s = slist->normal.prev;
+  ASSERT(s->location == Span::ON_NORMAL_FREELIST);
+
+  if (DecommitSpan(s)) {
+    RemoveFromFreeList(s);
+    const Length n = s->length;
+    s->location = Span::ON_RETURNED_FREELIST;
+    MergeIntoFreeList(s);  // Coalesces if possible.
+    return n;
+  }
+
+  return 0;
+}
+
+Length PageHeap::ReleaseAtLeastNPages(Length num_pages) {
+  Length released_pages = 0;
+
+  // Round robin through the lists of free spans, releasing the last
+  // span in each list.  Stop after releasing at least num_pages
+  // or when there is nothing more to release.
+  while (released_pages < num_pages && stats_.free_bytes > 0) {
+    for (int i = 0; i < kMaxPages+1 && released_pages < num_pages;
+         i++, release_index_++) {
+      if (release_index_ > kMaxPages) release_index_ = 0;
+      SpanList* slist = (release_index_ == kMaxPages) ?
+          &large_ : &free_[release_index_];
+      if (!DLL_IsEmpty(&slist->normal)) {
+        Length released_len = ReleaseLastNormalSpan(slist);
+        // Some systems do not support release
+        if (released_len == 0) return released_pages;
+        released_pages += released_len;
+      }
+    }
+  }
+  return released_pages;
+}
+
+bool PageHeap::EnsureLimit(Length n, bool withRelease)
+{
+  Length limit = (FLAGS_tcmalloc_heap_limit_mb*1024*1024) >> kPageShift;
+  if (limit == 0) return true; //there is no limit
+
+  // We do not use stats_.system_bytes because it does not take
+  // MetaDataAllocs into account.
+  Length takenPages = TCMalloc_SystemTaken >> kPageShift;
+  //XXX takenPages may be slightly bigger than limit for two reasons:
+  //* MetaDataAllocs ignore the limit (it is not easy to handle
+  //  out of memory there)
+  //* sys_alloc may round allocation up to huge page size,
+  //  although smaller limit was ensured
+
+  ASSERT(takenPages >= stats_.unmapped_bytes >> kPageShift);
+  takenPages -= stats_.unmapped_bytes >> kPageShift;
+
+  if (takenPages + n > limit && withRelease) {
+    takenPages -= ReleaseAtLeastNPages(takenPages + n - limit);
+  }
+
+  return takenPages + n <= limit;
+}
+
+void PageHeap::RegisterSizeClass(Span* span, size_t sc) {
+  // Associate span object with all interior pages as well
+  ASSERT(span->location == Span::IN_USE);
+  ASSERT(GetDescriptor(span->start) == span);
+  ASSERT(GetDescriptor(span->start+span->length-1) == span);
+  Event(span, 'C', sc);
+  span->sizeclass = sc;
+  for (Length i = 1; i < span->length-1; i++) {
+    pagemap_.set(span->start+i, span);
+  }
+}
+
+void PageHeap::GetSmallSpanStats(SmallSpanStats* result) {
+  for (int s = 0; s < kMaxPages; s++) {
+    result->normal_length[s] = DLL_Length(&free_[s].normal);
+    result->returned_length[s] = DLL_Length(&free_[s].returned);
+  }
+}
+
+void PageHeap::GetLargeSpanStats(LargeSpanStats* result) {
+  result->spans = 0;
+  result->normal_pages = 0;
+  result->returned_pages = 0;
+  for (Span* s = large_.normal.next; s != &large_.normal; s = s->next) {
+    result->normal_pages += s->length;;
+    result->spans++;
+  }
+  for (Span* s = large_.returned.next; s != &large_.returned; s = s->next) {
+    result->returned_pages += s->length;
+    result->spans++;
+  }
+}
+
+bool PageHeap::GetNextRange(PageID start, base::MallocRange* r) {
+  Span* span = reinterpret_cast<Span*>(pagemap_.Next(start));
+  if (span == NULL) {
+    return false;
+  }
+  r->address = span->start << kPageShift;
+  r->length = span->length << kPageShift;
+  r->fraction = 0;
+  switch (span->location) {
+    case Span::IN_USE:
+      r->type = base::MallocRange::INUSE;
+      r->fraction = 1;
+      if (span->sizeclass > 0) {
+        // Only some of the objects in this span may be in use.
+        const size_t osize = Static::sizemap()->class_to_size(span->sizeclass);
+        r->fraction = (1.0 * osize * span->refcount) / r->length;
+      }
+      break;
+    case Span::ON_NORMAL_FREELIST:
+      r->type = base::MallocRange::FREE;
+      break;
+    case Span::ON_RETURNED_FREELIST:
+      r->type = base::MallocRange::UNMAPPED;
+      break;
+    default:
+      r->type = base::MallocRange::UNKNOWN;
+      break;
+  }
+  return true;
+}
+
+static void RecordGrowth(size_t growth) {
+  StackTrace* t = Static::stacktrace_allocator()->New();
+  t->depth = GetStackTrace(t->stack, kMaxStackDepth-1, 3);
+  t->size = growth;
+  t->stack[kMaxStackDepth-1] = reinterpret_cast<void*>(Static::growth_stacks());
+  Static::set_growth_stacks(t);
+}
+
+bool PageHeap::GrowHeap(Length n) {
+  ASSERT(kMaxPages >= kMinSystemAlloc);
+  if (n > kMaxValidPages) return false;
+  Length ask = (n>kMinSystemAlloc) ? n : static_cast<Length>(kMinSystemAlloc);
+  size_t actual_size;
+  void* ptr = NULL;
+  if (EnsureLimit(ask)) {
+      ptr = TCMalloc_SystemAlloc(ask << kPageShift, &actual_size, kPageSize);
+  }
+  if (ptr == NULL) {
+    if (n < ask) {
+      // Try growing just "n" pages
+      ask = n;
+      if (EnsureLimit(ask)) {
+        ptr = TCMalloc_SystemAlloc(ask << kPageShift, &actual_size, kPageSize);
+      }
+    }
+    if (ptr == NULL) return false;
+  }
+  ask = actual_size >> kPageShift;
+  RecordGrowth(ask << kPageShift);
+
+  uint64_t old_system_bytes = stats_.system_bytes;
+  stats_.system_bytes += (ask << kPageShift);
+  stats_.committed_bytes += (ask << kPageShift);
+  const PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
+  ASSERT(p > 0);
+
+  // If we have already a lot of pages allocated, just pre allocate a bunch of
+  // memory for the page map. This prevents fragmentation by pagemap metadata
+  // when a program keeps allocating and freeing large blocks.
+
+  if (old_system_bytes < kPageMapBigAllocationThreshold
+      && stats_.system_bytes >= kPageMapBigAllocationThreshold) {
+    pagemap_.PreallocateMoreMemory();
+  }
+
+  // Make sure pagemap_ has entries for all of the new pages.
+  // Plus ensure one before and one after so coalescing code
+  // does not need bounds-checking.
+  if (pagemap_.Ensure(p-1, ask+2)) {
+    // Pretend the new area is allocated and then Delete() it to cause
+    // any necessary coalescing to occur.
+    Span* span = NewSpan(p, ask);
+    RecordSpan(span);
+    Delete(span);
+    ASSERT(stats_.unmapped_bytes+ stats_.committed_bytes==stats_.system_bytes);
+    ASSERT(Check());
+    return true;
+  } else {
+    // We could not allocate memory within "pagemap_"
+    // TODO: Once we can return memory to the system, return the new span
+    return false;
+  }
+}
+
+bool PageHeap::Check() {
+  ASSERT(free_[0].normal.next == &free_[0].normal);
+  ASSERT(free_[0].returned.next == &free_[0].returned);
+  return true;
+}
+
+bool PageHeap::CheckExpensive() {
+  bool result = Check();
+  CheckList(&large_.normal, kMaxPages, 1000000000, Span::ON_NORMAL_FREELIST);
+  CheckList(&large_.returned, kMaxPages, 1000000000, Span::ON_RETURNED_FREELIST);
+  for (Length s = 1; s < kMaxPages; s++) {
+    CheckList(&free_[s].normal, s, s, Span::ON_NORMAL_FREELIST);
+    CheckList(&free_[s].returned, s, s, Span::ON_RETURNED_FREELIST);
+  }
+  return result;
+}
+
+bool PageHeap::CheckList(Span* list, Length min_pages, Length max_pages,
+                         int freelist) {
+  for (Span* s = list->next; s != list; s = s->next) {
+    CHECK_CONDITION(s->location == freelist);  // NORMAL or RETURNED
+    CHECK_CONDITION(s->length >= min_pages);
+    CHECK_CONDITION(s->length <= max_pages);
+    CHECK_CONDITION(GetDescriptor(s->start) == s);
+    CHECK_CONDITION(GetDescriptor(s->start+s->length-1) == s);
+  }
+  return true;
+}
+
+}  // namespace tcmalloc

diff --git a/src/page_heap.h b/src/page_heap.h
new file mode 100644
index 0000000..18abed1
--- /dev/null
+++ b/src/page_heap.h

@@ -0,0 +1,316 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+
+#ifndef TCMALLOC_PAGE_HEAP_H_
+#define TCMALLOC_PAGE_HEAP_H_
+
+#include <config.h>
+#include <stddef.h>                     // for size_t
+#ifdef HAVE_STDINT_H
+#include <stdint.h>                     // for uint64_t, int64_t, uint16_t
+#endif
+#include <gperftools/malloc_extension.h>
+#include "base/basictypes.h"
+#include "common.h"
+#include "packed-cache-inl.h"
+#include "pagemap.h"
+#include "span.h"
+
+// We need to dllexport PageHeap just for the unittest.  MSVC complains
+// that we don't dllexport the PageHeap members, but we don't need to
+// test those, so I just suppress this warning.
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4251)
+#endif
+
+// This #ifdef should almost never be set.  Set NO_TCMALLOC_SAMPLES if
+// you're porting to a system where you really can't get a stacktrace.
+// Because we control the definition of GetStackTrace, all clients of
+// GetStackTrace should #include us rather than stacktrace.h.
+#ifdef NO_TCMALLOC_SAMPLES
+  // We use #define so code compiles even if you #include stacktrace.h somehow.
+# define GetStackTrace(stack, depth, skip)  (0)
+#else
+# include <gperftools/stacktrace.h>
+#endif
+
+namespace base {
+struct MallocRange;
+}
+
+namespace tcmalloc {
+
+// -------------------------------------------------------------------------
+// Map from page-id to per-page data
+// -------------------------------------------------------------------------
+
+// We use PageMap2<> for 32-bit and PageMap3<> for 64-bit machines.
+// We also use a simple one-level cache for hot PageID-to-sizeclass mappings,
+// because sometimes the sizeclass is all the information we need.
+
+// Selector class -- general selector uses 3-level map
+template <int BITS> class MapSelector {
+ public:
+  typedef TCMalloc_PageMap3<BITS-kPageShift> Type;
+  typedef PackedCache<BITS-kPageShift, uint64_t> CacheType;
+};
+
+// A two-level map for 32-bit machines
+template <> class MapSelector<32> {
+ public:
+  typedef TCMalloc_PageMap2<32-kPageShift> Type;
+  typedef PackedCache<32-kPageShift, uint16_t> CacheType;
+};
+
+// -------------------------------------------------------------------------
+// Page-level allocator
+//  * Eager coalescing
+//
+// Heap for page-level allocation.  We allow allocating and freeing a
+// contiguous runs of pages (called a "span").
+// -------------------------------------------------------------------------
+
+class PERFTOOLS_DLL_DECL PageHeap {
+ public:
+  PageHeap();
+
+  // Allocate a run of "n" pages.  Returns zero if out of memory.
+  // Caller should not pass "n == 0" -- instead, n should have
+  // been rounded up already.
+  Span* New(Length n);
+
+  // Delete the span "[p, p+n-1]".
+  // REQUIRES: span was returned by earlier call to New() and
+  //           has not yet been deleted.
+  void Delete(Span* span);
+
+  // Mark an allocated span as being used for small objects of the
+  // specified size-class.
+  // REQUIRES: span was returned by an earlier call to New()
+  //           and has not yet been deleted.
+  void RegisterSizeClass(Span* span, size_t sc);
+
+  // Split an allocated span into two spans: one of length "n" pages
+  // followed by another span of length "span->length - n" pages.
+  // Modifies "*span" to point to the first span of length "n" pages.
+  // Returns a pointer to the second span.
+  //
+  // REQUIRES: "0 < n < span->length"
+  // REQUIRES: span->location == IN_USE
+  // REQUIRES: span->sizeclass == 0
+  Span* Split(Span* span, Length n);
+
+  // Return the descriptor for the specified page.  Returns NULL if
+  // this PageID was not allocated previously.
+  inline Span* GetDescriptor(PageID p) const {
+    return reinterpret_cast<Span*>(pagemap_.get(p));
+  }
+
+  // If this page heap is managing a range with starting page # >= start,
+  // store info about the range in *r and return true.  Else return false.
+  bool GetNextRange(PageID start, base::MallocRange* r);
+
+  // Page heap statistics
+  struct Stats {
+    Stats() : system_bytes(0), free_bytes(0), unmapped_bytes(0), committed_bytes(0) {}
+    uint64_t system_bytes;    // Total bytes allocated from system
+    uint64_t free_bytes;      // Total bytes on normal freelists
+    uint64_t unmapped_bytes;  // Total bytes on returned freelists
+    uint64_t committed_bytes;  // Bytes committed, always <= system_bytes_.
+
+  };
+  inline Stats stats() const { return stats_; }
+
+  struct SmallSpanStats {
+    // For each free list of small spans, the length (in spans) of the
+    // normal and returned free lists for that size.
+    int64 normal_length[kMaxPages];
+    int64 returned_length[kMaxPages];
+  };
+  void GetSmallSpanStats(SmallSpanStats* result);
+
+  // Stats for free large spans (i.e., spans with more than kMaxPages pages).
+  struct LargeSpanStats {
+    int64 spans;           // Number of such spans
+    int64 normal_pages;    // Combined page length of normal large spans
+    int64 returned_pages;  // Combined page length of unmapped spans
+  };
+  void GetLargeSpanStats(LargeSpanStats* result);
+
+  bool Check();
+  // Like Check() but does some more comprehensive checking.
+  bool CheckExpensive();
+  bool CheckList(Span* list, Length min_pages, Length max_pages,
+                 int freelist);  // ON_NORMAL_FREELIST or ON_RETURNED_FREELIST
+
+  // Try to release at least num_pages for reuse by the OS.  Returns
+  // the actual number of pages released, which may be less than
+  // num_pages if there weren't enough pages to release. The result
+  // may also be larger than num_pages since page_heap might decide to
+  // release one large range instead of fragmenting it into two
+  // smaller released and unreleased ranges.
+  Length ReleaseAtLeastNPages(Length num_pages);
+
+  // Return 0 if we have no information, or else the correct sizeclass for p.
+  // Reads and writes to pagemap_cache_ do not require locking.
+  // The entries are 64 bits on 64-bit hardware and 16 bits on
+  // 32-bit hardware, and we don't mind raciness as long as each read of
+  // an entry yields a valid entry, not a partially updated entry.
+  size_t GetSizeClassIfCached(PageID p) const {
+    return pagemap_cache_.GetOrDefault(p, 0);
+  }
+  void CacheSizeClass(PageID p, size_t cl) const { pagemap_cache_.Put(p, cl); }
+
+  bool GetAggressiveDecommit(void) {return aggressive_decommit_;}
+  void SetAggressiveDecommit(bool aggressive_decommit) {
+    aggressive_decommit_ = aggressive_decommit;
+  }
+
+ private:
+  // Allocates a big block of memory for the pagemap once we reach more than
+  // 128MB
+  static const size_t kPageMapBigAllocationThreshold = 128 << 20;
+
+  // Minimum number of pages to fetch from system at a time.  Must be
+  // significantly bigger than kBlockSize to amortize system-call
+  // overhead, and also to reduce external fragementation.  Also, we
+  // should keep this value big because various incarnations of Linux
+  // have small limits on the number of mmap() regions per
+  // address-space.
+  // REQUIRED: kMinSystemAlloc <= kMaxPages;
+  static const int kMinSystemAlloc = kMaxPages;
+
+  // Never delay scavenging for more than the following number of
+  // deallocated pages.  With 4K pages, this comes to 4GB of
+  // deallocation.
+  static const int kMaxReleaseDelay = 1 << 20;
+
+  // If there is nothing to release, wait for so many pages before
+  // scavenging again.  With 4K pages, this comes to 1GB of memory.
+  static const int kDefaultReleaseDelay = 1 << 18;
+
+  // Pick the appropriate map and cache types based on pointer size
+  typedef MapSelector<kAddressBits>::Type PageMap;
+  typedef MapSelector<kAddressBits>::CacheType PageMapCache;
+  PageMap pagemap_;
+  mutable PageMapCache pagemap_cache_;
+
+  // We segregate spans of a given size into two circular linked
+  // lists: one for normal spans, and one for spans whose memory
+  // has been returned to the system.
+  struct SpanList {
+    Span        normal;
+    Span        returned;
+  };
+
+  // List of free spans of length >= kMaxPages
+  SpanList large_;
+
+  // Array mapping from span length to a doubly linked list of free spans
+  SpanList free_[kMaxPages];
+
+  // Statistics on system, free, and unmapped bytes
+  Stats stats_;
+
+  Span* SearchFreeAndLargeLists(Length n);
+
+  bool GrowHeap(Length n);
+
+  // REQUIRES: span->length >= n
+  // REQUIRES: span->location != IN_USE
+  // Remove span from its free list, and move any leftover part of
+  // span into appropriate free lists.  Also update "span" to have
+  // length exactly "n" and mark it as non-free so it can be returned
+  // to the client.  After all that, decrease free_pages_ by n and
+  // return span.
+  Span* Carve(Span* span, Length n);
+
+  void RecordSpan(Span* span) {
+    pagemap_.set(span->start, span);
+    if (span->length > 1) {
+      pagemap_.set(span->start + span->length - 1, span);
+    }
+  }
+
+  // Allocate a large span of length == n.  If successful, returns a
+  // span of exactly the specified length.  Else, returns NULL.
+  Span* AllocLarge(Length n);
+
+  // Coalesce span with neighboring spans if possible, prepend to
+  // appropriate free list, and adjust stats.
+  void MergeIntoFreeList(Span* span);
+
+  // Commit the span.
+  void CommitSpan(Span* span);
+
+  // Decommit the span.
+  bool DecommitSpan(Span* span);
+
+  // Prepends span to appropriate free list, and adjusts stats.
+  void PrependToFreeList(Span* span);
+
+  // Removes span from its free list, and adjust stats.
+  void RemoveFromFreeList(Span* span);
+
+  // Incrementally release some memory to the system.
+  // IncrementalScavenge(n) is called whenever n pages are freed.
+  void IncrementalScavenge(Length n);
+
+  // Release the last span on the normal portion of this list.
+  // Return the length of that span or zero if release failed.
+  Length ReleaseLastNormalSpan(SpanList* slist);
+
+  // Checks if we are allowed to take more memory from the system.
+  // If limit is reached and allowRelease is true, tries to release
+  // some unused spans.
+  bool EnsureLimit(Length n, bool allowRelease = true);
+
+  bool MayMergeSpans(Span *span, Span *other);
+
+  // Number of pages to deallocate before doing more scavenging
+  int64_t scavenge_counter_;
+
+  // Index of last free list where we released memory to the OS.
+  int release_index_;
+
+  bool aggressive_decommit_;
+};
+
+}  // namespace tcmalloc
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif  // TCMALLOC_PAGE_HEAP_H_

diff --git a/src/page_heap_allocator.h b/src/page_heap_allocator.h
new file mode 100644
index 0000000..892d1c1
--- /dev/null
+++ b/src/page_heap_allocator.h

@@ -0,0 +1,114 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+
+#ifndef TCMALLOC_PAGE_HEAP_ALLOCATOR_H_
+#define TCMALLOC_PAGE_HEAP_ALLOCATOR_H_
+
+#include <stddef.h>                     // for NULL, size_t
+
+#include "common.h"            // for MetaDataAlloc
+#include "internal_logging.h"  // for ASSERT
+
+namespace tcmalloc {
+
+// Simple allocator for objects of a specified type.  External locking
+// is required before accessing one of these objects.
+template <class T>
+class PageHeapAllocator {
+ public:
+  // We use an explicit Init function because these variables are statically
+  // allocated and their constructors might not have run by the time some
+  // other static variable tries to allocate memory.
+  void Init() {
+    ASSERT(sizeof(T) <= kAllocIncrement);
+    inuse_ = 0;
+    free_area_ = NULL;
+    free_avail_ = 0;
+    free_list_ = NULL;
+    // Reserve some space at the beginning to avoid fragmentation.
+    Delete(New());
+  }
+
+  T* New() {
+    // Consult free list
+    void* result;
+    if (free_list_ != NULL) {
+      result = free_list_;
+      free_list_ = *(reinterpret_cast<void**>(result));
+    } else {
+      if (free_avail_ < sizeof(T)) {
+        // Need more room. We assume that MetaDataAlloc returns
+        // suitably aligned memory.
+        free_area_ = reinterpret_cast<char*>(MetaDataAlloc(kAllocIncrement));
+        if (free_area_ == NULL) {
+          Log(kCrash, __FILE__, __LINE__,
+              "FATAL ERROR: Out of memory trying to allocate internal "
+              "tcmalloc data (bytes, object-size)",
+              kAllocIncrement, sizeof(T));
+        }
+        free_avail_ = kAllocIncrement;
+      }
+      result = free_area_;
+      free_area_ += sizeof(T);
+      free_avail_ -= sizeof(T);
+    }
+    inuse_++;
+    return reinterpret_cast<T*>(result);
+  }
+
+  void Delete(T* p) {
+    *(reinterpret_cast<void**>(p)) = free_list_;
+    free_list_ = p;
+    inuse_--;
+  }
+
+  int inuse() const { return inuse_; }
+
+ private:
+  // How much to allocate from system at a time
+  static const int kAllocIncrement = 128 << 10;
+
+  // Free area from which to carve new objects
+  char* free_area_;
+  size_t free_avail_;
+
+  // Free list of already carved objects
+  void* free_list_;
+
+  // Number of allocated but unfreed objects
+  int inuse_;
+};
+
+}  // namespace tcmalloc
+
+#endif  // TCMALLOC_PAGE_HEAP_ALLOCATOR_H_

diff --git a/src/pagemap.h b/src/pagemap.h
new file mode 100644
index 0000000..dd94423
--- /dev/null
+++ b/src/pagemap.h

@@ -0,0 +1,324 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+//
+// A data structure used by the caching malloc.  It maps from page# to
+// a pointer that contains info about that page.  We use two
+// representations: one for 32-bit addresses, and another for 64 bit
+// addresses.  Both representations provide the same interface.  The
+// first representation is implemented as a flat array, the seconds as
+// a three-level radix tree that strips away approximately 1/3rd of
+// the bits every time.
+//
+// The BITS parameter should be the number of bits required to hold
+// a page number.  E.g., with 32 bit pointers and 4K pages (i.e.,
+// page offset fits in lower 12 bits), BITS == 20.
+
+#ifndef TCMALLOC_PAGEMAP_H_
+#define TCMALLOC_PAGEMAP_H_
+
+#include "config.h"
+
+#include <stddef.h>                     // for NULL, size_t
+#include <string.h>                     // for memset
+#if defined HAVE_STDINT_H
+#include <stdint.h>
+#elif defined HAVE_INTTYPES_H
+#include <inttypes.h>
+#else
+#include <sys/types.h>
+#endif
+#include "internal_logging.h"  // for ASSERT
+
+// Single-level array
+template <int BITS>
+class TCMalloc_PageMap1 {
+ private:
+  static const int LENGTH = 1 << BITS;
+
+  void** array_;
+
+ public:
+  typedef uintptr_t Number;
+
+  explicit TCMalloc_PageMap1(void* (*allocator)(size_t)) {
+    array_ = reinterpret_cast<void**>((*allocator)(sizeof(void*) << BITS));
+    memset(array_, 0, sizeof(void*) << BITS);
+  }
+
+  // Ensure that the map contains initialized entries "x .. x+n-1".
+  // Returns true if successful, false if we could not allocate memory.
+  bool Ensure(Number x, size_t n) {
+    // Nothing to do since flat array was allocated at start.  All
+    // that's left is to check for overflow (that is, we don't want to
+    // ensure a number y where array_[y] would be an out-of-bounds
+    // access).
+    return n <= LENGTH - x;   // an overflow-free way to do "x + n <= LENGTH"
+  }
+
+  void PreallocateMoreMemory() {}
+
+  // Return the current value for KEY.  Returns NULL if not yet set,
+  // or if k is out of range.
+  void* get(Number k) const {
+    if ((k >> BITS) > 0) {
+      return NULL;
+    }
+    return array_[k];
+  }
+
+  // REQUIRES "k" is in range "[0,2^BITS-1]".
+  // REQUIRES "k" has been ensured before.
+  //
+  // Sets the value 'v' for key 'k'.
+  void set(Number k, void* v) {
+    array_[k] = v;
+  }
+
+  // Return the first non-NULL pointer found in this map for
+  // a page number >= k.  Returns NULL if no such number is found.
+  void* Next(Number k) const {
+    while (k < (1 << BITS)) {
+      if (array_[k] != NULL) return array_[k];
+      k++;
+    }
+    return NULL;
+  }
+};
+
+// Two-level radix tree
+template <int BITS>
+class TCMalloc_PageMap2 {
+ private:
+  // Put 32 entries in the root and (2^BITS)/32 entries in each leaf.
+  static const int ROOT_BITS = 5;
+  static const int ROOT_LENGTH = 1 << ROOT_BITS;
+
+  static const int LEAF_BITS = BITS - ROOT_BITS;
+  static const int LEAF_LENGTH = 1 << LEAF_BITS;
+
+  // Leaf node
+  struct Leaf {
+    void* values[LEAF_LENGTH];
+  };
+
+  Leaf* root_[ROOT_LENGTH];             // Pointers to 32 child nodes
+  void* (*allocator_)(size_t);          // Memory allocator
+
+ public:
+  typedef uintptr_t Number;
+
+  explicit TCMalloc_PageMap2(void* (*allocator)(size_t)) {
+    allocator_ = allocator;
+    memset(root_, 0, sizeof(root_));
+  }
+
+  void* get(Number k) const {
+    const Number i1 = k >> LEAF_BITS;
+    const Number i2 = k & (LEAF_LENGTH-1);
+    if ((k >> BITS) > 0 || root_[i1] == NULL) {
+      return NULL;
+    }
+    return root_[i1]->values[i2];
+  }
+
+  void set(Number k, void* v) {
+    const Number i1 = k >> LEAF_BITS;
+    const Number i2 = k & (LEAF_LENGTH-1);
+    ASSERT(i1 < ROOT_LENGTH);
+    root_[i1]->values[i2] = v;
+  }
+
+  bool Ensure(Number start, size_t n) {
+    for (Number key = start; key <= start + n - 1; ) {
+      const Number i1 = key >> LEAF_BITS;
+
+      // Check for overflow
+      if (i1 >= ROOT_LENGTH)
+        return false;
+
+      // Make 2nd level node if necessary
+      if (root_[i1] == NULL) {
+        Leaf* leaf = reinterpret_cast<Leaf*>((*allocator_)(sizeof(Leaf)));
+        if (leaf == NULL) return false;
+        memset(leaf, 0, sizeof(*leaf));
+        root_[i1] = leaf;
+      }
+
+      // Advance key past whatever is covered by this leaf node
+      key = ((key >> LEAF_BITS) + 1) << LEAF_BITS;
+    }
+    return true;
+  }
+
+  void PreallocateMoreMemory() {
+    // Allocate enough to keep track of all possible pages
+    Ensure(0, 1 << BITS);
+  }
+
+  void* Next(Number k) const {
+    while (k < (1 << BITS)) {
+      const Number i1 = k >> LEAF_BITS;
+      Leaf* leaf = root_[i1];
+      if (leaf != NULL) {
+        // Scan forward in leaf
+        for (Number i2 = k & (LEAF_LENGTH - 1); i2 < LEAF_LENGTH; i2++) {
+          if (leaf->values[i2] != NULL) {
+            return leaf->values[i2];
+          }
+        }
+      }
+      // Skip to next top-level entry
+      k = (i1 + 1) << LEAF_BITS;
+    }
+    return NULL;
+  }
+};
+
+// Three-level radix tree
+template <int BITS>
+class TCMalloc_PageMap3 {
+ private:
+  // How many bits should we consume at each interior level
+  static const int INTERIOR_BITS = (BITS + 2) / 3; // Round-up
+  static const int INTERIOR_LENGTH = 1 << INTERIOR_BITS;
+
+  // How many bits should we consume at leaf level
+  static const int LEAF_BITS = BITS - 2*INTERIOR_BITS;
+  static const int LEAF_LENGTH = 1 << LEAF_BITS;
+
+  // Interior node
+  struct Node {
+    Node* ptrs[INTERIOR_LENGTH];
+  };
+
+  // Leaf node
+  struct Leaf {
+    void* values[LEAF_LENGTH];
+  };
+
+  Node* root_;                          // Root of radix tree
+  void* (*allocator_)(size_t);          // Memory allocator
+
+  Node* NewNode() {
+    Node* result = reinterpret_cast<Node*>((*allocator_)(sizeof(Node)));
+    if (result != NULL) {
+      memset(result, 0, sizeof(*result));
+    }
+    return result;
+  }
+
+ public:
+  typedef uintptr_t Number;
+
+  explicit TCMalloc_PageMap3(void* (*allocator)(size_t)) {
+    allocator_ = allocator;
+    root_ = NewNode();
+  }
+
+  void* get(Number k) const {
+    const Number i1 = k >> (LEAF_BITS + INTERIOR_BITS);
+    const Number i2 = (k >> LEAF_BITS) & (INTERIOR_LENGTH-1);
+    const Number i3 = k & (LEAF_LENGTH-1);
+    if ((k >> BITS) > 0 ||
+        root_->ptrs[i1] == NULL || root_->ptrs[i1]->ptrs[i2] == NULL) {
+      return NULL;
+    }
+    return reinterpret_cast<Leaf*>(root_->ptrs[i1]->ptrs[i2])->values[i3];
+  }
+
+  void set(Number k, void* v) {
+    ASSERT(k >> BITS == 0);
+    const Number i1 = k >> (LEAF_BITS + INTERIOR_BITS);
+    const Number i2 = (k >> LEAF_BITS) & (INTERIOR_LENGTH-1);
+    const Number i3 = k & (LEAF_LENGTH-1);
+    reinterpret_cast<Leaf*>(root_->ptrs[i1]->ptrs[i2])->values[i3] = v;
+  }
+
+  bool Ensure(Number start, size_t n) {
+    for (Number key = start; key <= start + n - 1; ) {
+      const Number i1 = key >> (LEAF_BITS + INTERIOR_BITS);
+      const Number i2 = (key >> LEAF_BITS) & (INTERIOR_LENGTH-1);
+
+      // Check for overflow
+      if (i1 >= INTERIOR_LENGTH || i2 >= INTERIOR_LENGTH)
+        return false;
+
+      // Make 2nd level node if necessary
+      if (root_->ptrs[i1] == NULL) {
+        Node* n = NewNode();
+        if (n == NULL) return false;
+        root_->ptrs[i1] = n;
+      }
+
+      // Make leaf node if necessary
+      if (root_->ptrs[i1]->ptrs[i2] == NULL) {
+        Leaf* leaf = reinterpret_cast<Leaf*>((*allocator_)(sizeof(Leaf)));
+        if (leaf == NULL) return false;
+        memset(leaf, 0, sizeof(*leaf));
+        root_->ptrs[i1]->ptrs[i2] = reinterpret_cast<Node*>(leaf);
+      }
+
+      // Advance key past whatever is covered by this leaf node
+      key = ((key >> LEAF_BITS) + 1) << LEAF_BITS;
+    }
+    return true;
+  }
+
+  void PreallocateMoreMemory() {
+  }
+
+  void* Next(Number k) const {
+    while (k < (Number(1) << BITS)) {
+      const Number i1 = k >> (LEAF_BITS + INTERIOR_BITS);
+      const Number i2 = (k >> LEAF_BITS) & (INTERIOR_LENGTH-1);
+      if (root_->ptrs[i1] == NULL) {
+        // Advance to next top-level entry
+        k = (i1 + 1) << (LEAF_BITS + INTERIOR_BITS);
+      } else {
+        Leaf* leaf = reinterpret_cast<Leaf*>(root_->ptrs[i1]->ptrs[i2]);
+        if (leaf != NULL) {
+          for (Number i3 = (k & (LEAF_LENGTH-1)); i3 < LEAF_LENGTH; i3++) {
+            if (leaf->values[i3] != NULL) {
+              return leaf->values[i3];
+            }
+          }
+        }
+        // Advance to next interior entry
+        k = ((k >> LEAF_BITS) + 1) << LEAF_BITS;
+      }
+    }
+    return NULL;
+  }
+};
+
+#endif  // TCMALLOC_PAGEMAP_H_

diff --git a/src/pprof b/src/pprof
new file mode 100755
index 0000000..c0c64bc
--- /dev/null
+++ b/src/pprof

@@ -0,0 +1,5590 @@
+#! /usr/bin/env perl
+
+# Copyright (c) 1998-2007, Google Inc.
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ---
+# Program for printing the profile generated by common/profiler.cc,
+# or by the heap profiler (common/debugallocation.cc)
+#
+# The profile contains a sequence of entries of the form:
+#       <count> <stack trace>
+# This program parses the profile, and generates user-readable
+# output.
+#
+# Examples:
+#
+# % tools/pprof "program" "profile"
+#   Enters "interactive" mode
+#
+# % tools/pprof --text "program" "profile"
+#   Generates one line per procedure
+#
+# % tools/pprof --gv "program" "profile"
+#   Generates annotated call-graph and displays via "gv"
+#
+# % tools/pprof --gv --focus=Mutex "program" "profile"
+#   Restrict to code paths that involve an entry that matches "Mutex"
+#
+# % tools/pprof --gv --focus=Mutex --ignore=string "program" "profile"
+#   Restrict to code paths that involve an entry that matches "Mutex"
+#   and does not match "string"
+#
+# % tools/pprof --list=IBF_CheckDocid "program" "profile"
+#   Generates disassembly listing of all routines with at least one
+#   sample that match the --list=<regexp> pattern.  The listing is
+#   annotated with the flat and cumulative sample counts at each line.
+#
+# % tools/pprof --disasm=IBF_CheckDocid "program" "profile"
+#   Generates disassembly listing of all routines with at least one
+#   sample that match the --disasm=<regexp> pattern.  The listing is
+#   annotated with the flat and cumulative sample counts at each PC value.
+#
+# TODO: Use color to indicate files?
+
+use strict;
+use warnings;
+use Getopt::Long;
+use Cwd;
+use POSIX;
+
+my $PPROF_VERSION = "2.0";
+
+# These are the object tools we use which can come from a
+# user-specified location using --tools, from the PPROF_TOOLS
+# environment variable, or from the environment.
+my %obj_tool_map = (
+  "objdump" => "objdump",
+  "nm" => "nm",
+  "addr2line" => "addr2line",
+  "c++filt" => "c++filt",
+  ## ConfigureObjTools may add architecture-specific entries:
+  #"nm_pdb" => "nm-pdb",       # for reading windows (PDB-format) executables
+  #"addr2line_pdb" => "addr2line-pdb",                                # ditto
+  #"otool" => "otool",         # equivalent of objdump on OS X
+);
+# NOTE: these are lists, so you can put in commandline flags if you want.
+my @DOT = ("dot");          # leave non-absolute, since it may be in /usr/local
+my @GV = ("gv");
+my @EVINCE = ("evince");    # could also be xpdf or perhaps acroread
+my @KCACHEGRIND = ("kcachegrind");
+my @PS2PDF = ("ps2pdf");
+# These are used for dynamic profiles
+my @URL_FETCHER = ("curl", "-s");
+
+# These are the web pages that servers need to support for dynamic profiles
+my $HEAP_PAGE = "/pprof/heap";
+my $PROFILE_PAGE = "/pprof/profile";   # must support cgi-param "?seconds=#"
+my $PMUPROFILE_PAGE = "/pprof/pmuprofile(?:\\?.*)?"; # must support cgi-param
+                                                # ?seconds=#&event=x&period=n
+my $GROWTH_PAGE = "/pprof/growth";
+my $CONTENTION_PAGE = "/pprof/contention";
+my $WALL_PAGE = "/pprof/wall(?:\\?.*)?";  # accepts options like namefilter
+my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?";
+my $CENSUSPROFILE_PAGE = "/pprof/censusprofile(?:\\?.*)?"; # must support cgi-param
+                                                       # "?seconds=#",
+                                                       # "?tags_regexp=#" and
+                                                       # "?type=#".
+my $SYMBOL_PAGE = "/pprof/symbol";     # must support symbol lookup via POST
+my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
+
+# These are the web pages that can be named on the command line.
+# All the alternatives must begin with /.
+my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" .
+               "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" .
+               "$FILTEREDPROFILE_PAGE|$CENSUSPROFILE_PAGE)";
+
+# default binary name
+my $UNKNOWN_BINARY = "(unknown)";
+
+# There is a pervasive dependency on the length (in hex characters,
+# i.e., nibbles) of an address, distinguishing between 32-bit and
+# 64-bit profiles.  To err on the safe size, default to 64-bit here:
+my $address_length = 16;
+
+my $dev_null = "/dev/null";
+if (! -e $dev_null && $^O =~ /MSWin/) {    # $^O is the OS perl was built for
+  $dev_null = "nul";
+}
+
+# A list of paths to search for shared object files
+my @prefix_list = ();
+
+# Special routine name that should not have any symbols.
+# Used as separator to parse "addr2line -i" output.
+my $sep_symbol = '_fini';
+my $sep_address = undef;
+
+my @stackTraces;
+
+##### Argument parsing #####
+
+sub usage_string {
+  return <<EOF;
+Usage:
+pprof [options] <program> <profiles>
+   <profiles> is a space separated list of profile names.
+pprof [options] <symbolized-profiles>
+   <symbolized-profiles> is a list of profile files where each file contains
+   the necessary symbol mappings  as well as profile data (likely generated
+   with --raw).
+pprof [options] <profile>
+   <profile> is a remote form.  Symbols are obtained from host:port$SYMBOL_PAGE
+
+   Each name can be:
+   /path/to/profile        - a path to a profile file
+   host:port[/<service>]   - a location of a service to get profile from
+
+   The /<service> can be $HEAP_PAGE, $PROFILE_PAGE, /pprof/pmuprofile,
+                         $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall,
+                         $CENSUSPROFILE_PAGE, or /pprof/filteredprofile.
+   For instance:
+     pprof http://myserver.com:80$HEAP_PAGE
+   If /<service> is omitted, the service defaults to $PROFILE_PAGE (cpu profiling).
+pprof --symbols <program>
+   Maps addresses to symbol names.  In this mode, stdin should be a
+   list of library mappings, in the same format as is found in the heap-
+   and cpu-profile files (this loosely matches that of /proc/self/maps
+   on linux), followed by a list of hex addresses to map, one per line.
+
+   For more help with querying remote servers, including how to add the
+   necessary server-side support code, see this filename (or one like it):
+
+   /usr/doc/gperftools-$PPROF_VERSION/pprof_remote_servers.html
+
+Options:
+   --cum               Sort by cumulative data
+   --base=<base>       Subtract <base> from <profile> before display
+   --interactive       Run in interactive mode (interactive "help" gives help) [default]
+   --seconds=<n>       Length of time for dynamic profiles [default=30 secs]
+   --add_lib=<file>    Read additional symbols and line info from the given library
+   --lib_prefix=<dir>  Comma separated list of library path prefixes
+   --no_strip_temp     Do not strip template arguments from function names
+
+Reporting Granularity:
+   --addresses         Report at address level
+   --lines             Report at source line level
+   --functions         Report at function level [default]
+   --files             Report at source file level
+
+Output type:
+   --text              Generate text report
+   --stacks            Generate stack traces similar to the heap profiler (requires --text)
+   --callgrind         Generate callgrind format to stdout
+   --gv                Generate Postscript and display
+   --evince            Generate PDF and display
+   --web               Generate SVG and display
+   --list=<regexp>     Generate source listing of matching routines
+   --disasm=<regexp>   Generate disassembly of matching routines
+   --symbols           Print demangled symbol names found at given addresses
+   --dot               Generate DOT file to stdout
+   --ps                Generate Postcript to stdout
+   --pdf               Generate PDF to stdout
+   --svg               Generate SVG to stdout
+   --gif               Generate GIF to stdout
+   --raw               Generate symbolized pprof data (useful with remote fetch)
+   --collapsed         Generate collapsed stacks for building flame graphs
+                       (see http://www.brendangregg.com/flamegraphs.html)
+
+Heap-Profile Options:
+   --inuse_space       Display in-use (mega)bytes [default]
+   --inuse_objects     Display in-use objects
+   --alloc_space       Display allocated (mega)bytes
+   --alloc_objects     Display allocated objects
+   --show_bytes        Display space in bytes
+   --drop_negative     Ignore negative differences
+
+Contention-profile options:
+   --total_delay       Display total delay at each region [default]
+   --contentions       Display number of delays at each region
+   --mean_delay        Display mean delay at each region
+
+Call-graph Options:
+   --nodecount=<n>     Show at most so many nodes [default=80]
+   --nodefraction=<f>  Hide nodes below <f>*total [default=.005]
+   --edgefraction=<f>  Hide edges below <f>*total [default=.001]
+   --maxdegree=<n>     Max incoming/outgoing edges per node [default=8]
+   --focus=<regexp>    Focus on nodes matching <regexp>
+   --ignore=<regexp>   Ignore nodes matching <regexp>
+   --scale=<n>         Set GV scaling [default=0]
+   --heapcheck         Make nodes with non-0 object counts
+                       (i.e. direct leak generators) more visible
+
+Miscellaneous:
+   --no-auto-signal-frm Automatically drop 2nd frame that is always same (cpu-only)
+                       (assuming that it is artifact of bad stack captures
+                        which include signal handler frames)
+   --show_addresses    Always show addresses when applicable
+   --tools=<prefix or binary:fullpath>[,...]   \$PATH for object tool pathnames
+   --test              Run unit tests
+   --help              This message
+   --version           Version information
+
+Environment Variables:
+   PPROF_TMPDIR        Profiles directory. Defaults to \$HOME/pprof
+   PPROF_TOOLS         Prefix for object tools pathnames
+
+Examples:
+
+pprof /bin/ls ls.prof
+                       Enters "interactive" mode
+pprof --text /bin/ls ls.prof
+                       Outputs one line per procedure
+pprof --web /bin/ls ls.prof
+                       Displays annotated call-graph in web browser
+pprof --gv /bin/ls ls.prof
+                       Displays annotated call-graph via 'gv'
+pprof --gv --focus=Mutex /bin/ls ls.prof
+                       Restricts to code paths including a .*Mutex.* entry
+pprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof
+                       Code paths including Mutex but not string
+pprof --list=getdir /bin/ls ls.prof
+                       (Per-line) annotated source listing for getdir()
+pprof --disasm=getdir /bin/ls ls.prof
+                       (Per-PC) annotated disassembly for getdir()
+
+pprof http://localhost:1234/
+                       Enters "interactive" mode
+pprof --text localhost:1234
+                       Outputs one line per procedure for localhost:1234
+pprof --raw localhost:1234 > ./local.raw
+pprof --text ./local.raw
+                       Fetches a remote profile for later analysis and then
+                       analyzes it in text mode.
+EOF
+}
+
+sub version_string {
+  return <<EOF
+pprof (part of gperftools $PPROF_VERSION)
+
+Copyright 1998-2007 Google Inc.
+
+This is BSD licensed software; see the source for copying conditions
+and license information.
+There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.
+EOF
+}
+
+sub usage {
+  my $msg = shift;
+  print STDERR "$msg\n\n";
+  print STDERR usage_string();
+  exit(1);
+}
+
+sub Init() {
+  # Setup tmp-file name and handler to clean it up.
+  # We do this in the very beginning so that we can use
+  # error() and cleanup() function anytime here after.
+  $main::tmpfile_sym = "/tmp/pprof$$.sym";
+  $main::tmpfile_ps = "/tmp/pprof$$";
+  $main::next_tmpfile = 0;
+  $SIG{'INT'} = \&sighandler;
+
+  # Cache from filename/linenumber to source code
+  $main::source_cache = ();
+
+  $main::opt_help = 0;
+  $main::opt_version = 0;
+  $main::opt_show_addresses = 0;
+  $main::opt_no_auto_signal_frames = 0;
+
+  $main::opt_cum = 0;
+  $main::opt_base = '';
+  $main::opt_addresses = 0;
+  $main::opt_lines = 0;
+  $main::opt_functions = 0;
+  $main::opt_files = 0;
+  $main::opt_lib_prefix = "";
+
+  $main::opt_text = 0;
+  $main::opt_stacks = 0;
+  $main::opt_callgrind = 0;
+  $main::opt_list = "";
+  $main::opt_disasm = "";
+  $main::opt_symbols = 0;
+  $main::opt_gv = 0;
+  $main::opt_evince = 0;
+  $main::opt_web = 0;
+  $main::opt_dot = 0;
+  $main::opt_ps = 0;
+  $main::opt_pdf = 0;
+  $main::opt_gif = 0;
+  $main::opt_svg = 0;
+  $main::opt_raw = 0;
+  $main::opt_collapsed = 0;
+
+  $main::opt_nodecount = 80;
+  $main::opt_nodefraction = 0.005;
+  $main::opt_edgefraction = 0.001;
+  $main::opt_maxdegree = 8;
+  $main::opt_focus = '';
+  $main::opt_ignore = '';
+  $main::opt_scale = 0;
+  $main::opt_heapcheck = 0;
+  $main::opt_seconds = 30;
+  $main::opt_lib = "";
+
+  $main::opt_inuse_space   = 0;
+  $main::opt_inuse_objects = 0;
+  $main::opt_alloc_space   = 0;
+  $main::opt_alloc_objects = 0;
+  $main::opt_show_bytes    = 0;
+  $main::opt_drop_negative = 0;
+  $main::opt_interactive   = 0;
+
+  $main::opt_total_delay = 0;
+  $main::opt_contentions = 0;
+  $main::opt_mean_delay = 0;
+
+  $main::opt_tools   = "";
+  $main::opt_debug   = 0;
+  $main::opt_test    = 0;
+
+  # Do not strip template argument in function names
+  $main::opt_no_strip_temp = 0;
+
+  # These are undocumented flags used only by unittests.
+  $main::opt_test_stride = 0;
+
+  # Are we using $SYMBOL_PAGE?
+  $main::use_symbol_page = 0;
+
+  # Files returned by TempName.
+  %main::tempnames = ();
+
+  # Type of profile we are dealing with
+  # Supported types:
+  #     cpu
+  #     heap
+  #     growth
+  #     contention
+  $main::profile_type = '';     # Empty type means "unknown"
+
+  GetOptions("help!"          => \$main::opt_help,
+             "version!"       => \$main::opt_version,
+             "show_addresses!"=> \$main::opt_show_addresses,
+             "no-auto-signal-frm!"=> \$main::opt_no_auto_signal_frames,
+             "cum!"           => \$main::opt_cum,
+             "base=s"         => \$main::opt_base,
+             "seconds=i"      => \$main::opt_seconds,
+             "add_lib=s"      => \$main::opt_lib,
+             "lib_prefix=s"   => \$main::opt_lib_prefix,
+             "functions!"     => \$main::opt_functions,
+             "lines!"         => \$main::opt_lines,
+             "addresses!"     => \$main::opt_addresses,
+             "files!"         => \$main::opt_files,
+             "text!"          => \$main::opt_text,
+             "stacks!"        => \$main::opt_stacks,
+             "callgrind!"     => \$main::opt_callgrind,
+             "list=s"         => \$main::opt_list,
+             "disasm=s"       => \$main::opt_disasm,
+             "symbols!"       => \$main::opt_symbols,
+             "gv!"            => \$main::opt_gv,
+             "evince!"        => \$main::opt_evince,
+             "web!"           => \$main::opt_web,
+             "dot!"           => \$main::opt_dot,
+             "ps!"            => \$main::opt_ps,
+             "pdf!"           => \$main::opt_pdf,
+             "svg!"           => \$main::opt_svg,
+             "gif!"           => \$main::opt_gif,
+             "raw!"           => \$main::opt_raw,
+             "collapsed!"     => \$main::opt_collapsed,
+             "interactive!"   => \$main::opt_interactive,
+             "nodecount=i"    => \$main::opt_nodecount,
+             "nodefraction=f" => \$main::opt_nodefraction,
+             "edgefraction=f" => \$main::opt_edgefraction,
+             "maxdegree=i"    => \$main::opt_maxdegree,
+             "focus=s"        => \$main::opt_focus,
+             "ignore=s"       => \$main::opt_ignore,
+             "scale=i"        => \$main::opt_scale,
+             "heapcheck"      => \$main::opt_heapcheck,
+             "inuse_space!"   => \$main::opt_inuse_space,
+             "inuse_objects!" => \$main::opt_inuse_objects,
+             "alloc_space!"   => \$main::opt_alloc_space,
+             "alloc_objects!" => \$main::opt_alloc_objects,
+             "show_bytes!"    => \$main::opt_show_bytes,
+             "drop_negative!" => \$main::opt_drop_negative,
+             "total_delay!"   => \$main::opt_total_delay,
+             "contentions!"   => \$main::opt_contentions,
+             "mean_delay!"    => \$main::opt_mean_delay,
+             "tools=s"        => \$main::opt_tools,
+             "no_strip_temp!" => \$main::opt_no_strip_temp,
+             "test!"          => \$main::opt_test,
+             "debug!"         => \$main::opt_debug,
+             # Undocumented flags used only by unittests:
+             "test_stride=i"  => \$main::opt_test_stride,
+      ) || usage("Invalid option(s)");
+
+  # Deal with the standard --help and --version
+  if ($main::opt_help) {
+    print usage_string();
+    exit(0);
+  }
+
+  if ($main::opt_version) {
+    print version_string();
+    exit(0);
+  }
+
+  # Disassembly/listing/symbols mode requires address-level info
+  if ($main::opt_disasm || $main::opt_list || $main::opt_symbols) {
+    $main::opt_functions = 0;
+    $main::opt_lines = 0;
+    $main::opt_addresses = 1;
+    $main::opt_files = 0;
+  }
+
+  # Check heap-profiling flags
+  if ($main::opt_inuse_space +
+      $main::opt_inuse_objects +
+      $main::opt_alloc_space +
+      $main::opt_alloc_objects > 1) {
+    usage("Specify at most on of --inuse/--alloc options");
+  }
+
+  # Check output granularities
+  my $grains =
+      $main::opt_functions +
+      $main::opt_lines +
+      $main::opt_addresses +
+      $main::opt_files +
+      0;
+  if ($grains > 1) {
+    usage("Only specify one output granularity option");
+  }
+  if ($grains == 0) {
+    $main::opt_functions = 1;
+  }
+
+  # Check output modes
+  my $modes =
+      $main::opt_text +
+      $main::opt_callgrind +
+      ($main::opt_list eq '' ? 0 : 1) +
+      ($main::opt_disasm eq '' ? 0 : 1) +
+      ($main::opt_symbols == 0 ? 0 : 1) +
+      $main::opt_gv +
+      $main::opt_evince +
+      $main::opt_web +
+      $main::opt_dot +
+      $main::opt_ps +
+      $main::opt_pdf +
+      $main::opt_svg +
+      $main::opt_gif +
+      $main::opt_raw +
+      $main::opt_collapsed +
+      $main::opt_interactive +
+      0;
+  if ($modes > 1) {
+    usage("Only specify one output mode");
+  }
+  if ($modes == 0) {
+    if (-t STDOUT) {  # If STDOUT is a tty, activate interactive mode
+      $main::opt_interactive = 1;
+    } else {
+      $main::opt_text = 1;
+    }
+  }
+
+  if ($main::opt_test) {
+    RunUnitTests();
+    # Should not return
+    exit(1);
+  }
+
+  # Binary name and profile arguments list
+  $main::prog = "";
+  @main::pfile_args = ();
+
+  # Remote profiling without a binary (using $SYMBOL_PAGE instead)
+  if (@ARGV > 0) {
+    if (IsProfileURL($ARGV[0])) {
+      printf STDERR "Using remote profile at $ARGV[0].\n";
+      $main::use_symbol_page = 1;
+    } elsif (IsSymbolizedProfileFile($ARGV[0])) {
+      $main::use_symbolized_profile = 1;
+      $main::prog = $UNKNOWN_BINARY;  # will be set later from the profile file
+    }
+  }
+
+  if ($main::use_symbol_page || $main::use_symbolized_profile) {
+    # We don't need a binary!
+    my %disabled = ('--lines' => $main::opt_lines,
+                    '--disasm' => $main::opt_disasm);
+    for my $option (keys %disabled) {
+      usage("$option cannot be used without a binary") if $disabled{$option};
+    }
+    # Set $main::prog later...
+    scalar(@ARGV) || usage("Did not specify profile file");
+  } elsif ($main::opt_symbols) {
+    # --symbols needs a binary-name (to run nm on, etc) but not profiles
+    $main::prog = shift(@ARGV) || usage("Did not specify program");
+  } else {
+    $main::prog = shift(@ARGV) || usage("Did not specify program");
+    scalar(@ARGV) || usage("Did not specify profile file");
+  }
+
+  # Parse profile file/location arguments
+  foreach my $farg (@ARGV) {
+    if ($farg =~ m/(.*)\@([0-9]+)(|\/.*)$/ ) {
+      my $machine = $1;
+      my $num_machines = $2;
+      my $path = $3;
+      for (my $i = 0; $i < $num_machines; $i++) {
+        unshift(@main::pfile_args, "$i.$machine$path");
+      }
+    } else {
+      unshift(@main::pfile_args, $farg);
+    }
+  }
+
+  if ($main::use_symbol_page) {
+    unless (IsProfileURL($main::pfile_args[0])) {
+      error("The first profile should be a remote form to use $SYMBOL_PAGE\n");
+    }
+    CheckSymbolPage();
+    $main::prog = FetchProgramName();
+  } elsif (!$main::use_symbolized_profile) {  # may not need objtools!
+    ConfigureObjTools($main::prog)
+  }
+
+  # Break the opt_lib_prefix into the prefix_list array
+  @prefix_list = split (',', $main::opt_lib_prefix);
+
+  # Remove trailing / from the prefixes, in the list to prevent
+  # searching things like /my/path//lib/mylib.so
+  foreach (@prefix_list) {
+    s|/+$||;
+  }
+}
+
+sub Main() {
+  Init();
+  $main::collected_profile = undef;
+  @main::profile_files = ();
+  $main::op_time = time();
+
+  # Printing symbols is special and requires a lot less info that most.
+  if ($main::opt_symbols) {
+    PrintSymbols(*STDIN);   # Get /proc/maps and symbols output from stdin
+    return;
+  }
+
+  # Fetch all profile data
+  FetchDynamicProfiles();
+
+  # this will hold symbols that we read from the profile files
+  my $symbol_map = {};
+
+  # Read one profile, pick the last item on the list
+  my $data = ReadProfile($main::prog, pop(@main::profile_files));
+  my $profile = $data->{profile};
+  my $pcs = $data->{pcs};
+  my $libs = $data->{libs};   # Info about main program and shared libraries
+  $symbol_map = MergeSymbols($symbol_map, $data->{symbols});
+
+  # Add additional profiles, if available.
+  if (scalar(@main::profile_files) > 0) {
+    foreach my $pname (@main::profile_files) {
+      my $data2 = ReadProfile($main::prog, $pname);
+      $profile = AddProfile($profile, $data2->{profile});
+      $pcs = AddPcs($pcs, $data2->{pcs});
+      $symbol_map = MergeSymbols($symbol_map, $data2->{symbols});
+    }
+  }
+
+  # Subtract base from profile, if specified
+  if ($main::opt_base ne '') {
+    my $base = ReadProfile($main::prog, $main::opt_base);
+    $profile = SubtractProfile($profile, $base->{profile});
+    $pcs = AddPcs($pcs, $base->{pcs});
+    $symbol_map = MergeSymbols($symbol_map, $base->{symbols});
+  }
+
+  # Get total data in profile
+  my $total = TotalProfile($profile);
+
+  # Collect symbols
+  my $symbols;
+  if ($main::use_symbolized_profile) {
+    $symbols = FetchSymbols($pcs, $symbol_map);
+  } elsif ($main::use_symbol_page) {
+    $symbols = FetchSymbols($pcs);
+  } else {
+    # TODO(csilvers): $libs uses the /proc/self/maps data from profile1,
+    # which may differ from the data from subsequent profiles, especially
+    # if they were run on different machines.  Use appropriate libs for
+    # each pc somehow.
+    $symbols = ExtractSymbols($libs, $pcs);
+  }
+
+  # Remove uniniteresting stack items
+  $profile = RemoveUninterestingFrames($symbols, $profile);
+
+  # Focus?
+  if ($main::opt_focus ne '') {
+    $profile = FocusProfile($symbols, $profile, $main::opt_focus);
+  }
+
+  # Ignore?
+  if ($main::opt_ignore ne '') {
+    $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore);
+  }
+
+  my $calls = ExtractCalls($symbols, $profile);
+
+  # Reduce profiles to required output granularity, and also clean
+  # each stack trace so a given entry exists at most once.
+  my $reduced = ReduceProfile($symbols, $profile);
+
+  # Get derived profiles
+  my $flat = FlatProfile($reduced);
+  my $cumulative = CumulativeProfile($reduced);
+
+  # Print
+  if (!$main::opt_interactive) {
+    if ($main::opt_disasm) {
+      PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm);
+    } elsif ($main::opt_list) {
+      PrintListing($total, $libs, $flat, $cumulative, $main::opt_list, 0);
+    } elsif ($main::opt_text) {
+      # Make sure the output is empty when have nothing to report
+      # (only matters when --heapcheck is given but we must be
+      # compatible with old branches that did not pass --heapcheck always):
+      if ($total != 0) {
+        printf("Total: %s %s\n", Unparse($total), Units());
+      }
+      if ($main::opt_stacks) {
+        printf("Stacks:\n\n");
+        PrintStacksForText($symbols, $profile);
+      }
+      PrintText($symbols, $flat, $cumulative, -1);
+    } elsif ($main::opt_raw) {
+      PrintSymbolizedProfile($symbols, $profile, $main::prog);
+    } elsif ($main::opt_collapsed) {
+      PrintCollapsedStacks($symbols, $profile);
+    } elsif ($main::opt_callgrind) {
+      PrintCallgrind($calls);
+    } else {
+      if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
+        if ($main::opt_gv) {
+          RunGV(TempName($main::next_tmpfile, "ps"), "");
+        } elsif ($main::opt_evince) {
+          RunEvince(TempName($main::next_tmpfile, "pdf"), "");
+        } elsif ($main::opt_web) {
+          my $tmp = TempName($main::next_tmpfile, "svg");
+          RunWeb($tmp);
+          # The command we run might hand the file name off
+          # to an already running browser instance and then exit.
+          # Normally, we'd remove $tmp on exit (right now),
+          # but fork a child to remove $tmp a little later, so that the
+          # browser has time to load it first.
+          delete $main::tempnames{$tmp};
+          if (fork() == 0) {
+            sleep 5;
+            unlink($tmp);
+            exit(0);
+          }
+        }
+      } else {
+        cleanup();
+        exit(1);
+      }
+    }
+  } else {
+    InteractiveMode($profile, $symbols, $libs, $total);
+  }
+
+  cleanup();
+  exit(0);
+}
+
+##### Entry Point #####
+
+Main();
+
+# Temporary code to detect if we're running on a Goobuntu system.
+# These systems don't have the right stuff installed for the special
+# Readline libraries to work, so as a temporary workaround, we default
+# to using the normal stdio code, rather than the fancier readline-based
+# code
+sub ReadlineMightFail {
+  if (-e '/lib/libtermcap.so.2') {
+    return 0;  # libtermcap exists, so readline should be okay
+  } else {
+    return 1;
+  }
+}
+
+sub RunGV {
+  my $fname = shift;
+  my $bg = shift;       # "" or " &" if we should run in background
+  if (!system(ShellEscape(@GV, "--version") . " >$dev_null 2>&1")) {
+    # Options using double dash are supported by this gv version.
+    # Also, turn on noantialias to better handle bug in gv for
+    # postscript files with large dimensions.
+    # TODO: Maybe we should not pass the --noantialias flag
+    # if the gv version is known to work properly without the flag.
+    system(ShellEscape(@GV, "--scale=$main::opt_scale", "--noantialias", $fname)
+           . $bg);
+  } else {
+    # Old gv version - only supports options that use single dash.
+    print STDERR ShellEscape(@GV, "-scale", $main::opt_scale) . "\n";
+    system(ShellEscape(@GV, "-scale", "$main::opt_scale", $fname) . $bg);
+  }
+}
+
+sub RunEvince {
+  my $fname = shift;
+  my $bg = shift;       # "" or " &" if we should run in background
+  system(ShellEscape(@EVINCE, $fname) . $bg);
+}
+
+sub RunWeb {
+  my $fname = shift;
+  print STDERR "Loading web page file:///$fname\n";
+
+  if (`uname` =~ /Darwin/) {
+    # OS X: open will use standard preference for SVG files.
+    system("/usr/bin/open", $fname);
+    return;
+  }
+
+  if (`uname` =~ /MINGW/) {
+    # Windows(MinGW): open will use standard preference for SVG files.
+    system("cmd", "/c", "start", $fname);
+    return;
+  }
+
+  # Some kind of Unix; try generic symlinks, then specific browsers.
+  # (Stop once we find one.)
+  # Works best if the browser is already running.
+  my @alt = (
+    "/etc/alternatives/gnome-www-browser",
+    "/etc/alternatives/x-www-browser",
+    "google-chrome",
+    "firefox",
+  );
+  foreach my $b (@alt) {
+    if (system($b, $fname) == 0) {
+      return;
+    }
+  }
+
+  print STDERR "Could not load web browser.\n";
+}
+
+sub RunKcachegrind {
+  my $fname = shift;
+  my $bg = shift;       # "" or " &" if we should run in background
+  print STDERR "Starting '@KCACHEGRIND " . $fname . $bg . "'\n";
+  system(ShellEscape(@KCACHEGRIND, $fname) . $bg);
+}
+
+
+##### Interactive helper routines #####
+
+sub InteractiveMode {
+  $| = 1;  # Make output unbuffered for interactive mode
+  my ($orig_profile, $symbols, $libs, $total) = @_;
+
+  print STDERR "Welcome to pprof!  For help, type 'help'.\n";
+
+  # Use ReadLine if it's installed and input comes from a console.
+  if ( -t STDIN &&
+       !ReadlineMightFail() &&
+       defined(eval {require Term::ReadLine}) ) {
+    my $term = new Term::ReadLine 'pprof';
+    while ( defined ($_ = $term->readline('(pprof) '))) {
+      $term->addhistory($_) if /\S/;
+      if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) {
+        last;    # exit when we get an interactive command to quit
+      }
+    }
+  } else {       # don't have readline
+    while (1) {
+      print STDERR "(pprof) ";
+      $_ = <STDIN>;
+      last if ! defined $_ ;
+      s/\r//g;         # turn windows-looking lines into unix-looking lines
+
+      # Save some flags that might be reset by InteractiveCommand()
+      my $save_opt_lines = $main::opt_lines;
+
+      if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) {
+        last;    # exit when we get an interactive command to quit
+      }
+
+      # Restore flags
+      $main::opt_lines = $save_opt_lines;
+    }
+  }
+}
+
+# Takes two args: orig profile, and command to run.
+# Returns 1 if we should keep going, or 0 if we were asked to quit
+sub InteractiveCommand {
+  my($orig_profile, $symbols, $libs, $total, $command) = @_;
+  $_ = $command;                # just to make future m//'s easier
+  if (!defined($_)) {
+    print STDERR "\n";
+    return 0;
+  }
+  if (m/^\s*quit/) {
+    return 0;
+  }
+  if (m/^\s*help/) {
+    InteractiveHelpMessage();
+    return 1;
+  }
+  # Clear all the mode options -- mode is controlled by "$command"
+  $main::opt_text = 0;
+  $main::opt_callgrind = 0;
+  $main::opt_disasm = 0;
+  $main::opt_list = 0;
+  $main::opt_gv = 0;
+  $main::opt_evince = 0;
+  $main::opt_cum = 0;
+
+  if (m/^\s*(text|top)(\d*)\s*(.*)/) {
+    $main::opt_text = 1;
+
+    my $line_limit = ($2 ne "") ? int($2) : 10;
+
+    my $routine;
+    my $ignore;
+    ($routine, $ignore) = ParseInteractiveArgs($3);
+
+    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
+
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
+
+    PrintText($symbols, $flat, $cumulative, $line_limit);
+    return 1;
+  }
+  if (m/^\s*callgrind\s*([^ \n]*)/) {
+    $main::opt_callgrind = 1;
+
+    # Get derived profiles
+    my $calls = ExtractCalls($symbols, $orig_profile);
+    my $filename = $1;
+    if ( $1 eq '' ) {
+      $filename = TempName($main::next_tmpfile, "callgrind");
+    }
+    PrintCallgrind($calls, $filename);
+    if ( $1 eq '' ) {
+      RunKcachegrind($filename, " & ");
+      $main::next_tmpfile++;
+    }
+
+    return 1;
+  }
+  if (m/^\s*(web)?list\s*(.+)/) {
+    my $html = (defined($1) && ($1 eq "web"));
+    $main::opt_list = 1;
+
+    my $routine;
+    my $ignore;
+    ($routine, $ignore) = ParseInteractiveArgs($2);
+
+    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
+
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
+
+    PrintListing($total, $libs, $flat, $cumulative, $routine, $html);
+    return 1;
+  }
+  if (m/^\s*disasm\s*(.+)/) {
+    $main::opt_disasm = 1;
+
+    my $routine;
+    my $ignore;
+    ($routine, $ignore) = ParseInteractiveArgs($1);
+
+    # Process current profile to account for various settings
+    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
+
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
+
+    PrintDisassembly($libs, $flat, $cumulative, $routine);
+    return 1;
+  }
+  if (m/^\s*(gv|web|evince)\s*(.*)/) {
+    $main::opt_gv = 0;
+    $main::opt_evince = 0;
+    $main::opt_web = 0;
+    if ($1 eq "gv") {
+      $main::opt_gv = 1;
+    } elsif ($1 eq "evince") {
+      $main::opt_evince = 1;
+    } elsif ($1 eq "web") {
+      $main::opt_web = 1;
+    }
+
+    my $focus;
+    my $ignore;
+    ($focus, $ignore) = ParseInteractiveArgs($2);
+
+    # Process current profile to account for various settings
+    my $profile = ProcessProfile($total, $orig_profile, $symbols,
+                                 $focus, $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
+
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
+
+    if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
+      if ($main::opt_gv) {
+        RunGV(TempName($main::next_tmpfile, "ps"), " &");
+      } elsif ($main::opt_evince) {
+        RunEvince(TempName($main::next_tmpfile, "pdf"), " &");
+      } elsif ($main::opt_web) {
+        RunWeb(TempName($main::next_tmpfile, "svg"));
+      }
+      $main::next_tmpfile++;
+    }
+    return 1;
+  }
+  if (m/^\s*$/) {
+    return 1;
+  }
+  print STDERR "Unknown command: try 'help'.\n";
+  return 1;
+}
+
+
+sub ProcessProfile {
+  my $total_count = shift;
+  my $orig_profile = shift;
+  my $symbols = shift;
+  my $focus = shift;
+  my $ignore = shift;
+
+  # Process current profile to account for various settings
+  my $profile = $orig_profile;
+  printf("Total: %s %s\n", Unparse($total_count), Units());
+  if ($focus ne '') {
+    $profile = FocusProfile($symbols, $profile, $focus);
+    my $focus_count = TotalProfile($profile);
+    printf("After focusing on '%s': %s %s of %s (%0.1f%%)\n",
+           $focus,
+           Unparse($focus_count), Units(),
+           Unparse($total_count), ($focus_count*100.0) / $total_count);
+  }
+  if ($ignore ne '') {
+    $profile = IgnoreProfile($symbols, $profile, $ignore);
+    my $ignore_count = TotalProfile($profile);
+    printf("After ignoring '%s': %s %s of %s (%0.1f%%)\n",
+           $ignore,
+           Unparse($ignore_count), Units(),
+           Unparse($total_count),
+           ($ignore_count*100.0) / $total_count);
+  }
+
+  return $profile;
+}
+
+sub InteractiveHelpMessage {
+  print STDERR <<ENDOFHELP;
+Interactive pprof mode
+
+Commands:
+  gv
+  gv [focus] [-ignore1] [-ignore2]
+      Show graphical hierarchical display of current profile.  Without
+      any arguments, shows all samples in the profile.  With the optional
+      "focus" argument, restricts the samples shown to just those where
+      the "focus" regular expression matches a routine name on the stack
+      trace.
+
+  web
+  web [focus] [-ignore1] [-ignore2]
+      Like GV, but displays profile in your web browser instead of using
+      Ghostview. Works best if your web browser is already running.
+      To change the browser that gets used:
+      On Linux, set the /etc/alternatives/gnome-www-browser symlink.
+      On OS X, change the Finder association for SVG files.
+
+  list [routine_regexp] [-ignore1] [-ignore2]
+      Show source listing of routines whose names match "routine_regexp"
+
+  weblist [routine_regexp] [-ignore1] [-ignore2]
+     Displays a source listing of routines whose names match "routine_regexp"
+     in a web browser.  You can click on source lines to view the
+     corresponding disassembly.
+
+  top [--cum] [-ignore1] [-ignore2]
+  top20 [--cum] [-ignore1] [-ignore2]
+  top37 [--cum] [-ignore1] [-ignore2]
+      Show top lines ordered by flat profile count, or cumulative count
+      if --cum is specified.  If a number is present after 'top', the
+      top K routines will be shown (defaults to showing the top 10)
+
+  disasm [routine_regexp] [-ignore1] [-ignore2]
+      Show disassembly of routines whose names match "routine_regexp",
+      annotated with sample counts.
+
+  callgrind
+  callgrind [filename]
+      Generates callgrind file. If no filename is given, kcachegrind is called.
+
+  help - This listing
+  quit or ^D - End pprof
+
+For commands that accept optional -ignore tags, samples where any routine in
+the stack trace matches the regular expression in any of the -ignore
+parameters will be ignored.
+
+Further pprof details are available at this location (or one similar):
+
+ /usr/doc/gperftools-$PPROF_VERSION/cpu_profiler.html
+ /usr/doc/gperftools-$PPROF_VERSION/heap_profiler.html
+
+ENDOFHELP
+}
+sub ParseInteractiveArgs {
+  my $args = shift;
+  my $focus = "";
+  my $ignore = "";
+  my @x = split(/ +/, $args);
+  foreach $a (@x) {
+    if ($a =~ m/^(--|-)lines$/) {
+      $main::opt_lines = 1;
+    } elsif ($a =~ m/^(--|-)cum$/) {
+      $main::opt_cum = 1;
+    } elsif ($a =~ m/^-(.*)/) {
+      $ignore .= (($ignore ne "") ? "|" : "" ) . $1;
+    } else {
+      $focus .= (($focus ne "") ? "|" : "" ) . $a;
+    }
+  }
+  if ($ignore ne "") {
+    print STDERR "Ignoring samples in call stacks that match '$ignore'\n";
+  }
+  return ($focus, $ignore);
+}
+
+##### Output code #####
+
+sub TempName {
+  my $fnum = shift;
+  my $ext = shift;
+  my $file = "$main::tmpfile_ps.$fnum.$ext";
+  $main::tempnames{$file} = 1;
+  return $file;
+}
+
+# Print profile data in packed binary format (64-bit) to standard out
+sub PrintProfileData {
+  my $profile = shift;
+  my $big_endian = pack("L", 1) eq pack("N", 1);
+  # print header (64-bit style)
+  # (zero) (header-size) (version) (sample-period) (zero)
+  if ($big_endian) {
+    print pack('L*', 0, 0, 0, 3, 0, 0, 0, 1, 0, 0);
+  }
+  else {
+    print pack('L*', 0, 0, 3, 0, 0, 0, 1, 0, 0, 0);
+  }
+
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    if ($#addrs >= 0) {
+      my $depth = $#addrs + 1;
+      # int(foo / 2**32) is the only reliable way to get rid of bottom
+      # 32 bits on both 32- and 64-bit systems.
+      if ($big_endian) {
+        print pack('L*', int($count / 2**32), $count & 0xFFFFFFFF);
+        print pack('L*', int($depth / 2**32), $depth & 0xFFFFFFFF);
+      }
+      else {
+        print pack('L*', $count & 0xFFFFFFFF, int($count / 2**32));
+        print pack('L*', $depth & 0xFFFFFFFF, int($depth / 2**32));
+      }
+
+      foreach my $full_addr (@addrs) {
+        my $addr = $full_addr;
+        $addr =~ s/0x0*//;  # strip off leading 0x, zeroes
+        if (length($addr) > 16) {
+          print STDERR "Invalid address in profile: $full_addr\n";
+          next;
+        }
+        my $low_addr = substr($addr, -8);       # get last 8 hex chars
+        my $high_addr = substr($addr, -16, 8);  # get up to 8 more hex chars
+        if ($big_endian) {
+          print pack('L*', hex('0x' . $high_addr), hex('0x' . $low_addr));
+        }
+        else {
+          print pack('L*', hex('0x' . $low_addr), hex('0x' . $high_addr));
+        }
+      }
+    }
+  }
+}
+
+# Print symbols and profile data
+sub PrintSymbolizedProfile {
+  my $symbols = shift;
+  my $profile = shift;
+  my $prog = shift;
+
+  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $symbol_marker = $&;
+
+  print '--- ', $symbol_marker, "\n";
+  if (defined($prog)) {
+    print 'binary=', $prog, "\n";
+  }
+  while (my ($pc, $name) = each(%{$symbols})) {
+    my $sep = ' ';
+    print '0x', $pc;
+    # We have a list of function names, which include the inlined
+    # calls.  They are separated (and terminated) by --, which is
+    # illegal in function names.
+    for (my $j = 2; $j <= $#{$name}; $j += 3) {
+      print $sep, $name->[$j];
+      $sep = '--';
+    }
+    print "\n";
+  }
+  print '---', "\n";
+
+  $PROFILE_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $profile_marker = $&;
+  print '--- ', $profile_marker, "\n";
+  if (defined($main::collected_profile)) {
+    # if used with remote fetch, simply dump the collected profile to output.
+    open(SRC, "<$main::collected_profile");
+    while (<SRC>) {
+      print $_;
+    }
+    close(SRC);
+  } else {
+    # dump a cpu-format profile to standard out
+    PrintProfileData($profile);
+  }
+}
+
+# Print text output
+sub PrintText {
+  my $symbols = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $line_limit = shift;
+
+  if ($main::opt_stacks && @stackTraces) {
+      foreach (sort { (split " ", $b)[1] <=> (split " ", $a)[1]; } @stackTraces) {
+	  print "$_\n" if $main::opt_debug;
+	  my ($n1, $s1, $n2, $s2, @addrs) = split;
+	  print "Leak of $s1 bytes in $n1 objects allocated from:\n";
+	  foreach my $pcstr (@addrs) {
+	      $pcstr =~ s/^0x//;
+	      my $sym;
+	      if (! defined $symbols->{$pcstr}) {
+		  $sym = "unknown";
+	      } else {
+		  $sym = "$symbols->{$pcstr}[0] $symbols->{$pcstr}[1]";
+	      }
+	      print "\t@ $pcstr $sym\n";
+	  }
+      }
+      print "\n";
+  }
+
+  my $total = TotalProfile($flat);
+
+  # Which profile to sort by?
+  my $s = $main::opt_cum ? $cumulative : $flat;
+
+  my $running_sum = 0;
+  my $lines = 0;
+  foreach my $k (sort { GetEntry($s, $b) <=> GetEntry($s, $a) || $a cmp $b }
+                 keys(%{$cumulative})) {
+    my $f = GetEntry($flat, $k);
+    my $c = GetEntry($cumulative, $k);
+    $running_sum += $f;
+
+    my $sym = $k;
+    if (exists($symbols->{$k})) {
+      $sym = $symbols->{$k}->[0] . " " . $symbols->{$k}->[1];
+      if ($main::opt_addresses) {
+        $sym = $k . " " . $sym;
+      }
+    }
+
+    if ($f != 0 || $c != 0) {
+      printf("%8s %6s %6s %8s %6s %s\n",
+             Unparse($f),
+             Percent($f, $total),
+             Percent($running_sum, $total),
+             Unparse($c),
+             Percent($c, $total),
+             $sym);
+    }
+    $lines++;
+    last if ($line_limit >= 0 && $lines >= $line_limit);
+  }
+}
+
+# Callgrind format has a compression for repeated function and file
+# names.  You show the name the first time, and just use its number
+# subsequently.  This can cut down the file to about a third or a
+# quarter of its uncompressed size.  $key and $val are the key/value
+# pair that would normally be printed by callgrind; $map is a map from
+# value to number.
+sub CompressedCGName {
+  my($key, $val, $map) = @_;
+  my $idx = $map->{$val};
+  # For very short keys, providing an index hurts rather than helps.
+  if (length($val) <= 3) {
+    return "$key=$val\n";
+  } elsif (defined($idx)) {
+    return "$key=($idx)\n";
+  } else {
+    # scalar(keys $map) gives the number of items in the map.
+    $idx = scalar(keys(%{$map})) + 1;
+    $map->{$val} = $idx;
+    return "$key=($idx) $val\n";
+  }
+}
+
+# Print the call graph in a way that's suiteable for callgrind.
+sub PrintCallgrind {
+  my $calls = shift;
+  my $filename;
+  my %filename_to_index_map;
+  my %fnname_to_index_map;
+
+  if ($main::opt_interactive) {
+    $filename = shift;
+    print STDERR "Writing callgrind file to '$filename'.\n"
+  } else {
+    $filename = "&STDOUT";
+  }
+  open(CG, ">$filename");
+  printf CG ("events: Hits\n\n");
+  foreach my $call ( map { $_->[0] }
+                     sort { $a->[1] cmp $b ->[1] ||
+                            $a->[2] <=> $b->[2] }
+                     map { /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/;
+                           [$_, $1, $2] }
+                     keys %$calls ) {
+    my $count = int($calls->{$call});
+    $call =~ /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/;
+    my ( $caller_file, $caller_line, $caller_function,
+         $callee_file, $callee_line, $callee_function ) =
+       ( $1, $2, $3, $5, $6, $7 );
+
+    # TODO(csilvers): for better compression, collect all the
+    # caller/callee_files and functions first, before printing
+    # anything, and only compress those referenced more than once.
+    printf CG CompressedCGName("fl", $caller_file, \%filename_to_index_map);
+    printf CG CompressedCGName("fn", $caller_function, \%fnname_to_index_map);
+    if (defined $6) {
+      printf CG CompressedCGName("cfl", $callee_file, \%filename_to_index_map);
+      printf CG CompressedCGName("cfn", $callee_function, \%fnname_to_index_map);
+      printf CG ("calls=$count $callee_line\n");
+    }
+    printf CG ("$caller_line $count\n\n");
+  }
+}
+
+# Print disassembly for all all routines that match $main::opt_disasm
+sub PrintDisassembly {
+  my $libs = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $disasm_opts = shift;
+
+  my $total = TotalProfile($flat);
+
+  foreach my $lib (@{$libs}) {
+    my $symbol_table = GetProcedureBoundaries($lib->[0], $disasm_opts);
+    my $offset = AddressSub($lib->[1], $lib->[3]);
+    foreach my $routine (sort ByName keys(%{$symbol_table})) {
+      my $start_addr = $symbol_table->{$routine}->[0];
+      my $end_addr = $symbol_table->{$routine}->[1];
+      # See if there are any samples in this routine
+      my $length = hex(AddressSub($end_addr, $start_addr));
+      my $addr = AddressAdd($start_addr, $offset);
+      for (my $i = 0; $i < $length; $i++) {
+        if (defined($cumulative->{$addr})) {
+          PrintDisassembledFunction($lib->[0], $offset,
+                                    $routine, $flat, $cumulative,
+                                    $start_addr, $end_addr, $total);
+          last;
+        }
+        $addr = AddressInc($addr);
+      }
+    }
+  }
+}
+
+# Return reference to array of tuples of the form:
+#       [start_address, filename, linenumber, instruction, limit_address]
+# E.g.,
+#       ["0x806c43d", "/foo/bar.cc", 131, "ret", "0x806c440"]
+sub Disassemble {
+  my $prog = shift;
+  my $offset = shift;
+  my $start_addr = shift;
+  my $end_addr = shift;
+
+  my $objdump = $obj_tool_map{"objdump"};
+  my $cmd = ShellEscape($objdump, "-C", "-d", "-l", "--no-show-raw-insn",
+                        "--start-address=0x$start_addr",
+                        "--stop-address=0x$end_addr", $prog);
+  open(OBJDUMP, "$cmd |") || error("$cmd: $!\n");
+  my @result = ();
+  my $filename = "";
+  my $linenumber = -1;
+  my $last = ["", "", "", ""];
+  while (<OBJDUMP>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    chop;
+    if (m|\s*([^:\s]+):(\d+)\s*$|) {
+      # Location line of the form:
+      #   <filename>:<linenumber>
+      $filename = $1;
+      $linenumber = $2;
+    } elsif (m/^ +([0-9a-f]+):\s*(.*)/) {
+      # Disassembly line -- zero-extend address to full length
+      my $addr = HexExtend($1);
+      my $k = AddressAdd($addr, $offset);
+      $last->[4] = $k;   # Store ending address for previous instruction
+      $last = [$k, $filename, $linenumber, $2, $end_addr];
+      push(@result, $last);
+    }
+  }
+  close(OBJDUMP);
+  return @result;
+}
+
+# The input file should contain lines of the form /proc/maps-like
+# output (same format as expected from the profiles) or that looks
+# like hex addresses (like "0xDEADBEEF").  We will parse all
+# /proc/maps output, and for all the hex addresses, we will output
+# "short" symbol names, one per line, in the same order as the input.
+sub PrintSymbols {
+  my $maps_and_symbols_file = shift;
+
+  # ParseLibraries expects pcs to be in a set.  Fine by us...
+  my @pclist = ();   # pcs in sorted order
+  my $pcs = {};
+  my $map = "";
+  foreach my $line (<$maps_and_symbols_file>) {
+    $line =~ s/\r//g;    # turn windows-looking lines into unix-looking lines
+    if ($line =~ /\b(0x[0-9a-f]+)\b/i) {
+      push(@pclist, HexExtend($1));
+      $pcs->{$pclist[-1]} = 1;
+    } else {
+      $map .= $line;
+    }
+  }
+
+  my $libs = ParseLibraries($main::prog, $map, $pcs);
+  my $symbols = ExtractSymbols($libs, $pcs);
+
+  foreach my $pc (@pclist) {
+    # ->[0] is the shortname, ->[2] is the full name
+    print(($symbols->{$pc}->[0] || "??") . "\n");
+  }
+}
+
+
+# For sorting functions by name
+sub ByName {
+  return ShortFunctionName($a) cmp ShortFunctionName($b);
+}
+
+# Print source-listing for all all routines that match $list_opts
+sub PrintListing {
+  my $total = shift;
+  my $libs = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $list_opts = shift;
+  my $html = shift;
+
+  my $output = \*STDOUT;
+  my $fname = "";
+
+  if ($html) {
+    # Arrange to write the output to a temporary file
+    $fname = TempName($main::next_tmpfile, "html");
+    $main::next_tmpfile++;
+    if (!open(TEMP, ">$fname")) {
+      print STDERR "$fname: $!\n";
+      return;
+    }
+    $output = \*TEMP;
+    print $output HtmlListingHeader();
+    printf $output ("<div class=\"legend\">%s<br>Total: %s %s</div>\n",
+                    $main::prog, Unparse($total), Units());
+  }
+
+  my $listed = 0;
+  foreach my $lib (@{$libs}) {
+    my $symbol_table = GetProcedureBoundaries($lib->[0], $list_opts);
+    my $offset = AddressSub($lib->[1], $lib->[3]);
+    foreach my $routine (sort ByName keys(%{$symbol_table})) {
+      # Print if there are any samples in this routine
+      my $start_addr = $symbol_table->{$routine}->[0];
+      my $end_addr = $symbol_table->{$routine}->[1];
+      my $length = hex(AddressSub($end_addr, $start_addr));
+      my $addr = AddressAdd($start_addr, $offset);
+      for (my $i = 0; $i < $length; $i++) {
+        if (defined($cumulative->{$addr})) {
+          $listed += PrintSource(
+            $lib->[0], $offset,
+            $routine, $flat, $cumulative,
+            $start_addr, $end_addr,
+            $html,
+            $output);
+          last;
+        }
+        $addr = AddressInc($addr);
+      }
+    }
+  }
+
+  if ($html) {
+    if ($listed > 0) {
+      print $output HtmlListingFooter();
+      close($output);
+      RunWeb($fname);
+    } else {
+      close($output);
+      unlink($fname);
+    }
+  }
+}
+
+sub HtmlListingHeader {
+  return <<'EOF';
+<DOCTYPE html>
+<html>
+<head>
+<title>Pprof listing</title>
+<style type="text/css">
+body {
+  font-family: sans-serif;
+}
+h1 {
+  font-size: 1.5em;
+  margin-bottom: 4px;
+}
+.legend {
+  font-size: 1.25em;
+}
+.line {
+  color: #aaaaaa;
+}
+.nop {
+  color: #aaaaaa;
+}
+.unimportant {
+  color: #cccccc;
+}
+.disasmloc {
+  color: #000000;
+}
+.deadsrc {
+  cursor: pointer;
+}
+.deadsrc:hover {
+  background-color: #eeeeee;
+}
+.livesrc {
+  color: #0000ff;
+  cursor: pointer;
+}
+.livesrc:hover {
+  background-color: #eeeeee;
+}
+.asm {
+  color: #008800;
+  display: none;
+}
+</style>
+<script type="text/javascript">
+function pprof_toggle_asm(e) {
+  var target;
+  if (!e) e = window.event;
+  if (e.target) target = e.target;
+  else if (e.srcElement) target = e.srcElement;
+
+  if (target) {
+    var asm = target.nextSibling;
+    if (asm && asm.className == "asm") {
+      asm.style.display = (asm.style.display == "block" ? "" : "block");
+      e.preventDefault();
+      return false;
+    }
+  }
+}
+</script>
+</head>
+<body>
+EOF
+}
+
+sub HtmlListingFooter {
+  return <<'EOF';
+</body>
+</html>
+EOF
+}
+
+sub HtmlEscape {
+  my $text = shift;
+  $text =~ s/&/&amp;/g;
+  $text =~ s/</&lt;/g;
+  $text =~ s/>/&gt;/g;
+  return $text;
+}
+
+# Returns the indentation of the line, if it has any non-whitespace
+# characters.  Otherwise, returns -1.
+sub Indentation {
+  my $line = shift;
+  if (m/^(\s*)\S/) {
+    return length($1);
+  } else {
+    return -1;
+  }
+}
+
+# If the symbol table contains inlining info, Disassemble() may tag an
+# instruction with a location inside an inlined function.  But for
+# source listings, we prefer to use the location in the function we
+# are listing.  So use MapToSymbols() to fetch full location
+# information for each instruction and then pick out the first
+# location from a location list (location list contains callers before
+# callees in case of inlining).
+#
+# After this routine has run, each entry in $instructions contains:
+#   [0] start address
+#   [1] filename for function we are listing
+#   [2] line number for function we are listing
+#   [3] disassembly
+#   [4] limit address
+#   [5] most specific filename (may be different from [1] due to inlining)
+#   [6] most specific line number (may be different from [2] due to inlining)
+sub GetTopLevelLineNumbers {
+  my ($lib, $offset, $instructions) = @_;
+  my $pcs = [];
+  for (my $i = 0; $i <= $#{$instructions}; $i++) {
+    push(@{$pcs}, $instructions->[$i]->[0]);
+  }
+  my $symbols = {};
+  MapToSymbols($lib, $offset, $pcs, $symbols);
+  for (my $i = 0; $i <= $#{$instructions}; $i++) {
+    my $e = $instructions->[$i];
+    push(@{$e}, $e->[1]);
+    push(@{$e}, $e->[2]);
+    my $addr = $e->[0];
+    my $sym = $symbols->{$addr};
+    if (defined($sym)) {
+      if ($#{$sym} >= 2 && $sym->[1] =~ m/^(.*):(\d+)$/) {
+        $e->[1] = $1;  # File name
+        $e->[2] = $2;  # Line number
+      }
+    }
+  }
+}
+
+# Print source-listing for one routine
+sub PrintSource {
+  my $prog = shift;
+  my $offset = shift;
+  my $routine = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $start_addr = shift;
+  my $end_addr = shift;
+  my $html = shift;
+  my $output = shift;
+
+  # Disassemble all instructions (just to get line numbers)
+  my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr);
+  GetTopLevelLineNumbers($prog, $offset, \@instructions);
+
+  # Hack 1: assume that the first source file encountered in the
+  # disassembly contains the routine
+  my $filename = undef;
+  for (my $i = 0; $i <= $#instructions; $i++) {
+    if ($instructions[$i]->[2] >= 0) {
+      $filename = $instructions[$i]->[1];
+      last;
+    }
+  }
+  if (!defined($filename)) {
+    print STDERR "no filename found in $routine\n";
+    return 0;
+  }
+
+  # Hack 2: assume that the largest line number from $filename is the
+  # end of the procedure.  This is typically safe since if P1 contains
+  # an inlined call to P2, then P2 usually occurs earlier in the
+  # source file.  If this does not work, we might have to compute a
+  # density profile or just print all regions we find.
+  my $lastline = 0;
+  for (my $i = 0; $i <= $#instructions; $i++) {
+    my $f = $instructions[$i]->[1];
+    my $l = $instructions[$i]->[2];
+    if (($f eq $filename) && ($l > $lastline)) {
+      $lastline = $l;
+    }
+  }
+
+  # Hack 3: assume the first source location from "filename" is the start of
+  # the source code.
+  my $firstline = 1;
+  for (my $i = 0; $i <= $#instructions; $i++) {
+    if ($instructions[$i]->[1] eq $filename) {
+      $firstline = $instructions[$i]->[2];
+      last;
+    }
+  }
+
+  # Hack 4: Extend last line forward until its indentation is less than
+  # the indentation we saw on $firstline
+  my $oldlastline = $lastline;
+  {
+    if (!open(FILE, "<$filename")) {
+      print STDERR "$filename: $!\n";
+      return 0;
+    }
+    my $l = 0;
+    my $first_indentation = -1;
+    while (<FILE>) {
+      s/\r//g;         # turn windows-looking lines into unix-looking lines
+      $l++;
+      my $indent = Indentation($_);
+      if ($l >= $firstline) {
+        if ($first_indentation < 0 && $indent >= 0) {
+          $first_indentation = $indent;
+          last if ($first_indentation == 0);
+        }
+      }
+      if ($l >= $lastline && $indent >= 0) {
+        if ($indent >= $first_indentation) {
+          $lastline = $l+1;
+        } else {
+          last;
+        }
+      }
+    }
+    close(FILE);
+  }
+
+  # Assign all samples to the range $firstline,$lastline,
+  # Hack 4: If an instruction does not occur in the range, its samples
+  # are moved to the next instruction that occurs in the range.
+  my $samples1 = {};        # Map from line number to flat count
+  my $samples2 = {};        # Map from line number to cumulative count
+  my $running1 = 0;         # Unassigned flat counts
+  my $running2 = 0;         # Unassigned cumulative counts
+  my $total1 = 0;           # Total flat counts
+  my $total2 = 0;           # Total cumulative counts
+  my %disasm = ();          # Map from line number to disassembly
+  my $running_disasm = "";  # Unassigned disassembly
+  my $skip_marker = "---\n";
+  if ($html) {
+    $skip_marker = "";
+    for (my $l = $firstline; $l <= $lastline; $l++) {
+      $disasm{$l} = "";
+    }
+  }
+  my $last_dis_filename = '';
+  my $last_dis_linenum = -1;
+  my $last_touched_line = -1;  # To detect gaps in disassembly for a line
+  foreach my $e (@instructions) {
+    # Add up counts for all address that fall inside this instruction
+    my $c1 = 0;
+    my $c2 = 0;
+    for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) {
+      $c1 += GetEntry($flat, $a);
+      $c2 += GetEntry($cumulative, $a);
+    }
+
+    if ($html) {
+      my $dis = sprintf("      %6s %6s \t\t%8s: %s ",
+                        HtmlPrintNumber($c1),
+                        HtmlPrintNumber($c2),
+                        UnparseAddress($offset, $e->[0]),
+                        CleanDisassembly($e->[3]));
+      
+      # Append the most specific source line associated with this instruction
+      if (length($dis) < 80) { $dis .= (' ' x (80 - length($dis))) };
+      $dis = HtmlEscape($dis);
+      my $f = $e->[5];
+      my $l = $e->[6];
+      if ($f ne $last_dis_filename) {
+        $dis .= sprintf("<span class=disasmloc>%s:%d</span>", 
+                        HtmlEscape(CleanFileName($f)), $l);
+      } elsif ($l ne $last_dis_linenum) {
+        # De-emphasize the unchanged file name portion
+        $dis .= sprintf("<span class=unimportant>%s</span>" .
+                        "<span class=disasmloc>:%d</span>", 
+                        HtmlEscape(CleanFileName($f)), $l);
+      } else {
+        # De-emphasize the entire location
+        $dis .= sprintf("<span class=unimportant>%s:%d</span>", 
+                        HtmlEscape(CleanFileName($f)), $l);
+      }
+      $last_dis_filename = $f;
+      $last_dis_linenum = $l;
+      $running_disasm .= $dis;
+      $running_disasm .= "\n";
+    }
+
+    $running1 += $c1;
+    $running2 += $c2;
+    $total1 += $c1;
+    $total2 += $c2;
+    my $file = $e->[1];
+    my $line = $e->[2];
+    if (($file eq $filename) &&
+        ($line >= $firstline) &&
+        ($line <= $lastline)) {
+      # Assign all accumulated samples to this line
+      AddEntry($samples1, $line, $running1);
+      AddEntry($samples2, $line, $running2);
+      $running1 = 0;
+      $running2 = 0;
+      if ($html) {
+        if ($line != $last_touched_line && $disasm{$line} ne '') {
+          $disasm{$line} .= "\n";
+        }
+        $disasm{$line} .= $running_disasm;
+        $running_disasm = '';
+        $last_touched_line = $line;
+      }
+    }
+  }
+
+  # Assign any leftover samples to $lastline
+  AddEntry($samples1, $lastline, $running1);
+  AddEntry($samples2, $lastline, $running2);
+  if ($html) {
+    if ($lastline != $last_touched_line && $disasm{$lastline} ne '') {
+      $disasm{$lastline} .= "\n";
+    }
+    $disasm{$lastline} .= $running_disasm;
+  }
+
+  if ($html) {
+    printf $output (
+      "<h1>%s</h1>%s\n<pre onClick=\"pprof_toggle_asm()\">\n" .
+      "Total:%6s %6s (flat / cumulative %s)\n",
+      HtmlEscape(ShortFunctionName($routine)),
+      HtmlEscape(CleanFileName($filename)),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  } else {
+    printf $output (
+      "ROUTINE ====================== %s in %s\n" .
+      "%6s %6s Total %s (flat / cumulative)\n",
+      ShortFunctionName($routine),
+      CleanFileName($filename),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  }
+  if (!open(FILE, "<$filename")) {
+    print STDERR "$filename: $!\n";
+    return 0;
+  }
+  my $l = 0;
+  while (<FILE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    $l++;
+    if ($l >= $firstline - 5 &&
+        (($l <= $oldlastline + 5) || ($l <= $lastline))) {
+      chop;
+      my $text = $_;
+      if ($l == $firstline) { print $output $skip_marker; }
+      my $n1 = GetEntry($samples1, $l);
+      my $n2 = GetEntry($samples2, $l);
+      if ($html) {
+        # Emit a span that has one of the following classes:
+        #    livesrc -- has samples
+        #    deadsrc -- has disassembly, but with no samples
+        #    nop     -- has no matching disasembly
+        # Also emit an optional span containing disassembly.
+        my $dis = $disasm{$l};
+        my $asm = "";
+        if (defined($dis) && $dis ne '') {
+          $asm = "<span class=\"asm\">" . $dis . "</span>";
+        }
+        my $source_class = (($n1 + $n2 > 0) 
+                            ? "livesrc" 
+                            : (($asm ne "") ? "deadsrc" : "nop"));
+        printf $output (
+          "<span class=\"line\">%5d</span> " .
+          "<span class=\"%s\">%6s %6s %s</span>%s\n",
+          $l, $source_class,
+          HtmlPrintNumber($n1),
+          HtmlPrintNumber($n2),
+          HtmlEscape($text),
+          $asm);
+      } else {
+        printf $output(
+          "%6s %6s %4d: %s\n",
+          UnparseAlt($n1),
+          UnparseAlt($n2),
+          $l,
+          $text);
+      }
+      if ($l == $lastline)  { print $output $skip_marker; }
+    };
+  }
+  close(FILE);
+  if ($html) {
+    print $output "</pre>\n";
+  }
+  return 1;
+}
+
+# Return the source line for the specified file/linenumber.
+# Returns undef if not found.
+sub SourceLine {
+  my $file = shift;
+  my $line = shift;
+
+  # Look in cache
+  if (!defined($main::source_cache{$file})) {
+    if (100 < scalar keys(%main::source_cache)) {
+      # Clear the cache when it gets too big
+      $main::source_cache = ();
+    }
+
+    # Read all lines from the file
+    if (!open(FILE, "<$file")) {
+      print STDERR "$file: $!\n";
+      $main::source_cache{$file} = [];  # Cache the negative result
+      return undef;
+    }
+    my $lines = [];
+    push(@{$lines}, "");        # So we can use 1-based line numbers as indices
+    while (<FILE>) {
+      push(@{$lines}, $_);
+    }
+    close(FILE);
+
+    # Save the lines in the cache
+    $main::source_cache{$file} = $lines;
+  }
+
+  my $lines = $main::source_cache{$file};
+  if (($line < 0) || ($line > $#{$lines})) {
+    return undef;
+  } else {
+    return $lines->[$line];
+  }
+}
+
+# Print disassembly for one routine with interspersed source if available
+sub PrintDisassembledFunction {
+  my $prog = shift;
+  my $offset = shift;
+  my $routine = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $start_addr = shift;
+  my $end_addr = shift;
+  my $total = shift;
+
+  # Disassemble all instructions
+  my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr);
+
+  # Make array of counts per instruction
+  my @flat_count = ();
+  my @cum_count = ();
+  my $flat_total = 0;
+  my $cum_total = 0;
+  foreach my $e (@instructions) {
+    # Add up counts for all address that fall inside this instruction
+    my $c1 = 0;
+    my $c2 = 0;
+    for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) {
+      $c1 += GetEntry($flat, $a);
+      $c2 += GetEntry($cumulative, $a);
+    }
+    push(@flat_count, $c1);
+    push(@cum_count, $c2);
+    $flat_total += $c1;
+    $cum_total += $c2;
+  }
+
+  # Print header with total counts
+  printf("ROUTINE ====================== %s\n" .
+         "%6s %6s %s (flat, cumulative) %.1f%% of total\n",
+         ShortFunctionName($routine),
+         Unparse($flat_total),
+         Unparse($cum_total),
+         Units(),
+         ($cum_total * 100.0) / $total);
+
+  # Process instructions in order
+  my $current_file = "";
+  for (my $i = 0; $i <= $#instructions; ) {
+    my $e = $instructions[$i];
+
+    # Print the new file name whenever we switch files
+    if ($e->[1] ne $current_file) {
+      $current_file = $e->[1];
+      my $fname = $current_file;
+      $fname =~ s|^\./||;   # Trim leading "./"
+
+      # Shorten long file names
+      if (length($fname) >= 58) {
+        $fname = "..." . substr($fname, -55);
+      }
+      printf("-------------------- %s\n", $fname);
+    }
+
+    # TODO: Compute range of lines to print together to deal with
+    # small reorderings.
+    my $first_line = $e->[2];
+    my $last_line = $first_line;
+    my %flat_sum = ();
+    my %cum_sum = ();
+    for (my $l = $first_line; $l <= $last_line; $l++) {
+      $flat_sum{$l} = 0;
+      $cum_sum{$l} = 0;
+    }
+
+    # Find run of instructions for this range of source lines
+    my $first_inst = $i;
+    while (($i <= $#instructions) &&
+           ($instructions[$i]->[2] >= $first_line) &&
+           ($instructions[$i]->[2] <= $last_line)) {
+      $e = $instructions[$i];
+      $flat_sum{$e->[2]} += $flat_count[$i];
+      $cum_sum{$e->[2]} += $cum_count[$i];
+      $i++;
+    }
+    my $last_inst = $i - 1;
+
+    # Print source lines
+    for (my $l = $first_line; $l <= $last_line; $l++) {
+      my $line = SourceLine($current_file, $l);
+      if (!defined($line)) {
+        $line = "?\n";
+        next;
+      } else {
+        $line =~ s/^\s+//;
+      }
+      printf("%6s %6s %5d: %s",
+             UnparseAlt($flat_sum{$l}),
+             UnparseAlt($cum_sum{$l}),
+             $l,
+             $line);
+    }
+
+    # Print disassembly
+    for (my $x = $first_inst; $x <= $last_inst; $x++) {
+      my $e = $instructions[$x];
+      printf("%6s %6s    %8s: %6s\n",
+             UnparseAlt($flat_count[$x]),
+             UnparseAlt($cum_count[$x]),
+             UnparseAddress($offset, $e->[0]),
+             CleanDisassembly($e->[3]));
+    }
+  }
+}
+
+# Print DOT graph
+sub PrintDot {
+  my $prog = shift;
+  my $symbols = shift;
+  my $raw = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $overall_total = shift;
+
+  # Get total
+  my $local_total = TotalProfile($flat);
+  my $nodelimit = int($main::opt_nodefraction * $local_total);
+  my $edgelimit = int($main::opt_edgefraction * $local_total);
+  my $nodecount = $main::opt_nodecount;
+
+  # Find nodes to include
+  my @list = (sort { abs(GetEntry($cumulative, $b)) <=>
+                     abs(GetEntry($cumulative, $a))
+                     || $a cmp $b }
+              keys(%{$cumulative}));
+  my $last = $nodecount - 1;
+  if ($last > $#list) {
+    $last = $#list;
+  }
+  while (($last >= 0) &&
+         (abs(GetEntry($cumulative, $list[$last])) <= $nodelimit)) {
+    $last--;
+  }
+  if ($last < 0) {
+    print STDERR "No nodes to print\n";
+    return 0;
+  }
+
+  if ($nodelimit > 0 || $edgelimit > 0) {
+    printf STDERR ("Dropping nodes with <= %s %s; edges with <= %s abs(%s)\n",
+                   Unparse($nodelimit), Units(),
+                   Unparse($edgelimit), Units());
+  }
+
+  # Open DOT output file
+  my $output;
+  my $escaped_dot = ShellEscape(@DOT);
+  my $escaped_ps2pdf = ShellEscape(@PS2PDF);
+  if ($main::opt_gv) {
+    my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "ps"));
+    $output = "| $escaped_dot -Tps2 >$escaped_outfile";
+  } elsif ($main::opt_evince) {
+    my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "pdf"));
+    $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - $escaped_outfile";
+  } elsif ($main::opt_ps) {
+    $output = "| $escaped_dot -Tps2";
+  } elsif ($main::opt_pdf) {
+    $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - -";
+  } elsif ($main::opt_web || $main::opt_svg) {
+    # We need to post-process the SVG, so write to a temporary file always.
+    my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "svg"));
+    $output = "| $escaped_dot -Tsvg >$escaped_outfile";
+  } elsif ($main::opt_gif) {
+    $output = "| $escaped_dot -Tgif";
+  } else {
+    $output = ">&STDOUT";
+  }
+  open(DOT, $output) || error("$output: $!\n");
+
+  # Title
+  printf DOT ("digraph \"%s; %s %s\" {\n",
+              $prog,
+              Unparse($overall_total),
+              Units());
+  if ($main::opt_pdf) {
+    # The output is more printable if we set the page size for dot.
+    printf DOT ("size=\"8,11\"\n");
+  }
+  printf DOT ("node [width=0.375,height=0.25];\n");
+
+  # Print legend
+  printf DOT ("Legend [shape=box,fontsize=24,shape=plaintext," .
+              "label=\"%s\\l%s\\l%s\\l%s\\l%s\\l\"];\n",
+              $prog,
+              sprintf("Total %s: %s", Units(), Unparse($overall_total)),
+              sprintf("Focusing on: %s", Unparse($local_total)),
+              sprintf("Dropped nodes with <= %s abs(%s)",
+                      Unparse($nodelimit), Units()),
+              sprintf("Dropped edges with <= %s %s",
+                      Unparse($edgelimit), Units())
+              );
+
+  # Print nodes
+  my %node = ();
+  my $nextnode = 1;
+  foreach my $a (@list[0..$last]) {
+    # Pick font size
+    my $f = GetEntry($flat, $a);
+    my $c = GetEntry($cumulative, $a);
+
+    my $fs = 8;
+    if ($local_total > 0) {
+      $fs = 8 + (50.0 * sqrt(abs($f * 1.0 / $local_total)));
+    }
+
+    $node{$a} = $nextnode++;
+    my $sym = $a;
+    $sym =~ s/\s+/\\n/g;
+    $sym =~ s/::/\\n/g;
+
+    # Extra cumulative info to print for non-leaves
+    my $extra = "";
+    if ($f != $c) {
+      $extra = sprintf("\\rof %s (%s)",
+                       Unparse($c),
+                       Percent($c, $local_total));
+    }
+    my $style = "";
+    if ($main::opt_heapcheck) {
+      if ($f > 0) {
+        # make leak-causing nodes more visible (add a background)
+        $style = ",style=filled,fillcolor=gray"
+      } elsif ($f < 0) {
+        # make anti-leak-causing nodes (which almost never occur)
+        # stand out as well (triple border)
+        $style = ",peripheries=3"
+      }
+    }
+
+    printf DOT ("N%d [label=\"%s\\n%s (%s)%s\\r" .
+                "\",shape=box,fontsize=%.1f%s];\n",
+                $node{$a},
+                $sym,
+                Unparse($f),
+                Percent($f, $local_total),
+                $extra,
+                $fs,
+                $style,
+               );
+  }
+
+  # Get edges and counts per edge
+  my %edge = ();
+  my $n;
+  my $fullname_to_shortname_map = {};
+  FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map);
+  foreach my $k (keys(%{$raw})) {
+    # TODO: omit low %age edges
+    $n = $raw->{$k};
+    my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k);
+    for (my $i = 1; $i <= $#translated; $i++) {
+      my $src = $translated[$i];
+      my $dst = $translated[$i-1];
+      #next if ($src eq $dst);  # Avoid self-edges?
+      if (exists($node{$src}) && exists($node{$dst})) {
+        my $edge_label = "$src\001$dst";
+        if (!exists($edge{$edge_label})) {
+          $edge{$edge_label} = 0;
+        }
+        $edge{$edge_label} += $n;
+      }
+    }
+  }
+
+  # Print edges (process in order of decreasing counts)
+  my %indegree = ();   # Number of incoming edges added per node so far
+  my %outdegree = ();  # Number of outgoing edges added per node so far
+  foreach my $e (sort { $edge{$b} <=> $edge{$a} } keys(%edge)) {
+    my @x = split(/\001/, $e);
+    $n = $edge{$e};
+
+    # Initialize degree of kept incoming and outgoing edges if necessary
+    my $src = $x[0];
+    my $dst = $x[1];
+    if (!exists($outdegree{$src})) { $outdegree{$src} = 0; }
+    if (!exists($indegree{$dst})) { $indegree{$dst} = 0; }
+
+    my $keep;
+    if ($indegree{$dst} == 0) {
+      # Keep edge if needed for reachability
+      $keep = 1;
+    } elsif (abs($n) <= $edgelimit) {
+      # Drop if we are below --edgefraction
+      $keep = 0;
+    } elsif ($outdegree{$src} >= $main::opt_maxdegree ||
+             $indegree{$dst} >= $main::opt_maxdegree) {
+      # Keep limited number of in/out edges per node
+      $keep = 0;
+    } else {
+      $keep = 1;
+    }
+
+    if ($keep) {
+      $outdegree{$src}++;
+      $indegree{$dst}++;
+
+      # Compute line width based on edge count
+      my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0);
+      if ($fraction > 1) { $fraction = 1; }
+      my $w = $fraction * 2;
+      if ($w < 1 && ($main::opt_web || $main::opt_svg)) {
+        # SVG output treats line widths < 1 poorly.
+        $w = 1;
+      }
+
+      # Dot sometimes segfaults if given edge weights that are too large, so
+      # we cap the weights at a large value
+      my $edgeweight = abs($n) ** 0.7;
+      if ($edgeweight > 100000) { $edgeweight = 100000; }
+      $edgeweight = int($edgeweight);
+
+      my $style = sprintf("setlinewidth(%f)", $w);
+      if ($x[1] =~ m/\(inline\)/) {
+        $style .= ",dashed";
+      }
+
+      # Use a slightly squashed function of the edge count as the weight
+      printf DOT ("N%s -> N%s [label=%s, weight=%d, style=\"%s\"];\n",
+                  $node{$x[0]},
+                  $node{$x[1]},
+                  Unparse($n),
+                  $edgeweight,
+                  $style);
+    }
+  }
+
+  print DOT ("}\n");
+  close(DOT);
+
+  if ($main::opt_web || $main::opt_svg) {
+    # Rewrite SVG to be more usable inside web browser.
+    RewriteSvg(TempName($main::next_tmpfile, "svg"));
+  }
+
+  return 1;
+}
+
+sub RewriteSvg {
+  my $svgfile = shift;
+
+  open(SVG, $svgfile) || die "open temp svg: $!";
+  my @svg = <SVG>;
+  close(SVG);
+  unlink $svgfile;
+  my $svg = join('', @svg);
+
+  # Dot's SVG output is
+  #
+  #    <svg width="___" height="___"
+  #     viewBox="___" xmlns=...>
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </svg>
+  #
+  # Change it to
+  #
+  #    <svg width="100%" height="100%"
+  #     xmlns=...>
+  #    $svg_javascript
+  #    <g id="viewport" transform="translate(0,0)">
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </g>
+  #    </svg>
+
+  # Fix width, height; drop viewBox.
+  $svg =~ s/(?s)<svg width="[^"]+" height="[^"]+"(.*?)viewBox="[^"]+"/<svg width="100%" height="100%"$1/;
+
+  # Insert script, viewport <g> above first <g>
+  my $svg_javascript = SvgJavascript();
+  my $viewport = "<g id=\"viewport\" transform=\"translate(0,0)\">\n";
+  $svg =~ s/<g id="graph\d"/$svg_javascript$viewport$&/;
+
+  # Insert final </g> above </svg>.
+  $svg =~ s/(.*)(<\/svg>)/$1<\/g>$2/;
+  $svg =~ s/<g id="graph\d"(.*?)/<g id="viewport"$1/;
+
+  if ($main::opt_svg) {
+    # --svg: write to standard output.
+    print $svg;
+  } else {
+    # Write back to temporary file.
+    open(SVG, ">$svgfile") || die "open $svgfile: $!";
+    print SVG $svg;
+    close(SVG);
+  }
+}
+
+sub SvgJavascript {
+  return <<'EOF';
+<script type="text/ecmascript"><![CDATA[
+// SVGPan
+// http://www.cyberz.org/blog/2009/12/08/svgpan-a-javascript-svg-panzoomdrag-library/
+// Local modification: if(true || ...) below to force panning, never moving.
+
+/**
+ *  SVGPan library 1.2
+ * ====================
+ *
+ * Given an unique existing element with id "viewport", including the
+ * the library into any SVG adds the following capabilities:
+ *
+ *  - Mouse panning
+ *  - Mouse zooming (using the wheel)
+ *  - Object dargging
+ *
+ * Known issues:
+ *
+ *  - Zooming (while panning) on Safari has still some issues
+ *
+ * Releases:
+ *
+ * 1.2, Sat Mar 20 08:42:50 GMT 2010, Zeng Xiaohui
+ *	Fixed a bug with browser mouse handler interaction
+ *
+ * 1.1, Wed Feb  3 17:39:33 GMT 2010, Zeng Xiaohui
+ *	Updated the zoom code to support the mouse wheel on Safari/Chrome
+ *
+ * 1.0, Andrea Leofreddi
+ *	First release
+ *
+ * This code is licensed under the following BSD license:
+ *
+ * Copyright 2009-2010 Andrea Leofreddi <a.leofreddi@itcharm.com>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *       of conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Andrea Leofreddi ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Andrea Leofreddi OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation are those of the
+ * authors and should not be interpreted as representing official policies, either expressed
+ * or implied, of Andrea Leofreddi.
+ */
+
+var root = document.documentElement;
+
+var state = 'none', stateTarget, stateOrigin, stateTf;
+
+setupHandlers(root);
+
+/**
+ * Register handlers
+ */
+function setupHandlers(root){
+	setAttributes(root, {
+		"onmouseup" : "add(evt)",
+		"onmousedown" : "handleMouseDown(evt)",
+		"onmousemove" : "handleMouseMove(evt)",
+		"onmouseup" : "handleMouseUp(evt)",
+		//"onmouseout" : "handleMouseUp(evt)", // Decomment this to stop the pan functionality when dragging out of the SVG element
+	});
+
+	if(navigator.userAgent.toLowerCase().indexOf('webkit') >= 0)
+		window.addEventListener('mousewheel', handleMouseWheel, false); // Chrome/Safari
+	else
+		window.addEventListener('DOMMouseScroll', handleMouseWheel, false); // Others
+
+	var g = svgDoc.getElementById("svg");
+	g.width = "100%";
+	g.height = "100%";
+}
+
+/**
+ * Instance an SVGPoint object with given event coordinates.
+ */
+function getEventPoint(evt) {
+	var p = root.createSVGPoint();
+
+	p.x = evt.clientX;
+	p.y = evt.clientY;
+
+	return p;
+}
+
+/**
+ * Sets the current transform matrix of an element.
+ */
+function setCTM(element, matrix) {
+	var s = "matrix(" + matrix.a + "," + matrix.b + "," + matrix.c + "," + matrix.d + "," + matrix.e + "," + matrix.f + ")";
+
+	element.setAttribute("transform", s);
+}
+
+/**
+ * Dumps a matrix to a string (useful for debug).
+ */
+function dumpMatrix(matrix) {
+	var s = "[ " + matrix.a + ", " + matrix.c + ", " + matrix.e + "\n  " + matrix.b + ", " + matrix.d + ", " + matrix.f + "\n  0, 0, 1 ]";
+
+	return s;
+}
+
+/**
+ * Sets attributes of an element.
+ */
+function setAttributes(element, attributes){
+	for (i in attributes)
+		element.setAttributeNS(null, i, attributes[i]);
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseWheel(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var delta;
+
+	if(evt.wheelDelta)
+		delta = evt.wheelDelta / 3600; // Chrome/Safari
+	else
+		delta = evt.detail / -90; // Mozilla
+
+	var z = 1 + delta; // Zoom factor: 0.9/1.1
+
+	var g = svgDoc.getElementById("viewport");
+
+	var p = getEventPoint(evt);
+
+	p = p.matrixTransform(g.getCTM().inverse());
+
+	// Compute new scale matrix in current mouse position
+	var k = root.createSVGMatrix().translate(p.x, p.y).scale(z).translate(-p.x, -p.y);
+
+        setCTM(g, g.getCTM().multiply(k));
+
+	stateTf = stateTf.multiply(k.inverse());
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseMove(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(state == 'pan') {
+		// Pan mode
+		var p = getEventPoint(evt).matrixTransform(stateTf);
+
+		setCTM(g, stateTf.inverse().translate(p.x - stateOrigin.x, p.y - stateOrigin.y));
+	} else if(state == 'move') {
+		// Move mode
+		var p = getEventPoint(evt).matrixTransform(g.getCTM().inverse());
+
+		setCTM(stateTarget, root.createSVGMatrix().translate(p.x - stateOrigin.x, p.y - stateOrigin.y).multiply(g.getCTM().inverse()).multiply(stateTarget.getCTM()));
+
+		stateOrigin = p;
+	}
+}
+
+/**
+ * Handle click event.
+ */
+function handleMouseDown(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(true || evt.target.tagName == "svg") {
+		// Pan mode
+		state = 'pan';
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	} else {
+		// Move mode
+		state = 'move';
+
+		stateTarget = evt.target;
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	}
+}
+
+/**
+ * Handle mouse button release event.
+ */
+function handleMouseUp(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	if(state == 'pan' || state == 'move') {
+		// Quit pan mode
+		state = '';
+	}
+}
+
+]]></script>
+EOF
+}
+
+# Provides a map from fullname to shortname for cases where the
+# shortname is ambiguous.  The symlist has both the fullname and
+# shortname for all symbols, which is usually fine, but sometimes --
+# such as overloaded functions -- two different fullnames can map to
+# the same shortname.  In that case, we use the address of the
+# function to disambiguate the two.  This function fills in a map that
+# maps fullnames to modified shortnames in such cases.  If a fullname
+# is not present in the map, the 'normal' shortname provided by the
+# symlist is the appropriate one to use.
+sub FillFullnameToShortnameMap {
+  my $symbols = shift;
+  my $fullname_to_shortname_map = shift;
+  my $shortnames_seen_once = {};
+  my $shortnames_seen_more_than_once = {};
+
+  foreach my $symlist (values(%{$symbols})) {
+    # TODO(csilvers): deal with inlined symbols too.
+    my $shortname = $symlist->[0];
+    my $fullname = $symlist->[2];
+    if ($fullname !~ /<[0-9a-fA-F]+>$/) {  # fullname doesn't end in an address
+      next;       # the only collisions we care about are when addresses differ
+    }
+    if (defined($shortnames_seen_once->{$shortname}) &&
+        $shortnames_seen_once->{$shortname} ne $fullname) {
+      $shortnames_seen_more_than_once->{$shortname} = 1;
+    } else {
+      $shortnames_seen_once->{$shortname} = $fullname;
+    }
+  }
+
+  foreach my $symlist (values(%{$symbols})) {
+    my $shortname = $symlist->[0];
+    my $fullname = $symlist->[2];
+    # TODO(csilvers): take in a list of addresses we care about, and only
+    # store in the map if $symlist->[1] is in that list.  Saves space.
+    next if defined($fullname_to_shortname_map->{$fullname});
+    if (defined($shortnames_seen_more_than_once->{$shortname})) {
+      if ($fullname =~ /<0*([^>]*)>$/) {   # fullname has address at end of it
+        $fullname_to_shortname_map->{$fullname} = "$shortname\@$1";
+      }
+    }
+  }
+}
+
+# Return a small number that identifies the argument.
+# Multiple calls with the same argument will return the same number.
+# Calls with different arguments will return different numbers.
+sub ShortIdFor {
+  my $key = shift;
+  my $id = $main::uniqueid{$key};
+  if (!defined($id)) {
+    $id = keys(%main::uniqueid) + 1;
+    $main::uniqueid{$key} = $id;
+  }
+  return $id;
+}
+
+# Translate a stack of addresses into a stack of symbols
+sub TranslateStack {
+  my $symbols = shift;
+  my $fullname_to_shortname_map = shift;
+  my $k = shift;
+
+  my @addrs = split(/\n/, $k);
+  my @result = ();
+  for (my $i = 0; $i <= $#addrs; $i++) {
+    my $a = $addrs[$i];
+
+    # Skip large addresses since they sometimes show up as fake entries on RH9
+    if (length($a) > 8 && $a gt "7fffffffffffffff") {
+      next;
+    }
+
+    if ($main::opt_disasm || $main::opt_list) {
+      # We want just the address for the key
+      push(@result, $a);
+      next;
+    }
+
+    my $symlist = $symbols->{$a};
+    if (!defined($symlist)) {
+      $symlist = [$a, "", $a];
+    }
+
+    # We can have a sequence of symbols for a particular entry
+    # (more than one symbol in the case of inlining).  Callers
+    # come before callees in symlist, so walk backwards since
+    # the translated stack should contain callees before callers.
+    for (my $j = $#{$symlist}; $j >= 2; $j -= 3) {
+      my $func = $symlist->[$j-2];
+      my $fileline = $symlist->[$j-1];
+      my $fullfunc = $symlist->[$j];
+      if (defined($fullname_to_shortname_map->{$fullfunc})) {
+        $func = $fullname_to_shortname_map->{$fullfunc};
+      }
+      if ($j > 2) {
+        $func = "$func (inline)";
+      }
+
+      # Do not merge nodes corresponding to Callback::Run since that
+      # causes confusing cycles in dot display.  Instead, we synthesize
+      # a unique name for this frame per caller.
+      if ($func =~ m/Callback.*::Run$/) {
+        my $caller = ($i > 0) ? $addrs[$i-1] : 0;
+        $func = "Run#" . ShortIdFor($caller);
+      }
+
+      if ($main::opt_addresses) {
+        push(@result, "$a $func $fileline");
+      } elsif ($main::opt_lines) {
+        if ($func eq '??' && $fileline eq '??:0') {
+          push(@result, "$a");
+        } elsif (!$main::opt_show_addresses) {
+          push(@result, "$func $fileline");
+        } else {
+          push(@result, "$func $fileline ($a)");
+        }
+      } elsif ($main::opt_functions) {
+        if ($func eq '??') {
+          push(@result, "$a");
+        } elsif (!$main::opt_show_addresses) {
+          push(@result, $func);
+        } else {
+          push(@result, "$func ($a)");
+        }
+      } elsif ($main::opt_files) {
+        if ($fileline eq '??:0' || $fileline eq '') {
+          push(@result, "$a");
+        } else {
+          my $f = $fileline;
+          $f =~ s/:\d+$//;
+          push(@result, $f);
+        }
+      } else {
+        push(@result, $a);
+        last;  # Do not print inlined info
+      }
+    }
+  }
+
+  # print join(",", @addrs), " => ", join(",", @result), "\n";
+  return @result;
+}
+
+# Generate percent string for a number and a total
+sub Percent {
+  my $num = shift;
+  my $tot = shift;
+  if ($tot != 0) {
+    return sprintf("%.1f%%", $num * 100.0 / $tot);
+  } else {
+    return ($num == 0) ? "nan" : (($num > 0) ? "+inf" : "-inf");
+  }
+}
+
+# Generate pretty-printed form of number
+sub Unparse {
+  my $num = shift;
+  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
+    if ($main::opt_inuse_objects || $main::opt_alloc_objects) {
+      return sprintf("%d", $num);
+    } else {
+      if ($main::opt_show_bytes) {
+        return sprintf("%d", $num);
+      } else {
+        return sprintf("%.1f", $num / 1048576.0);
+      }
+    }
+  } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) {
+    return sprintf("%.3f", $num / 1e9); # Convert nanoseconds to seconds
+  } else {
+    return sprintf("%d", $num);
+  }
+}
+
+# Alternate pretty-printed form: 0 maps to "."
+sub UnparseAlt {
+  my $num = shift;
+  if ($num == 0) {
+    return ".";
+  } else {
+    return Unparse($num);
+  }
+}
+
+# Alternate pretty-printed form: 0 maps to ""
+sub HtmlPrintNumber {
+  my $num = shift;
+  if ($num == 0) {
+    return "";
+  } else {
+    return Unparse($num);
+  }
+}
+
+# Return output units
+sub Units {
+  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
+    if ($main::opt_inuse_objects || $main::opt_alloc_objects) {
+      return "objects";
+    } else {
+      if ($main::opt_show_bytes) {
+        return "B";
+      } else {
+        return "MB";
+      }
+    }
+  } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) {
+    return "seconds";
+  } else {
+    return "samples";
+  }
+}
+
+##### Profile manipulation code #####
+
+# Generate flattened profile:
+# If count is charged to stack [a,b,c,d], in generated profile,
+# it will be charged to [a]
+sub FlatProfile {
+  my $profile = shift;
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    if ($#addrs >= 0) {
+      AddEntry($result, $addrs[0], $count);
+    }
+  }
+  return $result;
+}
+
+# Generate cumulative profile:
+# If count is charged to stack [a,b,c,d], in generated profile,
+# it will be charged to [a], [b], [c], [d]
+sub CumulativeProfile {
+  my $profile = shift;
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    foreach my $a (@addrs) {
+      AddEntry($result, $a, $count);
+    }
+  }
+  return $result;
+}
+
+# If the second-youngest PC on the stack is always the same, returns
+# that pc.  Otherwise, returns undef.
+sub IsSecondPcAlwaysTheSame {
+  my $profile = shift;
+
+  my $second_pc = undef;
+  foreach my $k (keys(%{$profile})) {
+    my @addrs = split(/\n/, $k);
+    if ($#addrs < 1) {
+      return undef;
+    }
+    if (not defined $second_pc) {
+      $second_pc = $addrs[1];
+    } else {
+      if ($second_pc ne $addrs[1]) {
+        return undef;
+      }
+    }
+  }
+  return $second_pc;
+}
+
+sub ExtractSymbolLocationInlineStack {
+  my $symbols = shift;
+  my $address = shift;
+  my $stack = shift;
+  # 'addr2line' outputs "??:0" for unknown locations; we do the
+  # same to be consistent.
+  if (exists $symbols->{$address}) {
+    my @localinlinestack = @{$symbols->{$address}};
+    for (my $i = $#localinlinestack; $i > 0; $i-=3) {
+      my $file = $localinlinestack[$i-1];
+      my $fn = $localinlinestack[$i-2];
+      if ($file eq "?" || $file eq ":0") {
+        $file = "??:0";
+      }
+      my $suffix = "[inline]";
+      if ($i == 2) {
+        $suffix = "";
+      }
+      push (@$stack, $file.":".$fn.$suffix);
+    }
+  }
+  else {
+      push (@$stack, "??:0:unknown");
+  }
+}
+
+sub ExtractSymbolNameInlineStack {
+  my $symbols = shift;
+  my $address = shift;
+
+  my @stack = ();
+
+  if (exists $symbols->{$address}) {
+    my @localinlinestack = @{$symbols->{$address}};
+    for (my $i = $#localinlinestack; $i > 0; $i-=3) {
+      my $file = $localinlinestack[$i-1];
+      my $fn = $localinlinestack[$i-0];
+
+      if ($file eq "?" || $file eq ":0") {
+        $file = "??:0";
+      }
+      if ($fn eq '??') {
+        # If we can't get the symbol name, at least use the file information.
+        $fn = $file;
+      }
+      my $suffix = "[inline]";
+      if ($i == 2) {
+        $suffix = "";
+      }
+      push (@stack, $fn.$suffix);
+    }
+  }
+  else {
+    # If we can't get a symbol name, at least fill in the address.
+    push (@stack, $address);
+  }
+
+  return @stack;
+}
+
+sub ExtractSymbolLocation {
+  my $symbols = shift;
+  my $address = shift;
+  # 'addr2line' outputs "??:0" for unknown locations; we do the
+  # same to be consistent.
+  my $location = "??:0:unknown";
+  if (exists $symbols->{$address}) {
+    my $file = $symbols->{$address}->[1];
+    if ($file eq "?" || $file eq ":0") {
+      $file = "??:0"
+    }
+    $location = $file . ":" . $symbols->{$address}->[0];
+  }
+  return $location;
+}
+
+# Extracts a graph of calls.
+sub ExtractCalls {
+  my $symbols = shift;
+  my $profile = shift;
+  my $calls = {};
+  while( my ($stack_trace, $count) = each %$profile ) {
+    my @address = split(/\n/, $stack_trace);
+    my @stack = ();
+    ExtractSymbolLocationInlineStack($symbols, $address[0], \@stack);
+    for (my $i = 1; $i <= $#address; $i++) {
+      ExtractSymbolLocationInlineStack($symbols, $address[$i], \@stack);
+    }
+    AddEntry($calls, $stack[0], $count);
+    for (my $i = 1; $i < $#address; $i++) {
+      AddEntry($calls, "$stack[$i] -> $stack[$i-1]", $count);
+    }
+  }
+  return $calls;
+}
+
+sub PrintStacksForText {
+  my $symbols = shift;
+  my $profile = shift;
+
+  while (my ($stack_trace, $count) = each %$profile) {
+    my @address = split(/\n/, $stack_trace);
+    for (my $i = 0; $i <= $#address; $i++) {
+      $address[$i] = sprintf("(%s) %s", $address[$i], ExtractSymbolLocation($symbols, $address[$i]));
+    }
+    printf("%-8d %s\n\n", $count, join("\n         ", @address));
+  }
+}
+
+sub PrintCollapsedStacks {
+  my $symbols = shift;
+  my $profile = shift;
+
+  while (my ($stack_trace, $count) = each %$profile) {
+    my @address = split(/\n/, $stack_trace);
+    my @names = reverse ( map { ExtractSymbolNameInlineStack($symbols, $_) } @address );
+    printf("%s %d\n", join(";", @names), $count);
+  }
+}
+
+sub RemoveUninterestingFrames {
+  my $symbols = shift;
+  my $profile = shift;
+
+  # List of function names to skip
+  my %skip = ();
+  my $skip_regexp = 'NOMATCH';
+  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
+    foreach my $name ('calloc',
+                      'cfree',
+                      'malloc',
+                      'free',
+                      'memalign',
+                      'posix_memalign',
+                      'pvalloc',
+                      'valloc',
+                      'realloc',
+                      'tc_calloc',
+                      'tc_cfree',
+                      'tc_malloc',
+                      'tc_free',
+                      'tc_memalign',
+                      'tc_posix_memalign',
+                      'tc_pvalloc',
+                      'tc_valloc',
+                      'tc_realloc',
+                      'tc_new',
+                      'tc_delete',
+                      'tc_newarray',
+                      'tc_deletearray',
+                      'tc_new_nothrow',
+                      'tc_newarray_nothrow',
+                      'do_malloc',
+                      '::do_malloc',   # new name -- got moved to an unnamed ns
+                      '::do_malloc_or_cpp_alloc',
+                      'DoSampledAllocation',
+                      'simple_alloc::allocate',
+                      '__malloc_alloc_template::allocate',
+                      '__builtin_delete',
+                      '__builtin_new',
+                      '__builtin_vec_delete',
+                      '__builtin_vec_new',
+                      'operator new',
+                      'operator new[]',
+                      # The entry to our memory-allocation routines on OS X
+                      'malloc_zone_malloc',
+                      'malloc_zone_calloc',
+                      'malloc_zone_valloc',
+                      'malloc_zone_realloc',
+                      'malloc_zone_memalign',
+                      'malloc_zone_free',
+                      # These mark the beginning/end of our custom sections
+                      '__start_google_malloc',
+                      '__stop_google_malloc',
+                      '__start_malloc_hook',
+                      '__stop_malloc_hook') {
+      $skip{$name} = 1;
+      $skip{"_" . $name} = 1;   # Mach (OS X) adds a _ prefix to everything
+    }
+    # TODO: Remove TCMalloc once everything has been
+    # moved into the tcmalloc:: namespace and we have flushed
+    # old code out of the system.
+    $skip_regexp = "TCMalloc|^tcmalloc::";
+  } elsif ($main::profile_type eq 'contention') {
+    foreach my $vname ('base::RecordLockProfileData',
+                       'base::SubmitMutexProfileData',
+                       'base::SubmitSpinLockProfileData',
+                       'Mutex::Unlock',
+                       'Mutex::UnlockSlow',
+                       'Mutex::ReaderUnlock',
+                       'MutexLock::~MutexLock',
+                       'SpinLock::Unlock',
+                       'SpinLock::SlowUnlock',
+                       'SpinLockHolder::~SpinLockHolder') {
+      $skip{$vname} = 1;
+    }
+  } elsif ($main::profile_type eq 'cpu' && !$main::opt_no_auto_signal_frames) {
+    # Drop signal handlers used for CPU profile collection
+    # TODO(dpeng): this should not be necessary; it's taken
+    # care of by the general 2nd-pc mechanism below.
+    foreach my $name ('ProfileData::Add',           # historical
+                      'ProfileData::prof_handler',  # historical
+                      'CpuProfiler::prof_handler',
+                      '__FRAME_END__',
+                      '__pthread_sighandler',
+                      '__restore') {
+      $skip{$name} = 1;
+    }
+  } else {
+    # Nothing skipped for unknown types
+  }
+
+  if ($main::profile_type eq 'cpu') {
+    # If all the second-youngest program counters are the same,
+    # this STRONGLY suggests that it is an artifact of measurement,
+    # i.e., stack frames pushed by the CPU profiler signal handler.
+    # Hence, we delete them.
+    # (The topmost PC is read from the signal structure, not from
+    # the stack, so it does not get involved.)
+    while (my $second_pc = IsSecondPcAlwaysTheSame($profile)) {
+      my $result = {};
+      my $func = '';
+      if (exists($symbols->{$second_pc})) {
+        $second_pc = $symbols->{$second_pc}->[0];
+      }
+      if ($main::opt_no_auto_signal_frames) {
+        print STDERR "All second stack frames are same: `$second_pc'.\nMight be stack trace capturing bug.\n";
+        last;
+      }
+      print STDERR "Removing $second_pc from all stack traces.\n";
+      foreach my $k (keys(%{$profile})) {
+        my $count = $profile->{$k};
+        my @addrs = split(/\n/, $k);
+        my $topaddr = POSIX::strtoul($addrs[0], 16);
+        splice @addrs, 1, 1;
+        if ($#addrs > 1) {
+          my $subtopaddr = POSIX::strtoul($addrs[1], 16);
+          if ($subtopaddr + 1 == $topaddr) {
+            splice @addrs, 1, 1;
+          }
+        }
+        my $reduced_path = join("\n", @addrs);
+        AddEntry($result, $reduced_path, $count);
+      }
+      $profile = $result;
+    }
+  }
+
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    my @path = ();
+    foreach my $a (@addrs) {
+      if (exists($symbols->{$a})) {
+        my $func = $symbols->{$a}->[0];
+        if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
+          next;
+        }
+      }
+      push(@path, $a);
+    }
+    my $reduced_path = join("\n", @path);
+    AddEntry($result, $reduced_path, $count);
+  }
+  return $result;
+}
+
+# Reduce profile to granularity given by user
+sub ReduceProfile {
+  my $symbols = shift;
+  my $profile = shift;
+  my $result = {};
+  my $fullname_to_shortname_map = {};
+  FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map);
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k);
+    my @path = ();
+    my %seen = ();
+    $seen{''} = 1;      # So that empty keys are skipped
+    foreach my $e (@translated) {
+      # To avoid double-counting due to recursion, skip a stack-trace
+      # entry if it has already been seen
+      if (!$seen{$e}) {
+        $seen{$e} = 1;
+        push(@path, $e);
+      }
+    }
+    my $reduced_path = join("\n", @path);
+    AddEntry($result, $reduced_path, $count);
+  }
+  return $result;
+}
+
+# Does the specified symbol array match the regexp?
+sub SymbolMatches {
+  my $sym = shift;
+  my $re = shift;
+  if (defined($sym)) {
+    for (my $i = 0; $i < $#{$sym}; $i += 3) {
+      if ($sym->[$i] =~ m/$re/ || $sym->[$i+1] =~ m/$re/) {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+# Focus only on paths involving specified regexps
+sub FocusProfile {
+  my $symbols = shift;
+  my $profile = shift;
+  my $focus = shift;
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    foreach my $a (@addrs) {
+      # Reply if it matches either the address/shortname/fileline
+      if (($a =~ m/$focus/) || SymbolMatches($symbols->{$a}, $focus)) {
+        AddEntry($result, $k, $count);
+        last;
+      }
+    }
+  }
+  return $result;
+}
+
+# Focus only on paths not involving specified regexps
+sub IgnoreProfile {
+  my $symbols = shift;
+  my $profile = shift;
+  my $ignore = shift;
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    my $matched = 0;
+    foreach my $a (@addrs) {
+      # Reply if it matches either the address/shortname/fileline
+      if (($a =~ m/$ignore/) || SymbolMatches($symbols->{$a}, $ignore)) {
+        $matched = 1;
+        last;
+      }
+    }
+    if (!$matched) {
+      AddEntry($result, $k, $count);
+    }
+  }
+  return $result;
+}
+
+# Get total count in profile
+sub TotalProfile {
+  my $profile = shift;
+  my $result = 0;
+  foreach my $k (keys(%{$profile})) {
+    $result += $profile->{$k};
+  }
+  return $result;
+}
+
+# Add A to B
+sub AddProfile {
+  my $A = shift;
+  my $B = shift;
+
+  my $R = {};
+  # add all keys in A
+  foreach my $k (keys(%{$A})) {
+    my $v = $A->{$k};
+    AddEntry($R, $k, $v);
+  }
+  # add all keys in B
+  foreach my $k (keys(%{$B})) {
+    my $v = $B->{$k};
+    AddEntry($R, $k, $v);
+  }
+  return $R;
+}
+
+# Merges symbol maps
+sub MergeSymbols {
+  my $A = shift;
+  my $B = shift;
+
+  my $R = {};
+  foreach my $k (keys(%{$A})) {
+    $R->{$k} = $A->{$k};
+  }
+  if (defined($B)) {
+    foreach my $k (keys(%{$B})) {
+      $R->{$k} = $B->{$k};
+    }
+  }
+  return $R;
+}
+
+
+# Add A to B
+sub AddPcs {
+  my $A = shift;
+  my $B = shift;
+
+  my $R = {};
+  # add all keys in A
+  foreach my $k (keys(%{$A})) {
+    $R->{$k} = 1
+  }
+  # add all keys in B
+  foreach my $k (keys(%{$B})) {
+    $R->{$k} = 1
+  }
+  return $R;
+}
+
+# Subtract B from A
+sub SubtractProfile {
+  my $A = shift;
+  my $B = shift;
+
+  my $R = {};
+  foreach my $k (keys(%{$A})) {
+    my $v = $A->{$k} - GetEntry($B, $k);
+    if ($v < 0 && $main::opt_drop_negative) {
+      $v = 0;
+    }
+    AddEntry($R, $k, $v);
+  }
+  if (!$main::opt_drop_negative) {
+    # Take care of when subtracted profile has more entries
+    foreach my $k (keys(%{$B})) {
+      if (!exists($A->{$k})) {
+        AddEntry($R, $k, 0 - $B->{$k});
+      }
+    }
+  }
+  return $R;
+}
+
+# Get entry from profile; zero if not present
+sub GetEntry {
+  my $profile = shift;
+  my $k = shift;
+  if (exists($profile->{$k})) {
+    return $profile->{$k};
+  } else {
+    return 0;
+  }
+}
+
+# Add entry to specified profile
+sub AddEntry {
+  my $profile = shift;
+  my $k = shift;
+  my $n = shift;
+  if (!exists($profile->{$k})) {
+    $profile->{$k} = 0;
+  }
+  $profile->{$k} += $n;
+}
+
+# Add a stack of entries to specified profile, and add them to the $pcs
+# list.
+sub AddEntries {
+  my $profile = shift;
+  my $pcs = shift;
+  my $stack = shift;
+  my $count = shift;
+  my @k = ();
+
+  foreach my $e (split(/\s+/, $stack)) {
+    my $pc = HexExtend($e);
+    $pcs->{$pc} = 1;
+    push @k, $pc;
+  }
+  AddEntry($profile, (join "\n", @k), $count);
+}
+
+##### Code to profile a server dynamically #####
+
+sub CheckSymbolPage {
+  my $url = SymbolPageURL();
+  my $command = ShellEscape(@URL_FETCHER, $url);
+  open(SYMBOL, "$command |") or error($command);
+  my $line = <SYMBOL>;
+  $line =~ s/\r//g;         # turn windows-looking lines into unix-looking lines
+  close(SYMBOL);
+  unless (defined($line)) {
+    error("$url doesn't exist\n");
+  }
+
+  if ($line =~ /^num_symbols:\s+(\d+)$/) {
+    if ($1 == 0) {
+      error("Stripped binary. No symbols available.\n");
+    }
+  } else {
+    error("Failed to get the number of symbols from $url\n");
+  }
+}
+
+sub IsProfileURL {
+  my $profile_name = shift;
+  if (-f $profile_name) {
+    printf STDERR "Using local file $profile_name.\n";
+    return 0;
+  }
+  return 1;
+}
+
+sub ParseProfileURL {
+  my $profile_name = shift;
+
+  if (!defined($profile_name) || $profile_name eq "") {
+    return ();
+  }
+
+  # Split profile URL - matches all non-empty strings, so no test.
+  $profile_name =~ m,^(https?://)?([^/]+)(.*?)(/|$PROFILES)?$,;
+
+  my $proto = $1 || "http://";
+  my $hostport = $2;
+  my $prefix = $3;
+  my $profile = $4 || "/";
+
+  my $host = $hostport;
+  $host =~ s/:.*//;
+
+  my $baseurl = "$proto$hostport$prefix";
+  return ($host, $baseurl, $profile);
+}
+
+# We fetch symbols from the first profile argument.
+sub SymbolPageURL {
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  return "$baseURL$SYMBOL_PAGE";
+}
+
+sub FetchProgramName() {
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  my $url = "$baseURL$PROGRAM_NAME_PAGE";
+  my $command_line = ShellEscape(@URL_FETCHER, $url);
+  open(CMDLINE, "$command_line |") or error($command_line);
+  my $cmdline = <CMDLINE>;
+  $cmdline =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+  close(CMDLINE);
+  error("Failed to get program name from $url\n") unless defined($cmdline);
+  $cmdline =~ s/\x00.+//;  # Remove argv[1] and latters.
+  $cmdline =~ s!\n!!g;  # Remove LFs.
+  return $cmdline;
+}
+
+# Gee, curl's -L (--location) option isn't reliable at least
+# with its 7.12.3 version.  Curl will forget to post data if
+# there is a redirection.  This function is a workaround for
+# curl.  Redirection happens on borg hosts.
+sub ResolveRedirectionForCurl {
+  my $url = shift;
+  my $command_line = ShellEscape(@URL_FETCHER, "--head", $url);
+  open(CMDLINE, "$command_line |") or error($command_line);
+  while (<CMDLINE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    if (/^Location: (.*)/) {
+      $url = $1;
+    }
+  }
+  close(CMDLINE);
+  return $url;
+}
+
+# Add a timeout flat to URL_FETCHER.  Returns a new list.
+sub AddFetchTimeout {
+  my $timeout = shift;
+  my @fetcher = shift;
+  if (defined($timeout)) {
+    if (join(" ", @fetcher) =~ m/\bcurl -s/) {
+      push(@fetcher, "--max-time", sprintf("%d", $timeout));
+    } elsif (join(" ", @fetcher) =~ m/\brpcget\b/) {
+      push(@fetcher, sprintf("--deadline=%d", $timeout));
+    }
+  }
+  return @fetcher;
+}
+
+# Reads a symbol map from the file handle name given as $1, returning
+# the resulting symbol map.  Also processes variables relating to symbols.
+# Currently, the only variable processed is 'binary=<value>' which updates
+# $main::prog to have the correct program name.
+sub ReadSymbols {
+  my $in = shift;
+  my $map = {};
+  while (<$in>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    # Removes all the leading zeroes from the symbols, see comment below.
+    if (m/^0x0*([0-9a-f]+)\s+(.+)/) {
+      $map->{$1} = $2;
+    } elsif (m/^---/) {
+      last;
+    } elsif (m/^([a-z][^=]*)=(.*)$/ ) {
+      my ($variable, $value) = ($1, $2);
+      for ($variable, $value) {
+        s/^\s+//;
+        s/\s+$//;
+      }
+      if ($variable eq "binary") {
+        if ($main::prog ne $UNKNOWN_BINARY && $main::prog ne $value) {
+          printf STDERR ("Warning: Mismatched binary name '%s', using '%s'.\n",
+                         $main::prog, $value);
+        }
+        $main::prog = $value;
+      } else {
+        printf STDERR ("Ignoring unknown variable in symbols list: " .
+            "'%s' = '%s'\n", $variable, $value);
+      }
+    }
+  }
+  return $map;
+}
+
+# Fetches and processes symbols to prepare them for use in the profile output
+# code.  If the optional 'symbol_map' arg is not given, fetches symbols from
+# $SYMBOL_PAGE for all PC values found in profile.  Otherwise, the raw symbols
+# are assumed to have already been fetched into 'symbol_map' and are simply
+# extracted and processed.
+sub FetchSymbols {
+  my $pcset = shift;
+  my $symbol_map = shift;
+
+  my %seen = ();
+  my @pcs = grep { !$seen{$_}++ } keys(%$pcset);  # uniq
+
+  if (!defined($symbol_map)) {
+    my $post_data = join("+", sort((map {"0x" . "$_"} @pcs)));
+
+    open(POSTFILE, ">$main::tmpfile_sym");
+    print POSTFILE $post_data;
+    close(POSTFILE);
+
+    my $url = SymbolPageURL();
+
+    my $command_line;
+    if (join(" ", @URL_FETCHER) =~ m/\bcurl -s/) {
+      $url = ResolveRedirectionForCurl($url);
+      $command_line = ShellEscape(@URL_FETCHER, "-d", "\@$main::tmpfile_sym",
+                                  $url);
+    } else {
+      $command_line = (ShellEscape(@URL_FETCHER, "--post", $url)
+                       . " < " . ShellEscape($main::tmpfile_sym));
+    }
+    # We use c++filt in case $SYMBOL_PAGE gives us mangled symbols.
+    my $escaped_cppfilt = ShellEscape($obj_tool_map{"c++filt"});
+    open(SYMBOL, "$command_line | $escaped_cppfilt |") or error($command_line);
+    $symbol_map = ReadSymbols(*SYMBOL{IO});
+    close(SYMBOL);
+  }
+
+  my $symbols = {};
+  foreach my $pc (@pcs) {
+    my $fullname;
+    # For 64 bits binaries, symbols are extracted with 8 leading zeroes.
+    # Then /symbol reads the long symbols in as uint64, and outputs
+    # the result with a "0x%08llx" format which get rid of the zeroes.
+    # By removing all the leading zeroes in both $pc and the symbols from
+    # /symbol, the symbols match and are retrievable from the map.
+    my $shortpc = $pc;
+    $shortpc =~ s/^0*//;
+    # Each line may have a list of names, which includes the function
+    # and also other functions it has inlined.  They are separated (in
+    # PrintSymbolizedProfile), by --, which is illegal in function names.
+    my $fullnames;
+    if (defined($symbol_map->{$shortpc})) {
+      $fullnames = $symbol_map->{$shortpc};
+    } else {
+      $fullnames = "0x" . $pc;  # Just use addresses
+    }
+    my $sym = [];
+    $symbols->{$pc} = $sym;
+    foreach my $fullname (split("--", $fullnames)) {
+      my $name = ShortFunctionName($fullname);
+      push(@{$sym}, $name, "?", $fullname);
+    }
+  }
+  return $symbols;
+}
+
+sub BaseName {
+  my $file_name = shift;
+  $file_name =~ s!^.*/!!;  # Remove directory name
+  return $file_name;
+}
+
+sub MakeProfileBaseName {
+  my ($binary_name, $profile_name) = @_;
+  my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
+  my $binary_shortname = BaseName($binary_name);
+  return sprintf("%s.%s.%s",
+                 $binary_shortname, $main::op_time, $host);
+}
+
+sub FetchDynamicProfile {
+  my $binary_name = shift;
+  my $profile_name = shift;
+  my $fetch_name_only = shift;
+  my $encourage_patience = shift;
+
+  if (!IsProfileURL($profile_name)) {
+    return $profile_name;
+  } else {
+    my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
+    if ($path eq "" || $path eq "/") {
+      # Missing type specifier defaults to cpu-profile
+      $path = $PROFILE_PAGE;
+    }
+
+    my $profile_file = MakeProfileBaseName($binary_name, $profile_name);
+
+    my $url = "$baseURL$path";
+    my $fetch_timeout = undef;
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/) {
+      if ($path =~ m/[?]/) {
+        $url .= "&";
+      } else {
+        $url .= "?";
+      }
+      $url .= sprintf("seconds=%d", $main::opt_seconds);
+      $fetch_timeout = $main::opt_seconds * 1.01 + 60;
+    } else {
+      # For non-CPU profiles, we add a type-extension to
+      # the target profile file name.
+      my $suffix = $path;
+      $suffix =~ s,/,.,g;
+      $profile_file .= $suffix;
+    }
+
+    my $profile_dir = $ENV{"PPROF_TMPDIR"} || ($ENV{HOME} . "/pprof");
+    if (! -d $profile_dir) {
+      mkdir($profile_dir)
+          || die("Unable to create profile directory $profile_dir: $!\n");
+    }
+    my $tmp_profile = "$profile_dir/.tmp.$profile_file";
+    my $real_profile = "$profile_dir/$profile_file";
+
+    if ($fetch_name_only > 0) {
+      return $real_profile;
+    }
+
+    my @fetcher = AddFetchTimeout($fetch_timeout, @URL_FETCHER);
+    my $cmd = ShellEscape(@fetcher, $url) . " > " . ShellEscape($tmp_profile);
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE|$CENSUSPROFILE_PAGE/){
+      print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n  ${real_profile}\n";
+      if ($encourage_patience) {
+        print STDERR "Be patient...\n";
+      }
+    } else {
+      print STDERR "Fetching $path profile from $url to\n  ${real_profile}\n";
+    }
+
+    (system($cmd) == 0) || error("Failed to get profile: $cmd: $!\n");
+    (system("mv", $tmp_profile, $real_profile) == 0) || error("Unable to rename profile\n");
+    print STDERR "Wrote profile to $real_profile\n";
+    $main::collected_profile = $real_profile;
+    return $main::collected_profile;
+  }
+}
+
+# Collect profiles in parallel
+sub FetchDynamicProfiles {
+  my $items = scalar(@main::pfile_args);
+  my $levels = log($items) / log(2);
+
+  if ($items == 1) {
+    $main::profile_files[0] = FetchDynamicProfile($main::prog, $main::pfile_args[0], 0, 1);
+  } else {
+    # math rounding issues
+    if ((2 ** $levels) < $items) {
+     $levels++;
+    }
+    my $count = scalar(@main::pfile_args);
+    for (my $i = 0; $i < $count; $i++) {
+      $main::profile_files[$i] = FetchDynamicProfile($main::prog, $main::pfile_args[$i], 1, 0);
+    }
+    print STDERR "Fetching $count profiles, Be patient...\n";
+    FetchDynamicProfilesRecurse($levels, 0, 0);
+    $main::collected_profile = join(" \\\n    ", @main::profile_files);
+  }
+}
+
+# Recursively fork a process to get enough processes
+# collecting profiles
+sub FetchDynamicProfilesRecurse {
+  my $maxlevel = shift;
+  my $level = shift;
+  my $position = shift;
+
+  if (my $pid = fork()) {
+    $position = 0 | ($position << 1);
+    TryCollectProfile($maxlevel, $level, $position);
+    wait;
+  } else {
+    $position = 1 | ($position << 1);
+    TryCollectProfile($maxlevel, $level, $position);
+    cleanup();
+    exit(0);
+  }
+}
+
+# Collect a single profile
+sub TryCollectProfile {
+  my $maxlevel = shift;
+  my $level = shift;
+  my $position = shift;
+
+  if ($level >= ($maxlevel - 1)) {
+    if ($position < scalar(@main::pfile_args)) {
+      FetchDynamicProfile($main::prog, $main::pfile_args[$position], 0, 0);
+    }
+  } else {
+    FetchDynamicProfilesRecurse($maxlevel, $level+1, $position);
+  }
+}
+
+##### Parsing code #####
+
+# Provide a small streaming-read module to handle very large
+# cpu-profile files.  Stream in chunks along a sliding window.
+# Provides an interface to get one 'slot', correctly handling
+# endian-ness differences.  A slot is one 32-bit or 64-bit word
+# (depending on the input profile).  We tell endianness and bit-size
+# for the profile by looking at the first 8 bytes: in cpu profiles,
+# the second slot is always 3 (we'll accept anything that's not 0).
+BEGIN {
+  package CpuProfileStream;
+
+  sub new {
+    my ($class, $file, $fname) = @_;
+    my $self = { file        => $file,
+                 base        => 0,
+                 stride      => 512 * 1024,   # must be a multiple of bitsize/8
+                 slots       => [],
+                 unpack_code => "",           # N for big-endian, V for little
+                 perl_is_64bit => 1,          # matters if profile is 64-bit
+    };
+    bless $self, $class;
+    # Let unittests adjust the stride
+    if ($main::opt_test_stride > 0) {
+      $self->{stride} = $main::opt_test_stride;
+    }
+    # Read the first two slots to figure out bitsize and endianness.
+    my $slots = $self->{slots};
+    my $str;
+    read($self->{file}, $str, 8);
+    # Set the global $address_length based on what we see here.
+    # 8 is 32-bit (8 hexadecimal chars); 16 is 64-bit (16 hexadecimal chars).
+    $address_length = ($str eq (chr(0)x8)) ? 16 : 8;
+    if ($address_length == 8) {
+      if (substr($str, 6, 2) eq chr(0)x2) {
+        $self->{unpack_code} = 'V';  # Little-endian.
+      } elsif (substr($str, 4, 2) eq chr(0)x2) {
+        $self->{unpack_code} = 'N';  # Big-endian
+      } else {
+        ::error("$fname: header size >= 2**16\n");
+      }
+      @$slots = unpack($self->{unpack_code} . "*", $str);
+    } else {
+      # If we're a 64-bit profile, check if we're a 64-bit-capable
+      # perl.  Otherwise, each slot will be represented as a float
+      # instead of an int64, losing precision and making all the
+      # 64-bit addresses wrong.  We won't complain yet, but will
+      # later if we ever see a value that doesn't fit in 32 bits.
+      my $has_q = 0;
+      eval { $has_q = pack("Q", "1") ? 1 : 1; };
+      if (!$has_q) {
+        $self->{perl_is_64bit} = 0;
+      }
+      read($self->{file}, $str, 8);
+      if (substr($str, 4, 4) eq chr(0)x4) {
+        # We'd love to use 'Q', but it's a) not universal, b) not endian-proof.
+        $self->{unpack_code} = 'V';  # Little-endian.
+      } elsif (substr($str, 0, 4) eq chr(0)x4) {
+        $self->{unpack_code} = 'N';  # Big-endian
+      } else {
+        ::error("$fname: header size >= 2**32\n");
+      }
+      my @pair = unpack($self->{unpack_code} . "*", $str);
+      # Since we know one of the pair is 0, it's fine to just add them.
+      @$slots = (0, $pair[0] + $pair[1]);
+    }
+    return $self;
+  }
+
+  # Load more data when we access slots->get(X) which is not yet in memory.
+  sub overflow {
+    my ($self) = @_;
+    my $slots = $self->{slots};
+    $self->{base} += $#$slots + 1;   # skip over data we're replacing
+    my $str;
+    read($self->{file}, $str, $self->{stride});
+    if ($address_length == 8) {      # the 32-bit case
+      # This is the easy case: unpack provides 32-bit unpacking primitives.
+      @$slots = unpack($self->{unpack_code} . "*", $str);
+    } else {
+      # We need to unpack 32 bits at a time and combine.
+      my @b32_values = unpack($self->{unpack_code} . "*", $str);
+      my @b64_values = ();
+      for (my $i = 0; $i < $#b32_values; $i += 2) {
+        # TODO(csilvers): if this is a 32-bit perl, the math below
+        #    could end up in a too-large int, which perl will promote
+        #    to a double, losing necessary precision.  Deal with that.
+        #    Right now, we just die.
+        my ($lo, $hi) = ($b32_values[$i], $b32_values[$i+1]);
+        if ($self->{unpack_code} eq 'N') {    # big-endian
+          ($lo, $hi) = ($hi, $lo);
+        }
+        my $value = $lo + $hi * (2**32);
+        if (!$self->{perl_is_64bit} &&   # check value is exactly represented
+            (($value % (2**32)) != $lo || int($value / (2**32)) != $hi)) {
+          ::error("Need a 64-bit perl to process this 64-bit profile.\n");
+        }
+        push(@b64_values, $value);
+      }
+      @$slots = @b64_values;
+    }
+  }
+
+  # Access the i-th long in the file (logically), or -1 at EOF.
+  sub get {
+    my ($self, $idx) = @_;
+    my $slots = $self->{slots};
+    while ($#$slots >= 0) {
+      if ($idx < $self->{base}) {
+        # The only time we expect a reference to $slots[$i - something]
+        # after referencing $slots[$i] is reading the very first header.
+        # Since $stride > |header|, that shouldn't cause any lookback
+        # errors.  And everything after the header is sequential.
+        print STDERR "Unexpected look-back reading CPU profile";
+        return -1;   # shrug, don't know what better to return
+      } elsif ($idx > $self->{base} + $#$slots) {
+        $self->overflow();
+      } else {
+        return $slots->[$idx - $self->{base}];
+      }
+    }
+    # If we get here, $slots is [], which means we've reached EOF
+    return -1;  # unique since slots is supposed to hold unsigned numbers
+  }
+}
+
+# Reads the top, 'header' section of a profile, and returns the last
+# line of the header, commonly called a 'header line'.  The header
+# section of a profile consists of zero or more 'command' lines that
+# are instructions to pprof, which pprof executes when reading the
+# header.  All 'command' lines start with a %.  After the command
+# lines is the 'header line', which is a profile-specific line that
+# indicates what type of profile it is, and perhaps other global
+# information about the profile.  For instance, here's a header line
+# for a heap profile:
+#   heap profile:     53:    38236 [  5525:  1284029] @ heapprofile
+# For historical reasons, the CPU profile does not contain a text-
+# readable header line.  If the profile looks like a CPU profile,
+# this function returns "".  If no header line could be found, this
+# function returns undef.
+#
+# The following commands are recognized:
+#   %warn -- emit the rest of this line to stderr, prefixed by 'WARNING:'
+#
+# The input file should be in binmode.
+sub ReadProfileHeader {
+  local *PROFILE = shift;
+  my $firstchar = "";
+  my $line = "";
+  read(PROFILE, $firstchar, 1);
+  seek(PROFILE, -1, 1);                    # unread the firstchar
+  if ($firstchar !~ /[[:print:]]/) {       # is not a text character
+    return "";
+  }
+  while (defined($line = <PROFILE>)) {
+    $line =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+    if ($line =~ /^%warn\s+(.*)/) {        # 'warn' command
+      # Note this matches both '%warn blah\n' and '%warn\n'.
+      print STDERR "WARNING: $1\n";        # print the rest of the line
+    } elsif ($line =~ /^%/) {
+      print STDERR "Ignoring unknown command from profile header: $line";
+    } else {
+      # End of commands, must be the header line.
+      return $line;
+    }
+  }
+  return undef;     # got to EOF without seeing a header line
+}
+
+sub IsSymbolizedProfileFile {
+  my $file_name = shift;
+  if (!(-e $file_name) || !(-r $file_name)) {
+    return 0;
+  }
+  # Check if the file contains a symbol-section marker.
+  open(TFILE, "<$file_name");
+  binmode TFILE;
+  my $firstline = ReadProfileHeader(*TFILE);
+  close(TFILE);
+  if (!$firstline) {
+    return 0;
+  }
+  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $symbol_marker = $&;
+  return $firstline =~ /^--- *$symbol_marker/;
+}
+
+# Parse profile generated by common/profiler.cc and return a reference
+# to a map:
+#      $result->{version}     Version number of profile file
+#      $result->{period}      Sampling period (in microseconds)
+#      $result->{profile}     Profile object
+#      $result->{map}         Memory map info from profile
+#      $result->{pcs}         Hash of all PC values seen, key is hex address
+sub ReadProfile {
+  my $prog = shift;
+  my $fname = shift;
+  my $result;            # return value
+
+  $CONTENTION_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $contention_marker = $&;
+  $GROWTH_PAGE  =~ m,[^/]+$,;    # matches everything after the last slash
+  my $growth_marker = $&;
+  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $symbol_marker = $&;
+  $PROFILE_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $profile_marker = $&;
+
+  # Look at first line to see if it is a heap or a CPU profile.
+  # CPU profile may start with no header at all, and just binary data
+  # (starting with \0\0\0\0) -- in that case, don't try to read the
+  # whole firstline, since it may be gigabytes(!) of data.
+  open(PROFILE, "<$fname") || error("$fname: $!\n");
+  binmode PROFILE;      # New perls do UTF-8 processing
+  my $header = ReadProfileHeader(*PROFILE);
+  if (!defined($header)) {   # means "at EOF"
+    error("Profile is empty.\n");
+  }
+
+  my $symbols;
+  if ($header =~ m/^--- *$symbol_marker/o) {
+    # Verify that the user asked for a symbolized profile
+    if (!$main::use_symbolized_profile) {
+      # we have both a binary and symbolized profiles, abort
+      error("FATAL ERROR: Symbolized profile\n   $fname\ncannot be used with " .
+            "a binary arg. Try again without passing\n   $prog\n");
+    }
+    # Read the symbol section of the symbolized profile file.
+    $symbols = ReadSymbols(*PROFILE{IO});
+    # Read the next line to get the header for the remaining profile.
+    $header = ReadProfileHeader(*PROFILE) || "";
+  }
+
+  $main::profile_type = '';
+  if ($header =~ m/^heap profile:.*$growth_marker/o) {
+    $main::profile_type = 'growth';
+    $result =  ReadHeapProfile($prog, *PROFILE, $header);
+  } elsif ($header =~ m/^heap profile:/) {
+    $main::profile_type = 'heap';
+    $result =  ReadHeapProfile($prog, *PROFILE, $header);
+  } elsif ($header =~ m/^--- *$contention_marker/o) {
+    $main::profile_type = 'contention';
+    $result = ReadSynchProfile($prog, *PROFILE);
+  } elsif ($header =~ m/^--- *Stacks:/) {
+    print STDERR
+      "Old format contention profile: mistakenly reports " .
+      "condition variable signals as lock contentions.\n";
+    $main::profile_type = 'contention';
+    $result = ReadSynchProfile($prog, *PROFILE);
+  } elsif ($header =~ m/^--- *$profile_marker/) {
+    # the binary cpu profile data starts immediately after this line
+    $main::profile_type = 'cpu';
+    $result = ReadCPUProfile($prog, $fname, *PROFILE);
+  } else {
+    if (defined($symbols)) {
+      # a symbolized profile contains a format we don't recognize, bail out
+      error("$fname: Cannot recognize profile section after symbols.\n");
+    }
+    # no ascii header present -- must be a CPU profile
+    $main::profile_type = 'cpu';
+    $result = ReadCPUProfile($prog, $fname, *PROFILE);
+  }
+
+  close(PROFILE);
+
+  # if we got symbols along with the profile, return those as well
+  if (defined($symbols)) {
+    $result->{symbols} = $symbols;
+  }
+
+  return $result;
+}
+
+# Subtract one from caller pc so we map back to call instr.
+# However, don't do this if we're reading a symbolized profile
+# file, in which case the subtract-one was done when the file
+# was written.
+#
+# We apply the same logic to all readers, though ReadCPUProfile uses an
+# independent implementation.
+sub FixCallerAddresses {
+  my $stack = shift;
+  if ($main::use_symbolized_profile) {
+    return $stack;
+  } else {
+    $stack =~ /(\s)/;
+    my $delimiter = $1;
+    my @addrs = split(' ', $stack);
+    my @fixedaddrs;
+    $#fixedaddrs = $#addrs;
+    if ($#addrs >= 0) {
+      $fixedaddrs[0] = $addrs[0];
+    }
+    for (my $i = 1; $i <= $#addrs; $i++) {
+      $fixedaddrs[$i] = AddressSub($addrs[$i], "0x1");
+    }
+    return join $delimiter, @fixedaddrs;
+  }
+}
+
+# CPU profile reader
+sub ReadCPUProfile {
+  my $prog = shift;
+  my $fname = shift;       # just used for logging
+  local *PROFILE = shift;
+  my $version;
+  my $period;
+  my $i;
+  my $profile = {};
+  my $pcs = {};
+
+  # Parse string into array of slots.
+  my $slots = CpuProfileStream->new(*PROFILE, $fname);
+
+  # Read header.  The current header version is a 5-element structure
+  # containing:
+  #   0: header count (always 0)
+  #   1: header "words" (after this one: 3)
+  #   2: format version (0)
+  #   3: sampling period (usec)
+  #   4: unused padding (always 0)
+  if ($slots->get(0) != 0 ) {
+    error("$fname: not a profile file, or old format profile file\n");
+  }
+  $i = 2 + $slots->get(1);
+  $version = $slots->get(2);
+  $period = $slots->get(3);
+  # Do some sanity checking on these header values.
+  if ($version > (2**32) || $period > (2**32) || $i > (2**32) || $i < 5) {
+    error("$fname: not a profile file, or corrupted profile file\n");
+  }
+
+  # Parse profile
+  while ($slots->get($i) != -1) {
+    my $n = $slots->get($i++);
+    my $d = $slots->get($i++);
+    if ($d > (2**16)) {  # TODO(csilvers): what's a reasonable max-stack-depth?
+      my $addr = sprintf("0%o", $i * ($address_length == 8 ? 4 : 8));
+      print STDERR "At index $i (address $addr):\n";
+      error("$fname: stack trace depth >= 2**32\n");
+    }
+    if ($slots->get($i) == 0) {
+      # End of profile data marker
+      $i += $d;
+      last;
+    }
+
+    # Make key out of the stack entries
+    my @k = ();
+    for (my $j = 0; $j < $d; $j++) {
+      my $pc = $slots->get($i+$j);
+      # Subtract one from caller pc so we map back to call instr.
+      # However, don't do this if we're reading a symbolized profile
+      # file, in which case the subtract-one was done when the file
+      # was written.
+      if ($j > 0 && !$main::use_symbolized_profile) {
+        $pc--;
+      }
+      $pc = sprintf("%0*x", $address_length, $pc);
+      $pcs->{$pc} = 1;
+      push @k, $pc;
+    }
+
+    AddEntry($profile, (join "\n", @k), $n);
+    $i += $d;
+  }
+
+  # Parse map
+  my $map = '';
+  seek(PROFILE, $i * ($address_length / 2), 0);
+  read(PROFILE, $map, (stat PROFILE)[7]);
+
+  my $r = {};
+  $r->{version} = $version;
+  $r->{period} = $period;
+  $r->{profile} = $profile;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+
+  return $r;
+}
+
+sub ReadHeapProfile {
+  my $prog = shift;
+  local *PROFILE = shift;
+  my $header = shift;
+
+  my $index = 1;
+  if ($main::opt_inuse_space) {
+    $index = 1;
+  } elsif ($main::opt_inuse_objects) {
+    $index = 0;
+  } elsif ($main::opt_alloc_space) {
+    $index = 3;
+  } elsif ($main::opt_alloc_objects) {
+    $index = 2;
+  }
+
+  # Find the type of this profile.  The header line looks like:
+  #    heap profile:   1246:  8800744 [  1246:  8800744] @ <heap-url>/266053
+  # There are two pairs <count: size>, the first inuse objects/space, and the
+  # second allocated objects/space.  This is followed optionally by a profile
+  # type, and if that is present, optionally by a sampling frequency.
+  # For remote heap profiles (v1):
+  # The interpretation of the sampling frequency is that the profiler, for
+  # each sample, calculates a uniformly distributed random integer less than
+  # the given value, and records the next sample after that many bytes have
+  # been allocated.  Therefore, the expected sample interval is half of the
+  # given frequency.  By default, if not specified, the expected sample
+  # interval is 128KB.  Only remote-heap-page profiles are adjusted for
+  # sample size.
+  # For remote heap profiles (v2):
+  # The sampling frequency is the rate of a Poisson process. This means that
+  # the probability of sampling an allocation of size X with sampling rate Y
+  # is 1 - exp(-X/Y)
+  # For version 2, a typical header line might look like this:
+  # heap profile:   1922: 127792360 [  1922: 127792360] @ <heap-url>_v2/524288
+  # the trailing number (524288) is the sampling rate. (Version 1 showed
+  # double the 'rate' here)
+  my $sampling_algorithm = 0;
+  my $sample_adjustment = 0;
+  chomp($header);
+  my $type = "unknown";
+  if ($header =~ m"^heap profile:\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\](\s*@\s*([^/]*)(/(\d+))?)?") {
+    if (defined($6) && ($6 ne '')) {
+      $type = $6;
+      my $sample_period = $8;
+      # $type is "heapprofile" for profiles generated by the
+      # heap-profiler, and either "heap" or "heap_v2" for profiles
+      # generated by sampling directly within tcmalloc.  It can also
+      # be "growth" for heap-growth profiles.  The first is typically
+      # found for profiles generated locally, and the others for
+      # remote profiles.
+      if (($type eq "heapprofile") || ($type !~ /heap/) ) {
+        # No need to adjust for the sampling rate with heap-profiler-derived data
+        $sampling_algorithm = 0;
+      } elsif ($type =~ /_v2/) {
+        $sampling_algorithm = 2;     # version 2 sampling
+        if (defined($sample_period) && ($sample_period ne '')) {
+          $sample_adjustment = int($sample_period);
+        }
+      } else {
+        $sampling_algorithm = 1;     # version 1 sampling
+        if (defined($sample_period) && ($sample_period ne '')) {
+          $sample_adjustment = int($sample_period)/2;
+        }
+      }
+    } else {
+      # We detect whether or not this is a remote-heap profile by checking
+      # that the total-allocated stats ($n2,$s2) are exactly the
+      # same as the in-use stats ($n1,$s1).  It is remotely conceivable
+      # that a non-remote-heap profile may pass this check, but it is hard
+      # to imagine how that could happen.
+      # In this case it's so old it's guaranteed to be remote-heap version 1.
+      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
+      if (($n1 == $n2) && ($s1 == $s2)) {
+        # This is likely to be a remote-heap based sample profile
+        $sampling_algorithm = 1;
+      }
+    }
+  }
+
+  if ($sampling_algorithm > 0) {
+    # For remote-heap generated profiles, adjust the counts and sizes to
+    # account for the sample rate (we sample once every 128KB by default).
+    if ($sample_adjustment == 0) {
+      # Turn on profile adjustment.
+      $sample_adjustment = 128*1024;
+      print STDERR "Adjusting heap profiles for 1-in-128KB sampling rate\n";
+    } else {
+      printf STDERR ("Adjusting heap profiles for 1-in-%d sampling rate\n",
+                     $sample_adjustment);
+    }
+    if ($sampling_algorithm > 1) {
+      # We don't bother printing anything for the original version (version 1)
+      printf STDERR "Heap version $sampling_algorithm\n";
+    }
+  }
+
+  my $profile = {};
+  my $pcs = {};
+  my $map = "";
+
+  while (<PROFILE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    if (/^MAPPED_LIBRARIES:/) {
+      # Read the /proc/self/maps data
+      while (<PROFILE>) {
+        s/\r//g;         # turn windows-looking lines into unix-looking lines
+        $map .= $_;
+      }
+      last;
+    }
+
+    if (/^--- Memory map:/) {
+      # Read /proc/self/maps data as formatted by DumpAddressMap()
+      my $buildvar = "";
+      while (<PROFILE>) {
+        s/\r//g;         # turn windows-looking lines into unix-looking lines
+        # Parse "build=<dir>" specification if supplied
+        if (m/^\s*build=(.*)\n/) {
+          $buildvar = $1;
+        }
+
+        # Expand "$build" variable if available
+        $_ =~ s/\$build\b/$buildvar/g;
+
+        $map .= $_;
+      }
+      last;
+    }
+
+    # Read entry of the form:
+    #  <count1>: <bytes1> [<count2>: <bytes2>] @ a1 a2 a3 ... an
+    s/^\s*//;
+    s/\s*$//;
+    if (m/^\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]\s+@\s+(.*)$/) {
+      my $stack = $5;
+      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
+
+      if ($sample_adjustment) {
+        if ($sampling_algorithm == 2) {
+          # Remote-heap version 2
+          # The sampling frequency is the rate of a Poisson process.
+          # This means that the probability of sampling an allocation of
+          # size X with sampling rate Y is 1 - exp(-X/Y)
+          if ($n1 != 0) {
+            my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+            my $scale_factor = 1/(1 - exp(-$ratio));
+            $n1 *= $scale_factor;
+            $s1 *= $scale_factor;
+          }
+          if ($n2 != 0) {
+            my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+            my $scale_factor = 1/(1 - exp(-$ratio));
+            $n2 *= $scale_factor;
+            $s2 *= $scale_factor;
+          }
+        } else {
+          # Remote-heap version 1
+          my $ratio;
+          $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+          if ($ratio < 1) {
+            $n1 /= $ratio;
+            $s1 /= $ratio;
+          }
+          $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+          if ($ratio < 1) {
+            $n2 /= $ratio;
+            $s2 /= $ratio;
+          }
+        }
+      }
+
+      my @counts = ($n1, $s1, $n2, $s2);
+      $stack = FixCallerAddresses($stack);
+      push @stackTraces, "$n1 $s1 $n2 $s2 $stack";
+      AddEntries($profile, $pcs, $stack, $counts[$index]);
+    }
+  }
+
+  my $r = {};
+  $r->{version} = "heap";
+  $r->{period} = 1;
+  $r->{profile} = $profile;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+  return $r;
+}
+
+sub ReadSynchProfile {
+  my $prog = shift;
+  local *PROFILE = shift;
+  my $header = shift;
+
+  my $map = '';
+  my $profile = {};
+  my $pcs = {};
+  my $sampling_period = 1;
+  my $cyclespernanosec = 2.8;   # Default assumption for old binaries
+  my $seen_clockrate = 0;
+  my $line;
+
+  my $index = 0;
+  if ($main::opt_total_delay) {
+    $index = 0;
+  } elsif ($main::opt_contentions) {
+    $index = 1;
+  } elsif ($main::opt_mean_delay) {
+    $index = 2;
+  }
+
+  while ( $line = <PROFILE> ) {
+    $line =~ s/\r//g;      # turn windows-looking lines into unix-looking lines
+    if ( $line =~ /^\s*(\d+)\s+(\d+) \@\s*(.*?)\s*$/ ) {
+      my ($cycles, $count, $stack) = ($1, $2, $3);
+
+      # Convert cycles to nanoseconds
+      $cycles /= $cyclespernanosec;
+
+      # Adjust for sampling done by application
+      $cycles *= $sampling_period;
+      $count *= $sampling_period;
+
+      my @values = ($cycles, $count, $cycles / $count);
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $values[$index]);
+
+    } elsif ( $line =~ /^(slow release).*thread \d+  \@\s*(.*?)\s*$/ ||
+              $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) {
+      my ($cycles, $stack) = ($1, $2);
+      if ($cycles !~ /^\d+$/) {
+        next;
+      }
+
+      # Convert cycles to nanoseconds
+      $cycles /= $cyclespernanosec;
+
+      # Adjust for sampling done by application
+      $cycles *= $sampling_period;
+
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $cycles);
+
+    } elsif ( $line =~ m/^([a-z][^=]*)=(.*)$/ ) {
+      my ($variable, $value) = ($1,$2);
+      for ($variable, $value) {
+        s/^\s+//;
+        s/\s+$//;
+      }
+      if ($variable eq "cycles/second") {
+        $cyclespernanosec = $value / 1e9;
+        $seen_clockrate = 1;
+      } elsif ($variable eq "sampling period") {
+        $sampling_period = $value;
+      } elsif ($variable eq "ms since reset") {
+        # Currently nothing is done with this value in pprof
+        # So we just silently ignore it for now
+      } elsif ($variable eq "discarded samples") {
+        # Currently nothing is done with this value in pprof
+        # So we just silently ignore it for now
+      } else {
+        printf STDERR ("Ignoring unnknown variable in /contention output: " .
+                       "'%s' = '%s'\n",$variable,$value);
+      }
+    } else {
+      # Memory map entry
+      $map .= $line;
+    }
+  }
+
+  if (!$seen_clockrate) {
+    printf STDERR ("No cycles/second entry in profile; Guessing %.1f GHz\n",
+                   $cyclespernanosec);
+  }
+
+  my $r = {};
+  $r->{version} = 0;
+  $r->{period} = $sampling_period;
+  $r->{profile} = $profile;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+  return $r;
+}
+
+# Given a hex value in the form "0x1abcd" or "1abcd", return either
+# "0001abcd" or "000000000001abcd", depending on the current (global)
+# address length.
+sub HexExtend {
+  my $addr = shift;
+
+  $addr =~ s/^(0x)?0*//;
+  my $zeros_needed = $address_length - length($addr);
+  if ($zeros_needed < 0) {
+    printf STDERR "Warning: address $addr is longer than address length $address_length\n";
+    return $addr;
+  }
+  return ("0" x $zeros_needed) . $addr;
+}
+
+##### Symbol extraction #####
+
+# Aggressively search the lib_prefix values for the given library
+# If all else fails, just return the name of the library unmodified.
+# If the lib_prefix is "/my/path,/other/path" and $file is "/lib/dir/mylib.so"
+# it will search the following locations in this order, until it finds a file:
+#   /my/path/lib/dir/mylib.so
+#   /other/path/lib/dir/mylib.so
+#   /my/path/dir/mylib.so
+#   /other/path/dir/mylib.so
+#   /my/path/mylib.so
+#   /other/path/mylib.so
+#   /lib/dir/mylib.so              (returned as last resort)
+sub FindLibrary {
+  my $file = shift;
+  my $suffix = $file;
+
+  # Search for the library as described above
+  do {
+    foreach my $prefix (@prefix_list) {
+      my $fullpath = $prefix . $suffix;
+      if (-e $fullpath) {
+        return $fullpath;
+      }
+    }
+  } while ($suffix =~ s|^/[^/]+/|/|);
+  return $file;
+}
+
+# Return path to library with debugging symbols.
+# For libc libraries, the copy in /usr/lib/debug contains debugging symbols
+sub DebuggingLibrary {
+  my $file = shift;
+  if ($file =~ m|^/| && -f "/usr/lib/debug$file") {
+    return "/usr/lib/debug$file";
+  }
+  if ($file =~ m|^/| && -f "/usr/lib/debug$file.debug") {
+    return "/usr/lib/debug$file.debug";
+  }
+  return undef;
+}
+
+# Parse text section header of a library using objdump
+sub ParseTextSectionHeaderFromObjdump {
+  my $lib = shift;
+
+  my $size = undef;
+  my $vma;
+  my $file_offset;
+  # Get objdump output from the library file to figure out how to
+  # map between mapped addresses and addresses in the library.
+  my $cmd = ShellEscape($obj_tool_map{"objdump"}, "-h", $lib);
+  open(OBJDUMP, "$cmd |") || error("$cmd: $!\n");
+  while (<OBJDUMP>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    # Idx Name          Size      VMA       LMA       File off  Algn
+    #  10 .text         00104b2c  420156f0  420156f0  000156f0  2**4
+    # For 64-bit objects, VMA and LMA will be 16 hex digits, size and file
+    # offset may still be 8.  But AddressSub below will still handle that.
+    my @x = split;
+    if (($#x >= 6) && ($x[1] eq '.text')) {
+      $size = $x[2];
+      $vma = $x[3];
+      $file_offset = $x[5];
+      last;
+    }
+  }
+  close(OBJDUMP);
+
+  if (!defined($size)) {
+    return undef;
+  }
+
+  my $r = {};
+  $r->{size} = $size;
+  $r->{vma} = $vma;
+  $r->{file_offset} = $file_offset;
+
+  return $r;
+}
+
+# Parse text section header of a library using otool (on OS X)
+sub ParseTextSectionHeaderFromOtool {
+  my $lib = shift;
+
+  my $size = undef;
+  my $vma = undef;
+  my $file_offset = undef;
+  # Get otool output from the library file to figure out how to
+  # map between mapped addresses and addresses in the library.
+  my $command = ShellEscape($obj_tool_map{"otool"}, "-l", $lib);
+  open(OTOOL, "$command |") || error("$command: $!\n");
+  my $cmd = "";
+  my $sectname = "";
+  my $segname = "";
+  foreach my $line (<OTOOL>) {
+    $line =~ s/\r//g;      # turn windows-looking lines into unix-looking lines
+    # Load command <#>
+    #       cmd LC_SEGMENT
+    # [...]
+    # Section
+    #   sectname __text
+    #    segname __TEXT
+    #       addr 0x000009f8
+    #       size 0x00018b9e
+    #     offset 2552
+    #      align 2^2 (4)
+    # We will need to strip off the leading 0x from the hex addresses,
+    # and convert the offset into hex.
+    if ($line =~ /Load command/) {
+      $cmd = "";
+      $sectname = "";
+      $segname = "";
+    } elsif ($line =~ /Section/) {
+      $sectname = "";
+      $segname = "";
+    } elsif ($line =~ /cmd (\w+)/) {
+      $cmd = $1;
+    } elsif ($line =~ /sectname (\w+)/) {
+      $sectname = $1;
+    } elsif ($line =~ /segname (\w+)/) {
+      $segname = $1;
+    } elsif (!(($cmd eq "LC_SEGMENT" || $cmd eq "LC_SEGMENT_64") &&
+               $sectname eq "__text" &&
+               $segname eq "__TEXT")) {
+      next;
+    } elsif ($line =~ /\baddr 0x([0-9a-fA-F]+)/) {
+      $vma = $1;
+    } elsif ($line =~ /\bsize 0x([0-9a-fA-F]+)/) {
+      $size = $1;
+    } elsif ($line =~ /\boffset ([0-9]+)/) {
+      $file_offset = sprintf("%016x", $1);
+    }
+    if (defined($vma) && defined($size) && defined($file_offset)) {
+      last;
+    }
+  }
+  close(OTOOL);
+
+  if (!defined($vma) || !defined($size) || !defined($file_offset)) {
+     return undef;
+  }
+
+  my $r = {};
+  $r->{size} = $size;
+  $r->{vma} = $vma;
+  $r->{file_offset} = $file_offset;
+
+  return $r;
+}
+
+sub ParseTextSectionHeader {
+  # obj_tool_map("otool") is only defined if we're in a Mach-O environment
+  if (defined($obj_tool_map{"otool"})) {
+    my $r = ParseTextSectionHeaderFromOtool(@_);
+    if (defined($r)){
+      return $r;
+    }
+  }
+  # If otool doesn't work, or we don't have it, fall back to objdump
+  return ParseTextSectionHeaderFromObjdump(@_);
+}
+
+# Split /proc/pid/maps dump into a list of libraries
+sub ParseLibraries {
+  return if $main::use_symbol_page;  # We don't need libraries info.
+  my $prog = Cwd::abs_path(shift);
+  my $map = shift;
+  my $pcs = shift;
+
+  my $result = [];
+  my $h = "[a-f0-9]+";
+  my $zero_offset = HexExtend("0");
+
+  my $buildvar = "";
+  my $priorlib = "";
+  foreach my $l (split("\n", $map)) {
+    if ($l =~ m/^\s*build=(.*)$/) {
+      $buildvar = $1;
+    }
+
+    my $start;
+    my $finish;
+    my $offset;
+    my $lib;
+    if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(.+\.(so|dll|dylib|bundle)((\.\d+)+\w*(\.\d+){0,3})?)$/i) {
+      # Full line from /proc/self/maps.  Example:
+      #   40000000-40015000 r-xp 00000000 03:01 12845071   /lib/ld-2.3.2.so
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = HexExtend($3);
+      $lib = $4;
+      $lib =~ s|\\|/|g;     # turn windows-style paths into unix-style paths
+    } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) {
+      # Cooked line from DumpAddressMap.  Example:
+      #   40000000-40015000: /lib/ld-2.3.2.so
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = $zero_offset;
+      $lib = $3;
+    } elsif (($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+)$/i) && ($4 eq $prog)) {
+      # PIEs and address space randomization do not play well with our
+      # default assumption that main executable is at lowest
+      # addresses. So we're detecting main executable in
+      # /proc/self/maps as well.
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = HexExtend($3);
+      $lib = $4;
+      $lib =~ s|\\|/|g;     # turn windows-style paths into unix-style paths
+    } else {
+      next;
+    }
+
+    # Expand "$build" variable if available
+    $lib =~ s/\$build\b/$buildvar/g;
+
+    $lib = FindLibrary($lib);
+
+    # Check for pre-relocated libraries, which use pre-relocated symbol tables
+    # and thus require adjusting the offset that we'll use to translate
+    # VM addresses into symbol table addresses.
+    # Only do this if we're not going to fetch the symbol table from a
+    # debugging copy of the library.
+    if (!DebuggingLibrary($lib)) {
+      my $text = ParseTextSectionHeader($lib);
+      if (defined($text)) {
+         my $vma_offset = AddressSub($text->{vma}, $text->{file_offset});
+         $offset = AddressAdd($offset, $vma_offset);
+      }
+    }
+
+    # If we find multiple executable segments for a single library, merge them
+    # into a single entry that spans the complete address range.
+    if ($lib eq $priorlib) {
+      my $prior = pop(@{$result});
+      $start = @$prior[1];
+      # TODO $offset may be wrong if .text is not in the final segment.
+    }
+
+    push(@{$result}, [$lib, $start, $finish, $offset]);
+    $priorlib = $lib;
+  }
+
+  # Append special entry for additional library (not relocated)
+  if ($main::opt_lib ne "") {
+    my $text = ParseTextSectionHeader($main::opt_lib);
+    if (defined($text)) {
+       my $start = $text->{vma};
+       my $finish = AddressAdd($start, $text->{size});
+
+       push(@{$result}, [$main::opt_lib, $start, $finish, $start]);
+    }
+  }
+
+  # Append special entry for the main program.  This covers
+  # 0..max_pc_value_seen, so that we assume pc values not found in one
+  # of the library ranges will be treated as coming from the main
+  # program binary.
+  my $min_pc = HexExtend("0");
+  my $max_pc = $min_pc;          # find the maximal PC value in any sample
+  foreach my $pc (keys(%{$pcs})) {
+    if (HexExtend($pc) gt $max_pc) { $max_pc = HexExtend($pc); }
+  }
+  push(@{$result}, [$prog, $min_pc, $max_pc, $zero_offset]);
+
+  return $result;
+}
+
+# Add two hex addresses of length $address_length.
+# Run pprof --test for unit test if this is changed.
+sub AddressAdd {
+  my $addr1 = shift;
+  my $addr2 = shift;
+  my $sum;
+
+  if ($address_length == 8) {
+    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
+    $sum = (hex($addr1)+hex($addr2)) % (0x10000000 * 16);
+    return sprintf("%08x", $sum);
+
+  } else {
+    # Do the addition in 7-nibble chunks to trivialize carry handling.
+
+    if ($main::opt_debug and $main::opt_test) {
+      print STDERR "AddressAdd $addr1 + $addr2 = ";
+    }
+
+    my $a1 = substr($addr1,-7);
+    $addr1 = substr($addr1,0,-7);
+    my $a2 = substr($addr2,-7);
+    $addr2 = substr($addr2,0,-7);
+    $sum = hex($a1) + hex($a2);
+    my $c = 0;
+    if ($sum > 0xfffffff) {
+      $c = 1;
+      $sum -= 0x10000000;
+    }
+    my $r = sprintf("%07x", $sum);
+
+    $a1 = substr($addr1,-7);
+    $addr1 = substr($addr1,0,-7);
+    $a2 = substr($addr2,-7);
+    $addr2 = substr($addr2,0,-7);
+    $sum = hex($a1) + hex($a2) + $c;
+    $c = 0;
+    if ($sum > 0xfffffff) {
+      $c = 1;
+      $sum -= 0x10000000;
+    }
+    $r = sprintf("%07x", $sum) . $r;
+
+    $sum = hex($addr1) + hex($addr2) + $c;
+    if ($sum > 0xff) { $sum -= 0x100; }
+    $r = sprintf("%02x", $sum) . $r;
+
+    if ($main::opt_debug and $main::opt_test) { print STDERR "$r\n"; }
+
+    return $r;
+  }
+}
+
+
+# Subtract two hex addresses of length $address_length.
+# Run pprof --test for unit test if this is changed.
+sub AddressSub {
+  my $addr1 = shift;
+  my $addr2 = shift;
+  my $diff;
+
+  if ($address_length == 8) {
+    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
+    $diff = (hex($addr1)-hex($addr2)) % (0x10000000 * 16);
+    return sprintf("%08x", $diff);
+
+  } else {
+    # Do the addition in 7-nibble chunks to trivialize borrow handling.
+    # if ($main::opt_debug) { print STDERR "AddressSub $addr1 - $addr2 = "; }
+
+    my $a1 = hex(substr($addr1,-7));
+    $addr1 = substr($addr1,0,-7);
+    my $a2 = hex(substr($addr2,-7));
+    $addr2 = substr($addr2,0,-7);
+    my $b = 0;
+    if ($a2 > $a1) {
+      $b = 1;
+      $a1 += 0x10000000;
+    }
+    $diff = $a1 - $a2;
+    my $r = sprintf("%07x", $diff);
+
+    $a1 = hex(substr($addr1,-7));
+    $addr1 = substr($addr1,0,-7);
+    $a2 = hex(substr($addr2,-7)) + $b;
+    $addr2 = substr($addr2,0,-7);
+    $b = 0;
+    if ($a2 > $a1) {
+      $b = 1;
+      $a1 += 0x10000000;
+    }
+    $diff = $a1 - $a2;
+    $r = sprintf("%07x", $diff) . $r;
+
+    $a1 = hex($addr1);
+    $a2 = hex($addr2) + $b;
+    if ($a2 > $a1) { $a1 += 0x100; }
+    $diff = $a1 - $a2;
+    $r = sprintf("%02x", $diff) . $r;
+
+    # if ($main::opt_debug) { print STDERR "$r\n"; }
+
+    return $r;
+  }
+}
+
+# Increment a hex addresses of length $address_length.
+# Run pprof --test for unit test if this is changed.
+sub AddressInc {
+  my $addr = shift;
+  my $sum;
+
+  if ($address_length == 8) {
+    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
+    $sum = (hex($addr)+1) % (0x10000000 * 16);
+    return sprintf("%08x", $sum);
+
+  } else {
+    # Do the addition in 7-nibble chunks to trivialize carry handling.
+    # We are always doing this to step through the addresses in a function,
+    # and will almost never overflow the first chunk, so we check for this
+    # case and exit early.
+
+    # if ($main::opt_debug) { print STDERR "AddressInc $addr1 = "; }
+
+    my $a1 = substr($addr,-7);
+    $addr = substr($addr,0,-7);
+    $sum = hex($a1) + 1;
+    my $r = sprintf("%07x", $sum);
+    if ($sum <= 0xfffffff) {
+      $r = $addr . $r;
+      # if ($main::opt_debug) { print STDERR "$r\n"; }
+      return HexExtend($r);
+    } else {
+      $r = "0000000";
+    }
+
+    $a1 = substr($addr,-7);
+    $addr = substr($addr,0,-7);
+    $sum = hex($a1) + 1;
+    $r = sprintf("%07x", $sum) . $r;
+    if ($sum <= 0xfffffff) {
+      $r = $addr . $r;
+      # if ($main::opt_debug) { print STDERR "$r\n"; }
+      return HexExtend($r);
+    } else {
+      $r = "00000000000000";
+    }
+
+    $sum = hex($addr) + 1;
+    if ($sum > 0xff) { $sum -= 0x100; }
+    $r = sprintf("%02x", $sum) . $r;
+
+    # if ($main::opt_debug) { print STDERR "$r\n"; }
+    return $r;
+  }
+}
+
+# Extract symbols for all PC values found in profile
+sub ExtractSymbols {
+  my $libs = shift;
+  my $pcset = shift;
+
+  my $symbols = {};
+
+  # Map each PC value to the containing library.  To make this faster,
+  # we sort libraries by their starting pc value (highest first), and
+  # advance through the libraries as we advance the pc.  Sometimes the
+  # addresses of libraries may overlap with the addresses of the main
+  # binary, so to make sure the libraries 'win', we iterate over the
+  # libraries in reverse order (which assumes the binary doesn't start
+  # in the middle of a library, which seems a fair assumption).
+  my @pcs = (sort { $a cmp $b } keys(%{$pcset}));  # pcset is 0-extended strings
+  foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) {
+    my $libname = $lib->[0];
+    my $start = $lib->[1];
+    my $finish = $lib->[2];
+    my $offset = $lib->[3];
+
+    # Get list of pcs that belong in this library.
+    my $contained = [];
+    my ($start_pc_index, $finish_pc_index);
+    # Find smallest finish_pc_index such that $finish < $pc[$finish_pc_index].
+    for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0;
+         $finish_pc_index--) {
+      last if $pcs[$finish_pc_index - 1] le $finish;
+    }
+    # Find smallest start_pc_index such that $start <= $pc[$start_pc_index].
+    for ($start_pc_index = $finish_pc_index; $start_pc_index > 0;
+         $start_pc_index--) {
+      last if $pcs[$start_pc_index - 1] lt $start;
+    }
+    # This keeps PC values higher than $pc[$finish_pc_index] in @pcs,
+    # in case there are overlaps in libraries and the main binary.
+    @{$contained} = splice(@pcs, $start_pc_index,
+                           $finish_pc_index - $start_pc_index);
+    # Map to symbols
+    MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols);
+  }
+
+  return $symbols;
+}
+
+# Map list of PC values to symbols for a given image
+sub MapToSymbols {
+  my $image = shift;
+  my $offset = shift;
+  my $pclist = shift;
+  my $symbols = shift;
+
+  my $debug = 0;
+
+  # For libc (and other) libraries, the copy in /usr/lib/debug contains debugging symbols
+  my $debugging = DebuggingLibrary($image);
+  if ($debugging) {
+    $image = $debugging;
+  }
+
+  # Ignore empty binaries
+  if ($#{$pclist} < 0) { return; }
+
+  # Figure out the addr2line command to use
+  my $addr2line = $obj_tool_map{"addr2line"};
+  my $cmd = ShellEscape($addr2line, "-f", "-C", "-e", $image);
+  if (exists $obj_tool_map{"addr2line_pdb"}) {
+    $addr2line = $obj_tool_map{"addr2line_pdb"};
+    $cmd = ShellEscape($addr2line, "--demangle", "-f", "-C", "-e", $image);
+  }
+
+  # If "addr2line" isn't installed on the system at all, just use
+  # nm to get what info we can (function names, but not line numbers).
+  if (system(ShellEscape($addr2line, "--help") . " >$dev_null 2>&1") != 0) {
+    MapSymbolsWithNM($image, $offset, $pclist, $symbols);
+    return;
+  }
+
+  # "addr2line -i" can produce a variable number of lines per input
+  # address, with no separator that allows us to tell when data for
+  # the next address starts.  So we find the address for a special
+  # symbol (_fini) and interleave this address between all real
+  # addresses passed to addr2line.  The name of this special symbol
+  # can then be used as a separator.
+  $sep_address = undef;  # May be filled in by MapSymbolsWithNM()
+  my $nm_symbols = {};
+  MapSymbolsWithNM($image, $offset, $pclist, $nm_symbols);
+  if (defined($sep_address)) {
+    # Only add " -i" to addr2line if the binary supports it.
+    # addr2line --help returns 0, but not if it sees an unknown flag first.
+    if (system("$cmd -i --help >$dev_null 2>&1") == 0) {
+      $cmd .= " -i";
+    } else {
+      $sep_address = undef;   # no need for sep_address if we don't support -i
+    }
+  }
+
+  # Make file with all PC values with intervening 'sep_address' so
+  # that we can reliably detect the end of inlined function list
+  open(ADDRESSES, ">$main::tmpfile_sym") || error("$main::tmpfile_sym: $!\n");
+  if ($debug) { print("---- $image ---\n"); }
+  for (my $i = 0; $i <= $#{$pclist}; $i++) {
+    # addr2line always reads hex addresses, and does not need '0x' prefix.
+    if ($debug) { printf STDERR ("%s\n", $pclist->[$i]); }
+    printf ADDRESSES ("%s\n", AddressSub($pclist->[$i], $offset));
+    if (defined($sep_address)) {
+      printf ADDRESSES ("%s\n", $sep_address);
+    }
+  }
+  close(ADDRESSES);
+  if ($debug) {
+    print("----\n");
+    system("cat", $main::tmpfile_sym);
+    print("---- $cmd ---\n");
+    system("$cmd < " . ShellEscape($main::tmpfile_sym));
+    print("----\n");
+  }
+
+  open(SYMBOLS, "$cmd <" . ShellEscape($main::tmpfile_sym) . " |")
+      || error("$cmd: $!\n");
+  my $count = 0;   # Index in pclist
+  while (<SYMBOLS>) {
+    # Read fullfunction and filelineinfo from next pair of lines
+    s/\r?\n$//g;
+    my $fullfunction = $_;
+    $_ = <SYMBOLS>;
+    s/\r?\n$//g;
+    my $filelinenum = $_;
+
+    if (defined($sep_address) && $fullfunction eq $sep_symbol) {
+      # Terminating marker for data for this address
+      $count++;
+      next;
+    }
+
+    $filelinenum =~ s|\\|/|g; # turn windows-style paths into unix-style paths
+
+    # Remove discriminator markers as this comes after the line number and
+    # confuses the rest of this script.
+    $filelinenum =~ s/ \(discriminator \d+\)$//;
+    # Convert unknown line numbers into line 0.
+    $filelinenum =~ s/:\?$/:0/;
+
+    my $pcstr = $pclist->[$count];
+    my $function = ShortFunctionName($fullfunction);
+    my $nms = $nm_symbols->{$pcstr};
+    if (defined($nms)) {
+      if ($fullfunction eq '??') {
+        # nm found a symbol for us.
+        $function = $nms->[0];
+        $fullfunction = $nms->[2];
+      } else {
+	# MapSymbolsWithNM tags each routine with its starting address,
+	# useful in case the image has multiple occurrences of this
+	# routine.  (It uses a syntax that resembles template paramters,
+	# that are automatically stripped out by ShortFunctionName().)
+	# addr2line does not provide the same information.  So we check
+	# if nm disambiguated our symbol, and if so take the annotated
+	# (nm) version of the routine-name.  TODO(csilvers): this won't
+	# catch overloaded, inlined symbols, which nm doesn't see.
+	# Better would be to do a check similar to nm's, in this fn.
+	if ($nms->[2] =~ m/^\Q$function\E/) {  # sanity check it's the right fn
+	  $function = $nms->[0];
+	  $fullfunction = $nms->[2];
+	}
+      }
+    }
+    
+    # Prepend to accumulated symbols for pcstr
+    # (so that caller comes before callee)
+    my $sym = $symbols->{$pcstr};
+    if (!defined($sym)) {
+      $sym = [];
+      $symbols->{$pcstr} = $sym;
+    }
+    unshift(@{$sym}, $function, $filelinenum, $fullfunction);
+    if ($debug) { printf STDERR ("%s => [%s]\n", $pcstr, join(" ", @{$sym})); }
+    if (!defined($sep_address)) {
+      # Inlining is off, so this entry ends immediately
+      $count++;
+    }
+  }
+  close(SYMBOLS);
+}
+
+# Use nm to map the list of referenced PCs to symbols.  Return true iff we
+# are able to read procedure information via nm.
+sub MapSymbolsWithNM {
+  my $image = shift;
+  my $offset = shift;
+  my $pclist = shift;
+  my $symbols = shift;
+
+  # Get nm output sorted by increasing address
+  my $symbol_table = GetProcedureBoundaries($image, ".");
+  if (!%{$symbol_table}) {
+    return 0;
+  }
+  # Start addresses are already the right length (8 or 16 hex digits).
+  my @names = sort { $symbol_table->{$a}->[0] cmp $symbol_table->{$b}->[0] }
+    keys(%{$symbol_table});
+
+  if ($#names < 0) {
+    # No symbols: just use addresses
+    foreach my $pc (@{$pclist}) {
+      my $pcstr = "0x" . $pc;
+      $symbols->{$pc} = [$pcstr, "?", $pcstr];
+    }
+    return 0;
+  }
+
+  # Sort addresses so we can do a join against nm output
+  my $index = 0;
+  my $fullname = $names[0];
+  my $name = ShortFunctionName($fullname);
+  foreach my $pc (sort { $a cmp $b } @{$pclist}) {
+    # Adjust for mapped offset
+    my $mpc = AddressSub($pc, $offset);
+    while (($index < $#names) && ($mpc ge $symbol_table->{$fullname}->[1])){
+      $index++;
+      $fullname = $names[$index];
+      $name = ShortFunctionName($fullname);
+    }
+    if ($mpc lt $symbol_table->{$fullname}->[1]) {
+      $symbols->{$pc} = [$name, "?", $fullname];
+    } else {
+      my $pcstr = "0x" . $pc;
+      $symbols->{$pc} = [$pcstr, "?", $pcstr];
+    }
+  }
+  return 1;
+}
+
+sub ShortFunctionName {
+  my $function = shift;
+  while ($function =~ s/\([^()]*\)(\s*const)?//g) { }   # Argument types
+  $function =~ s/<[0-9a-f]*>$//g;                # Remove Address
+  if (!$main::opt_no_strip_temp) {
+      while ($function =~ s/<[^<>]*>//g)  { }   # Remove template arguments
+  }
+  $function =~ s/^.*\s+(\w+::)/$1/;          # Remove leading type
+  return $function;
+}
+
+# Trim overly long symbols found in disassembler output
+sub CleanDisassembly {
+  my $d = shift;
+  while ($d =~ s/\([^()%]*\)(\s*const)?//g) { } # Argument types, not (%rax)
+  while ($d =~ s/(\w+)<[^<>]*>/$1/g)  { }       # Remove template arguments
+  return $d;
+}
+
+# Clean file name for display
+sub CleanFileName {
+  my ($f) = @_;
+  $f =~ s|^/proc/self/cwd/||;
+  $f =~ s|^\./||;
+  return $f;
+}
+
+# Make address relative to section and clean up for display
+sub UnparseAddress {
+  my ($offset, $address) = @_;
+  $address = AddressSub($address, $offset);
+  $address =~ s/^0x//;
+  $address =~ s/^0*//;
+  return $address;
+}
+
+##### Miscellaneous #####
+
+# Find the right versions of the above object tools to use.  The
+# argument is the program file being analyzed, and should be an ELF
+# 32-bit or ELF 64-bit executable file.  The location of the tools
+# is determined by considering the following options in this order:
+#   1) --tools option, if set
+#   2) PPROF_TOOLS environment variable, if set
+#   3) the environment
+sub ConfigureObjTools {
+  my $prog_file = shift;
+
+  # Check for the existence of $prog_file because /usr/bin/file does not
+  # predictably return error status in prod.
+  (-e $prog_file)  || error("$prog_file does not exist.\n");
+
+  my $file_type = undef;
+  if (-e "/usr/bin/file") {
+    # Follow symlinks (at least for systems where "file" supports that).
+    my $escaped_prog_file = ShellEscape($prog_file);
+    $file_type = `/usr/bin/file -L $escaped_prog_file 2>$dev_null ||
+                  /usr/bin/file $escaped_prog_file`;
+  } elsif ($^O == "MSWin32") {
+    $file_type = "MS Windows";
+  } else {
+    print STDERR "WARNING: Can't determine the file type of $prog_file";
+  }
+
+  if ($file_type =~ /64-bit/) {
+    # Change $address_length to 16 if the program file is ELF 64-bit.
+    # We can't detect this from many (most?) heap or lock contention
+    # profiles, since the actual addresses referenced are generally in low
+    # memory even for 64-bit programs.
+    $address_length = 16;
+  }
+
+  if ($file_type =~ /MS Windows/) {
+    # For windows, we provide a version of nm and addr2line as part of
+    # the opensource release, which is capable of parsing
+    # Windows-style PDB executables.  It should live in the path, or
+    # in the same directory as pprof.
+    $obj_tool_map{"nm_pdb"} = "nm-pdb";
+    $obj_tool_map{"addr2line_pdb"} = "addr2line-pdb";
+  }
+
+  if ($file_type =~ /Mach-O/) {
+    # OS X uses otool to examine Mach-O files, rather than objdump.
+    $obj_tool_map{"otool"} = "otool";
+    $obj_tool_map{"addr2line"} = "false";  # no addr2line
+    $obj_tool_map{"objdump"} = "false";  # no objdump
+  }
+
+  # Go fill in %obj_tool_map with the pathnames to use:
+  foreach my $tool (keys %obj_tool_map) {
+    $obj_tool_map{$tool} = ConfigureTool($obj_tool_map{$tool});
+  }
+}
+
+# Returns the path of a caller-specified object tool.  If --tools or
+# PPROF_TOOLS are specified, then returns the full path to the tool
+# with that prefix.  Otherwise, returns the path unmodified (which
+# means we will look for it on PATH).
+sub ConfigureTool {
+  my $tool = shift;
+  my $path;
+
+  # --tools (or $PPROF_TOOLS) is a comma separated list, where each
+  # item is either a) a pathname prefix, or b) a map of the form
+  # <tool>:<path>.  First we look for an entry of type (b) for our
+  # tool.  If one is found, we use it.  Otherwise, we consider all the
+  # pathname prefixes in turn, until one yields an existing file.  If
+  # none does, we use a default path.
+  my $tools = $main::opt_tools || $ENV{"PPROF_TOOLS"} || "";
+  if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) {
+    $path = $2;
+    # TODO(csilvers): sanity-check that $path exists?  Hard if it's relative.
+  } elsif ($tools ne '') {
+    foreach my $prefix (split(',', $tools)) {
+      next if ($prefix =~ /:/);    # ignore "tool:fullpath" entries in the list
+      if (-x $prefix . $tool) {
+        $path = $prefix . $tool;
+        last;
+      }
+    }
+    if (!$path) {
+      error("No '$tool' found with prefix specified by " .
+            "--tools (or \$PPROF_TOOLS) '$tools'\n");
+    }
+  } else {
+    # ... otherwise use the version that exists in the same directory as
+    # pprof.  If there's nothing there, use $PATH.
+    $0 =~ m,[^/]*$,;     # this is everything after the last slash
+    my $dirname = $`;    # this is everything up to and including the last slash
+    if (-x "$dirname$tool") {
+      $path = "$dirname$tool";
+    } else { 
+      $path = $tool;
+    }
+  }
+  if ($main::opt_debug) { print STDERR "Using '$path' for '$tool'.\n"; }
+  return $path;
+}
+
+sub ShellEscape {
+  my @escaped_words = ();
+  foreach my $word (@_) {
+    my $escaped_word = $word;
+    if ($word =~ m![^a-zA-Z0-9/.,_=-]!) {  # check for anything not in whitelist
+      $escaped_word =~ s/'/'\\''/;
+      $escaped_word = "'$escaped_word'";
+    }
+    push(@escaped_words, $escaped_word);
+  }
+  return join(" ", @escaped_words);
+}
+
+sub cleanup {
+  unlink($main::tmpfile_sym);
+  unlink(keys %main::tempnames);
+
+  # We leave any collected profiles in $HOME/pprof in case the user wants
+  # to look at them later.  We print a message informing them of this.
+  if ((scalar(@main::profile_files) > 0) &&
+      defined($main::collected_profile)) {
+    if (scalar(@main::profile_files) == 1) {
+      print STDERR "Dynamically gathered profile is in $main::collected_profile\n";
+    }
+    print STDERR "If you want to investigate this profile further, you can do:\n";
+    print STDERR "\n";
+    print STDERR "  pprof \\\n";
+    print STDERR "    $main::prog \\\n";
+    print STDERR "    $main::collected_profile\n";
+    print STDERR "\n";
+  }
+}
+
+sub sighandler {
+  cleanup();
+  exit(1);
+}
+
+sub error {
+  my $msg = shift;
+  print STDERR $msg;
+  cleanup();
+  exit(1);
+}
+
+
+# Run $nm_command and get all the resulting procedure boundaries whose
+# names match "$regexp" and returns them in a hashtable mapping from
+# procedure name to a two-element vector of [start address, end address]
+sub GetProcedureBoundariesViaNm {
+  my $escaped_nm_command = shift;    # shell-escaped
+  my $regexp = shift;
+  my $image = shift;
+
+  my $symbol_table = {};
+  open(NM, "$escaped_nm_command |") || error("$escaped_nm_command: $!\n");
+  my $last_start = "0";
+  my $routine = "";
+  while (<NM>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    if (m/^\s*([0-9a-f]+) (.) (..*)/) {
+      my $start_val = $1;
+      my $type = $2;
+      my $this_routine = $3;
+
+      # It's possible for two symbols to share the same address, if
+      # one is a zero-length variable (like __start_google_malloc) or
+      # one symbol is a weak alias to another (like __libc_malloc).
+      # In such cases, we want to ignore all values except for the
+      # actual symbol, which in nm-speak has type "T".  The logic
+      # below does this, though it's a bit tricky: what happens when
+      # we have a series of lines with the same address, is the first
+      # one gets queued up to be processed.  However, it won't
+      # *actually* be processed until later, when we read a line with
+      # a different address.  That means that as long as we're reading
+      # lines with the same address, we have a chance to replace that
+      # item in the queue, which we do whenever we see a 'T' entry --
+      # that is, a line with type 'T'.  If we never see a 'T' entry,
+      # we'll just go ahead and process the first entry (which never
+      # got touched in the queue), and ignore the others.
+      if ($start_val eq $last_start && $type =~ /t/i) {
+        # We are the 'T' symbol at this address, replace previous symbol.
+        $routine = $this_routine;
+        next;
+      } elsif ($start_val eq $last_start) {
+        # We're not the 'T' symbol at this address, so ignore us.
+        next;
+      }
+
+      if ($this_routine eq $sep_symbol) {
+        $sep_address = HexExtend($start_val);
+      }
+
+      # Tag this routine with the starting address in case the image
+      # has multiple occurrences of this routine.  We use a syntax
+      # that resembles template paramters that are automatically
+      # stripped out by ShortFunctionName()
+      $this_routine .= "<$start_val>";
+
+      if (defined($routine) && $routine =~ m/$regexp/) {
+        $symbol_table->{$routine} = [HexExtend($last_start),
+                                     HexExtend($start_val)];
+      }
+      $last_start = $start_val;
+      $routine = $this_routine;
+    } elsif (m/^Loaded image name: (.+)/) {
+      # The win32 nm workalike emits information about the binary it is using.
+      if ($main::opt_debug) { print STDERR "Using Image $1\n"; }
+    } elsif (m/^PDB file name: (.+)/) {
+      # The win32 nm workalike emits information about the pdb it is using.
+      if ($main::opt_debug) { print STDERR "Using PDB $1\n"; }
+    }
+  }
+  close(NM);
+  # Handle the last line in the nm output.  Unfortunately, we don't know
+  # how big this last symbol is, because we don't know how big the file
+  # is.  For now, we just give it a size of 0.
+  # TODO(csilvers): do better here.
+  if (defined($routine) && $routine =~ m/$regexp/) {
+    $symbol_table->{$routine} = [HexExtend($last_start),
+                                 HexExtend($last_start)];
+  }
+
+  # Verify if addr2line can find the $sep_symbol.  If not, we use objdump
+  # to find the address for the $sep_symbol on code section which addr2line
+  # can find.
+  if (defined($sep_address)){
+    my $start_val = $sep_address;
+    my $addr2line = $obj_tool_map{"addr2line"};
+    my $cmd = ShellEscape($addr2line, "-f", "-C", "-e", $image, "-i");
+    open(FINI, "echo $start_val | $cmd  |")
+         || error("echo $start_val | $cmd: $!\n");
+    $_ = <FINI>;
+    s/\r?\n$//g;
+    my $fini = $_;
+    close(FINI);
+    if ($fini ne $sep_symbol){
+      my $objdump =  $obj_tool_map{"objdump"};
+      $cmd = ShellEscape($objdump, "-d", $image);
+      my $grep = ShellEscape("grep", $sep_symbol);
+      my $tail = ShellEscape("tail", "-n", "1");
+      open(FINI, "$cmd | $grep | $tail |")
+           || error("$cmd | $grep | $tail: $!\n");
+      s/\r//g; # turn windows-looking lines into unix-looking lines
+      my $data = <FINI>;
+      if (defined($data)){
+        ($start_val, $fini) = split(/ </,$data);
+      }
+      close(FINI);
+    }
+    $sep_address = HexExtend($start_val);
+  }
+
+  return $symbol_table;
+}
+
+# Gets the procedure boundaries for all routines in "$image" whose names
+# match "$regexp" and returns them in a hashtable mapping from procedure
+# name to a two-element vector of [start address, end address].
+# Will return an empty map if nm is not installed or not working properly.
+sub GetProcedureBoundaries {
+  my $image = shift;
+  my $regexp = shift;
+
+  # If $image doesn't start with /, then put ./ in front of it.  This works
+  # around an obnoxious bug in our probing of nm -f behavior.
+  # "nm -f $image" is supposed to fail on GNU nm, but if:
+  #
+  # a. $image starts with [BbSsPp] (for example, bin/foo/bar), AND
+  # b. you have a.out in your current directory (a not uncommon occurrence)
+  #
+  # then "nm -f $image" succeeds because -f only looks at the first letter of
+  # the argument, which looks valid because it's [BbSsPp], and then since
+  # there's no image provided, it looks for a.out and finds it.
+  #
+  # This regex makes sure that $image starts with . or /, forcing the -f
+  # parsing to fail since . and / are not valid formats.
+  $image =~ s#^[^/]#./$&#;
+
+  # For libc libraries, the copy in /usr/lib/debug contains debugging symbols
+  my $debugging = DebuggingLibrary($image);
+  if ($debugging) {
+    $image = $debugging;
+  }
+
+  my $nm = $obj_tool_map{"nm"};
+  my $cppfilt = $obj_tool_map{"c++filt"};
+
+  # nm can fail for two reasons: 1) $image isn't a debug library; 2) nm
+  # binary doesn't support --demangle.  In addition, for OS X we need
+  # to use the -f flag to get 'flat' nm output (otherwise we don't sort
+  # properly and get incorrect results).  Unfortunately, GNU nm uses -f
+  # in an incompatible way.  So first we test whether our nm supports
+  # --demangle and -f.
+  my $demangle_flag = "";
+  my $cppfilt_flag = "";
+  my $to_devnull = ">$dev_null 2>&1";
+  if (system(ShellEscape($nm, "--demangle", "image") . $to_devnull) == 0) {
+    # In this mode, we do "nm --demangle <foo>"
+    $demangle_flag = "--demangle";
+    $cppfilt_flag = "";
+  } elsif (system(ShellEscape($cppfilt, $image) . $to_devnull) == 0) {
+    # In this mode, we do "nm <foo> | c++filt"
+    $cppfilt_flag = " | " . ShellEscape($cppfilt);
+  };
+  my $flatten_flag = "";
+  if (system(ShellEscape($nm, "-f", $image) . $to_devnull) == 0) {
+    $flatten_flag = "-f";
+  }
+
+  # Finally, in the case $imagie isn't a debug library, we try again with
+  # -D to at least get *exported* symbols.  If we can't use --demangle,
+  # we use c++filt instead, if it exists on this system.
+  my @nm_commands = (ShellEscape($nm, "-n", $flatten_flag, $demangle_flag,
+                                 $image) . " 2>$dev_null $cppfilt_flag",
+                     ShellEscape($nm, "-D", "-n", $flatten_flag, $demangle_flag,
+                                 $image) . " 2>$dev_null $cppfilt_flag",
+                     # 6nm is for Go binaries
+                     ShellEscape("6nm", "$image") . " 2>$dev_null | sort",
+                     );
+
+  # If the executable is an MS Windows PDB-format executable, we'll
+  # have set up obj_tool_map("nm_pdb").  In this case, we actually
+  # want to use both unix nm and windows-specific nm_pdb, since
+  # PDB-format executables can apparently include dwarf .o files.
+  if (exists $obj_tool_map{"nm_pdb"}) {
+    push(@nm_commands,
+         ShellEscape($obj_tool_map{"nm_pdb"}, "--demangle", $image)
+         . " 2>$dev_null");
+  }
+
+  foreach my $nm_command (@nm_commands) {
+    my $symbol_table = GetProcedureBoundariesViaNm($nm_command, $regexp, $image);
+    return $symbol_table if (%{$symbol_table});
+  }
+  my $symbol_table = {};
+  return $symbol_table;
+}
+
+
+# The test vectors for AddressAdd/Sub/Inc are 8-16-nibble hex strings.
+# To make them more readable, we add underscores at interesting places.
+# This routine removes the underscores, producing the canonical representation
+# used by pprof to represent addresses, particularly in the tested routines.
+sub CanonicalHex {
+  my $arg = shift;
+  return join '', (split '_',$arg);
+}
+
+
+# Unit test for AddressAdd:
+sub AddressAddUnitTest {
+  my $test_data_8 = shift;
+  my $test_data_16 = shift;
+  my $error_count = 0;
+  my $fail_count = 0;
+  my $pass_count = 0;
+  # print STDERR "AddressAddUnitTest: ", 1+$#{$test_data_8}, " tests\n";
+
+  # First a few 8-nibble addresses.  Note that this implementation uses
+  # plain old arithmetic, so a quick sanity check along with verifying what
+  # happens to overflow (we want it to wrap):
+  $address_length = 8;
+  foreach my $row (@{$test_data_8}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressAdd ($row->[0], $row->[1]);
+    if ($sum ne $row->[2]) {
+      printf STDERR "ERROR: %s != %s + %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[2];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressAdd 32-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count = $fail_count;
+  $fail_count = 0;
+  $pass_count = 0;
+
+  # Now 16-nibble addresses.
+  $address_length = 16;
+  foreach my $row (@{$test_data_16}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressAdd (CanonicalHex($row->[0]), CanonicalHex($row->[1]));
+    my $expected = join '', (split '_',$row->[2]);
+    if ($sum ne CanonicalHex($row->[2])) {
+      printf STDERR "ERROR: %s != %s + %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[2];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressAdd 64-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count += $fail_count;
+
+  return $error_count;
+}
+
+
+# Unit test for AddressSub:
+sub AddressSubUnitTest {
+  my $test_data_8 = shift;
+  my $test_data_16 = shift;
+  my $error_count = 0;
+  my $fail_count = 0;
+  my $pass_count = 0;
+  # print STDERR "AddressSubUnitTest: ", 1+$#{$test_data_8}, " tests\n";
+
+  # First a few 8-nibble addresses.  Note that this implementation uses
+  # plain old arithmetic, so a quick sanity check along with verifying what
+  # happens to overflow (we want it to wrap):
+  $address_length = 8;
+  foreach my $row (@{$test_data_8}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressSub ($row->[0], $row->[1]);
+    if ($sum ne $row->[3]) {
+      printf STDERR "ERROR: %s != %s - %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[3];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressSub 32-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count = $fail_count;
+  $fail_count = 0;
+  $pass_count = 0;
+
+  # Now 16-nibble addresses.
+  $address_length = 16;
+  foreach my $row (@{$test_data_16}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressSub (CanonicalHex($row->[0]), CanonicalHex($row->[1]));
+    if ($sum ne CanonicalHex($row->[3])) {
+      printf STDERR "ERROR: %s != %s - %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[3];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressSub 64-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count += $fail_count;
+
+  return $error_count;
+}
+
+
+# Unit test for AddressInc:
+sub AddressIncUnitTest {
+  my $test_data_8 = shift;
+  my $test_data_16 = shift;
+  my $error_count = 0;
+  my $fail_count = 0;
+  my $pass_count = 0;
+  # print STDERR "AddressIncUnitTest: ", 1+$#{$test_data_8}, " tests\n";
+
+  # First a few 8-nibble addresses.  Note that this implementation uses
+  # plain old arithmetic, so a quick sanity check along with verifying what
+  # happens to overflow (we want it to wrap):
+  $address_length = 8;
+  foreach my $row (@{$test_data_8}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressInc ($row->[0]);
+    if ($sum ne $row->[4]) {
+      printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum,
+             $row->[0], $row->[4];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressInc 32-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count = $fail_count;
+  $fail_count = 0;
+  $pass_count = 0;
+
+  # Now 16-nibble addresses.
+  $address_length = 16;
+  foreach my $row (@{$test_data_16}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressInc (CanonicalHex($row->[0]));
+    if ($sum ne CanonicalHex($row->[4])) {
+      printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum,
+             $row->[0], $row->[4];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressInc 64-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count += $fail_count;
+
+  return $error_count;
+}
+
+
+# Driver for unit tests.
+# Currently just the address add/subtract/increment routines for 64-bit.
+sub RunUnitTests {
+  my $error_count = 0;
+
+  # This is a list of tuples [a, b, a+b, a-b, a+1]
+  my $unit_test_data_8 = [
+    [qw(aaaaaaaa 50505050 fafafafa 5a5a5a5a aaaaaaab)],
+    [qw(50505050 aaaaaaaa fafafafa a5a5a5a6 50505051)],
+    [qw(ffffffff aaaaaaaa aaaaaaa9 55555555 00000000)],
+    [qw(00000001 ffffffff 00000000 00000002 00000002)],
+    [qw(00000001 fffffff0 fffffff1 00000011 00000002)],
+  ];
+  my $unit_test_data_16 = [
+    # The implementation handles data in 7-nibble chunks, so those are the
+    # interesting boundaries.
+    [qw(aaaaaaaa 50505050
+        00_000000f_afafafa 00_0000005_a5a5a5a 00_000000a_aaaaaab)],
+    [qw(50505050 aaaaaaaa
+        00_000000f_afafafa ff_ffffffa_5a5a5a6 00_0000005_0505051)],
+    [qw(ffffffff aaaaaaaa
+        00_000001a_aaaaaa9 00_0000005_5555555 00_0000010_0000000)],
+    [qw(00000001 ffffffff
+        00_0000010_0000000 ff_ffffff0_0000002 00_0000000_0000002)],
+    [qw(00000001 fffffff0
+        00_000000f_ffffff1 ff_ffffff0_0000011 00_0000000_0000002)],
+
+    [qw(00_a00000a_aaaaaaa 50505050
+        00_a00000f_afafafa 00_a000005_a5a5a5a 00_a00000a_aaaaaab)],
+    [qw(0f_fff0005_0505050 aaaaaaaa
+        0f_fff000f_afafafa 0f_ffefffa_5a5a5a6 0f_fff0005_0505051)],
+    [qw(00_000000f_fffffff 01_800000a_aaaaaaa
+        01_800001a_aaaaaa9 fe_8000005_5555555 00_0000010_0000000)],
+    [qw(00_0000000_0000001 ff_fffffff_fffffff
+        00_0000000_0000000 00_0000000_0000002 00_0000000_0000002)],
+    [qw(00_0000000_0000001 ff_fffffff_ffffff0
+        ff_fffffff_ffffff1 00_0000000_0000011 00_0000000_0000002)],
+  ];
+
+  $error_count += AddressAddUnitTest($unit_test_data_8, $unit_test_data_16);
+  $error_count += AddressSubUnitTest($unit_test_data_8, $unit_test_data_16);
+  $error_count += AddressIncUnitTest($unit_test_data_8, $unit_test_data_16);
+  if ($error_count > 0) {
+    print STDERR $error_count, " errors: FAILED\n";
+  } else {
+    print STDERR "PASS\n";
+  }
+  exit ($error_count);
+}

diff --git a/src/profile-handler.cc b/src/profile-handler.cc
new file mode 100644
index 0000000..66c9d74
--- /dev/null
+++ b/src/profile-handler.cc

@@ -0,0 +1,697 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2009, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//         Nabeel Mian
+//
+// Implements management of profile timers and the corresponding signal handler.
+
+#include "config.h"
+#include "profile-handler.h"
+
+#if !(defined(__CYGWIN__) || defined(__CYGWIN32__))
+
+#include <stdio.h>
+#include <errno.h>
+#include <sys/time.h>
+
+#include <list>
+#include <string>
+
+#if HAVE_LINUX_SIGEV_THREAD_ID
+// for timer_{create,settime} and associated typedefs & constants
+#include <time.h>
+// for sys_gettid
+#include "base/linux_syscall_support.h"
+// for perftools_pthread_key_create
+#include "maybe_threads.h"
+#endif
+
+#include "base/dynamic_annotations.h"
+#include "base/googleinit.h"
+#include "base/logging.h"
+#include "base/spinlock.h"
+#include "maybe_threads.h"
+
+using std::list;
+using std::string;
+
+// This structure is used by ProfileHandlerRegisterCallback and
+// ProfileHandlerUnregisterCallback as a handle to a registered callback.
+struct ProfileHandlerToken {
+  // Sets the callback and associated arg.
+  ProfileHandlerToken(ProfileHandlerCallback cb, void* cb_arg)
+      : callback(cb),
+        callback_arg(cb_arg) {
+  }
+
+  // Callback function to be invoked on receiving a profile timer interrupt.
+  ProfileHandlerCallback callback;
+  // Argument for the callback function.
+  void* callback_arg;
+};
+
+// This class manages profile timers and associated signal handler. This is a
+// a singleton.
+class ProfileHandler {
+ public:
+  // Registers the current thread with the profile handler. On systems which
+  // have a separate interval timer for each thread, this function starts the
+  // timer for the current thread.
+  //
+  // The function also attempts to determine whether or not timers are shared by
+  // all threads in the process.  (With LinuxThreads, and with NPTL on some
+  // Linux kernel versions, each thread has separate timers.)
+  //
+  // Prior to determining whether timers are shared, this function will
+  // unconditionally start the timer.  However, if this function determines
+  // that timers are shared, then it will stop the timer if no callbacks are
+  // currently registered.
+  void RegisterThread();
+
+  // Registers a callback routine to receive profile timer ticks. The returned
+  // token is to be used when unregistering this callback and must not be
+  // deleted by the caller. Registration of the first callback enables the
+  // SIGPROF handler (or SIGALRM if using ITIMER_REAL).
+  ProfileHandlerToken* RegisterCallback(ProfileHandlerCallback callback,
+                                        void* callback_arg);
+
+  // Unregisters a previously registered callback. Expects the token returned
+  // by the corresponding RegisterCallback routine. Unregistering the last
+  // callback disables the SIGPROF handler (or SIGALRM if using ITIMER_REAL).
+  void UnregisterCallback(ProfileHandlerToken* token)
+      NO_THREAD_SAFETY_ANALYSIS;
+
+  // Unregisters all the callbacks, stops the timer if shared, disables the
+  // SIGPROF (or SIGALRM) handler and clears the timer_sharing_ state.
+  void Reset();
+
+  // Gets the current state of profile handler.
+  void GetState(ProfileHandlerState* state);
+
+  // Initializes and returns the ProfileHandler singleton.
+  static ProfileHandler* Instance();
+
+ private:
+  ProfileHandler();
+  ~ProfileHandler();
+
+  // Largest allowed frequency.
+  static const int32 kMaxFrequency = 4000;
+  // Default frequency.
+  static const int32 kDefaultFrequency = 100;
+
+  // ProfileHandler singleton.
+  static ProfileHandler* instance_;
+
+  // pthread_once_t for one time initialization of ProfileHandler singleton.
+  static pthread_once_t once_;
+
+  // Initializes the ProfileHandler singleton via GoogleOnceInit.
+  static void Init();
+
+  // The number of SIGPROF (or SIGALRM for ITIMER_REAL) interrupts received.
+  int64 interrupts_ GUARDED_BY(signal_lock_);
+
+  // SIGPROF/SIGALRM interrupt frequency, read-only after construction.
+  int32 frequency_;
+
+  // ITIMER_PROF (which uses SIGPROF), or ITIMER_REAL (which uses SIGALRM)
+  int timer_type_;
+
+  // Signal number for timer signal.
+  int signal_number_;
+
+  // Counts the number of callbacks registered.
+  int32 callback_count_ GUARDED_BY(control_lock_);
+
+  // Is profiling allowed at all?
+  bool allowed_;
+
+  bool per_thread_timer_enabled_;
+
+#ifdef HAVE_LINUX_SIGEV_THREAD_ID
+  // this is used to destroy per-thread profiling timers on thread
+  // termination
+  pthread_key_t thread_timer_key;
+#endif
+
+  // Whether or not the threading system provides interval timers that are
+  // shared by all threads in a process.
+  enum {
+    // No timer initialization attempted yet.
+    TIMERS_UNTOUCHED,
+    // First thread has registered and set timer.
+    TIMERS_ONE_SET,
+    // Timers are shared by all threads.
+    TIMERS_SHARED,
+    // Timers are separate in each thread.
+    TIMERS_SEPARATE
+  } timer_sharing_ GUARDED_BY(control_lock_);
+
+  // This lock serializes the registration of threads and protects the
+  // callbacks_ list below.
+  // Locking order:
+  // In the context of a signal handler, acquire signal_lock_ to walk the
+  // callback list. Otherwise, acquire control_lock_, disable the signal
+  // handler and then acquire signal_lock_.
+  SpinLock control_lock_ ACQUIRED_BEFORE(signal_lock_);
+  SpinLock signal_lock_;
+
+  // Holds the list of registered callbacks. We expect the list to be pretty
+  // small. Currently, the cpu profiler (base/profiler) and thread module
+  // (base/thread.h) are the only two components registering callbacks.
+  // Following are the locking requirements for callbacks_:
+  // For read-write access outside the SIGPROF handler:
+  //  - Acquire control_lock_
+  //  - Disable SIGPROF handler.
+  //  - Acquire signal_lock_
+  // For read-only access in the context of SIGPROF handler
+  // (Read-write access is *not allowed* in the SIGPROF handler)
+  //  - Acquire signal_lock_
+  // For read-only access outside SIGPROF handler:
+  //  - Acquire control_lock_
+  typedef list<ProfileHandlerToken*> CallbackList;
+  typedef CallbackList::iterator CallbackIterator;
+  CallbackList callbacks_ GUARDED_BY(signal_lock_);
+
+  // Starts the interval timer.  If the thread library shares timers between
+  // threads, this function starts the shared timer. Otherwise, this will start
+  // the timer in the current thread.
+  void StartTimer() EXCLUSIVE_LOCKS_REQUIRED(control_lock_);
+
+  // Stops the interval timer. If the thread library shares timers between
+  // threads, this fucntion stops the shared timer. Otherwise, this will stop
+  // the timer in the current thread.
+  void StopTimer() EXCLUSIVE_LOCKS_REQUIRED(control_lock_);
+
+  // Returns true if the profile interval timer is enabled in the current
+  // thread.  This actually checks the kernel's interval timer setting.  (It is
+  // used to detect whether timers are shared or separate.)
+  bool IsTimerRunning() EXCLUSIVE_LOCKS_REQUIRED(control_lock_);
+
+  // Sets the timer interrupt signal handler.
+  void EnableHandler() EXCLUSIVE_LOCKS_REQUIRED(control_lock_);
+
+  // Disables (ignores) the timer interrupt signal.
+  void DisableHandler() EXCLUSIVE_LOCKS_REQUIRED(control_lock_);
+
+  // Returns true if the handler is not being used by something else.
+  // This checks the kernel's signal handler table.
+  bool IsSignalHandlerAvailable();
+
+  // SIGPROF/SIGALRM handler. Iterate over and call all the registered callbacks.
+  static void SignalHandler(int sig, siginfo_t* sinfo, void* ucontext);
+
+  DISALLOW_COPY_AND_ASSIGN(ProfileHandler);
+};
+
+ProfileHandler* ProfileHandler::instance_ = NULL;
+pthread_once_t ProfileHandler::once_ = PTHREAD_ONCE_INIT;
+
+const int32 ProfileHandler::kMaxFrequency;
+const int32 ProfileHandler::kDefaultFrequency;
+
+// If we are LD_PRELOAD-ed against a non-pthreads app, then
+// pthread_once won't be defined.  We declare it here, for that
+// case (with weak linkage) which will cause the non-definition to
+// resolve to NULL.  We can then check for NULL or not in Instance.
+extern "C" int pthread_once(pthread_once_t *, void (*)(void))
+    ATTRIBUTE_WEAK;
+
+#if HAVE_LINUX_SIGEV_THREAD_ID
+
+// We use weak alias to timer_create to avoid runtime dependency on
+// -lrt and in turn -lpthread.
+//
+// At runtime we detect if timer_create is available and if so we
+// can enable linux-sigev-thread mode of profiling
+extern "C" {
+  int timer_create(clockid_t clockid, struct sigevent *evp,
+                            timer_t *timerid)
+    ATTRIBUTE_WEAK;
+  int timer_delete(timer_t timerid)
+    ATTRIBUTE_WEAK;
+  int timer_settime(timer_t timerid, int flags,
+                    const struct itimerspec *value,
+                    struct itimerspec *ovalue)
+    ATTRIBUTE_WEAK;
+}
+
+struct timer_id_holder {
+  timer_t timerid;
+  timer_id_holder(timer_t _timerid) : timerid(_timerid) {}
+};
+
+extern "C" {
+  static void ThreadTimerDestructor(void *arg) {
+    if (!arg) {
+      return;
+    }
+    timer_id_holder *holder = static_cast<timer_id_holder *>(arg);
+    timer_delete(holder->timerid);
+    delete holder;
+  }
+}
+
+static void CreateThreadTimerKey(pthread_key_t *pkey) {
+  int rv = perftools_pthread_key_create(pkey, ThreadTimerDestructor);
+  if (rv) {
+    RAW_LOG(FATAL, "aborting due to pthread_key_create error: %s", strerror(rv));
+  }
+}
+
+static void StartLinuxThreadTimer(int timer_type, int signal_number,
+                                  int32 frequency, pthread_key_t timer_key) {
+  int rv;
+  struct sigevent sevp;
+  timer_t timerid;
+  struct itimerspec its;
+  memset(&sevp, 0, sizeof(sevp));
+  sevp.sigev_notify = SIGEV_THREAD_ID;
+  sevp._sigev_un._tid = sys_gettid();
+  sevp.sigev_signo = signal_number;
+  clockid_t clock = CLOCK_THREAD_CPUTIME_ID;
+  if (timer_type == ITIMER_REAL) {
+    clock = CLOCK_MONOTONIC;
+  }
+  rv = timer_create(clock, &sevp, &timerid);
+  if (rv) {
+    RAW_LOG(FATAL, "aborting due to timer_create error: %s", strerror(errno));
+  }
+
+  timer_id_holder *holder = new timer_id_holder(timerid);
+  rv = perftools_pthread_setspecific(timer_key, holder);
+  if (rv) {
+    RAW_LOG(FATAL, "aborting due to pthread_setspecific error: %s", strerror(rv));
+  }
+
+  its.it_interval.tv_sec = 0;
+  its.it_interval.tv_nsec = 1000000000 / frequency;
+  its.it_value = its.it_interval;
+  rv = timer_settime(timerid, 0, &its, 0);
+  if (rv) {
+    RAW_LOG(FATAL, "aborting due to timer_settime error: %s", strerror(errno));
+  }
+}
+#endif
+
+void ProfileHandler::Init() {
+  instance_ = new ProfileHandler();
+}
+
+ProfileHandler* ProfileHandler::Instance() {
+  if (pthread_once) {
+    pthread_once(&once_, Init);
+  }
+  if (instance_ == NULL) {
+    // This will be true on systems that don't link in pthreads,
+    // including on FreeBSD where pthread_once has a non-zero address
+    // (but doesn't do anything) even when pthreads isn't linked in.
+    Init();
+    assert(instance_ != NULL);
+  }
+  return instance_;
+}
+
+ProfileHandler::ProfileHandler()
+    : interrupts_(0),
+      callback_count_(0),
+      allowed_(true),
+      per_thread_timer_enabled_(false),
+      timer_sharing_(TIMERS_UNTOUCHED) {
+  SpinLockHolder cl(&control_lock_);
+
+  timer_type_ = (getenv("CPUPROFILE_REALTIME") ? ITIMER_REAL : ITIMER_PROF);
+  signal_number_ = (timer_type_ == ITIMER_PROF ? SIGPROF : SIGALRM);
+
+  // Get frequency of interrupts (if specified)
+  char junk;
+  const char* fr = getenv("CPUPROFILE_FREQUENCY");
+  if (fr != NULL && (sscanf(fr, "%u%c", &frequency_, &junk) == 1) &&
+      (frequency_ > 0)) {
+    // Limit to kMaxFrequency
+    frequency_ = (frequency_ > kMaxFrequency) ? kMaxFrequency : frequency_;
+  } else {
+    frequency_ = kDefaultFrequency;
+  }
+
+  if (!allowed_) {
+    return;
+  }
+
+#if HAVE_LINUX_SIGEV_THREAD_ID
+  // Do this early because we might be overriding signal number.
+
+  const char *per_thread = getenv("CPUPROFILE_PER_THREAD_TIMERS");
+  const char *signal_number = getenv("CPUPROFILE_TIMER_SIGNAL");
+
+  if (per_thread || signal_number) {
+    if (timer_create && pthread_once) {
+      timer_sharing_ = TIMERS_SEPARATE;
+      CreateThreadTimerKey(&thread_timer_key);
+      per_thread_timer_enabled_ = true;
+      // Override signal number if requested.
+      if (signal_number) {
+        signal_number_ = strtol(signal_number, NULL, 0);
+      }
+    } else {
+      RAW_LOG(INFO,
+              "Ignoring CPUPROFILE_PER_THREAD_TIMERS and\n"
+              " CPUPROFILE_TIMER_SIGNAL due to lack of timer_create().\n"
+              " Preload or link to librt.so for this to work");
+    }
+  }
+#endif
+
+  // If something else is using the signal handler,
+  // assume it has priority over us and stop.
+  if (!IsSignalHandlerAvailable()) {
+    RAW_LOG(INFO, "Disabling profiler because signal %d handler is already in use.",
+            signal_number_);
+    allowed_ = false;
+    return;
+  }
+
+  // Ignore signals until we decide to turn profiling on.  (Paranoia;
+  // should already be ignored.)
+  DisableHandler();
+
+}
+
+ProfileHandler::~ProfileHandler() {
+  Reset();
+#ifdef HAVE_LINUX_SIGEV_THREAD_ID
+  if (per_thread_timer_enabled_) {
+    perftools_pthread_key_delete(thread_timer_key);
+  }
+#endif
+}
+
+void ProfileHandler::RegisterThread() {
+  SpinLockHolder cl(&control_lock_);
+
+  if (!allowed_) {
+    return;
+  }
+
+  // We try to detect whether timers are being shared by setting a
+  // timer in the first call to this function, then checking whether
+  // it's set in the second call.
+  //
+  // Note that this detection method requires that the first two calls
+  // to RegisterThread must be made from different threads.  (Subsequent
+  // calls will see timer_sharing_ set to either TIMERS_SEPARATE or
+  // TIMERS_SHARED, and won't try to detect the timer sharing type.)
+  //
+  // Also note that if timer settings were inherited across new thread
+  // creation but *not* shared, this approach wouldn't work.  That's
+  // not an issue for any Linux threading implementation, and should
+  // not be a problem for a POSIX-compliant threads implementation.
+  switch (timer_sharing_) {
+    case TIMERS_UNTOUCHED:
+      StartTimer();
+      timer_sharing_ = TIMERS_ONE_SET;
+      break;
+    case TIMERS_ONE_SET:
+      // If the timer is running, that means that the main thread's
+      // timer setup is seen in this (second) thread -- and therefore
+      // that timers are shared.
+      if (IsTimerRunning()) {
+        timer_sharing_ = TIMERS_SHARED;
+        // If callback is already registered, we have to keep the timer
+        // running.  If not, we disable the timer here.
+        if (callback_count_ == 0) {
+          StopTimer();
+        }
+      } else {
+        timer_sharing_ = TIMERS_SEPARATE;
+        StartTimer();
+      }
+      break;
+    case TIMERS_SHARED:
+      // Nothing needed.
+      break;
+    case TIMERS_SEPARATE:
+      StartTimer();
+      break;
+  }
+}
+
+ProfileHandlerToken* ProfileHandler::RegisterCallback(
+    ProfileHandlerCallback callback, void* callback_arg) {
+
+  ProfileHandlerToken* token = new ProfileHandlerToken(callback, callback_arg);
+
+  SpinLockHolder cl(&control_lock_);
+  DisableHandler();
+  {
+    SpinLockHolder sl(&signal_lock_);
+    callbacks_.push_back(token);
+  }
+  // Start the timer if timer is shared and this is a first callback.
+  if ((callback_count_ == 0) && (timer_sharing_ == TIMERS_SHARED)) {
+    StartTimer();
+  }
+  ++callback_count_;
+  EnableHandler();
+  return token;
+}
+
+void ProfileHandler::UnregisterCallback(ProfileHandlerToken* token) {
+  SpinLockHolder cl(&control_lock_);
+  for (CallbackIterator it = callbacks_.begin(); it != callbacks_.end();
+       ++it) {
+    if ((*it) == token) {
+      RAW_CHECK(callback_count_ > 0, "Invalid callback count");
+      DisableHandler();
+      {
+        SpinLockHolder sl(&signal_lock_);
+        delete *it;
+        callbacks_.erase(it);
+      }
+      --callback_count_;
+      if (callback_count_ > 0) {
+        EnableHandler();
+      } else if (timer_sharing_ == TIMERS_SHARED) {
+        StopTimer();
+      }
+      return;
+    }
+  }
+  // Unknown token.
+  RAW_LOG(FATAL, "Invalid token");
+}
+
+void ProfileHandler::Reset() {
+  SpinLockHolder cl(&control_lock_);
+  DisableHandler();
+  {
+    SpinLockHolder sl(&signal_lock_);
+    CallbackIterator it = callbacks_.begin();
+    while (it != callbacks_.end()) {
+      CallbackIterator tmp = it;
+      ++it;
+      delete *tmp;
+      callbacks_.erase(tmp);
+    }
+  }
+  callback_count_ = 0;
+  if (timer_sharing_ == TIMERS_SHARED) {
+    StopTimer();
+  }
+  timer_sharing_ = TIMERS_UNTOUCHED;
+}
+
+void ProfileHandler::GetState(ProfileHandlerState* state) {
+  SpinLockHolder cl(&control_lock_);
+  DisableHandler();
+  {
+    SpinLockHolder sl(&signal_lock_);  // Protects interrupts_.
+    state->interrupts = interrupts_;
+  }
+  if (callback_count_ > 0) {
+    EnableHandler();
+  }
+  state->frequency = frequency_;
+  state->callback_count = callback_count_;
+  state->allowed = allowed_;
+}
+
+void ProfileHandler::StartTimer() {
+  if (!allowed_) {
+    return;
+  }
+
+#if HAVE_LINUX_SIGEV_THREAD_ID
+  if (per_thread_timer_enabled_) {
+    StartLinuxThreadTimer(timer_type_, signal_number_, frequency_, thread_timer_key);
+    return;
+  }
+#endif
+
+  struct itimerval timer;
+  timer.it_interval.tv_sec = 0;
+  timer.it_interval.tv_usec = 1000000 / frequency_;
+  timer.it_value = timer.it_interval;
+  setitimer(timer_type_, &timer, 0);
+}
+
+void ProfileHandler::StopTimer() {
+  if (!allowed_) {
+    return;
+  }
+  if (per_thread_timer_enabled_) {
+    RAW_LOG(FATAL, "StopTimer cannot be called in linux-per-thread-timers mode");
+  }
+
+  struct itimerval timer;
+  memset(&timer, 0, sizeof timer);
+  setitimer(timer_type_, &timer, 0);
+}
+
+bool ProfileHandler::IsTimerRunning() {
+  if (!allowed_) {
+    return false;
+  }
+  if (per_thread_timer_enabled_) {
+    return false;
+  }
+  struct itimerval current_timer;
+  RAW_CHECK(0 == getitimer(timer_type_, &current_timer), "getitimer");
+  return (current_timer.it_value.tv_sec != 0 ||
+          current_timer.it_value.tv_usec != 0);
+}
+
+void ProfileHandler::EnableHandler() {
+  if (!allowed_) {
+    return;
+  }
+  struct sigaction sa;
+  sa.sa_sigaction = SignalHandler;
+  sa.sa_flags = SA_RESTART | SA_SIGINFO;
+  sigemptyset(&sa.sa_mask);
+  RAW_CHECK(sigaction(signal_number_, &sa, NULL) == 0, "sigprof (enable)");
+}
+
+void ProfileHandler::DisableHandler() {
+  if (!allowed_) {
+    return;
+  }
+  struct sigaction sa;
+  sa.sa_handler = SIG_IGN;
+  sa.sa_flags = SA_RESTART;
+  sigemptyset(&sa.sa_mask);
+  RAW_CHECK(sigaction(signal_number_, &sa, NULL) == 0, "sigprof (disable)");
+}
+
+bool ProfileHandler::IsSignalHandlerAvailable() {
+  struct sigaction sa;
+  RAW_CHECK(sigaction(signal_number_, NULL, &sa) == 0, "is-signal-handler avail");
+
+  // We only take over the handler if the current one is unset.
+  // It must be SIG_IGN or SIG_DFL, not some other function.
+  // SIG_IGN must be allowed because when profiling is allowed but
+  // not actively in use, this code keeps the handler set to SIG_IGN.
+  // That setting will be inherited across fork+exec.  In order for
+  // any child to be able to use profiling, SIG_IGN must be treated
+  // as available.
+  return sa.sa_handler == SIG_IGN || sa.sa_handler == SIG_DFL;
+}
+
+void ProfileHandler::SignalHandler(int sig, siginfo_t* sinfo, void* ucontext) {
+  int saved_errno = errno;
+  // At this moment, instance_ must be initialized because the handler is
+  // enabled in RegisterThread or RegisterCallback only after
+  // ProfileHandler::Instance runs.
+  ProfileHandler* instance = ANNOTATE_UNPROTECTED_READ(instance_);
+  RAW_CHECK(instance != NULL, "ProfileHandler is not initialized");
+  {
+    SpinLockHolder sl(&instance->signal_lock_);
+    ++instance->interrupts_;
+    for (CallbackIterator it = instance->callbacks_.begin();
+         it != instance->callbacks_.end();
+         ++it) {
+      (*it)->callback(sig, sinfo, ucontext, (*it)->callback_arg);
+    }
+  }
+  errno = saved_errno;
+}
+
+// This module initializer registers the main thread, so it must be
+// executed in the context of the main thread.
+REGISTER_MODULE_INITIALIZER(profile_main, ProfileHandlerRegisterThread());
+
+extern "C" void ProfileHandlerRegisterThread() {
+  ProfileHandler::Instance()->RegisterThread();
+}
+
+extern "C" ProfileHandlerToken* ProfileHandlerRegisterCallback(
+    ProfileHandlerCallback callback, void* callback_arg) {
+  return ProfileHandler::Instance()->RegisterCallback(callback, callback_arg);
+}
+
+extern "C" void ProfileHandlerUnregisterCallback(ProfileHandlerToken* token) {
+  ProfileHandler::Instance()->UnregisterCallback(token);
+}
+
+extern "C" void ProfileHandlerReset() {
+  return ProfileHandler::Instance()->Reset();
+}
+
+extern "C" void ProfileHandlerGetState(ProfileHandlerState* state) {
+  ProfileHandler::Instance()->GetState(state);
+}
+
+#else  // OS_CYGWIN
+
+// ITIMER_PROF doesn't work under cygwin.  ITIMER_REAL is available, but doesn't
+// work as well for profiling, and also interferes with alarm().  Because of
+// these issues, unless a specific need is identified, profiler support is
+// disabled under Cygwin.
+extern "C" void ProfileHandlerRegisterThread() {
+}
+
+extern "C" ProfileHandlerToken* ProfileHandlerRegisterCallback(
+    ProfileHandlerCallback callback, void* callback_arg) {
+  return NULL;
+}
+
+extern "C" void ProfileHandlerUnregisterCallback(ProfileHandlerToken* token) {
+}
+
+extern "C" void ProfileHandlerReset() {
+}
+
+extern "C" void ProfileHandlerGetState(ProfileHandlerState* state) {
+}
+
+#endif  // OS_CYGWIN

diff --git a/src/profile-handler.h b/src/profile-handler.h
new file mode 100644
index 0000000..4f96a18
--- /dev/null
+++ b/src/profile-handler.h

@@ -0,0 +1,149 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2009, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Nabeel Mian
+ *
+ * This module manages the cpu profile timers and the associated interrupt
+ * handler. When enabled, all registered threads in the program are profiled.
+ * (Note: if using linux 2.4 or earlier, you must use the Thread class, in
+ * google3/thread, to ensure all threads are profiled.)
+ *
+ * Any component interested in receiving a profile timer interrupt can do so by
+ * registering a callback. All registered callbacks must be async-signal-safe.
+ *
+ * Note: This module requires the sole ownership of ITIMER_PROF timer and the
+ * SIGPROF signal.
+ */
+
+#ifndef BASE_PROFILE_HANDLER_H_
+#define BASE_PROFILE_HANDLER_H_
+
+#include "config.h"
+#include <signal.h>
+#ifdef COMPILER_MSVC
+#include "conflict-signal.h"
+#endif
+#include "base/basictypes.h"
+
+/* All this code should be usable from within C apps. */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Forward declaration. */
+struct ProfileHandlerToken;
+
+/*
+ * Callback function to be used with ProfilefHandlerRegisterCallback. This
+ * function will be called in the context of SIGPROF signal handler and must
+ * be async-signal-safe. The first three arguments are the values provided by
+ * the SIGPROF signal handler. We use void* to avoid using ucontext_t on
+ * non-POSIX systems.
+ *
+ * Requirements:
+ * - Callback must be async-signal-safe.
+ * - None of the functions in ProfileHandler are async-signal-safe. Therefore,
+ *   callback function *must* not call any of the ProfileHandler functions.
+ * - Callback is not required to be re-entrant. At most one instance of
+ *   callback can run at a time.
+ *
+ * Notes:
+ * - The SIGPROF signal handler saves and restores errno, so the callback
+ *   doesn't need to.
+ * - Callback code *must* not acquire lock(s) to serialize access to data shared
+ *   with the code outside the signal handler (callback must be
+ *   async-signal-safe). If such a serialization is needed, follow the model
+ *   used by profiler.cc:
+ *
+ *   When code other than the signal handler modifies the shared data it must:
+ *   - Acquire lock.
+ *   - Unregister the callback with the ProfileHandler.
+ *   - Modify shared data.
+ *   - Re-register the callback.
+ *   - Release lock.
+ *   and the callback code gets a lockless, read-write access to the data.
+ */
+typedef void (*ProfileHandlerCallback)(int sig, siginfo_t* sig_info,
+                                       void* ucontext, void* callback_arg);
+
+/*
+ * Registers a new thread with profile handler and should be called only once
+ * per thread. The main thread is registered at program startup. This routine
+ * is called by the Thread module in google3/thread whenever a new thread is
+ * created. This function is not async-signal-safe.
+ */
+void ProfileHandlerRegisterThread();
+
+/*
+ * Registers a callback routine. This callback function will be called in the
+ * context of SIGPROF handler, so must be async-signal-safe. The returned token
+ * is to be used when unregistering this callback via
+ * ProfileHandlerUnregisterCallback. Registering the first callback enables
+ * the SIGPROF signal handler. Caller must not free the returned token. This
+ * function is not async-signal-safe.
+ */
+ProfileHandlerToken* ProfileHandlerRegisterCallback(
+    ProfileHandlerCallback callback, void* callback_arg);
+
+/*
+ * Unregisters a previously registered callback. Expects the token returned
+ * by the corresponding ProfileHandlerRegisterCallback and asserts that the
+ * passed token is valid. Unregistering the last callback disables the SIGPROF
+ * signal handler. It waits for the currently running callback to
+ * complete before returning. This function is not async-signal-safe.
+ */
+void ProfileHandlerUnregisterCallback(ProfileHandlerToken* token);
+
+/*
+ * FOR TESTING ONLY
+ * Unregisters all the callbacks, stops the timers (if shared) and disables the
+ * SIGPROF handler. All the threads, including the main thread, need to be
+ * re-registered after this call. This function is not async-signal-safe.
+ */
+void ProfileHandlerReset();
+
+/*
+ * Stores profile handler's current state. This function is not
+ * async-signal-safe.
+ */
+struct ProfileHandlerState {
+  int32 frequency;  /* Profiling frequency */
+  int32 callback_count;  /* Number of callbacks registered */
+  int64 interrupts;  /* Number of interrupts received */
+  bool allowed; /* Profiling is allowed */
+};
+void ProfileHandlerGetState(struct ProfileHandlerState* state);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  /* BASE_PROFILE_HANDLER_H_ */

diff --git a/src/profiledata.cc b/src/profiledata.cc
new file mode 100644
index 0000000..8b05d3a
--- /dev/null
+++ b/src/profiledata.cc

@@ -0,0 +1,332 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+// Author: Sanjay Ghemawat
+//         Chris Demetriou (refactoring)
+//
+// Collect profiling data.
+
+#include <config.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <sys/time.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include "profiledata.h"
+
+#include "base/logging.h"
+#include "base/sysinfo.h"
+
+// All of these are initialized in profiledata.h.
+const int ProfileData::kMaxStackDepth;
+const int ProfileData::kAssociativity;
+const int ProfileData::kBuckets;
+const int ProfileData::kBufferLength;
+
+ProfileData::Options::Options()
+    : frequency_(1) {
+}
+
+// This function is safe to call from asynchronous signals (but is not
+// re-entrant).  However, that's not part of its public interface.
+void ProfileData::Evict(const Entry& entry) {
+  const int d = entry.depth;
+  const int nslots = d + 2;     // Number of slots needed in eviction buffer
+  if (num_evicted_ + nslots > kBufferLength) {
+    FlushEvicted();
+    assert(num_evicted_ == 0);
+    assert(nslots <= kBufferLength);
+  }
+  evict_[num_evicted_++] = entry.count;
+  evict_[num_evicted_++] = d;
+  memcpy(&evict_[num_evicted_], entry.stack, d * sizeof(Slot));
+  num_evicted_ += d;
+}
+
+ProfileData::ProfileData()
+    : hash_(0),
+      evict_(0),
+      num_evicted_(0),
+      out_(-1),
+      count_(0),
+      evictions_(0),
+      total_bytes_(0),
+      fname_(0),
+      start_time_(0) {
+}
+
+bool ProfileData::Start(const char* fname,
+                        const ProfileData::Options& options) {
+  if (enabled()) {
+    return false;
+  }
+
+  // Open output file and initialize various data structures
+  int fd = open(fname, O_CREAT | O_WRONLY | O_TRUNC, 0666);
+  if (fd < 0) {
+    // Can't open outfile for write
+    return false;
+  }
+
+  start_time_ = time(NULL);
+  fname_ = strdup(fname);
+
+  // Reset counters
+  num_evicted_ = 0;
+  count_       = 0;
+  evictions_   = 0;
+  total_bytes_ = 0;
+
+  hash_ = new Bucket[kBuckets];
+  evict_ = new Slot[kBufferLength];
+  memset(hash_, 0, sizeof(hash_[0]) * kBuckets);
+
+  // Record special entries
+  evict_[num_evicted_++] = 0;                     // count for header
+  evict_[num_evicted_++] = 3;                     // depth for header
+  evict_[num_evicted_++] = 0;                     // Version number
+  CHECK_NE(0, options.frequency());
+  int period = 1000000 / options.frequency();
+  evict_[num_evicted_++] = period;                // Period (microseconds)
+  evict_[num_evicted_++] = 0;                     // Padding
+
+  out_ = fd;
+
+  return true;
+}
+
+ProfileData::~ProfileData() {
+  Stop();
+}
+
+// Dump /proc/maps data to fd.  Copied from heap-profile-table.cc.
+#define NO_INTR(fn)  do {} while ((fn) < 0 && errno == EINTR)
+
+static void FDWrite(int fd, const char* buf, size_t len) {
+  while (len > 0) {
+    ssize_t r;
+    NO_INTR(r = write(fd, buf, len));
+    RAW_CHECK(r >= 0, "write failed");
+    buf += r;
+    len -= r;
+  }
+}
+
+static void DumpProcSelfMaps(int fd) {
+  ProcMapsIterator::Buffer iterbuf;
+  ProcMapsIterator it(0, &iterbuf);   // 0 means "current pid"
+
+  uint64 start, end, offset;
+  int64 inode;
+  char *flags, *filename;
+  ProcMapsIterator::Buffer linebuf;
+  while (it.Next(&start, &end, &flags, &offset, &inode, &filename)) {
+    int written = it.FormatLine(linebuf.buf_, sizeof(linebuf.buf_),
+                                start, end, flags, offset, inode, filename,
+                                0);
+    FDWrite(fd, linebuf.buf_, written);
+  }
+}
+
+void ProfileData::Stop() {
+  if (!enabled()) {
+    return;
+  }
+
+  // Move data from hash table to eviction buffer
+  for (int b = 0; b < kBuckets; b++) {
+    Bucket* bucket = &hash_[b];
+    for (int a = 0; a < kAssociativity; a++) {
+      if (bucket->entry[a].count > 0) {
+        Evict(bucket->entry[a]);
+      }
+    }
+  }
+
+  if (num_evicted_ + 3 > kBufferLength) {
+    // Ensure there is enough room for end of data marker
+    FlushEvicted();
+  }
+
+  // Write end of data marker
+  evict_[num_evicted_++] = 0;         // count
+  evict_[num_evicted_++] = 1;         // depth
+  evict_[num_evicted_++] = 0;         // end of data marker
+  FlushEvicted();
+
+  // Dump "/proc/self/maps" so we get list of mapped shared libraries
+  DumpProcSelfMaps(out_);
+
+  Reset();
+  fprintf(stderr, "PROFILE: interrupts/evictions/bytes = %d/%d/%" PRIuS "\n",
+          count_, evictions_, total_bytes_);
+}
+
+void ProfileData::Reset() {
+  if (!enabled()) {
+    return;
+  }
+
+  // Don't reset count_, evictions_, or total_bytes_ here.  They're used
+  // by Stop to print information about the profile after reset, and are
+  // cleared by Start when starting a new profile.
+  close(out_);
+  delete[] hash_;
+  hash_ = 0;
+  delete[] evict_;
+  evict_ = 0;
+  num_evicted_ = 0;
+  free(fname_);
+  fname_ = 0;
+  start_time_ = 0;
+
+  out_ = -1;
+}
+
+// This function is safe to call from asynchronous signals (but is not
+// re-entrant).  However, that's not part of its public interface.
+void ProfileData::GetCurrentState(State* state) const {
+  if (enabled()) {
+    state->enabled = true;
+    state->start_time = start_time_;
+    state->samples_gathered = count_;
+    int buf_size = sizeof(state->profile_name);
+    strncpy(state->profile_name, fname_, buf_size);
+    state->profile_name[buf_size-1] = '\0';
+  } else {
+    state->enabled = false;
+    state->start_time = 0;
+    state->samples_gathered = 0;
+    state->profile_name[0] = '\0';
+  }
+}
+
+// This function is safe to call from asynchronous signals (but is not
+// re-entrant).  However, that's not part of its public interface.
+void ProfileData::FlushTable() {
+  if (!enabled()) {
+    return;
+  }
+
+  // Move data from hash table to eviction buffer
+  for (int b = 0; b < kBuckets; b++) {
+    Bucket* bucket = &hash_[b];
+    for (int a = 0; a < kAssociativity; a++) {
+      if (bucket->entry[a].count > 0) {
+        Evict(bucket->entry[a]);
+        bucket->entry[a].depth = 0;
+        bucket->entry[a].count = 0;
+      }
+    }
+  }
+
+  // Write out all pending data
+  FlushEvicted();
+}
+
+void ProfileData::Add(int depth, const void* const* stack) {
+  if (!enabled()) {
+    return;
+  }
+
+  if (depth > kMaxStackDepth) depth = kMaxStackDepth;
+  RAW_CHECK(depth > 0, "ProfileData::Add depth <= 0");
+
+  // Make hash-value
+  Slot h = 0;
+  for (int i = 0; i < depth; i++) {
+    Slot slot = reinterpret_cast<Slot>(stack[i]);
+    h = (h << 8) | (h >> (8*(sizeof(h)-1)));
+    h += (slot * 31) + (slot * 7) + (slot * 3);
+  }
+
+  count_++;
+
+  // See if table already has an entry for this trace
+  bool done = false;
+  Bucket* bucket = &hash_[h % kBuckets];
+  for (int a = 0; a < kAssociativity; a++) {
+    Entry* e = &bucket->entry[a];
+    if (e->depth == depth) {
+      bool match = true;
+      for (int i = 0; i < depth; i++) {
+        if (e->stack[i] != reinterpret_cast<Slot>(stack[i])) {
+          match = false;
+          break;
+        }
+      }
+      if (match) {
+        e->count++;
+        done = true;
+        break;
+      }
+    }
+  }
+
+  if (!done) {
+    // Evict entry with smallest count
+    Entry* e = &bucket->entry[0];
+    for (int a = 1; a < kAssociativity; a++) {
+      if (bucket->entry[a].count < e->count) {
+        e = &bucket->entry[a];
+      }
+    }
+    if (e->count > 0) {
+      evictions_++;
+      Evict(*e);
+    }
+
+    // Use the newly evicted entry
+    e->depth = depth;
+    e->count = 1;
+    for (int i = 0; i < depth; i++) {
+      e->stack[i] = reinterpret_cast<Slot>(stack[i]);
+    }
+  }
+}
+
+// This function is safe to call from asynchronous signals (but is not
+// re-entrant).  However, that's not part of its public interface.
+void ProfileData::FlushEvicted() {
+  if (num_evicted_ > 0) {
+    const char* buf = reinterpret_cast<char*>(evict_);
+    size_t bytes = sizeof(evict_[0]) * num_evicted_;
+    total_bytes_ += bytes;
+    FDWrite(out_, buf, bytes);
+  }
+  num_evicted_ = 0;
+}

diff --git a/src/profiledata.h b/src/profiledata.h
new file mode 100644
index 0000000..44033f0
--- /dev/null
+++ b/src/profiledata.h

@@ -0,0 +1,184 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+// Author: Sanjay Ghemawat
+//         Chris Demetriou (refactoring)
+//
+// Collect profiling data.
+//
+// The profile data file format is documented in
+// doc/cpuprofile-fileformat.html
+
+
+#ifndef BASE_PROFILEDATA_H_
+#define BASE_PROFILEDATA_H_
+
+#include <config.h>
+#include <time.h>   // for time_t
+#include <stdint.h>
+#include "base/basictypes.h"
+
+// A class that accumulates profile samples and writes them to a file.
+//
+// Each sample contains a stack trace and a count.  Memory usage is
+// reduced by combining profile samples that have the same stack trace
+// by adding up the associated counts.
+//
+// Profile data is accumulated in a bounded amount of memory, and will
+// flushed to a file as necessary to stay within the memory limit.
+//
+// Use of this class assumes external synchronization.  The exact
+// requirements of that synchronization are that:
+//
+//  - 'Add' may be called from asynchronous signals, but is not
+//    re-entrant.
+//
+//  - None of 'Start', 'Stop', 'Reset', 'Flush', and 'Add' may be
+//    called at the same time.
+//
+//  - 'Start', 'Stop', or 'Reset' should not be called while 'Enabled'
+//     or 'GetCurrent' are running, and vice versa.
+//
+// A profiler which uses asyncronous signals to add samples will
+// typically use two locks to protect this data structure:
+//
+//  - A SpinLock which is held over all calls except for the 'Add'
+//    call made from the signal handler.
+//
+//  - A SpinLock which is held over calls to 'Start', 'Stop', 'Reset',
+//    'Flush', and 'Add'.  (This SpinLock should be acquired after
+//    the first SpinLock in all cases where both are needed.)
+class ProfileData {
+ public:
+  struct State {
+    bool     enabled;             // Is profiling currently enabled?
+    time_t   start_time;          // If enabled, when was profiling started?
+    char     profile_name[1024];  // Name of file being written, or '\0'
+    int      samples_gathered;    // Number of samples gathered to far (or 0)
+  };
+
+  class Options {
+   public:
+    Options();
+
+    // Get and set the sample frequency.
+    int frequency() const {
+      return frequency_;
+    }
+    void set_frequency(int frequency) {
+      frequency_ = frequency;
+    }
+
+   private:
+    int      frequency_;                  // Sample frequency.
+  };
+
+  static const int kMaxStackDepth = 64;  // Max stack depth stored in profile
+
+  ProfileData();
+  ~ProfileData();
+
+  // If data collection is not already enabled start to collect data
+  // into fname.  Parameters related to this profiling run are specified
+  // by 'options'.
+  //
+  // Returns true if data collection could be started, otherwise (if an
+  // error occurred or if data collection was already enabled) returns
+  // false.
+  bool Start(const char *fname, const Options& options);
+
+  // If data collection is enabled, stop data collection and write the
+  // data to disk.
+  void Stop();
+
+  // Stop data collection without writing anything else to disk, and
+  // discard any collected data.
+  void Reset();
+
+  // If data collection is enabled, record a sample with 'depth'
+  // entries from 'stack'.  (depth must be > 0.)  At most
+  // kMaxStackDepth stack entries will be recorded, starting with
+  // stack[0].
+  //
+  // This function is safe to call from asynchronous signals (but is
+  // not re-entrant).
+  void Add(int depth, const void* const* stack);
+
+  // If data collection is enabled, write the data to disk (and leave
+  // the collector enabled).
+  void FlushTable();
+
+  // Is data collection currently enabled?
+  bool enabled() const { return out_ >= 0; }
+
+  // Get the current state of the data collector.
+  void GetCurrentState(State* state) const;
+
+ private:
+  static const int kAssociativity = 4;          // For hashtable
+  static const int kBuckets = 1 << 10;          // For hashtable
+  static const int kBufferLength = 1 << 18;     // For eviction buffer
+
+  // Type of slots: each slot can be either a count, or a PC value
+  typedef uintptr_t Slot;
+
+  // Hash-table/eviction-buffer entry (a.k.a. a sample)
+  struct Entry {
+    Slot count;                  // Number of hits
+    Slot depth;                  // Stack depth
+    Slot stack[kMaxStackDepth];  // Stack contents
+  };
+
+  // Hash table bucket
+  struct Bucket {
+    Entry entry[kAssociativity];
+  };
+
+  Bucket*       hash_;          // hash table
+  Slot*         evict_;         // evicted entries
+  int           num_evicted_;   // how many evicted entries?
+  int           out_;           // fd for output file.
+  int           count_;         // How many samples recorded
+  int           evictions_;     // How many evictions
+  size_t        total_bytes_;   // How much output
+  char*         fname_;         // Profile file name
+  time_t        start_time_;    // Start time, or 0
+
+  // Move 'entry' to the eviction buffer.
+  void Evict(const Entry& entry);
+
+  // Write contents of eviction buffer to disk.
+  void FlushEvicted();
+
+  DISALLOW_COPY_AND_ASSIGN(ProfileData);
+};
+
+#endif  // BASE_PROFILEDATA_H_

diff --git a/src/profiler.cc b/src/profiler.cc
new file mode 100644
index 0000000..b862ae6
--- /dev/null
+++ b/src/profiler.cc

@@ -0,0 +1,431 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//         Chris Demetriou (refactoring)
+//
+// Profile current program by sampling stack-trace every so often
+
+#include "config.h"
+#include "getpc.h"      // should be first to get the _GNU_SOURCE dfn
+#include <signal.h>
+#include <assert.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>  // for getpid()
+#endif
+#if defined(HAVE_SYS_UCONTEXT_H)
+#include <sys/ucontext.h>
+#elif defined(HAVE_UCONTEXT_H)
+#include <ucontext.h>
+#elif defined(HAVE_CYGWIN_SIGNAL_H)
+#include <cygwin/signal.h>
+typedef ucontext ucontext_t;
+#else
+typedef int ucontext_t;   // just to quiet the compiler, mostly
+#endif
+#include <sys/time.h>
+#include <string>
+#include <gperftools/profiler.h>
+#include <gperftools/stacktrace.h>
+#include "base/commandlineflags.h"
+#include "base/logging.h"
+#include "base/googleinit.h"
+#include "base/spinlock.h"
+#include "base/sysinfo.h"             /* for GetUniquePathFromEnv, etc */
+#include "profiledata.h"
+#include "profile-handler.h"
+#ifdef HAVE_CONFLICT_SIGNAL_H
+#include "conflict-signal.h"          /* used on msvc machines */
+#endif
+
+using std::string;
+
+DEFINE_bool(cpu_profiler_unittest,
+            EnvToBool("PERFTOOLS_UNITTEST", true),
+            "Determines whether or not we are running under the \
+             control of a unit test. This allows us to include or \
+			 exclude certain behaviours.");
+
+// Collects up all profile data. This is a singleton, which is
+// initialized by a constructor at startup. If no cpu profiler
+// signal is specified then the profiler lifecycle is either
+// manaully controlled via the API or attached to the scope of
+// the singleton (program scope). Otherwise the cpu toggle is
+// used to allow for user selectable control via signal generation.
+// This is very useful for profiling a daemon process without
+// having to start and stop the daemon or having to modify the
+// source code to use the cpu profiler API.
+class CpuProfiler {
+ public:
+  CpuProfiler();
+  ~CpuProfiler();
+
+  // Start profiler to write profile info into fname
+  bool Start(const char* fname, const ProfilerOptions* options);
+
+  // Stop profiling and write the data to disk.
+  void Stop();
+
+  // Write the data to disk (and continue profiling).
+  void FlushTable();
+
+  bool Enabled();
+
+  void GetCurrentState(ProfilerState* state);
+
+  static CpuProfiler instance_;
+
+ private:
+  // This lock implements the locking requirements described in the ProfileData
+  // documentation, specifically:
+  //
+  // lock_ is held all over all collector_ method calls except for the 'Add'
+  // call made from the signal handler, to protect against concurrent use of
+  // collector_'s control routines. Code other than signal handler must
+  // unregister the signal handler before calling any collector_ method.
+  // 'Add' method in the collector is protected by a guarantee from
+  // ProfileHandle that only one instance of prof_handler can run at a time.
+  SpinLock      lock_;
+  ProfileData   collector_;
+
+  // Filter function and its argument, if any.  (NULL means include all
+  // samples).  Set at start, read-only while running.  Written while holding
+  // lock_, read and executed in the context of SIGPROF interrupt.
+  int           (*filter_)(void*);
+  void*         filter_arg_;
+
+  // Opaque token returned by the profile handler. To be used when calling
+  // ProfileHandlerUnregisterCallback.
+  ProfileHandlerToken* prof_handler_token_;
+
+  // Sets up a callback to receive SIGPROF interrupt.
+  void EnableHandler();
+
+  // Disables receiving SIGPROF interrupt.
+  void DisableHandler();
+
+  // Signal handler that records the interrupted pc in the profile data.
+  static void prof_handler(int sig, siginfo_t*, void* signal_ucontext,
+                           void* cpu_profiler);
+};
+
+// Signal handler that is registered when a user selectable signal
+// number is defined in the environment variable CPUPROFILESIGNAL.
+static void CpuProfilerSwitch(int signal_number)
+{
+    bool static started = false;
+	static unsigned profile_count = 0;
+    static char base_profile_name[1024] = "\0";
+
+	if (base_profile_name[0] == '\0') {
+    	if (!GetUniquePathFromEnv("CPUPROFILE", base_profile_name)) {
+        	RAW_LOG(FATAL,"Cpu profiler switch is registered but no CPUPROFILE is defined");
+        	return;
+    	}
+	}
+    if (!started) 
+    {
+    	char full_profile_name[1024];
+
+		snprintf(full_profile_name, sizeof(full_profile_name), "%s.%u",
+                 base_profile_name, profile_count++);
+
+        if(!ProfilerStart(full_profile_name))
+        {
+            RAW_LOG(FATAL, "Can't turn on cpu profiling for '%s': %s\n",
+                    full_profile_name, strerror(errno));
+        }
+    }
+    else    
+    {
+        ProfilerStop();
+    }
+    started = !started;
+}
+
+// Profile data structure singleton: Constructor will check to see if
+// profiling should be enabled.  Destructor will write profile data
+// out to disk.
+CpuProfiler CpuProfiler::instance_;
+
+// Initialize profiling: activated if getenv("CPUPROFILE") exists.
+CpuProfiler::CpuProfiler()
+    : prof_handler_token_(NULL) {
+  // TODO(cgd) Move this code *out* of the CpuProfile constructor into a
+  // separate object responsible for initialization. With ProfileHandler there
+  // is no need to limit the number of profilers.
+  if (getenv("CPUPROFILE") == NULL) {
+    if (!FLAGS_cpu_profiler_unittest) {
+      RAW_LOG(WARNING, "CPU profiler linked but no valid CPUPROFILE environment variable found\n");
+    }
+    return;
+  }
+
+  // We don't enable profiling if setuid -- it's a security risk
+#ifdef HAVE_GETEUID
+  if (getuid() != geteuid()) {
+    if (!FLAGS_cpu_profiler_unittest) {
+      RAW_LOG(WARNING, "Cannot perform CPU profiling when running with setuid\n");
+    }
+    return;
+  }
+#endif
+
+  char *signal_number_str = getenv("CPUPROFILESIGNAL");
+  if (signal_number_str != NULL) {
+    long int signal_number = strtol(signal_number_str, NULL, 10);
+    if (signal_number >= 1 && signal_number <= 64) {
+      intptr_t old_signal_handler = reinterpret_cast<intptr_t>(signal(signal_number, CpuProfilerSwitch));
+      if (old_signal_handler == 0) {
+        RAW_LOG(INFO,"Using signal %d as cpu profiling switch", signal_number);
+      } else {
+        RAW_LOG(FATAL, "Signal %d already in use\n", signal_number);
+      }
+    } else {
+      RAW_LOG(FATAL, "Signal number %s is invalid\n", signal_number_str);
+    }
+  } else {
+    char fname[PATH_MAX];
+    if (!GetUniquePathFromEnv("CPUPROFILE", fname)) {
+      if (!FLAGS_cpu_profiler_unittest) {
+        RAW_LOG(WARNING, "CPU profiler linked but no valid CPUPROFILE environment variable found\n");
+      }
+      return;
+	}
+
+    if (!Start(fname, NULL)) {
+      RAW_LOG(FATAL, "Can't turn on cpu profiling for '%s': %s\n",
+              fname, strerror(errno));
+    }
+  }
+}
+
+bool CpuProfiler::Start(const char* fname, const ProfilerOptions* options) {
+  SpinLockHolder cl(&lock_);
+
+  if (collector_.enabled()) {
+    return false;
+  }
+
+  ProfileHandlerState prof_handler_state;
+  ProfileHandlerGetState(&prof_handler_state);
+
+  ProfileData::Options collector_options;
+  collector_options.set_frequency(prof_handler_state.frequency);
+  if (!collector_.Start(fname, collector_options)) {
+    return false;
+  }
+
+  filter_ = NULL;
+  if (options != NULL && options->filter_in_thread != NULL) {
+    filter_ = options->filter_in_thread;
+    filter_arg_ = options->filter_in_thread_arg;
+  }
+
+  // Setup handler for SIGPROF interrupts
+  EnableHandler();
+
+  return true;
+}
+
+CpuProfiler::~CpuProfiler() {
+  Stop();
+}
+
+// Stop profiling and write out any collected profile data
+void CpuProfiler::Stop() {
+  SpinLockHolder cl(&lock_);
+
+  if (!collector_.enabled()) {
+    return;
+  }
+
+  // Unregister prof_handler to stop receiving SIGPROF interrupts before
+  // stopping the collector.
+  DisableHandler();
+
+  // DisableHandler waits for the currently running callback to complete and
+  // guarantees no future invocations. It is safe to stop the collector.
+  collector_.Stop();
+}
+
+void CpuProfiler::FlushTable() {
+  SpinLockHolder cl(&lock_);
+
+  if (!collector_.enabled()) {
+    return;
+  }
+
+  // Unregister prof_handler to stop receiving SIGPROF interrupts before
+  // flushing the profile data.
+  DisableHandler();
+
+  // DisableHandler waits for the currently running callback to complete and
+  // guarantees no future invocations. It is safe to flush the profile data.
+  collector_.FlushTable();
+
+  EnableHandler();
+}
+
+bool CpuProfiler::Enabled() {
+  SpinLockHolder cl(&lock_);
+  return collector_.enabled();
+}
+
+void CpuProfiler::GetCurrentState(ProfilerState* state) {
+  ProfileData::State collector_state;
+  {
+    SpinLockHolder cl(&lock_);
+    collector_.GetCurrentState(&collector_state);
+  }
+
+  state->enabled = collector_state.enabled;
+  state->start_time = static_cast<time_t>(collector_state.start_time);
+  state->samples_gathered = collector_state.samples_gathered;
+  int buf_size = sizeof(state->profile_name);
+  strncpy(state->profile_name, collector_state.profile_name, buf_size);
+  state->profile_name[buf_size-1] = '\0';
+}
+
+void CpuProfiler::EnableHandler() {
+  RAW_CHECK(prof_handler_token_ == NULL, "SIGPROF handler already registered");
+  prof_handler_token_ = ProfileHandlerRegisterCallback(prof_handler, this);
+  RAW_CHECK(prof_handler_token_ != NULL, "Failed to set up SIGPROF handler");
+}
+
+void CpuProfiler::DisableHandler() {
+  RAW_CHECK(prof_handler_token_ != NULL, "SIGPROF handler is not registered");
+  ProfileHandlerUnregisterCallback(prof_handler_token_);
+  prof_handler_token_ = NULL;
+}
+
+// Signal handler that records the pc in the profile-data structure. We do no
+// synchronization here.  profile-handler.cc guarantees that at most one
+// instance of prof_handler() will run at a time. All other routines that
+// access the data touched by prof_handler() disable this signal handler before
+// accessing the data and therefore cannot execute concurrently with
+// prof_handler().
+void CpuProfiler::prof_handler(int sig, siginfo_t*, void* signal_ucontext,
+                               void* cpu_profiler) {
+  CpuProfiler* instance = static_cast<CpuProfiler*>(cpu_profiler);
+
+  if (instance->filter_ == NULL ||
+      (*instance->filter_)(instance->filter_arg_)) {
+    void* stack[ProfileData::kMaxStackDepth];
+
+    // Under frame-pointer-based unwinding at least on x86, the
+    // top-most active routine doesn't show up as a normal frame, but
+    // as the "pc" value in the signal handler context.
+    stack[0] = GetPC(*reinterpret_cast<ucontext_t*>(signal_ucontext));
+
+    // We skip the top three stack trace entries (this function,
+    // SignalHandler::SignalHandler and one signal handler frame)
+    // since they are artifacts of profiling and should not be
+    // measured.  Other profiling related frames may be removed by
+    // "pprof" at analysis time.  Instead of skipping the top frames,
+    // we could skip nothing, but that would increase the profile size
+    // unnecessarily.
+    int depth = GetStackTraceWithContext(stack + 1, arraysize(stack) - 1,
+                                         3, signal_ucontext);
+
+    void **used_stack;
+    if (stack[1] == stack[0]) {
+      // in case of non-frame-pointer-based unwinding we will get
+      // duplicate of PC in stack[1], which we don't want
+      used_stack = stack + 1;
+    } else {
+      used_stack = stack;
+      depth++;  // To account for pc value in stack[0];
+    }
+
+    instance->collector_.Add(depth, used_stack);
+  }
+}
+
+#if !(defined(__CYGWIN__) || defined(__CYGWIN32__))
+
+extern "C" PERFTOOLS_DLL_DECL void ProfilerRegisterThread() {
+  ProfileHandlerRegisterThread();
+}
+
+extern "C" PERFTOOLS_DLL_DECL void ProfilerFlush() {
+  CpuProfiler::instance_.FlushTable();
+}
+
+extern "C" PERFTOOLS_DLL_DECL int ProfilingIsEnabledForAllThreads() {
+  return CpuProfiler::instance_.Enabled();
+}
+
+extern "C" PERFTOOLS_DLL_DECL int ProfilerStart(const char* fname) {
+  return CpuProfiler::instance_.Start(fname, NULL);
+}
+
+extern "C" PERFTOOLS_DLL_DECL int ProfilerStartWithOptions(
+    const char *fname, const ProfilerOptions *options) {
+  return CpuProfiler::instance_.Start(fname, options);
+}
+
+extern "C" PERFTOOLS_DLL_DECL void ProfilerStop() {
+  CpuProfiler::instance_.Stop();
+}
+
+extern "C" PERFTOOLS_DLL_DECL void ProfilerGetCurrentState(
+    ProfilerState* state) {
+  CpuProfiler::instance_.GetCurrentState(state);
+}
+
+#else  // OS_CYGWIN
+
+// ITIMER_PROF doesn't work under cygwin.  ITIMER_REAL is available, but doesn't
+// work as well for profiling, and also interferes with alarm().  Because of
+// these issues, unless a specific need is identified, profiler support is
+// disabled under Cygwin.
+extern "C" void ProfilerRegisterThread() { }
+extern "C" void ProfilerFlush() { }
+extern "C" int ProfilingIsEnabledForAllThreads() { return 0; }
+extern "C" int ProfilerStart(const char* fname) { return 0; }
+extern "C" int ProfilerStartWithOptions(const char *fname,
+                                        const ProfilerOptions *options) {
+  return 0;
+}
+extern "C" void ProfilerStop() { }
+extern "C" void ProfilerGetCurrentState(ProfilerState* state) {
+  memset(state, 0, sizeof(*state));
+}
+
+#endif  // OS_CYGWIN
+
+// DEPRECATED routines
+extern "C" PERFTOOLS_DLL_DECL void ProfilerEnable() { }
+extern "C" PERFTOOLS_DLL_DECL void ProfilerDisable() { }

diff --git a/src/raw_printer.cc b/src/raw_printer.cc
new file mode 100644
index 0000000..3cf028e
--- /dev/null
+++ b/src/raw_printer.cc

@@ -0,0 +1,72 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: sanjay@google.com (Sanjay Ghemawat)
+
+#include <config.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include "raw_printer.h"
+#include "base/logging.h"
+
+namespace base {
+
+RawPrinter::RawPrinter(char* buf, int length)
+    : base_(buf),
+      ptr_(buf),
+      limit_(buf + length - 1) {
+  RAW_DCHECK(length > 0, "");
+  *ptr_ = '\0';
+  *limit_ = '\0';
+}
+
+void RawPrinter::Printf(const char* format, ...) {
+  if (limit_ > ptr_) {
+    va_list ap;
+    va_start(ap, format);
+    int avail = limit_ - ptr_;
+    // We pass avail+1 to vsnprintf() since that routine needs room
+    // to store the trailing \0.
+    const int r = perftools_vsnprintf(ptr_, avail+1, format, ap);
+    va_end(ap);
+    if (r < 0) {
+      // Perhaps an old glibc that returns -1 on truncation?
+      ptr_ = limit_;
+    } else if (r > avail) {
+      // Truncation
+      ptr_ = limit_;
+    } else {
+      ptr_ += r;
+    }
+  }
+}
+
+}

diff --git a/src/raw_printer.h b/src/raw_printer.h
new file mode 100644
index 0000000..9288bb5
--- /dev/null
+++ b/src/raw_printer.h

@@ -0,0 +1,90 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// A printf() wrapper that writes into a fixed length buffer.
+// Useful in low-level code that does not want to use allocating
+// routines like StringPrintf().
+//
+// The implementation currently uses vsnprintf().  This seems to
+// be fine for use in many low-level contexts, but we may need to
+// rethink this decision if we hit a problem with it calling
+// down into malloc() etc.
+
+#ifndef BASE_RAW_PRINTER_H_
+#define BASE_RAW_PRINTER_H_
+
+#include <config.h>
+#include "base/basictypes.h"
+
+namespace base {
+
+class RawPrinter {
+ public:
+  // REQUIRES: "length > 0"
+  // Will printf any data added to this into "buf[0,length-1]" and
+  // will arrange to always keep buf[] null-terminated.
+  RawPrinter(char* buf, int length);
+
+  // Return the number of bytes that have been appended to the string
+  // so far.  Does not count any bytes that were dropped due to overflow.
+  int length() const { return (ptr_ - base_); }
+
+  // Return the number of bytes that can be added to this.
+  int space_left() const { return (limit_ - ptr_); }
+
+  // Format the supplied arguments according to the "format" string
+  // and append to this.  Will silently truncate the output if it does
+  // not fit.
+  void Printf(const char* format, ...)
+#ifdef HAVE___ATTRIBUTE__
+  __attribute__ ((__format__ (__printf__, 2, 3)))
+#endif
+;
+
+ private:
+  // We can write into [ptr_ .. limit_-1].
+  // *limit_ is also writable, but reserved for a terminating \0
+  // in case we overflow.
+  //
+  // Invariants: *ptr_ == \0
+  // Invariants: *limit_ == \0
+  char* base_;          // Initial pointer
+  char* ptr_;           // Where should we write next
+  char* limit_;         // One past last non-\0 char we can write
+
+  DISALLOW_COPY_AND_ASSIGN(RawPrinter);
+};
+
+}
+
+#endif  // BASE_RAW_PRINTER_H_

diff --git a/src/sampler.cc b/src/sampler.cc
new file mode 100755
index 0000000..cc71112
--- /dev/null
+++ b/src/sampler.cc

@@ -0,0 +1,131 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// All Rights Reserved.
+//
+// Author: Daniel Ford
+
+#include "sampler.h"
+
+#include <algorithm>  // For min()
+#include <math.h>
+#include "base/commandlineflags.h"
+
+using std::min;
+
+// The approximate gap in bytes between sampling actions.
+// I.e., we take one sample approximately once every
+// tcmalloc_sample_parameter bytes of allocation
+// i.e. about once every 512KB if value is 1<<19.
+#ifdef NO_TCMALLOC_SAMPLES
+DEFINE_int64(tcmalloc_sample_parameter, 0,
+             "Unused: code is compiled with NO_TCMALLOC_SAMPLES");
+#else
+DEFINE_int64(tcmalloc_sample_parameter,
+             EnvToInt64("TCMALLOC_SAMPLE_PARAMETER", 0),
+             "The approximate gap in bytes between sampling actions. "
+             "This must be between 1 and 2^58.");
+#endif
+
+namespace tcmalloc {
+
+// Statics for Sampler
+double Sampler::log_table_[1<<kFastlogNumBits];
+
+// Populate the lookup table for FastLog2.
+// This approximates the log2 curve with a step function.
+// Steps have height equal to log2 of the mid-point of the step.
+void Sampler::PopulateFastLog2Table() {
+  for (int i = 0; i < (1<<kFastlogNumBits); i++) {
+    log_table_[i] = (log(1.0 + static_cast<double>(i+0.5)/(1<<kFastlogNumBits))
+                     / log(2.0));
+  }
+}
+
+int Sampler::GetSamplePeriod() {
+  return FLAGS_tcmalloc_sample_parameter;
+}
+
+// Run this before using your sampler
+void Sampler::Init(uint32_t seed) {
+  // Initialize PRNG
+  if (seed != 0) {
+    rnd_ = seed;
+  } else {
+    rnd_ = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(this));
+    if (rnd_ == 0) {
+      rnd_ = 1;
+    }
+  }
+  // Step it forward 20 times for good measure
+  for (int i = 0; i < 20; i++) {
+    rnd_ = NextRandom(rnd_);
+  }
+  // Initialize counter
+  bytes_until_sample_ = PickNextSamplingPoint();
+}
+
+// Initialize the Statics for the Sampler class
+void Sampler::InitStatics() {
+  PopulateFastLog2Table();
+}
+
+// Generates a geometric variable with the specified mean (512K by default).
+// This is done by generating a random number between 0 and 1 and applying
+// the inverse cumulative distribution function for an exponential.
+// Specifically: Let m be the inverse of the sample period, then
+// the probability distribution function is m*exp(-mx) so the CDF is
+// p = 1 - exp(-mx), so
+// q = 1 - p = exp(-mx)
+// log_e(q) = -mx
+// -log_e(q)/m = x
+// log_2(q) * (-log_e(2) * 1/m) = x
+// In the code, q is actually in the range 1 to 2**26, hence the -26 below
+size_t Sampler::PickNextSamplingPoint() {
+  rnd_ = NextRandom(rnd_);
+  // Take the top 26 bits as the random number
+  // (This plus the 1<<58 sampling bound give a max possible step of
+  // 5194297183973780480 bytes.)
+  const uint64_t prng_mod_power = 48;  // Number of bits in prng
+  // The uint32_t cast is to prevent a (hard-to-reproduce) NAN
+  // under piii debug for some binaries.
+  double q = static_cast<uint32_t>(rnd_ >> (prng_mod_power - 26)) + 1.0;
+  // Put the computed p-value through the CDF of a geometric.
+  // For faster performance (save ~1/20th exec time), replace
+  // min(0.0, FastLog2(q) - 26)  by  (Fastlog2(q) - 26.000705)
+  // The value 26.000705 is used rather than 26 to compensate
+  // for inaccuracies in FastLog2 which otherwise result in a
+  // negative answer.
+  return static_cast<size_t>(min(0.0, (FastLog2(q) - 26)) * (-log(2.0)
+                             * FLAGS_tcmalloc_sample_parameter) + 1);
+}
+
+}  // namespace tcmalloc

diff --git a/src/sampler.h b/src/sampler.h
new file mode 100755
index 0000000..eb316d7
--- /dev/null
+++ b/src/sampler.h

@@ -0,0 +1,180 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// All Rights Reserved.
+//
+// Author: Daniel Ford
+
+#ifndef TCMALLOC_SAMPLER_H_
+#define TCMALLOC_SAMPLER_H_
+
+#include "config.h"
+#include <stddef.h>                     // for size_t
+#ifdef HAVE_STDINT_H
+#include <stdint.h>                     // for uint64_t, uint32_t, int32_t
+#endif
+#include <string.h>                     // for memcpy
+#include "base/basictypes.h"  // for ASSERT
+#include "internal_logging.h"  // for ASSERT
+
+namespace tcmalloc {
+
+//-------------------------------------------------------------------
+// Sampler to decide when to create a sample trace for an allocation
+// Not thread safe: Each thread should have it's own sampler object.
+// Caller must use external synchronization if used
+// from multiple threads.
+//
+// With 512K average sample step (the default):
+//  the probability of sampling a 4K allocation is about 0.00778
+//  the probability of sampling a 1MB allocation is about 0.865
+//  the probability of sampling a 1GB allocation is about 1.00000
+// In general, the probablity of sampling is an allocation of size X
+// given a flag value of Y (default 1M) is:
+//  1 - e^(-X/Y)
+//
+// With 128K average sample step:
+//  the probability of sampling a 1MB allocation is about 0.99966
+//  the probability of sampling a 1GB allocation is about 1.0
+//  (about 1 - 2**(-26))
+// With 1M average sample step:
+//  the probability of sampling a 4K allocation is about 0.00390
+//  the probability of sampling a 1MB allocation is about 0.632
+//  the probability of sampling a 1GB allocation is about 1.0
+//
+// The sampler works by representing memory as a long stream from
+// which allocations are taken. Some of the bytes in this stream are
+// marked and if an allocation includes a marked byte then it is
+// sampled. Bytes are marked according to a Poisson point process
+// with each byte being marked independently with probability
+// p = 1/tcmalloc_sample_parameter.  This makes the probability
+// of sampling an allocation of X bytes equal to the CDF of
+// a geometric with mean tcmalloc_sample_parameter. (ie. the
+// probability that at least one byte in the range is marked). This
+// is accurately given by the CDF of the corresponding exponential
+// distribution : 1 - e^(X/tcmalloc_sample_parameter_)
+// Independence of the byte marking ensures independence of
+// the sampling of each allocation.
+//
+// This scheme is implemented by noting that, starting from any
+// fixed place, the number of bytes until the next marked byte
+// is geometrically distributed. This number is recorded as
+// bytes_until_sample_.  Every allocation subtracts from this
+// number until it is less than 0. When this happens the current
+// allocation is sampled.
+//
+// When an allocation occurs, bytes_until_sample_ is reset to
+// a new independtly sampled geometric number of bytes. The
+// memoryless property of the point process means that this may
+// be taken as the number of bytes after the end of the current
+// allocation until the next marked byte. This ensures that
+// very large allocations which would intersect many marked bytes
+// only result in a single call to PickNextSamplingPoint.
+//-------------------------------------------------------------------
+
+class PERFTOOLS_DLL_DECL Sampler {
+ public:
+  // Initialize this sampler.
+  // Passing a seed of 0 gives a non-deterministic
+  // seed value given by casting the object ("this")
+  void Init(uint32_t seed);
+  void Cleanup();
+
+  // Record allocation of "k" bytes.  Return true iff allocation
+  // should be sampled
+  bool SampleAllocation(size_t k);
+
+  // Generate a geometric with mean 512K (or FLAG_tcmalloc_sample_parameter)
+  size_t PickNextSamplingPoint();
+
+  // Initialize the statics for the Sampler class
+  static void InitStatics();
+
+  // Returns the current sample period
+  int GetSamplePeriod();
+
+  // The following are public for the purposes of testing
+  static uint64_t NextRandom(uint64_t rnd_);  // Returns the next prng value
+  static double FastLog2(const double & d);  // Computes Log2(x) quickly
+  static void PopulateFastLog2Table();  // Populate the lookup table
+
+ private:
+  size_t        bytes_until_sample_;    // Bytes until we sample next
+  uint64_t      rnd_;                   // Cheap random number generator
+
+  // Statics for the fast log
+  // Note that this code may not depend on anything in //util
+  // hence the duplication of functionality here
+  static const int kFastlogNumBits = 10;
+  static const int kFastlogMask = (1 << kFastlogNumBits) - 1;
+  static double log_table_[1<<kFastlogNumBits];  // Constant
+};
+
+inline bool Sampler::SampleAllocation(size_t k) {
+  if (bytes_until_sample_ < k) {
+    bytes_until_sample_ = PickNextSamplingPoint();
+    return true;
+  } else {
+    bytes_until_sample_ -= k;
+    return false;
+  }
+}
+
+// Inline functions which are public for testing purposes
+
+// Returns the next prng value.
+// pRNG is: aX+b mod c with a = 0x5DEECE66D, b =  0xB, c = 1<<48
+// This is the lrand64 generator.
+inline uint64_t Sampler::NextRandom(uint64_t rnd) {
+  const uint64_t prng_mult = 0x5DEECE66DLL;
+  const uint64_t prng_add = 0xB;
+  const uint64_t prng_mod_power = 48;
+  const uint64_t prng_mod_mask =
+                ~((~static_cast<uint64_t>(0)) << prng_mod_power);
+  return (prng_mult * rnd + prng_add) & prng_mod_mask;
+}
+
+// Adapted from //util/math/fastmath.[h|cc] by Noam Shazeer
+// This mimics the VeryFastLog2 code in those files
+inline double Sampler::FastLog2(const double & d) {
+  ASSERT(d>0);
+  COMPILE_ASSERT(sizeof(d) == sizeof(uint64_t), DoubleMustBe64Bits);
+  uint64_t x;
+  memcpy(&x, &d, sizeof(x));   // we depend on the compiler inlining this
+  const uint32_t x_high = x >> 32;
+  const uint32_t y = x_high >> (20 - kFastlogNumBits) & kFastlogMask;
+  const int32_t exponent = ((x_high >> 20) & 0x7FF) - 1023;
+  return exponent + log_table_[y];
+}
+
+}  // namespace tcmalloc
+
+#endif  // TCMALLOC_SAMPLER_H_

diff --git a/src/solaris/libstdc++.la b/src/solaris/libstdc++.la
new file mode 100644
index 0000000..3edf425
--- /dev/null
+++ b/src/solaris/libstdc++.la

@@ -0,0 +1,51 @@
+# libstdc++.la - a libtool library file
+# Generated by ltmain.sh - GNU libtool 1.4a-GCC3.0 (1.641.2.256 2001/05/28 20:09:07 with GCC-local changes)
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# ---
+# NOTE: This file lives in /usr/sfw/lib on Solaris 10.  Unfortunately,
+# due to an apparent bug in the Solaris 10 6/06 release,
+# /usr/sfw/lib/libstdc++.la is empty.  Below is the correct content,
+# according to
+#    http://forum.java.sun.com/thread.jspa?threadID=5073150
+# By passing LDFLAGS='-Lsrc/solaris' to configure, make will pick up
+# this copy of the file rather than the empty copy in /usr/sfw/lib.
+#
+# Also see
+#   http://www.technicalarticles.org/index.php/Compiling_MySQL_5.0_on_Solaris_10
+#
+# Note: this is for 32-bit systems.  If you have a 64-bit system,
+# uncomment the appropriate dependency_libs line below.
+# ----
+
+# The name that we can dlopen(3).
+dlname='libstdc++.so.6'
+
+# Names of this library.
+library_names='libstdc++.so.6.0.3 libstdc++.so.6 libstdc++.so'
+
+# The name of the static archive.
+old_library='libstdc++.a'
+
+# Libraries that this one depends upon.
+# 32-bit version:
+dependency_libs='-lc -lm -L/usr/sfw/lib -lgcc_s'
+# 64-bit version:
+#dependency_libs='-L/lib/64 -lc -lm -L/usr/sfw/lib/64 -lgcc_s'
+
+# Version information for libstdc++.
+current=6
+age=0
+revision=3
+
+# Is this an already installed library?
+installed=yes
+
+# Files to dlopen/dlpreopen
+dlopen=''
+dlpreopen=''
+
+# Directory that this library needs to be installed in:
+libdir='/usr/sfw/lib'

diff --git a/src/span.cc b/src/span.cc
new file mode 100644
index 0000000..4d08964
--- /dev/null
+++ b/src/span.cc

@@ -0,0 +1,102 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+
+#include <config.h>
+#include "span.h"
+
+#include <string.h>                     // for NULL, memset
+
+#include "internal_logging.h"  // for ASSERT
+#include "page_heap_allocator.h"  // for PageHeapAllocator
+#include "static_vars.h"       // for Static
+
+namespace tcmalloc {
+
+#ifdef SPAN_HISTORY
+void Event(Span* span, char op, int v = 0) {
+  span->history[span->nexthistory] = op;
+  span->value[span->nexthistory] = v;
+  span->nexthistory++;
+  if (span->nexthistory == sizeof(span->history)) span->nexthistory = 0;
+}
+#endif
+
+Span* NewSpan(PageID p, Length len) {
+  Span* result = Static::span_allocator()->New();
+  memset(result, 0, sizeof(*result));
+  result->start = p;
+  result->length = len;
+#ifdef SPAN_HISTORY
+  result->nexthistory = 0;
+#endif
+  return result;
+}
+
+void DeleteSpan(Span* span) {
+#ifndef NDEBUG
+  // In debug mode, trash the contents of deleted Spans
+  memset(span, 0x3f, sizeof(*span));
+#endif
+  Static::span_allocator()->Delete(span);
+}
+
+void DLL_Init(Span* list) {
+  list->next = list;
+  list->prev = list;
+}
+
+void DLL_Remove(Span* span) {
+  span->prev->next = span->next;
+  span->next->prev = span->prev;
+  span->prev = NULL;
+  span->next = NULL;
+}
+
+int DLL_Length(const Span* list) {
+  int result = 0;
+  for (Span* s = list->next; s != list; s = s->next) {
+    result++;
+  }
+  return result;
+}
+
+void DLL_Prepend(Span* list, Span* span) {
+  ASSERT(span->next == NULL);
+  ASSERT(span->prev == NULL);
+  span->next = list->next;
+  span->prev = list;
+  list->next->prev = span;
+  list->next = span;
+}
+
+}  // namespace tcmalloc

diff --git a/src/span.h b/src/span.h
new file mode 100644
index 0000000..83feda1
--- /dev/null
+++ b/src/span.h

@@ -0,0 +1,102 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+//
+// A Span is a contiguous run of pages.
+
+#ifndef TCMALLOC_SPAN_H_
+#define TCMALLOC_SPAN_H_
+
+#include <config.h>
+#include "common.h"
+
+namespace tcmalloc {
+
+// Information kept for a span (a contiguous run of pages).
+struct Span {
+  PageID        start;          // Starting page number
+  Length        length;         // Number of pages in span
+  Span*         next;           // Used when in link list
+  Span*         prev;           // Used when in link list
+  void*         objects;        // Linked list of free objects
+  unsigned int  refcount : 16;  // Number of non-free objects
+  unsigned int  sizeclass : 8;  // Size-class for small objects (or 0)
+  unsigned int  location : 2;   // Is the span on a freelist, and if so, which?
+  unsigned int  sample : 1;     // Sampled object?
+
+#undef SPAN_HISTORY
+#ifdef SPAN_HISTORY
+  // For debugging, we can keep a log events per span
+  int nexthistory;
+  char history[64];
+  int value[64];
+#endif
+
+  // What freelist the span is on: IN_USE if on none, or normal or returned
+  enum { IN_USE, ON_NORMAL_FREELIST, ON_RETURNED_FREELIST };
+};
+
+#ifdef SPAN_HISTORY
+void Event(Span* span, char op, int v = 0);
+#else
+#define Event(s,o,v) ((void) 0)
+#endif
+
+// Allocator/deallocator for spans
+Span* NewSpan(PageID p, Length len);
+void DeleteSpan(Span* span);
+
+// -------------------------------------------------------------------------
+// Doubly linked list of spans.
+// -------------------------------------------------------------------------
+
+// Initialize *list to an empty list.
+void DLL_Init(Span* list);
+
+// Remove 'span' from the linked list in which it resides, updating the
+// pointers of adjacent Spans and setting span's next and prev to NULL.
+void DLL_Remove(Span* span);
+
+// Return true iff "list" is empty.
+inline bool DLL_IsEmpty(const Span* list) {
+  return list->next == list;
+}
+
+// Add span to the front of list.
+void DLL_Prepend(Span* list, Span* span);
+
+// Return the length of the linked list. O(n)
+int DLL_Length(const Span* list);
+
+}  // namespace tcmalloc
+
+#endif  // TCMALLOC_SPAN_H_

diff --git a/src/stack_trace_table.cc b/src/stack_trace_table.cc
new file mode 100644
index 0000000..1862124
--- /dev/null
+++ b/src/stack_trace_table.cc

@@ -0,0 +1,160 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Andrew Fikes
+
+#include <config.h>
+#include "stack_trace_table.h"
+#include <string.h>                     // for NULL, memset
+#include "base/spinlock.h"              // for SpinLockHolder
+#include "common.h"            // for StackTrace
+#include "internal_logging.h"  // for ASSERT, Log
+#include "page_heap_allocator.h"  // for PageHeapAllocator
+#include "static_vars.h"       // for Static
+
+namespace tcmalloc {
+
+bool StackTraceTable::Bucket::KeyEqual(uintptr_t h,
+                                       const StackTrace& t) const {
+  const bool eq = (this->hash == h && this->trace.depth == t.depth);
+  for (int i = 0; eq && i < t.depth; ++i) {
+    if (this->trace.stack[i] != t.stack[i]) {
+      return false;
+    }
+  }
+  return eq;
+}
+
+StackTraceTable::StackTraceTable()
+    : error_(false),
+      depth_total_(0),
+      bucket_total_(0),
+      table_(new Bucket*[kHashTableSize]()) {
+  memset(table_, 0, kHashTableSize * sizeof(Bucket*));
+}
+
+StackTraceTable::~StackTraceTable() {
+  delete[] table_;
+}
+
+void StackTraceTable::AddTrace(const StackTrace& t) {
+  if (error_) {
+    return;
+  }
+
+  // Hash function borrowed from base/heap-profile-table.cc
+  uintptr_t h = 0;
+  for (int i = 0; i < t.depth; ++i) {
+    h += reinterpret_cast<uintptr_t>(t.stack[i]);
+    h += h << 10;
+    h ^= h >> 6;
+  }
+  h += h << 3;
+  h ^= h >> 11;
+
+  const int idx = h % kHashTableSize;
+
+  Bucket* b = table_[idx];
+  while (b != NULL && !b->KeyEqual(h, t)) {
+    b = b->next;
+  }
+  if (b != NULL) {
+    b->count++;
+    b->trace.size += t.size;  // keep cumulative size
+  } else {
+    depth_total_ += t.depth;
+    bucket_total_++;
+    b = Static::bucket_allocator()->New();
+    if (b == NULL) {
+      Log(kLog, __FILE__, __LINE__,
+          "tcmalloc: could not allocate bucket", sizeof(*b));
+      error_ = true;
+    } else {
+      b->hash = h;
+      b->trace = t;
+      b->count = 1;
+      b->next = table_[idx];
+      table_[idx] = b;
+    }
+  }
+}
+
+void** StackTraceTable::ReadStackTracesAndClear() {
+  if (error_) {
+    return NULL;
+  }
+
+  // Allocate output array
+  const int out_len = bucket_total_ * 3 + depth_total_ + 1;
+  void** out = new void*[out_len];
+  if (out == NULL) {
+    Log(kLog, __FILE__, __LINE__,
+        "tcmalloc: allocation failed for stack traces",
+        out_len * sizeof(*out));
+    return NULL;
+  }
+
+  // Fill output array
+  int idx = 0;
+  for (int i = 0; i < kHashTableSize; ++i) {
+    Bucket* b = table_[i];
+    while (b != NULL) {
+      out[idx++] = reinterpret_cast<void*>(static_cast<uintptr_t>(b->count));
+      out[idx++] = reinterpret_cast<void*>(b->trace.size);  // cumulative size
+      out[idx++] = reinterpret_cast<void*>(b->trace.depth);
+      for (int d = 0; d < b->trace.depth; ++d) {
+        out[idx++] = b->trace.stack[d];
+      }
+      b = b->next;
+    }
+  }
+  out[idx++] = NULL;
+  ASSERT(idx == out_len);
+
+  // Clear state
+  error_ = false;
+  depth_total_ = 0;
+  bucket_total_ = 0;
+  SpinLockHolder h(Static::pageheap_lock());
+  for (int i = 0; i < kHashTableSize; ++i) {
+    Bucket* b = table_[i];
+    while (b != NULL) {
+      Bucket* next = b->next;
+      Static::bucket_allocator()->Delete(b);
+      b = next;
+    }
+    table_[i] = NULL;
+  }
+
+  return out;
+}
+
+}  // namespace tcmalloc

diff --git a/src/stack_trace_table.h b/src/stack_trace_table.h
new file mode 100644
index 0000000..e289771
--- /dev/null
+++ b/src/stack_trace_table.h

@@ -0,0 +1,92 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Andrew Fikes
+//
+// Utility class for coalescing sampled stack traces.  Not thread-safe.
+
+#ifndef TCMALLOC_STACK_TRACE_TABLE_H_
+#define TCMALLOC_STACK_TRACE_TABLE_H_
+
+#include <config.h>
+#ifdef HAVE_STDINT_H
+#include <stdint.h>                     // for uintptr_t
+#endif
+#include "common.h"
+
+namespace tcmalloc {
+
+class PERFTOOLS_DLL_DECL StackTraceTable {
+ public:
+  // REQUIRES: L < pageheap_lock
+  StackTraceTable();
+  ~StackTraceTable();
+
+  // Adds stack trace "t" to table.
+  //
+  // REQUIRES: L >= pageheap_lock
+  void AddTrace(const StackTrace& t);
+
+  // Returns stack traces formatted per MallocExtension guidelines.
+  // May return NULL on error.  Clears state before returning.
+  //
+  // REQUIRES: L < pageheap_lock
+  void** ReadStackTracesAndClear();
+
+  // Exposed for PageHeapAllocator
+  struct Bucket {
+    // Key
+    uintptr_t hash;
+    StackTrace trace;
+
+    // Payload
+    int count;
+    Bucket* next;
+
+    bool KeyEqual(uintptr_t h, const StackTrace& t) const;
+  };
+
+  // For testing
+  int depth_total() const { return depth_total_; }
+  int bucket_total() const { return bucket_total_; }
+
+ private:
+  static const int kHashTableSize = 1 << 14; // => table_ is 128k
+
+  bool error_;
+  int depth_total_;
+  int bucket_total_;
+  Bucket** table_;
+};
+
+}  // namespace tcmalloc
+
+#endif  // TCMALLOC_STACK_TRACE_TABLE_H_

diff --git a/src/stacktrace.cc b/src/stacktrace.cc
new file mode 100644
index 0000000..999863c
--- /dev/null
+++ b/src/stacktrace.cc

@@ -0,0 +1,270 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// Produce stack trace.
+//
+// There are three different ways we can try to get the stack trace:
+//
+// 1) Our hand-coded stack-unwinder.  This depends on a certain stack
+//    layout, which is used by gcc (and those systems using a
+//    gcc-compatible ABI) on x86 systems, at least since gcc 2.95.
+//    It uses the frame pointer to do its work.
+//
+// 2) The libunwind library.  This is still in development, and as a
+//    separate library adds a new dependency, abut doesn't need a frame
+//    pointer.  It also doesn't call malloc.
+//
+// 3) The gdb unwinder -- also the one used by the c++ exception code.
+//    It's obviously well-tested, but has a fatal flaw: it can call
+//    malloc() from the unwinder.  This is a problem because we're
+//    trying to use the unwinder to instrument malloc().
+//
+// Note: if you add a new implementation here, make sure it works
+// correctly when GetStackTrace() is called with max_depth == 0.
+// Some code may do that.
+
+#include <config.h>
+#include <stdlib.h> // for getenv
+#include <string.h> // for strcmp
+#include <stdio.h> // for fprintf
+#include "gperftools/stacktrace.h"
+#include "base/commandlineflags.h"
+#include "base/googleinit.h"
+
+
+// we're using plain struct and not class to avoid any possible issues
+// during initialization. Struct of pointers is easy to init at
+// link-time.
+struct GetStackImplementation {
+  int (*GetStackFramesPtr)(void** result, int* sizes, int max_depth,
+                           int skip_count);
+
+  int (*GetStackFramesWithContextPtr)(void** result, int* sizes, int max_depth,
+                                      int skip_count, const void *uc);
+
+  int (*GetStackTracePtr)(void** result, int max_depth,
+                          int skip_count);
+
+  int (*GetStackTraceWithContextPtr)(void** result, int max_depth,
+                                  int skip_count, const void *uc);
+
+  const char *name;
+};
+
+#if HAVE_DECL_BACKTRACE
+#define STACKTRACE_INL_HEADER "stacktrace_generic-inl.h"
+#define GST_SUFFIX generic
+#include "stacktrace_impl_setup-inl.h"
+#undef GST_SUFFIX
+#undef STACKTRACE_INL_HEADER
+#define HAVE_GST_generic
+#endif
+
+// libunwind uses __thread so we check for both libunwind.h and
+// __thread support
+#if defined(HAVE_LIBUNWIND_H) && defined(HAVE_TLS)
+#define STACKTRACE_INL_HEADER "stacktrace_libunwind-inl.h"
+#define GST_SUFFIX libunwind
+#include "stacktrace_impl_setup-inl.h"
+#undef GST_SUFFIX
+#undef STACKTRACE_INL_HEADER
+#define HAVE_GST_libunwind
+#endif // HAVE_LIBUNWIND_H
+
+#if defined(__i386__) || defined(__x86_64__)
+#define STACKTRACE_INL_HEADER "stacktrace_x86-inl.h"
+#define GST_SUFFIX x86
+#include "stacktrace_impl_setup-inl.h"
+#undef GST_SUFFIX
+#undef STACKTRACE_INL_HEADER
+#define HAVE_GST_x86
+#endif // i386 || x86_64
+
+#if defined(__ppc__) || defined(__PPC__)
+#if defined(__linux__)
+#define STACKTRACE_INL_HEADER "stacktrace_powerpc-linux-inl.h"
+#else
+#define STACKTRACE_INL_HEADER "stacktrace_powerpc-darwin-inl.h"
+#endif
+#define GST_SUFFIX ppc
+#include "stacktrace_impl_setup-inl.h"
+#undef GST_SUFFIX
+#undef STACKTRACE_INL_HEADER
+#define HAVE_GST_ppc
+#endif
+
+#if defined(__arm__)
+#define STACKTRACE_INL_HEADER "stacktrace_arm-inl.h"
+#define GST_SUFFIX arm
+#include "stacktrace_impl_setup-inl.h"
+#undef GST_SUFFIX
+#undef STACKTRACE_INL_HEADER
+#define HAVE_GST_arm
+#endif
+
+#ifdef TCMALLOC_ENABLE_INSTRUMENT_STACKTRACE
+#define STACKTRACE_INL_HEADER "stacktrace_instrument-inl.h"
+#define GST_SUFFIX instrument
+#include "stacktrace_impl_setup-inl.h"
+#undef GST_SUFFIX
+#undef STACKTRACE_INL_HEADER
+#define HAVE_GST_instrument
+#endif
+
+// The Windows case -- probably cygwin and mingw will use one of the
+// x86-includes above, but if not, we can fall back to windows intrinsics.
+#if defined(_WIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__) || defined(__MINGW32__)
+#define STACKTRACE_INL_HEADER "stacktrace_win32-inl.h"
+#define GST_SUFFIX win32
+#include "stacktrace_impl_setup-inl.h"
+#undef GST_SUFFIX
+#undef STACKTRACE_INL_HEADER
+#define HAVE_GST_win32
+#endif
+
+static GetStackImplementation *all_impls[] = {
+#ifdef HAVE_GST_generic
+  &impl__generic,
+#endif
+#ifdef HAVE_GST_libunwind
+  &impl__libunwind,
+#endif
+#ifdef HAVE_GST_x86
+  &impl__x86,
+#endif
+#ifdef HAVE_GST_arm
+  &impl__arm,
+#endif
+#ifdef HAVE_GST_ppc
+  &impl__ppc,
+#endif
+#ifdef HAVE_GST_instrument
+  &impl__instrument,
+#endif
+#ifdef HAVE_GST_win32
+  &impl__win32,
+#endif
+  NULL
+};
+
+// ppc and i386 implementations prefer arch-specific asm implementations.
+// arm's asm implementation is broken
+#if defined(__i386__) || defined(__x86_64__) || defined(__ppc__) || defined(__PPC__)
+#if !defined(NO_FRAME_POINTER)
+#define TCMALLOC_DONT_PREFER_LIBUNWIND
+#endif
+#endif
+
+#if defined(HAVE_GST_instrument)
+static GetStackImplementation *get_stack_impl = &impl__instrument;
+#elif defined(HAVE_GST_win32)
+static GetStackImplementation *get_stack_impl = &impl__win32;
+#elif defined(HAVE_GST_x86) && defined(TCMALLOC_DONT_PREFER_LIBUNWIND)
+static GetStackImplementation *get_stack_impl = &impl__x86;
+#elif defined(HAVE_GST_ppc) && defined(TCMALLOC_DONT_PREFER_LIBUNWIND)
+static GetStackImplementation *get_stack_impl = &impl__ppc;
+#elif defined(HAVE_GST_libunwind)
+static GetStackImplementation *get_stack_impl = &impl__libunwind;
+#elif defined(HAVE_GST_arm)
+static GetStackImplementation *get_stack_impl = &impl__arm;
+#elif defined(HAVE_GST_generic)
+static GetStackImplementation *get_stack_impl = &impl__generic;
+#elif 0
+// This is for the benefit of code analysis tools that may have
+// trouble with the computed #include above.
+# include "stacktrace_x86-inl.h"
+# include "stacktrace_libunwind-inl.h"
+# include "stacktrace_generic-inl.h"
+# include "stacktrace_powerpc-inl.h"
+# include "stacktrace_win32-inl.h"
+# include "stacktrace_arm-inl.h"
+# include "stacktrace_instrument-inl.h"
+#else
+#error Cannot calculate stack trace: will need to write for your environment
+#endif
+
+static int ATTRIBUTE_NOINLINE frame_forcer(int rv) {
+  return rv;
+}
+
+PERFTOOLS_DLL_DECL int GetStackFrames(void** result, int* sizes, int max_depth,
+                                      int skip_count) {
+  return frame_forcer(get_stack_impl->GetStackFramesPtr(result, sizes, max_depth, skip_count));
+}
+
+PERFTOOLS_DLL_DECL int GetStackFramesWithContext(void** result, int* sizes, int max_depth,
+                                                 int skip_count, const void *uc) {
+  return frame_forcer(get_stack_impl->GetStackFramesWithContextPtr(
+                        result, sizes, max_depth,
+                        skip_count, uc));
+}
+
+PERFTOOLS_DLL_DECL int GetStackTrace(void** result, int max_depth,
+                                     int skip_count) {
+  return frame_forcer(get_stack_impl->GetStackTracePtr(result, max_depth, skip_count));
+}
+
+PERFTOOLS_DLL_DECL int GetStackTraceWithContext(void** result, int max_depth,
+                                                int skip_count, const void *uc) {
+  return frame_forcer(get_stack_impl->GetStackTraceWithContextPtr(
+                        result, max_depth, skip_count, uc));
+}
+
+static void init_default_stack_impl_inner(void) {
+  char *val = getenv("TCMALLOC_STACKTRACE_METHOD");
+  if (!val || !*val) {
+    return;
+  }
+  for (GetStackImplementation **p = all_impls; *p; p++) {
+    GetStackImplementation *c = *p;
+    if (strcmp(c->name, val) == 0) {
+      get_stack_impl = c;
+      return;
+    }
+  }
+  fprintf(stderr, "Unknown or unsupported stacktrace method requested: %s. Ignoring it\n", val);
+}
+
+static void init_default_stack_impl(void) {
+  init_default_stack_impl_inner();
+  if (EnvToBool("TCMALLOC_STACKTRACE_METHOD_VERBOSE", false)) {
+    fprintf(stderr, "Chosen stacktrace method is %s\nSupported methods:\n", get_stack_impl->name);
+    for (GetStackImplementation **p = all_impls; *p; p++) {
+      GetStackImplementation *c = *p;
+      fprintf(stderr, "* %s\n", c->name);
+    }
+    fputs("\n", stderr);
+  }
+}
+
+REGISTER_MODULE_INITIALIZER(stacktrace_init_default_stack_impl, init_default_stack_impl());

diff --git a/src/stacktrace_arm-inl.h b/src/stacktrace_arm-inl.h
new file mode 100644
index 0000000..1586b8f
--- /dev/null
+++ b/src/stacktrace_arm-inl.h

@@ -0,0 +1,148 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Doug Kwan
+// This is inspired by Craig Silverstein's PowerPC stacktrace code.
+//
+
+#ifndef BASE_STACKTRACE_ARM_INL_H_
+#define BASE_STACKTRACE_ARM_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
+#include <stdint.h>   // for uintptr_t
+#include "base/basictypes.h"  // for NULL
+#include <gperftools/stacktrace.h>
+
+// WARNING:
+// This only works if all your code is in either ARM or THUMB mode.  With
+// interworking, the frame pointer of the caller can either be in r11 (ARM
+// mode) or r7 (THUMB mode).  A callee only saves the frame pointer of its
+// mode in a fixed location on its stack frame.  If the caller is a different
+// mode, there is no easy way to find the frame pointer.  It can either be
+// still in the designated register or saved on stack along with other callee
+// saved registers.
+
+// Given a pointer to a stack frame, locate and return the calling
+// stackframe, or return NULL if no stackframe can be found. Perform sanity
+// checks (the strictness of which is controlled by the boolean parameter
+// "STRICT_UNWINDING") to reduce the chance that a bad pointer is returned.
+template<bool STRICT_UNWINDING>
+static void **NextStackFrame(void **old_sp) {
+  void **new_sp = (void**) old_sp[-1];
+
+  // Check that the transition from frame pointer old_sp to frame
+  // pointer new_sp isn't clearly bogus
+  if (STRICT_UNWINDING) {
+    // With the stack growing downwards, older stack frame must be
+    // at a greater address that the current one.
+    if (new_sp <= old_sp) return NULL;
+    // Assume stack frames larger than 100,000 bytes are bogus.
+    if ((uintptr_t)new_sp - (uintptr_t)old_sp > 100000) return NULL;
+  } else {
+    // In the non-strict mode, allow discontiguous stack frames.
+    // (alternate-signal-stacks for example).
+    if (new_sp == old_sp) return NULL;
+    // And allow frames upto about 1MB.
+    if ((new_sp > old_sp)
+        && ((uintptr_t)new_sp - (uintptr_t)old_sp > 1000000)) return NULL;
+  }
+  if ((uintptr_t)new_sp & (sizeof(void *) - 1)) return NULL;
+  return new_sp;
+}
+
+// This ensures that GetStackTrace stes up the Link Register properly.
+#ifdef __GNUC__
+void StacktraceArmDummyFunction() __attribute__((noinline));
+void StacktraceArmDummyFunction() { __asm__ volatile(""); }
+#else
+# error StacktraceArmDummyFunction() needs to be ported to this platform.
+#endif
+#endif  // BASE_STACKTRACE_ARM_INL_H_
+
+// Note: this part of the file is included several times.
+// Do not put globals below.
+
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
+//
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+static int GET_STACK_TRACE_OR_FRAMES {
+#ifdef __GNUC__
+  void **sp = reinterpret_cast<void**>(__builtin_frame_address(0));
+#else
+# error reading stack point not yet supported on this platform.
+#endif
+
+  // On ARM, the return address is stored in the link register (r14).
+  // This is not saved on the stack frame of a leaf function.  To
+  // simplify code that reads return addresses, we call a dummy
+  // function so that the return address of this function is also
+  // stored in the stack frame.  This works at least for gcc.
+  StacktraceArmDummyFunction();
+
+  skip_count++; // skip parent frame due to indirection in stacktrace.cc
+
+  int n = 0;
+  while (sp && n < max_depth) {
+    // The GetStackFrames routine is called when we are in some
+    // informational context (the failure signal handler for example).
+    // Use the non-strict unwinding rules to produce a stack trace
+    // that is as complete as possible (even if it contains a few bogus
+    // entries in some rare cases).
+    void **next_sp = NextStackFrame<IS_STACK_FRAMES == 0>(sp);
+
+    if (skip_count > 0) {
+      skip_count--;
+    } else {
+      result[n] = *sp;
+
+#if IS_STACK_FRAMES
+      if (next_sp > sp) {
+        sizes[n] = (uintptr_t)next_sp - (uintptr_t)sp;
+      } else {
+        // A frame-size of 0 is used to indicate unknown frame size.
+        sizes[n] = 0;
+      }
+#endif
+      n++;
+    }
+    sp = next_sp;
+  }
+  return n;
+}

diff --git a/src/stacktrace_generic-inl.h b/src/stacktrace_generic-inl.h
new file mode 100644
index 0000000..7d7c22d
--- /dev/null
+++ b/src/stacktrace_generic-inl.h

@@ -0,0 +1,84 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// Portable implementation - just use glibc
+//
+// Note:  The glibc implementation may cause a call to malloc.
+// This can cause a deadlock in HeapProfiler.
+
+#ifndef BASE_STACKTRACE_GENERIC_INL_H_
+#define BASE_STACKTRACE_GENERIC_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
+#include <execinfo.h>
+#include <string.h>
+#include "gperftools/stacktrace.h"
+#endif  // BASE_STACKTRACE_GENERIC_INL_H_
+
+// Note: this part of the file is included several times.
+// Do not put globals below.
+
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
+//
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+static int GET_STACK_TRACE_OR_FRAMES {
+  static const int kStackLength = 64;
+  void * stack[kStackLength];
+  int size;
+
+  size = backtrace(stack, kStackLength);
+  skip_count += 2;  // we want to skip the current and it's parent frame as well
+  int result_count = size - skip_count;
+  if (result_count < 0)
+    result_count = 0;
+  if (result_count > max_depth)
+    result_count = max_depth;
+  for (int i = 0; i < result_count; i++)
+    result[i] = stack[i + skip_count];
+
+#if IS_STACK_FRAMES
+  // No implementation for finding out the stack frame sizes yet.
+  memset(sizes, 0, sizeof(*sizes) * result_count);
+#endif
+
+  return result_count;
+}

diff --git a/src/stacktrace_impl_setup-inl.h b/src/stacktrace_impl_setup-inl.h
new file mode 100644
index 0000000..698c5b3
--- /dev/null
+++ b/src/stacktrace_impl_setup-inl.h

@@ -0,0 +1,94 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// NOTE: this is NOT to be #include-d normally. It's internal
+// implementation detail of stacktrace.cc
+//
+
+// Copyright (c) 2014, gperftools Contributors.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Aliaksey Kandratsenka <alk@tut.by>
+//
+//  based on stacktrace.cc and stacktrace_config.h by Sanjay Ghemawat
+//  and Paul Pluzhnikov from Google Inc
+
+#define SIS_CONCAT2(a, b) a##b
+#define SIS_CONCAT(a, b) SIS_CONCAT2(a,b)
+
+#define SIS_STRINGIFY(a) SIS_STRINGIFY2(a)
+#define SIS_STRINGIFY2(a) #a
+
+#define IS_STACK_FRAMES 0
+#define IS_WITH_CONTEXT 0
+#define GET_STACK_TRACE_OR_FRAMES \
+  SIS_CONCAT(GetStackTrace_, GST_SUFFIX)(void **result, int max_depth, int skip_count)
+#include STACKTRACE_INL_HEADER
+#undef IS_STACK_FRAMES
+#undef IS_WITH_CONTEXT
+#undef GET_STACK_TRACE_OR_FRAMES
+
+#define IS_STACK_FRAMES 1
+#define IS_WITH_CONTEXT 0
+#define GET_STACK_TRACE_OR_FRAMES \
+  SIS_CONCAT(GetStackFrames_, GST_SUFFIX)(void **result, int *sizes, int max_depth, int skip_count)
+#include STACKTRACE_INL_HEADER
+#undef IS_STACK_FRAMES
+#undef IS_WITH_CONTEXT
+#undef GET_STACK_TRACE_OR_FRAMES
+
+#define IS_STACK_FRAMES 0
+#define IS_WITH_CONTEXT 1
+#define GET_STACK_TRACE_OR_FRAMES \
+  SIS_CONCAT(GetStackTraceWithContext_, GST_SUFFIX)(void **result, int max_depth, \
+                                                   int skip_count, const void *ucp)
+#include STACKTRACE_INL_HEADER
+#undef IS_STACK_FRAMES
+#undef IS_WITH_CONTEXT
+#undef GET_STACK_TRACE_OR_FRAMES
+
+#define IS_STACK_FRAMES 1
+#define IS_WITH_CONTEXT 1
+#define GET_STACK_TRACE_OR_FRAMES \
+  SIS_CONCAT(GetStackFramesWithContext_, GST_SUFFIX)(void **result, int *sizes, int max_depth, \
+                                                    int skip_count, const void *ucp)
+#include STACKTRACE_INL_HEADER
+#undef IS_STACK_FRAMES
+#undef IS_WITH_CONTEXT
+#undef GET_STACK_TRACE_OR_FRAMES
+
+static GetStackImplementation SIS_CONCAT(impl__,GST_SUFFIX) = {
+  SIS_CONCAT(GetStackFrames_, GST_SUFFIX),
+  SIS_CONCAT(GetStackFramesWithContext_, GST_SUFFIX),
+  SIS_CONCAT(GetStackTrace_, GST_SUFFIX),
+  SIS_CONCAT(GetStackTraceWithContext_, GST_SUFFIX),
+  SIS_STRINGIFY(GST_SUFFIX)
+};
+
+#undef SIS_CONCAT2
+#undef SIS_CONCAT

diff --git a/src/stacktrace_instrument-inl.h b/src/stacktrace_instrument-inl.h
new file mode 100755
index 0000000..c631765
--- /dev/null
+++ b/src/stacktrace_instrument-inl.h

@@ -0,0 +1,155 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2013, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Jean Lee <xiaoyur347@gmail.com>
+// based on gcc Code-Gen-Options "-finstrument-functions" listed in
+// http://gcc.gnu.org/onlinedocs/gcc/Code-Gen-Options.html .
+// Should run configure with CXXFLAGS="-finstrument-functions".
+
+// This file is a backtrace implementation for systems :
+// * The glibc implementation of backtrace() may cause a call to malloc,
+//   and cause a deadlock in HeapProfiler.
+// * The libunwind implementation prints no backtrace.
+
+// The backtrace arrays are stored in "thread_back_trace" variable.
+// Maybe to use thread local storage is better and should save memorys.
+
+#ifndef BASE_STACKTRACE_INSTRUMENT_INL_H_
+#define BASE_STACKTRACE_INSTRUMENT_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
+#include <execinfo.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include "gperftools/stacktrace.h"
+
+#define gettid() syscall(__NR_gettid)
+#ifndef __x86_64__
+#define MAX_THREAD (32768)
+#else
+#define MAX_THREAD (65536)
+#endif
+#define MAX_DEPTH  (30)
+#define ATTRIBUTE_NOINSTRUMENT __attribute__ ((no_instrument_function))
+
+typedef struct {
+  int   stack_depth;
+  void* frame[MAX_DEPTH];
+}BACK_TRACE;
+
+static BACK_TRACE thread_back_trace[MAX_THREAD];
+extern "C" {
+void __cyg_profile_func_enter(void *func_address,
+                              void *call_site) ATTRIBUTE_NOINSTRUMENT;
+void __cyg_profile_func_enter(void *func_address, void *call_site) {
+  (void)func_address;
+
+  BACK_TRACE* backtrace = thread_back_trace + gettid();
+  int stack_depth = backtrace->stack_depth;
+  backtrace->stack_depth = stack_depth + 1;
+  if ( stack_depth >= MAX_DEPTH ) {
+    return;
+  }
+  backtrace->frame[stack_depth] = call_site;
+}
+
+void __cyg_profile_func_exit(void *func_address,
+                             void *call_site) ATTRIBUTE_NOINSTRUMENT;
+void __cyg_profile_func_exit(void *func_address, void *call_site) {
+  (void)func_address;
+  (void)call_site;
+
+  BACK_TRACE* backtrace = thread_back_trace + gettid();
+  int stack_depth = backtrace->stack_depth;
+  backtrace->stack_depth = stack_depth - 1;
+  if ( stack_depth >= MAX_DEPTH ) {
+    return;
+  }
+  backtrace->frame[stack_depth] = 0;
+}
+}  // extern "C"
+
+static int cyg_backtrace(void **buffer, int size) {
+  BACK_TRACE* backtrace = thread_back_trace + gettid();
+  int stack_depth = backtrace->stack_depth;
+  if ( stack_depth >= MAX_DEPTH ) {
+    stack_depth = MAX_DEPTH;
+  }
+  int nSize = (size > stack_depth) ? stack_depth : size;
+  for (int i = 0; i < nSize; i++) {
+  buffer[i] = backtrace->frame[nSize - i - 1];
+  }
+
+  return nSize;
+}
+
+#endif  // BASE_STACKTRACE_INSTRUMENT_INL_H_
+
+
+// Note: this part of the file is included several times.
+// Do not put globals below.
+
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
+//
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+static int GET_STACK_TRACE_OR_FRAMES {
+  static const int kStackLength = 64;
+  void * stack[kStackLength];
+  int size;
+  memset(stack, 0, sizeof(stack));
+
+  size = cyg_backtrace(stack, kStackLength);
+  skip_count += 2;  // we want to skip the current and parent frame as well
+  int result_count = size - skip_count;
+  if (result_count < 0)
+    result_count = 0;
+  if (result_count > max_depth)
+    result_count = max_depth;
+  for (int i = 0; i < result_count; i++)
+    result[i] = stack[i + skip_count];
+
+#if IS_STACK_FRAMES
+  // No implementation for finding out the stack frame sizes yet.
+  memset(sizes, 0, sizeof(*sizes) * result_count);
+#endif
+
+  return result_count;
+}

diff --git a/src/stacktrace_libunwind-inl.h b/src/stacktrace_libunwind-inl.h
new file mode 100644
index 0000000..8a4a731
--- /dev/null
+++ b/src/stacktrace_libunwind-inl.h

@@ -0,0 +1,150 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Arun Sharma
+//
+// Produce stack trace using libunwind
+
+#ifndef BASE_STACKTRACE_LIBINWIND_INL_H_
+#define BASE_STACKTRACE_LIBINWIND_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
+// We only need local unwinder.
+#define UNW_LOCAL_ONLY
+
+extern "C" {
+#include <assert.h>
+#include <string.h>   // for memset()
+#include <libunwind.h>
+}
+#include "gperftools/stacktrace.h"
+#include "base/logging.h"
+
+// Sometimes, we can try to get a stack trace from within a stack
+// trace, because libunwind can call mmap (maybe indirectly via an
+// internal mmap based memory allocator), and that mmap gets trapped
+// and causes a stack-trace request.  If were to try to honor that
+// recursive request, we'd end up with infinite recursion or deadlock.
+// Luckily, it's safe to ignore those subsequent traces.  In such
+// cases, we return 0 to indicate the situation.
+static __thread int recursive;
+
+#if defined(TCMALLOC_ENABLE_UNWIND_FROM_UCONTEXT) && (defined(__i386__) || defined(__x86_64__)) && defined(__GNU_LIBRARY__)
+#define BASE_STACKTRACE_UNW_CONTEXT_IS_UCONTEXT 1
+#endif
+
+#endif  // BASE_STACKTRACE_LIBINWIND_INL_H_
+
+// Note: this part of the file is included several times.
+// Do not put globals below.
+
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
+//
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+static int GET_STACK_TRACE_OR_FRAMES {
+  void *ip;
+  int n = 0;
+  unw_cursor_t cursor;
+  unw_context_t uc;
+#if IS_STACK_FRAMES
+  unw_word_t sp = 0, next_sp = 0;
+#endif
+
+  if (recursive) {
+    return 0;
+  }
+  ++recursive;
+
+#if (IS_WITH_CONTEXT && defined(BASE_STACKTRACE_UNW_CONTEXT_IS_UCONTEXT))
+  if (ucp) {
+    uc = *(static_cast<unw_context_t *>(const_cast<void *>(ucp)));
+    /* this is a bit weird. profiler.cc calls us with signal's ucontext
+     * yet passing us 2 as skip_count and essentially assuming we won't
+     * use ucontext. */
+    /* In order to fix that I'm going to assume that if ucp is
+     * non-null we're asked to ignore skip_count in case we're
+     * able to use ucp */
+    skip_count = 0;
+  } else {
+    unw_getcontext(&uc);
+    skip_count += 2;         // Do not include current and parent frame
+  }
+#else
+  unw_getcontext(&uc);
+  skip_count += 2;         // Do not include current and parent frame
+#endif
+
+  int ret = unw_init_local(&cursor, &uc);
+  assert(ret >= 0);
+
+  while (skip_count--) {
+    if (unw_step(&cursor) <= 0) {
+      goto out;
+    }
+#if IS_STACK_FRAMES
+    if (unw_get_reg(&cursor, UNW_REG_SP, &next_sp)) {
+      goto out;
+    }
+#endif
+  }
+
+  while (n < max_depth) {
+    if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0) {
+      break;
+    }
+#if IS_STACK_FRAMES
+    sizes[n] = 0;
+#endif
+    result[n++] = ip;
+    if (unw_step(&cursor) <= 0) {
+      break;
+    }
+#if IS_STACK_FRAMES
+    sp = next_sp;
+    if (unw_get_reg(&cursor, UNW_REG_SP, &next_sp) , 0) {
+      break;
+    }
+    sizes[n - 1] = next_sp - sp;
+#endif
+  }
+out:
+  --recursive;
+  return n;
+}

diff --git a/src/stacktrace_powerpc-darwin-inl.h b/src/stacktrace_powerpc-darwin-inl.h
new file mode 100644
index 0000000..c4c2edb
--- /dev/null
+++ b/src/stacktrace_powerpc-darwin-inl.h

@@ -0,0 +1,158 @@
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Produce stack trace.  ABI documentation reference can be found at:
+// * PowerPC32 ABI: https://www.power.org/documentation/
+// power-architecture-32-bit-abi-supplement-1-0-embeddedlinuxunified/
+// * PowerPC64 ABI:
+// http://www.linux-foundation.org/spec/ELF/ppc64/PPC-elf64abi-1.9.html#STACK
+
+#ifndef BASE_STACKTRACE_POWERPC_INL_H_
+#define BASE_STACKTRACE_POWERPC_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
+#include <stdint.h>   // for uintptr_t
+#include <stdlib.h>   // for NULL
+#include <gperftools/stacktrace.h>
+
+// Given a pointer to a stack frame, locate and return the calling
+// stackframe, or return NULL if no stackframe can be found. Perform sanity
+// checks (the strictness of which is controlled by the boolean parameter
+// "STRICT_UNWINDING") to reduce the chance that a bad pointer is returned.
+template<bool STRICT_UNWINDING>
+static void **NextStackFrame(void **old_sp) {
+  void **new_sp = (void **) *old_sp;
+
+  // Check that the transition from frame pointer old_sp to frame
+  // pointer new_sp isn't clearly bogus
+  if (STRICT_UNWINDING) {
+    // With the stack growing downwards, older stack frame must be
+    // at a greater address that the current one.
+    if (new_sp <= old_sp) return NULL;
+    // Assume stack frames larger than 100,000 bytes are bogus.
+    if ((uintptr_t)new_sp - (uintptr_t)old_sp > 100000) return NULL;
+  } else {
+    // In the non-strict mode, allow discontiguous stack frames.
+    // (alternate-signal-stacks for example).
+    if (new_sp == old_sp) return NULL;
+    // And allow frames upto about 1MB.
+    if ((new_sp > old_sp)
+        && ((uintptr_t)new_sp - (uintptr_t)old_sp > 1000000)) return NULL;
+  }
+  if ((uintptr_t)new_sp & (sizeof(void *) - 1)) return NULL;
+  return new_sp;
+}
+
+// This ensures that GetStackTrace stes up the Link Register properly.
+void StacktracePowerPCDummyFunction() __attribute__((noinline));
+void StacktracePowerPCDummyFunction() { __asm__ volatile(""); }
+#endif  // BASE_STACKTRACE_POWERPC_INL_H_
+
+// Note: this part of the file is included several times.
+// Do not put globals below.
+
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
+//
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+int GET_STACK_TRACE_OR_FRAMES {
+  void **sp;
+  // Apple OS X uses an old version of gnu as -- both Darwin 7.9.0 (Panther)
+  // and Darwin 8.8.1 (Tiger) use as 1.38.  This means we have to use a
+  // different asm syntax.  I don't know quite the best way to discriminate
+  // systems using the old as from the new one; I've gone with __APPLE__.
+  // TODO(csilvers): use autoconf instead, to look for 'as --version' == 1 or 2
+  __asm__ volatile ("mr %0,r1" : "=r" (sp));
+
+  // On PowerPC, the "Link Register" or "Link Record" (LR), is a stack
+  // entry that holds the return address of the subroutine call (what
+  // instruction we run after our function finishes).  This is the
+  // same as the stack-pointer of our parent routine, which is what we
+  // want here.  While the compiler will always(?) set up LR for
+  // subroutine calls, it may not for leaf functions (such as this one).
+  // This routine forces the compiler (at least gcc) to push it anyway.
+  StacktracePowerPCDummyFunction();
+
+#if IS_STACK_FRAMES
+  // Note we do *not* increment skip_count here for the SYSV ABI.  If
+  // we did, the list of stack frames wouldn't properly match up with
+  // the list of return addresses.  Note this means the top pc entry
+  // is probably bogus for linux/ppc (and other SYSV-ABI systems).
+#else
+  // The LR save area is used by the callee, so the top entry is bogus.
+  skip_count++;
+#endif
+
+  int n = 0;
+  while (sp && n < max_depth) {
+    // The GetStackFrames routine is called when we are in some
+    // informational context (the failure signal handler for example).
+    // Use the non-strict unwinding rules to produce a stack trace
+    // that is as complete as possible (even if it contains a few
+    // bogus entries in some rare cases).
+    void **next_sp = NextStackFrame<!IS_STACK_FRAMES>(sp);
+
+    if (skip_count > 0) {
+      skip_count--;
+    } else {
+      // PowerPC has 3 main ABIs, which say where in the stack the
+      // Link Register is.  For DARWIN and AIX (used by apple and
+      // linux ppc64), it's in sp[2].  For SYSV (used by linux ppc),
+      // it's in sp[1].
+#if defined(__PPC64__)
+      // This check is in case the compiler doesn't define _CALL_AIX/etc.
+      result[n] = *(sp+2);
+#elif defined(__linux)
+      // This check is in case the compiler doesn't define _CALL_SYSV.
+      result[n] = *(sp+1);
+#endif
+
+#if IS_STACK_FRAMES
+      if (next_sp > sp) {
+        sizes[n] = (uintptr_t)next_sp - (uintptr_t)sp;
+      } else {
+        // A frame-size of 0 is used to indicate unknown frame size.
+        sizes[n] = 0;
+      }
+#endif
+      n++;
+    }
+    sp = next_sp;
+  }
+  return n;
+}

diff --git a/src/stacktrace_powerpc-inl.h b/src/stacktrace_powerpc-inl.h
new file mode 100644
index 0000000..811d6cc
--- /dev/null
+++ b/src/stacktrace_powerpc-inl.h

@@ -0,0 +1,176 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// Produce stack trace.  I'm guessing (hoping!) the code is much like
+// for x86.  For apple machines, at least, it seems to be; see
+//    http://developer.apple.com/documentation/mac/runtimehtml/RTArch-59.html
+//    http://www.linux-foundation.org/spec/ELF/ppc64/PPC-elf64abi-1.9.html#STACK
+// Linux has similar code: http://patchwork.ozlabs.org/linuxppc/patch?id=8882
+
+#ifndef BASE_STACKTRACE_POWERPC_INL_H_
+#define BASE_STACKTRACE_POWERPC_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
+#include <stdint.h>   // for uintptr_t
+#include <stdlib.h>   // for NULL
+#include <gperftools/stacktrace.h>
+
+struct layout_ppc {
+  struct layout_ppc *next;
+#if defined(__APPLE__) || (defined(__linux) && defined(__PPC64__))
+  long condition_register;
+#endif
+  void *return_addr;
+};
+
+// Given a pointer to a stack frame, locate and return the calling
+// stackframe, or return NULL if no stackframe can be found. Perform sanity
+// checks (the strictness of which is controlled by the boolean parameter
+// "STRICT_UNWINDING") to reduce the chance that a bad pointer is returned.
+template<bool STRICT_UNWINDING>
+static layout_ppc *NextStackFrame(layout_ppc *current) {
+  uintptr_t old_sp = (uintptr_t)(current);
+  uintptr_t new_sp = (uintptr_t)(current->next);
+
+  // Check that the transition from frame pointer old_sp to frame
+  // pointer new_sp isn't clearly bogus
+  if (STRICT_UNWINDING) {
+    // With the stack growing downwards, older stack frame must be
+    // at a greater address that the current one.
+    if (new_sp <= old_sp)
+      return NULL;
+    // Assume stack frames larger than 100,000 bytes are bogus.
+    if (new_sp - old_sp > 100000)
+      return NULL;
+  } else {
+    // In the non-strict mode, allow discontiguous stack frames.
+    // (alternate-signal-stacks for example).
+    if (new_sp == old_sp)
+      return NULL;
+    // And allow frames upto about 1MB.
+    if ((new_sp > old_sp) && (new_sp - old_sp > 1000000))
+      return NULL;
+  }
+  if (new_sp & (sizeof(void *) - 1))
+    return NULL;
+  return current->next;
+}
+
+// This ensures that GetStackTrace stes up the Link Register properly.
+void StacktracePowerPCDummyFunction() __attribute__((noinline));
+void StacktracePowerPCDummyFunction() { __asm__ volatile(""); }
+#endif  // BASE_STACKTRACE_POWERPC_INL_H_
+
+// Note: this part of the file is included several times.
+// Do not put globals below.
+
+// Load instruction used on top-of-stack get.
+#if defined(__PPC64__) || defined(__LP64__)
+# define LOAD "ld"
+#else
+# define LOAD "lwz"
+#endif
+
+#if defined(__linux__) && defined(__PPC__)
+# define TOP_STACK "%0,0(1)"
+#elif defined(__MACH__) && defined(__APPLE__)
+// Apple OS X uses an old version of gnu as -- both Darwin 7.9.0 (Panther)
+// and Darwin 8.8.1 (Tiger) use as 1.38.  This means we have to use a
+// different asm syntax.  I don't know quite the best way to discriminate
+// systems using the old as from the new one; I've gone with __APPLE__.
+// TODO(csilvers): use autoconf instead, to look for 'as --version' == 1 or 2
+# define TOP_STACK "%0,0(r1)"
+#endif
+
+
+
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
+//
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+static int GET_STACK_TRACE_OR_FRAMES {
+  layout_ppc *current;
+  int n;
+
+  // Force GCC to spill LR.
+  asm volatile ("" : "=l"(current));
+
+  // Get the address on top-of-stack
+  asm volatile (LOAD " " TOP_STACK : "=r"(current));
+
+  StacktracePowerPCDummyFunction();
+
+  n = 0;
+  skip_count++; // skip parent's frame due to indirection in
+                // stacktrace.cc
+  while (current && n < max_depth) {
+
+    // The GetStackFrames routine is called when we are in some
+    // informational context (the failure signal handler for example).
+    // Use the non-strict unwinding rules to produce a stack trace
+    // that is as complete as possible (even if it contains a few
+    // bogus entries in some rare cases).
+    layout_ppc *next = NextStackFrame<!IS_STACK_FRAMES>(current);
+    if (skip_count > 0) {
+      skip_count--;
+    } else {
+      result[n] = current->return_addr;
+#if IS_STACK_FRAMES
+      if (next > current) {
+        sizes[n] = (uintptr_t)next - (uintptr_t)current;
+      } else {
+        // A frame-size of 0 is used to indicate unknown frame size.
+        sizes[n] = 0;
+      }
+#endif
+      n++;
+    }
+    current = next;
+  }
+
+  // It's possible the second-last stack frame can't return
+  // (that is, it's __libc_start_main), in which case
+  // the CRT startup code will have set its LR to 'NULL'.
+  if (n > 0 && result[n-1] == NULL)
+    n--;
+
+  return n;
+}

diff --git a/src/stacktrace_powerpc-linux-inl.h b/src/stacktrace_powerpc-linux-inl.h
new file mode 100644
index 0000000..5d16fa1
--- /dev/null
+++ b/src/stacktrace_powerpc-linux-inl.h

@@ -0,0 +1,231 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// Produce stack trace.  ABI documentation reference can be found at:
+// * PowerPC32 ABI: https://www.power.org/documentation/
+// power-architecture-32-bit-abi-supplement-1-0-embeddedlinuxunified/
+// * PowerPC64 ABI:
+// http://www.linux-foundation.org/spec/ELF/ppc64/PPC-elf64abi-1.9.html#STACK
+
+#ifndef BASE_STACKTRACE_POWERPC_INL_H_
+#define BASE_STACKTRACE_POWERPC_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
+#include <stdint.h>   // for uintptr_t
+#include <stdlib.h>   // for NULL
+#include <gperftools/stacktrace.h>
+#include <base/vdso_support.h>
+
+#if defined(HAVE_SYS_UCONTEXT_H)
+#include <sys/ucontext.h>
+#elif defined(HAVE_UCONTEXT_H)
+#include <ucontext.h>  // for ucontext_t
+#endif
+typedef ucontext ucontext_t;
+
+// PowerPC64 Little Endian follows BE wrt. backchain, condition register,
+// and LR save area, so no need to adjust the reading struct.
+struct layout_ppc {
+  struct layout_ppc *next;
+#ifdef __PPC64__
+  long condition_register;
+#endif
+  void *return_addr;
+};
+
+// Signal callbacks are handled by the vDSO symbol:
+//
+// * PowerPC64 Linux (arch/powerpc/kernel/vdso64/sigtramp.S):
+//   __kernel_sigtramp_rt64
+// * PowerPC32 Linux (arch/powerpc/kernel/vdso32/sigtramp.S):
+//   __kernel_sigtramp32
+//   __kernel_sigtramp_rt32
+//
+// So a backtrace may need to specially handling if the symbol readed is
+// the signal trampoline.
+
+// Given a pointer to a stack frame, locate and return the calling
+// stackframe, or return NULL if no stackframe can be found. Perform sanity
+// checks (the strictness of which is controlled by the boolean parameter
+// "STRICT_UNWINDING") to reduce the chance that a bad pointer is returned.
+template<bool STRICT_UNWINDING>
+static layout_ppc *NextStackFrame(layout_ppc *current) {
+  uintptr_t old_sp = (uintptr_t)(current);
+  uintptr_t new_sp = (uintptr_t)(current->next);
+
+  // Check that the transition from frame pointer old_sp to frame
+  // pointer new_sp isn't clearly bogus
+  if (STRICT_UNWINDING) {
+    // With the stack growing downwards, older stack frame must be
+    // at a greater address that the current one.
+    if (new_sp <= old_sp)
+      return NULL;
+    // Assume stack frames larger than 100,000 bytes are bogus.
+    if (new_sp - old_sp > 100000)
+      return NULL;
+  } else {
+    // In the non-strict mode, allow discontiguous stack frames.
+    // (alternate-signal-stacks for example).
+    if (new_sp == old_sp)
+      return NULL;
+    // And allow frames upto about 1MB.
+    if ((new_sp > old_sp) && (new_sp - old_sp > 1000000))
+      return NULL;
+  }
+  if (new_sp & (sizeof(void *) - 1))
+    return NULL;
+  return current->next;
+}
+
+// This ensures that GetStackTrace stes up the Link Register properly.
+void StacktracePowerPCDummyFunction() __attribute__((noinline));
+void StacktracePowerPCDummyFunction() { __asm__ volatile(""); }
+#endif  // BASE_STACKTRACE_POWERPC_INL_H_
+
+// Note: this part of the file is included several times.
+// Do not put globals below.
+
+// Load instruction used on top-of-stack get.
+#if defined(__PPC64__) || defined(__LP64__)
+# define LOAD "ld"
+#else
+# define LOAD "lwz"
+#endif
+
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
+//
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+static int GET_STACK_TRACE_OR_FRAMES {
+  layout_ppc *current;
+  int n;
+
+  // Get the address on top-of-stack
+  current = reinterpret_cast<layout_ppc*> (__builtin_frame_address (0));
+  // And ignore the current symbol
+  current = current->next;
+
+  StacktracePowerPCDummyFunction();
+
+  n = 0;
+  skip_count++; // skip parent's frame due to indirection in
+                // stacktrace.cc
+
+  base::VDSOSupport vdso;
+  base::ElfMemImage::SymbolInfo rt_sigreturn_symbol_info;
+#ifdef __PPC64__
+  const void *sigtramp64_vdso = 0;
+  if (vdso.LookupSymbol("__kernel_sigtramp_rt64", "LINUX_2.6.15", STT_NOTYPE,
+                        &rt_sigreturn_symbol_info))
+    sigtramp64_vdso = rt_sigreturn_symbol_info.address;
+#else
+  const void *sigtramp32_vdso = 0;
+  if (vdso.LookupSymbol("__kernel_sigtramp32", "LINUX_2.6.15", STT_NOTYPE,
+                        &rt_sigreturn_symbol_info))
+    sigtramp32_vdso = rt_sigreturn_symbol_info.address;
+  const void *sigtramp32_rt_vdso = 0;
+  if (vdso.LookupSymbol("__kernel_sigtramp_rt32", "LINUX_2.6.15", STT_NOTYPE,
+                        &rt_sigreturn_symbol_info))
+    sigtramp32_rt_vdso = rt_sigreturn_symbol_info.address;
+#endif
+
+  while (current && n < max_depth) {
+
+    // The GetStackFrames routine is called when we are in some
+    // informational context (the failure signal handler for example).
+    // Use the non-strict unwinding rules to produce a stack trace
+    // that is as complete as possible (even if it contains a few
+    // bogus entries in some rare cases).
+    layout_ppc *next = NextStackFrame<!IS_STACK_FRAMES>(current);
+    if (skip_count > 0) {
+      skip_count--;
+    } else {
+      result[n] = current->return_addr;
+#ifdef __PPC64__
+      if (sigtramp64_vdso && (sigtramp64_vdso == current->return_addr)) {
+        struct signal_frame_64 {
+          char dummy[128];
+          ucontext_t uc;
+        // We don't care about the rest, since the IP value is at 'uc' field.
+        } *sigframe = reinterpret_cast<signal_frame_64*>(current);
+        result[n] = (void*) sigframe->uc.uc_mcontext.gp_regs[PT_NIP];
+      }
+#else
+      if (sigtramp32_vdso && (sigtramp32_vdso == current->return_addr)) {
+        struct signal_frame_32 {
+          char dummy[64];
+          struct sigcontext sctx;
+          mcontext_t mctx;
+          // We don't care about the rest, since IP value is at 'mctx' field.
+        } *sigframe = reinterpret_cast<signal_frame_32*>(current);
+        result[n] = (void*) sigframe->mctx.gregs[PT_NIP];
+      } else if (sigtramp32_rt_vdso && (sigtramp32_rt_vdso == current->return_addr)) {
+        struct rt_signal_frame_32 {
+          char dummy[64 + 16];
+          siginfo_t info;
+          struct ucontext uc;
+          // We don't care about the rest, since IP value is at 'uc' field.A
+        } *sigframe = reinterpret_cast<rt_signal_frame_32*>(current);
+        result[n] = (void*) sigframe->uc.uc_mcontext.uc_regs->gregs[PT_NIP];
+      }
+#endif
+
+#if IS_STACK_FRAMES
+      if (next > current) {
+        sizes[n] = (uintptr_t)next - (uintptr_t)current;
+      } else {
+        // A frame-size of 0 is used to indicate unknown frame size.
+        sizes[n] = 0;
+      }
+#endif
+      n++;
+    }
+    current = next;
+  }
+
+  // It's possible the second-last stack frame can't return
+  // (that is, it's __libc_start_main), in which case
+  // the CRT startup code will have set its LR to 'NULL'.
+  if (n > 0 && result[n-1] == NULL)
+    n--;
+
+  return n;
+}

diff --git a/src/stacktrace_win32-inl.h b/src/stacktrace_win32-inl.h
new file mode 100644
index 0000000..663e9a5
--- /dev/null
+++ b/src/stacktrace_win32-inl.h

@@ -0,0 +1,107 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+// Produces a stack trace for Windows.  Normally, one could use
+// stacktrace_x86-inl.h or stacktrace_x86_64-inl.h -- and indeed, that
+// should work for binaries compiled using MSVC in "debug" mode.
+// However, in "release" mode, Windows uses frame-pointer
+// optimization, which makes getting a stack trace very difficult.
+//
+// There are several approaches one can take.  One is to use Windows
+// intrinsics like StackWalk64.  These can work, but have restrictions
+// on how successful they can be.  Another attempt is to write a
+// version of stacktrace_x86-inl.h that has heuristic support for
+// dealing with FPO, similar to what WinDbg does (see
+// http://www.nynaeve.net/?p=97).
+//
+// The solution we've ended up doing is to call the undocumented
+// windows function RtlCaptureStackBackTrace, which probably doesn't
+// work with FPO but at least is fast, and doesn't require a symbol
+// server.
+//
+// This code is inspired by a patch from David Vitek:
+//   http://code.google.com/p/gperftools/issues/detail?id=83
+
+#ifndef BASE_STACKTRACE_WIN32_INL_H_
+#define BASE_STACKTRACE_WIN32_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
+#include "config.h"
+#include <windows.h>    // for GetProcAddress and GetModuleHandle
+#include <assert.h>
+
+typedef USHORT NTAPI RtlCaptureStackBackTrace_Function(
+    IN ULONG frames_to_skip,
+    IN ULONG frames_to_capture,
+    OUT PVOID *backtrace,
+    OUT PULONG backtrace_hash);
+
+// Load the function we need at static init time, where we don't have
+// to worry about someone else holding the loader's lock.
+static RtlCaptureStackBackTrace_Function* const RtlCaptureStackBackTrace_fn =
+   (RtlCaptureStackBackTrace_Function*)
+   GetProcAddress(GetModuleHandleA("ntdll.dll"), "RtlCaptureStackBackTrace");
+
+static int GetStackTrace_win32(void** result, int max_depth,
+                               int skip_count) {
+  if (!RtlCaptureStackBackTrace_fn) {
+    // TODO(csilvers): should we log an error here?
+    return 0;     // can't find a stacktrace with no function to call
+  }
+  return (int)RtlCaptureStackBackTrace_fn(skip_count + 3, max_depth,
+                                          result, 0);
+}
+
+static int not_implemented(void) {
+  assert(0 == "Not yet implemented");
+  return 0;
+}
+
+static int GetStackFrames_win32(void** /* pcs */,
+                                int* /* sizes */,
+                                int /* max_depth */,
+                                int /* skip_count */) {
+  return not_implemented();
+}
+
+static int GetStackFramesWithContext_win32(void** result, int* sizes, int max_depth,
+                                           int skip_count, const void *uc) {
+  return not_implemented();
+}
+
+static int GetStackTraceWithContext_win32(void** result, int max_depth,
+                                          int skip_count, const void *uc) {
+  return not_implemented();
+}
+
+
+#endif  // BASE_STACKTRACE_WIN32_INL_H_

diff --git a/src/stacktrace_x86-inl.h b/src/stacktrace_x86-inl.h
new file mode 100644
index 0000000..46eb5d8
--- /dev/null
+++ b/src/stacktrace_x86-inl.h

@@ -0,0 +1,354 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// Produce stack trace
+
+#ifndef BASE_STACKTRACE_X86_INL_H_
+#define BASE_STACKTRACE_X86_INL_H_
+// Note: this file is included into stacktrace.cc more than once.
+// Anything that should only be defined once should be here:
+
+#include "config.h"
+#include <stdlib.h>   // for NULL
+#include <assert.h>
+#if defined(HAVE_SYS_UCONTEXT_H)
+#include <sys/ucontext.h>
+#elif defined(HAVE_UCONTEXT_H)
+#include <ucontext.h>  // for ucontext_t
+#elif defined(HAVE_CYGWIN_SIGNAL_H)
+// cygwin/signal.h has a buglet where it uses pthread_attr_t without
+// #including <pthread.h> itself.  So we have to do it.
+# ifdef HAVE_PTHREAD
+# include <pthread.h>
+# endif
+#include <cygwin/signal.h>
+typedef ucontext ucontext_t;
+#endif
+#ifdef HAVE_STDINT_H
+#include <stdint.h>   // for uintptr_t
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_MMAP
+#include <sys/mman.h> // for msync
+#include "base/vdso_support.h"
+#endif
+
+#include "gperftools/stacktrace.h"
+
+#if defined(__linux__) && defined(__i386__) && defined(__ELF__) && defined(HAVE_MMAP)
+// Count "push %reg" instructions in VDSO __kernel_vsyscall(),
+// preceding "syscall" or "sysenter".
+// If __kernel_vsyscall uses frame pointer, answer 0.
+//
+// kMaxBytes tells how many instruction bytes of __kernel_vsyscall
+// to analyze before giving up. Up to kMaxBytes+1 bytes of
+// instructions could be accessed.
+//
+// Here are known __kernel_vsyscall instruction sequences:
+//
+// SYSENTER (linux-2.6.26/arch/x86/vdso/vdso32/sysenter.S).
+// Used on Intel.
+//  0xffffe400 <__kernel_vsyscall+0>:       push   %ecx
+//  0xffffe401 <__kernel_vsyscall+1>:       push   %edx
+//  0xffffe402 <__kernel_vsyscall+2>:       push   %ebp
+//  0xffffe403 <__kernel_vsyscall+3>:       mov    %esp,%ebp
+//  0xffffe405 <__kernel_vsyscall+5>:       sysenter
+//
+// SYSCALL (see linux-2.6.26/arch/x86/vdso/vdso32/syscall.S).
+// Used on AMD.
+//  0xffffe400 <__kernel_vsyscall+0>:       push   %ebp
+//  0xffffe401 <__kernel_vsyscall+1>:       mov    %ecx,%ebp
+//  0xffffe403 <__kernel_vsyscall+3>:       syscall
+//
+// i386 (see linux-2.6.26/arch/x86/vdso/vdso32/int80.S)
+//  0xffffe400 <__kernel_vsyscall+0>:       int $0x80
+//  0xffffe401 <__kernel_vsyscall+1>:       ret
+//
+static const int kMaxBytes = 10;
+
+// We use assert()s instead of DCHECK()s -- this is too low level
+// for DCHECK().
+
+static int CountPushInstructions(const unsigned char *const addr) {
+  int result = 0;
+  for (int i = 0; i < kMaxBytes; ++i) {
+    if (addr[i] == 0x89) {
+      // "mov reg,reg"
+      if (addr[i + 1] == 0xE5) {
+        // Found "mov %esp,%ebp".
+        return 0;
+      }
+      ++i;  // Skip register encoding byte.
+    } else if (addr[i] == 0x0F &&
+               (addr[i + 1] == 0x34 || addr[i + 1] == 0x05)) {
+      // Found "sysenter" or "syscall".
+      return result;
+    } else if ((addr[i] & 0xF0) == 0x50) {
+      // Found "push %reg".
+      ++result;
+    } else if (addr[i] == 0xCD && addr[i + 1] == 0x80) {
+      // Found "int $0x80"
+      assert(result == 0);
+      return 0;
+    } else {
+      // Unexpected instruction.
+      assert(0 == "unexpected instruction in __kernel_vsyscall");
+      return 0;
+    }
+  }
+  // Unexpected: didn't find SYSENTER or SYSCALL in
+  // [__kernel_vsyscall, __kernel_vsyscall + kMaxBytes) interval.
+  assert(0 == "did not find SYSENTER or SYSCALL in __kernel_vsyscall");
+  return 0;
+}
+#endif
+
+// Given a pointer to a stack frame, locate and return the calling
+// stackframe, or return NULL if no stackframe can be found. Perform sanity
+// checks (the strictness of which is controlled by the boolean parameter
+// "STRICT_UNWINDING") to reduce the chance that a bad pointer is returned.
+template<bool STRICT_UNWINDING, bool WITH_CONTEXT>
+static void **NextStackFrame(void **old_sp, const void *uc) {
+  void **new_sp = (void **) *old_sp;
+
+#if defined(__linux__) && defined(__i386__) && defined(HAVE_VDSO_SUPPORT)
+  if (WITH_CONTEXT && uc != NULL) {
+    // How many "push %reg" instructions are there at __kernel_vsyscall?
+    // This is constant for a given kernel and processor, so compute
+    // it only once.
+    static int num_push_instructions = -1;  // Sentinel: not computed yet.
+    // Initialize with sentinel value: __kernel_rt_sigreturn can not possibly
+    // be there.
+    static const unsigned char *kernel_rt_sigreturn_address = NULL;
+    static const unsigned char *kernel_vsyscall_address = NULL;
+    if (num_push_instructions == -1) {
+      base::VDSOSupport vdso;
+      if (vdso.IsPresent()) {
+        base::VDSOSupport::SymbolInfo rt_sigreturn_symbol_info;
+        base::VDSOSupport::SymbolInfo vsyscall_symbol_info;
+        if (!vdso.LookupSymbol("__kernel_rt_sigreturn", "LINUX_2.5",
+                               STT_FUNC, &rt_sigreturn_symbol_info) ||
+            !vdso.LookupSymbol("__kernel_vsyscall", "LINUX_2.5",
+                               STT_FUNC, &vsyscall_symbol_info) ||
+            rt_sigreturn_symbol_info.address == NULL ||
+            vsyscall_symbol_info.address == NULL) {
+          // Unexpected: 32-bit VDSO is present, yet one of the expected
+          // symbols is missing or NULL.
+          assert(0 == "VDSO is present, but doesn't have expected symbols");
+          num_push_instructions = 0;
+        } else {
+          kernel_rt_sigreturn_address =
+              reinterpret_cast<const unsigned char *>(
+                  rt_sigreturn_symbol_info.address);
+          kernel_vsyscall_address =
+              reinterpret_cast<const unsigned char *>(
+                  vsyscall_symbol_info.address);
+          num_push_instructions =
+              CountPushInstructions(kernel_vsyscall_address);
+        }
+      } else {
+        num_push_instructions = 0;
+      }
+    }
+    if (num_push_instructions != 0 && kernel_rt_sigreturn_address != NULL &&
+        old_sp[1] == kernel_rt_sigreturn_address) {
+      const ucontext_t *ucv = static_cast<const ucontext_t *>(uc);
+      // This kernel does not use frame pointer in its VDSO code,
+      // and so %ebp is not suitable for unwinding.
+      void **const reg_ebp =
+          reinterpret_cast<void **>(ucv->uc_mcontext.gregs[REG_EBP]);
+      const unsigned char *const reg_eip =
+          reinterpret_cast<unsigned char *>(ucv->uc_mcontext.gregs[REG_EIP]);
+      if (new_sp == reg_ebp &&
+          kernel_vsyscall_address <= reg_eip &&
+          reg_eip - kernel_vsyscall_address < kMaxBytes) {
+        // We "stepped up" to __kernel_vsyscall, but %ebp is not usable.
+        // Restore from 'ucv' instead.
+        void **const reg_esp =
+            reinterpret_cast<void **>(ucv->uc_mcontext.gregs[REG_ESP]);
+        // Check that alleged %esp is not NULL and is reasonably aligned.
+        if (reg_esp &&
+            ((uintptr_t)reg_esp & (sizeof(reg_esp) - 1)) == 0) {
+          // Check that alleged %esp is actually readable. This is to prevent
+          // "double fault" in case we hit the first fault due to e.g. stack
+          // corruption.
+          //
+          // page_size is linker-initalized to avoid async-unsafe locking
+          // that GCC would otherwise insert (__cxa_guard_acquire etc).
+          static int page_size;
+          if (page_size == 0) {
+            // First time through.
+            page_size = getpagesize();
+          }
+          void *const reg_esp_aligned =
+              reinterpret_cast<void *>(
+                  (uintptr_t)(reg_esp + num_push_instructions - 1) &
+                  ~(page_size - 1));
+          if (msync(reg_esp_aligned, page_size, MS_ASYNC) == 0) {
+            // Alleged %esp is readable, use it for further unwinding.
+            new_sp = reinterpret_cast<void **>(
+                reg_esp[num_push_instructions - 1]);
+          }
+        }
+      }
+    }
+  }
+#endif
+
+  // Check that the transition from frame pointer old_sp to frame
+  // pointer new_sp isn't clearly bogus
+  if (STRICT_UNWINDING) {
+    // With the stack growing downwards, older stack frame must be
+    // at a greater address that the current one.
+    if (new_sp <= old_sp) return NULL;
+    // Assume stack frames larger than 100,000 bytes are bogus.
+    if ((uintptr_t)new_sp - (uintptr_t)old_sp > 100000) return NULL;
+  } else {
+    // In the non-strict mode, allow discontiguous stack frames.
+    // (alternate-signal-stacks for example).
+    if (new_sp == old_sp) return NULL;
+    if (new_sp > old_sp) {
+      // And allow frames upto about 1MB.
+      const uintptr_t delta = (uintptr_t)new_sp - (uintptr_t)old_sp;
+      const uintptr_t acceptable_delta = 1000000;
+      if (delta > acceptable_delta) {
+        return NULL;
+      }
+    }
+  }
+  if ((uintptr_t)new_sp & (sizeof(void *) - 1)) return NULL;
+#ifdef __i386__
+  // On 64-bit machines, the stack pointer can be very close to
+  // 0xffffffff, so we explicitly check for a pointer into the
+  // last two pages in the address space
+  if ((uintptr_t)new_sp >= 0xffffe000) return NULL;
+#endif
+#ifdef HAVE_MMAP
+  if (!STRICT_UNWINDING) {
+    // Lax sanity checks cause a crash on AMD-based machines with
+    // VDSO-enabled kernels.
+    // Make an extra sanity check to insure new_sp is readable.
+    // Note: NextStackFrame<false>() is only called while the program
+    //       is already on its last leg, so it's ok to be slow here.
+    static int page_size = getpagesize();
+    void *new_sp_aligned = (void *)((uintptr_t)new_sp & ~(page_size - 1));
+    if (msync(new_sp_aligned, page_size, MS_ASYNC) == -1)
+      return NULL;
+  }
+#endif
+  return new_sp;
+}
+
+#endif  // BASE_STACKTRACE_X86_INL_H_
+
+// Note: this part of the file is included several times.
+// Do not put globals below.
+
+// The following 4 functions are generated from the code below:
+//   GetStack{Trace,Frames}()
+//   GetStack{Trace,Frames}WithContext()
+//
+// These functions take the following args:
+//   void** result: the stack-trace, as an array
+//   int* sizes: the size of each stack frame, as an array
+//               (GetStackFrames* only)
+//   int max_depth: the size of the result (and sizes) array(s)
+//   int skip_count: how many stack pointers to skip before storing in result
+//   void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
+
+static int GET_STACK_TRACE_OR_FRAMES {
+  void **sp;
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2) || __llvm__
+  // __builtin_frame_address(0) can return the wrong address on gcc-4.1.0-k8.
+  // It's always correct on llvm, and the techniques below aren't (in
+  // particular, llvm-gcc will make a copy of pcs, so it's not in sp[2]),
+  // so we also prefer __builtin_frame_address when running under llvm.
+  sp = reinterpret_cast<void**>(__builtin_frame_address(0));
+#elif defined(__i386__)
+  // Stack frame format:
+  //    sp[0]   pointer to previous frame
+  //    sp[1]   caller address
+  //    sp[2]   first argument
+  //    ...
+  // NOTE: This will break under llvm, since result is a copy and not in sp[2]
+  sp = (void **)&result - 2;
+#elif defined(__x86_64__)
+  unsigned long rbp;
+  // Move the value of the register %rbp into the local variable rbp.
+  // We need 'volatile' to prevent this instruction from getting moved
+  // around during optimization to before function prologue is done.
+  // An alternative way to achieve this
+  // would be (before this __asm__ instruction) to call Noop() defined as
+  //   static void Noop() __attribute__ ((noinline));  // prevent inlining
+  //   static void Noop() { asm(""); }  // prevent optimizing-away
+  __asm__ volatile ("mov %%rbp, %0" : "=r" (rbp));
+  // Arguments are passed in registers on x86-64, so we can't just
+  // offset from &result
+  sp = (void **) rbp;
+#else
+# error Using stacktrace_x86-inl.h on a non x86 architecture!
+#endif
+
+  skip_count++; // skip parent's frame due to indirection in stacktrace.cc
+
+  int n = 0;
+  while (sp && n < max_depth) {
+    if (*(sp+1) == reinterpret_cast<void *>(0)) {
+      // In 64-bit code, we often see a frame that
+      // points to itself and has a return address of 0.
+      break;
+    }
+#if !IS_WITH_CONTEXT
+    const void *const ucp = NULL;
+#endif
+    void **next_sp = NextStackFrame<!IS_STACK_FRAMES, IS_WITH_CONTEXT>(sp, ucp);
+    if (skip_count > 0) {
+      skip_count--;
+    } else {
+      result[n] = *(sp+1);
+#if IS_STACK_FRAMES
+      if (next_sp > sp) {
+        sizes[n] = (uintptr_t)next_sp - (uintptr_t)sp;
+      } else {
+        // A frame-size of 0 is used to indicate unknown frame size.
+        sizes[n] = 0;
+      }
+#endif
+      n++;
+    }
+    sp = next_sp;
+  }
+  return n;
+}

diff --git a/src/static_vars.cc b/src/static_vars.cc
new file mode 100644
index 0000000..09d2b59
--- /dev/null
+++ b/src/static_vars.cc

@@ -0,0 +1,125 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Ken Ashcraft <opensource@google.com>
+
+#include <config.h>
+#include "static_vars.h"
+#include <stddef.h>                     // for NULL
+#include <new>                          // for operator new
+#ifdef HAVE_PTHREAD
+#include <pthread.h>                    // for pthread_atfork
+#endif
+#include "internal_logging.h"  // for CHECK_CONDITION
+#include "common.h"
+#include "sampler.h"           // for Sampler
+#include "getenv_safe.h"       // TCMallocGetenvSafe
+#include "base/googleinit.h"
+
+namespace tcmalloc {
+
+#if defined(HAVE_FORK) && defined(HAVE_PTHREAD)
+// These following two functions are registered via pthread_atfork to make
+// sure the central_cache locks remain in a consisten state in the forked
+// version of the thread.
+
+void CentralCacheLockAll()
+{
+  Static::pageheap_lock()->Lock();
+  for (int i = 0; i < kNumClasses; ++i)
+    Static::central_cache()[i].Lock();
+}
+
+void CentralCacheUnlockAll()
+{
+  for (int i = 0; i < kNumClasses; ++i)
+    Static::central_cache()[i].Unlock();
+  Static::pageheap_lock()->Unlock();
+}
+#endif
+
+SpinLock Static::pageheap_lock_(SpinLock::LINKER_INITIALIZED);
+SizeMap Static::sizemap_;
+CentralFreeListPadded Static::central_cache_[kNumClasses];
+PageHeapAllocator<Span> Static::span_allocator_;
+PageHeapAllocator<StackTrace> Static::stacktrace_allocator_;
+Span Static::sampled_objects_;
+PageHeapAllocator<StackTraceTable::Bucket> Static::bucket_allocator_;
+StackTrace* Static::growth_stacks_ = NULL;
+PageHeap* Static::pageheap_ = NULL;
+
+
+void Static::InitStaticVars() {
+  sizemap_.Init();
+  span_allocator_.Init();
+  span_allocator_.New(); // Reduce cache conflicts
+  span_allocator_.New(); // Reduce cache conflicts
+  stacktrace_allocator_.Init();
+  bucket_allocator_.Init();
+  // Do a bit of sanitizing: make sure central_cache is aligned properly
+  CHECK_CONDITION((sizeof(central_cache_[0]) % 64) == 0);
+  for (int i = 0; i < kNumClasses; ++i) {
+    central_cache_[i].Init(i);
+  }
+
+  // It's important to have PageHeap allocated, not in static storage,
+  // so that HeapLeakChecker does not consider all the byte patterns stored
+  // in is caches as pointers that are sources of heap object liveness,
+  // which leads to it missing some memory leaks.
+  pageheap_ = new (MetaDataAlloc(sizeof(PageHeap))) PageHeap;
+
+  bool aggressive_decommit =
+    tcmalloc::commandlineflags::StringToBool(
+      TCMallocGetenvSafe("TCMALLOC_AGGRESSIVE_DECOMMIT"), true);
+
+  pageheap_->SetAggressiveDecommit(aggressive_decommit);
+
+  DLL_Init(&sampled_objects_);
+  Sampler::InitStatics();
+}
+
+
+#if defined(HAVE_FORK) && defined(HAVE_PTHREAD)
+
+static inline
+void SetupAtForkLocksHandler()
+{
+#if !defined(__APPLE__)
+  pthread_atfork(CentralCacheLockAll,    // parent calls before fork
+                 CentralCacheUnlockAll,  // parent calls after fork
+                 CentralCacheUnlockAll); // child calls after fork
+#endif
+}
+REGISTER_MODULE_INITIALIZER(tcmalloc_fork_handler, SetupAtForkLocksHandler());
+
+#endif
+
+}  // namespace tcmalloc

diff --git a/src/static_vars.h b/src/static_vars.h
new file mode 100644
index 0000000..c662e40
--- /dev/null
+++ b/src/static_vars.h

@@ -0,0 +1,115 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Ken Ashcraft <opensource@google.com>
+//
+// Static variables shared by multiple classes.
+
+#ifndef TCMALLOC_STATIC_VARS_H_
+#define TCMALLOC_STATIC_VARS_H_
+
+#include <config.h>
+#include "base/spinlock.h"
+#include "central_freelist.h"
+#include "common.h"
+#include "page_heap.h"
+#include "page_heap_allocator.h"
+#include "span.h"
+#include "stack_trace_table.h"
+
+namespace tcmalloc {
+
+class Static {
+ public:
+  // Linker initialized, so this lock can be accessed at any time.
+  static SpinLock* pageheap_lock() { return &pageheap_lock_; }
+
+  // Must be called before calling any of the accessors below.
+  static void InitStaticVars();
+
+  // Central cache -- an array of free-lists, one per size-class.
+  // We have a separate lock per free-list to reduce contention.
+  static CentralFreeListPadded* central_cache() { return central_cache_; }
+
+  static SizeMap* sizemap() { return &sizemap_; }
+
+  //////////////////////////////////////////////////////////////////////
+  // In addition to the explicit initialization comment, the variables below
+  // must be protected by pageheap_lock.
+
+  // Page-level allocator.
+  static PageHeap* pageheap() { return pageheap_; }
+
+  static PageHeapAllocator<Span>* span_allocator() { return &span_allocator_; }
+
+  static PageHeapAllocator<StackTrace>* stacktrace_allocator() {
+    return &stacktrace_allocator_;
+  }
+
+  static StackTrace* growth_stacks() { return growth_stacks_; }
+  static void set_growth_stacks(StackTrace* s) { growth_stacks_ = s; }
+
+  // State kept for sampled allocations (/pprof/heap support)
+  static Span* sampled_objects() { return &sampled_objects_; }
+  static PageHeapAllocator<StackTraceTable::Bucket>* bucket_allocator() {
+    return &bucket_allocator_;
+  }
+
+  // Check if InitStaticVars() has been run.
+  static bool IsInited() { return pageheap() != NULL; }
+
+ private:
+  static SpinLock pageheap_lock_;
+
+  // These static variables require explicit initialization.  We cannot
+  // count on their constructors to do any initialization because other
+  // static variables may try to allocate memory before these variables
+  // can run their constructors.
+
+  static SizeMap sizemap_;
+  static CentralFreeListPadded central_cache_[kNumClasses];
+  static PageHeapAllocator<Span> span_allocator_;
+  static PageHeapAllocator<StackTrace> stacktrace_allocator_;
+  static Span sampled_objects_;
+  static PageHeapAllocator<StackTraceTable::Bucket> bucket_allocator_;
+
+  // Linked list of stack traces recorded every time we allocated memory
+  // from the system.  Useful for finding allocation sites that cause
+  // increase in the footprint of the system.  The linked list pointer
+  // is stored in trace->stack[kMaxStackDepth-1].
+  static StackTrace* growth_stacks_;
+
+  static PageHeap* pageheap_;
+};
+
+}  // namespace tcmalloc
+
+#endif  // TCMALLOC_STATIC_VARS_H_

diff --git a/src/symbolize.cc b/src/symbolize.cc
new file mode 100755
index 0000000..a27106e
--- /dev/null
+++ b/src/symbolize.cc

@@ -0,0 +1,285 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2009, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// This forks out to pprof to do the actual symbolizing.  We might
+// be better off writing our own in C++.
+
+#include "config.h"
+#include "symbolize.h"
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>   // for write()
+#endif
+#ifdef HAVE_SYS_SOCKET_H
+#include <sys/socket.h>   // for socketpair() -- needed by Symbolize
+#endif
+#ifdef HAVE_SYS_WAIT_H
+#include <sys/wait.h>   // for wait() -- needed by Symbolize
+#endif
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
+#ifdef __MACH__
+#include <mach-o/dyld.h>   // for GetProgramInvocationName()
+#include <limits.h>        // for PATH_MAX
+#endif
+#if defined(__CYGWIN__) || defined(__CYGWIN32__)
+#include <io.h>            // for get_osfhandle()
+#endif
+#include <string>
+#include "base/commandlineflags.h"
+#include "base/logging.h"
+#include "base/sysinfo.h"
+
+using std::string;
+using tcmalloc::DumpProcSelfMaps;   // from sysinfo.h
+
+
+DEFINE_string(symbolize_pprof,
+              EnvToString("PPROF_PATH", "pprof"),
+              "Path to pprof to call for reporting function names.");
+
+// heap_profile_table_pprof may be referenced after destructors are
+// called (since that's when leak-checking is done), so we make
+// a more-permanent copy that won't ever get destroyed.
+static string* g_pprof_path = new string(FLAGS_symbolize_pprof);
+
+// Returns NULL if we're on an OS where we can't get the invocation name.
+// Using a static var is ok because we're not called from a thread.
+static const char* GetProgramInvocationName() {
+#if defined(HAVE_PROGRAM_INVOCATION_NAME)
+#ifdef __UCLIBC__
+  extern const char* program_invocation_name; // uclibc provides this
+#else
+  extern char* program_invocation_name;  // gcc provides this
+#endif
+  return program_invocation_name;
+#elif defined(__MACH__)
+  // We don't want to allocate memory for this since we may be
+  // calculating it when memory is corrupted.
+  static char program_invocation_name[PATH_MAX];
+  if (program_invocation_name[0] == '\0') {  // first time calculating
+    uint32_t length = sizeof(program_invocation_name);
+    if (_NSGetExecutablePath(program_invocation_name, &length))
+      return NULL;
+  }
+  return program_invocation_name;
+#else
+  return NULL;   // figure out a way to get argv[0]
+#endif
+}
+
+// Prints an error message when you can't run Symbolize().
+static void PrintError(const char* reason) {
+  RAW_LOG(ERROR,
+          "*** WARNING: Cannot convert addresses to symbols in output below.\n"
+          "*** Reason: %s\n"
+          "*** If you cannot fix this, try running pprof directly.\n",
+          reason);
+}
+
+void SymbolTable::Add(const void* addr) {
+  symbolization_table_[addr] = "";
+}
+
+const char* SymbolTable::GetSymbol(const void* addr) {
+  return symbolization_table_[addr];
+}
+
+// Updates symbolization_table with the pointers to symbol names corresponding
+// to its keys. The symbol names are stored in out, which is allocated and
+// freed by the caller of this routine.
+// Note that the forking/etc is not thread-safe or re-entrant.  That's
+// ok for the purpose we need -- reporting leaks detected by heap-checker
+// -- but be careful if you decide to use this routine for other purposes.
+// Returns number of symbols read on error.  If can't symbolize, returns 0
+// and emits an error message about why.
+int SymbolTable::Symbolize() {
+#if !defined(HAVE_UNISTD_H)  || !defined(HAVE_SYS_SOCKET_H) || !defined(HAVE_SYS_WAIT_H)
+  PrintError("Perftools does not know how to call a sub-process on this O/S");
+  return 0;
+#else
+  const char* argv0 = GetProgramInvocationName();
+  if (argv0 == NULL) {  // can't call symbolize if we can't figure out our name
+    PrintError("Cannot figure out the name of this executable (argv0)");
+    return 0;
+  }
+  if (access(g_pprof_path->c_str(), R_OK) != 0) {
+    PrintError("Cannot find 'pprof' (is PPROF_PATH set correctly?)");
+    return 0;
+  }
+
+  // All this work is to do two-way communication.  ugh.
+  int *child_in = NULL;   // file descriptors
+  int *child_out = NULL;  // for now, we don't worry about child_err
+  int child_fds[5][2];    // socketpair may be called up to five times below
+
+  // The client program may close its stdin and/or stdout and/or stderr
+  // thus allowing socketpair to reuse file descriptors 0, 1 or 2.
+  // In this case the communication between the forked processes may be broken
+  // if either the parent or the child tries to close or duplicate these
+  // descriptors. The loop below produces two pairs of file descriptors, each
+  // greater than 2 (stderr).
+  for (int i = 0; i < 5; i++) {
+    if (socketpair(AF_UNIX, SOCK_STREAM, 0, child_fds[i]) == -1) {
+      for (int j = 0; j < i; j++) {
+        close(child_fds[j][0]);
+        close(child_fds[j][1]);
+        PrintError("Cannot create a socket pair");
+      }
+      return 0;
+    } else {
+      if ((child_fds[i][0] > 2) && (child_fds[i][1] > 2)) {
+        if (child_in == NULL) {
+          child_in = child_fds[i];
+        } else {
+          child_out = child_fds[i];
+          for (int j = 0; j < i; j++) {
+            if (child_fds[j] == child_in) continue;
+            close(child_fds[j][0]);
+            close(child_fds[j][1]);
+          }
+          break;
+        }
+      }
+    }
+  }
+
+  switch (fork()) {
+    case -1: {  // error
+      close(child_in[0]);
+      close(child_in[1]);
+      close(child_out[0]);
+      close(child_out[1]);
+      PrintError("Unknown error calling fork()");
+      return 0;
+    }
+    case 0: {  // child
+      close(child_in[1]);   // child uses the 0's, parent uses the 1's
+      close(child_out[1]);  // child uses the 0's, parent uses the 1's
+      close(0);
+      close(1);
+      if (dup2(child_in[0], 0) == -1) _exit(1);
+      if (dup2(child_out[0], 1) == -1) _exit(2);
+      // Unset vars that might cause trouble when we fork
+      unsetenv("CPUPROFILE");
+      unsetenv("HEAPPROFILE");
+      unsetenv("HEAPCHECK");
+      unsetenv("PERFTOOLS_VERBOSE");
+      execlp(g_pprof_path->c_str(), g_pprof_path->c_str(),
+             "--symbols", argv0, NULL);
+      _exit(3);  // if execvp fails, it's bad news for us
+    }
+    default: {  // parent
+      close(child_in[0]);   // child uses the 0's, parent uses the 1's
+      close(child_out[0]);  // child uses the 0's, parent uses the 1's
+#ifdef HAVE_POLL_H
+      // Waiting for 1ms seems to give the OS time to notice any errors.
+      poll(0, 0, 1);
+      // For maximum safety, we check to make sure the execlp
+      // succeeded before trying to write.  (Otherwise we'll get a
+      // SIGPIPE.)  For systems without poll.h, we'll just skip this
+      // check, and trust that the user set PPROF_PATH correctly!
+      struct pollfd pfd = { child_in[1], POLLOUT, 0 };
+      if (!poll(&pfd, 1, 0) || !(pfd.revents & POLLOUT) ||
+          (pfd.revents & (POLLHUP|POLLERR))) {
+        PrintError("Cannot run 'pprof' (is PPROF_PATH set correctly?)");
+        return 0;
+      }
+#endif
+#if defined(__CYGWIN__) || defined(__CYGWIN32__)
+      // On cygwin, DumpProcSelfMaps() takes a HANDLE, not an fd.  Convert.
+      const HANDLE symbols_handle = (HANDLE) get_osfhandle(child_in[1]);
+      DumpProcSelfMaps(symbols_handle);
+#else
+      DumpProcSelfMaps(child_in[1]);  // what pprof expects on stdin
+#endif
+
+      // Allocate 24 bytes = ("0x" + 8 bytes + "\n" + overhead) for each
+      // address to feed to pprof.
+      const int kOutBufSize = 24 * symbolization_table_.size();
+      char *pprof_buffer = new char[kOutBufSize];
+      int written = 0;
+      for (SymbolMap::const_iterator iter = symbolization_table_.begin();
+           iter != symbolization_table_.end(); ++iter) {
+        written += snprintf(pprof_buffer + written, kOutBufSize - written,
+                 // pprof expects format to be 0xXXXXXX
+                 "0x%" PRIxPTR "\n", reinterpret_cast<uintptr_t>(iter->first));
+      }
+      write(child_in[1], pprof_buffer, strlen(pprof_buffer));
+      close(child_in[1]);             // that's all we need to write
+
+      const int kSymbolBufferSize = kSymbolSize * symbolization_table_.size();
+      int total_bytes_read = 0;
+      delete[] symbol_buffer_;
+      symbol_buffer_ = new char[kSymbolBufferSize];
+      memset(symbol_buffer_, '\0', kSymbolBufferSize);
+      while (1) {
+        int bytes_read = read(child_out[1], symbol_buffer_ + total_bytes_read,
+                              kSymbolBufferSize - total_bytes_read);
+        if (bytes_read < 0) {
+          close(child_out[1]);
+          PrintError("Cannot read data from pprof");
+          return 0;
+        } else if (bytes_read == 0) {
+          close(child_out[1]);
+          wait(NULL);
+          break;
+        } else {
+          total_bytes_read += bytes_read;
+        }
+      }
+      // We have successfully read the output of pprof into out.  Make sure
+      // the last symbol is full (we can tell because it ends with a \n).
+      if (total_bytes_read == 0 || symbol_buffer_[total_bytes_read - 1] != '\n')
+        return 0;
+      // make the symbolization_table_ values point to the output vector
+      SymbolMap::iterator fill = symbolization_table_.begin();
+      int num_symbols = 0;
+      const char *current_name = symbol_buffer_;
+      for (int i = 0; i < total_bytes_read; i++) {
+        if (symbol_buffer_[i] == '\n') {
+          fill->second = current_name;
+          symbol_buffer_[i] = '\0';
+          current_name = symbol_buffer_ + i + 1;
+          fill++;
+          num_symbols++;
+        }
+      }
+      return num_symbols;
+    }
+  }
+  PrintError("Unkown error (should never occur!)");
+  return 0;  // shouldn't be reachable
+#endif
+}

diff --git a/src/symbolize.h b/src/symbolize.h
new file mode 100644
index 0000000..728d073
--- /dev/null
+++ b/src/symbolize.h

@@ -0,0 +1,84 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2009, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+
+#ifndef TCMALLOC_SYMBOLIZE_H_
+#define TCMALLOC_SYMBOLIZE_H_
+
+#include "config.h"
+#ifdef HAVE_STDINT_H
+#include <stdint.h>  // for uintptr_t
+#endif
+#include <stddef.h>  // for NULL
+#include <map>
+
+using std::map;
+
+// SymbolTable encapsulates the address operations necessary for stack trace
+// symbolization. A common use-case is to Add() the addresses from one or
+// several stack traces to a table, call Symbolize() once and use GetSymbol()
+// to get the symbol names for pretty-printing the stack traces.
+class SymbolTable {
+ public:
+  SymbolTable()
+    : symbol_buffer_(NULL) {}
+  ~SymbolTable() {
+    delete[] symbol_buffer_;
+  }
+
+  // Adds an address to the table. This may overwrite a currently known symbol
+  // name, so Add() should not generally be called after Symbolize().
+  void Add(const void* addr);
+
+  // Returns the symbol name for addr, if the given address was added before
+  // the last successful call to Symbolize(). Otherwise may return an empty
+  // c-string.
+  const char* GetSymbol(const void* addr);
+
+  // Obtains the symbol names for the addresses stored in the table and returns
+  // the number of addresses actually symbolized.
+  int Symbolize();
+
+ private:
+  typedef map<const void*, const char*> SymbolMap;
+
+  // An average size of memory allocated for a stack trace symbol.
+  static const int kSymbolSize = 1024;
+
+  // Map from addresses to symbol names.
+  SymbolMap symbolization_table_;
+
+  // Pointer to the buffer that stores the symbol names.
+  char *symbol_buffer_;
+};
+
+#endif  // TCMALLOC_SYMBOLIZE_H_

diff --git a/src/system-alloc.cc b/src/system-alloc.cc
new file mode 100755
index 0000000..e61c087
--- /dev/null
+++ b/src/system-alloc.cc

@@ -0,0 +1,552 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+
+#include <config.h>
+#include <errno.h>                      // for EAGAIN, errno
+#include <fcntl.h>                      // for open, O_RDWR
+#include <stddef.h>                     // for size_t, NULL, ptrdiff_t
+#if defined HAVE_STDINT_H
+#include <stdint.h>                     // for uintptr_t, intptr_t
+#elif defined HAVE_INTTYPES_H
+#include <inttypes.h>
+#else
+#include <sys/types.h>
+#endif
+#ifdef HAVE_MMAP
+#include <sys/mman.h>                   // for munmap, mmap, MADV_DONTNEED, etc
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>                     // for sbrk, getpagesize, off_t
+#endif
+#include <new>                          // for operator new
+#include <gperftools/malloc_extension.h>
+#include "base/basictypes.h"
+#include "base/commandlineflags.h"
+#include "base/spinlock.h"              // for SpinLockHolder, SpinLock, etc
+#include "common.h"
+#include "internal_logging.h"
+
+// On systems (like freebsd) that don't define MAP_ANONYMOUS, use the old
+// form of the name instead.
+#ifndef MAP_ANONYMOUS
+# define MAP_ANONYMOUS MAP_ANON
+#endif
+
+// MADV_FREE is specifically designed for use by malloc(), but only
+// FreeBSD supports it; in linux we fall back to the somewhat inferior
+// MADV_DONTNEED.
+#if !defined(MADV_FREE) && defined(MADV_DONTNEED)
+# define MADV_FREE  MADV_DONTNEED
+#endif
+
+// Solaris has a bug where it doesn't declare madvise() for C++.
+//    http://www.opensolaris.org/jive/thread.jspa?threadID=21035&tstart=0
+#if defined(__sun) && defined(__SVR4)
+# include <sys/types.h>    // for caddr_t
+  extern "C" { extern int madvise(caddr_t, size_t, int); }
+#endif
+
+// Set kDebugMode mode so that we can have use C++ conditionals
+// instead of preprocessor conditionals.
+#ifdef NDEBUG
+static const bool kDebugMode = false;
+#else
+static const bool kDebugMode = true;
+#endif
+
+// TODO(sanjay): Move the code below into the tcmalloc namespace
+using tcmalloc::kLog;
+using tcmalloc::Log;
+
+// Anonymous namespace to avoid name conflicts on "CheckAddressBits".
+namespace {
+
+// Check that no bit is set at position ADDRESS_BITS or higher.
+template <int ADDRESS_BITS> bool CheckAddressBits(uintptr_t ptr) {
+  return (ptr >> ADDRESS_BITS) == 0;
+}
+
+// Specialize for the bit width of a pointer to avoid undefined shift.
+template <> bool CheckAddressBits<8 * sizeof(void*)>(uintptr_t ptr) {
+  return true;
+}
+
+}  // Anonymous namespace to avoid name conflicts on "CheckAddressBits".
+
+COMPILE_ASSERT(kAddressBits <= 8 * sizeof(void*),
+               address_bits_larger_than_pointer_size);
+
+// Structure for discovering alignment
+union MemoryAligner {
+  void*  p;
+  double d;
+  size_t s;
+} CACHELINE_ALIGNED;
+
+static SpinLock spinlock(SpinLock::LINKER_INITIALIZED);
+
+#if defined(HAVE_MMAP) || defined(MADV_FREE)
+// Page size is initialized on demand (only needed for mmap-based allocators)
+static size_t pagesize = 0;
+#endif
+
+// The current system allocator
+SysAllocator* sys_alloc = NULL;
+
+// Number of bytes taken from system.
+size_t TCMalloc_SystemTaken = 0;
+
+// Configuration parameters.
+DEFINE_int32(malloc_devmem_start,
+             EnvToInt("TCMALLOC_DEVMEM_START", 0),
+             "Physical memory starting location in MB for /dev/mem allocation."
+             "  Setting this to 0 disables /dev/mem allocation");
+DEFINE_int32(malloc_devmem_limit,
+             EnvToInt("TCMALLOC_DEVMEM_LIMIT", 0),
+             "Physical memory limit location in MB for /dev/mem allocation."
+             "  Setting this to 0 means no limit.");
+DEFINE_bool(malloc_skip_sbrk,
+            EnvToBool("TCMALLOC_SKIP_SBRK", false),
+            "Whether sbrk can be used to obtain memory.");
+DEFINE_bool(malloc_skip_mmap,
+            EnvToBool("TCMALLOC_SKIP_MMAP", false),
+            "Whether mmap can be used to obtain memory.");
+DEFINE_bool(malloc_disable_memory_release,
+            EnvToBool("TCMALLOC_DISABLE_MEMORY_RELEASE", false),
+            "Whether MADV_FREE/MADV_DONTNEED should be used"
+            " to return unused memory to the system.");
+
+// static allocators
+class SbrkSysAllocator : public SysAllocator {
+public:
+  SbrkSysAllocator() : SysAllocator() {
+  }
+  void* Alloc(size_t size, size_t *actual_size, size_t alignment);
+};
+static char sbrk_space[sizeof(SbrkSysAllocator)];
+
+class MmapSysAllocator : public SysAllocator {
+public:
+  MmapSysAllocator() : SysAllocator() {
+  }
+  void* Alloc(size_t size, size_t *actual_size, size_t alignment);
+};
+static char mmap_space[sizeof(MmapSysAllocator)];
+
+class DevMemSysAllocator : public SysAllocator {
+public:
+  DevMemSysAllocator() : SysAllocator() {
+  }
+  void* Alloc(size_t size, size_t *actual_size, size_t alignment);
+};
+
+class DefaultSysAllocator : public SysAllocator {
+ public:
+  DefaultSysAllocator() : SysAllocator() {
+    for (int i = 0; i < kMaxAllocators; i++) {
+      failed_[i] = true;
+      allocs_[i] = NULL;
+      names_[i] = NULL;
+    }
+  }
+  void SetChildAllocator(SysAllocator* alloc, unsigned int index,
+                         const char* name) {
+    if (index < kMaxAllocators && alloc != NULL) {
+      allocs_[index] = alloc;
+      failed_[index] = false;
+      names_[index] = name;
+    }
+  }
+  void* Alloc(size_t size, size_t *actual_size, size_t alignment);
+
+ private:
+  static const int kMaxAllocators = 2;
+  bool failed_[kMaxAllocators];
+  SysAllocator* allocs_[kMaxAllocators];
+  const char* names_[kMaxAllocators];
+};
+static char default_space[sizeof(DefaultSysAllocator)];
+static const char sbrk_name[] = "SbrkSysAllocator";
+static const char mmap_name[] = "MmapSysAllocator";
+
+
+void* SbrkSysAllocator::Alloc(size_t size, size_t *actual_size,
+                              size_t alignment) {
+#if !defined(HAVE_SBRK) || defined(__UCLIBC__)
+  return NULL;
+#else
+  // Check if we should use sbrk allocation.
+  // FLAGS_malloc_skip_sbrk starts out as false (its uninitialized
+  // state) and eventually gets initialized to the specified value.  Note
+  // that this code runs for a while before the flags are initialized.
+  // That means that even if this flag is set to true, some (initial)
+  // memory will be allocated with sbrk before the flag takes effect.
+  if (FLAGS_malloc_skip_sbrk) {
+    return NULL;
+  }
+
+  // sbrk will release memory if passed a negative number, so we do
+  // a strict check here
+  if (static_cast<ptrdiff_t>(size + alignment) < 0) return NULL;
+
+  // This doesn't overflow because TCMalloc_SystemAlloc has already
+  // tested for overflow at the alignment boundary.
+  size = ((size + alignment - 1) / alignment) * alignment;
+
+  // "actual_size" indicates that the bytes from the returned pointer
+  // p up to and including (p + actual_size - 1) have been allocated.
+  if (actual_size) {
+    *actual_size = size;
+  }
+
+  // Check that we we're not asking for so much more memory that we'd
+  // wrap around the end of the virtual address space.  (This seems
+  // like something sbrk() should check for us, and indeed opensolaris
+  // does, but glibc does not:
+  //    http://src.opensolaris.org/source/xref/onnv/onnv-gate/usr/src/lib/libc/port/sys/sbrk.c?a=true
+  //    http://sourceware.org/cgi-bin/cvsweb.cgi/~checkout~/libc/misc/sbrk.c?rev=1.1.2.1&content-type=text/plain&cvsroot=glibc
+  // Without this check, sbrk may succeed when it ought to fail.)
+  if (reinterpret_cast<intptr_t>(sbrk(0)) + size < size) {
+    return NULL;
+  }
+
+  void* result = sbrk(size);
+  if (result == reinterpret_cast<void*>(-1)) {
+    return NULL;
+  }
+
+  // Is it aligned?
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(result);
+  if ((ptr & (alignment-1)) == 0)  return result;
+
+  // Try to get more memory for alignment
+  size_t extra = alignment - (ptr & (alignment-1));
+  void* r2 = sbrk(extra);
+  if (reinterpret_cast<uintptr_t>(r2) == (ptr + size)) {
+    // Contiguous with previous result
+    return reinterpret_cast<void*>(ptr + extra);
+  }
+
+  // Give up and ask for "size + alignment - 1" bytes so
+  // that we can find an aligned region within it.
+  result = sbrk(size + alignment - 1);
+  if (result == reinterpret_cast<void*>(-1)) {
+    return NULL;
+  }
+  ptr = reinterpret_cast<uintptr_t>(result);
+  if ((ptr & (alignment-1)) != 0) {
+    ptr += alignment - (ptr & (alignment-1));
+  }
+  return reinterpret_cast<void*>(ptr);
+#endif  // HAVE_SBRK
+}
+
+void* MmapSysAllocator::Alloc(size_t size, size_t *actual_size,
+                              size_t alignment) {
+#ifndef HAVE_MMAP
+  return NULL;
+#else
+  // Check if we should use mmap allocation.
+  // FLAGS_malloc_skip_mmap starts out as false (its uninitialized
+  // state) and eventually gets initialized to the specified value.  Note
+  // that this code runs for a while before the flags are initialized.
+  // Chances are we never get here before the flags are initialized since
+  // sbrk is used until the heap is exhausted (before mmap is used).
+  if (FLAGS_malloc_skip_mmap) {
+    return NULL;
+  }
+
+  // Enforce page alignment
+  if (pagesize == 0) pagesize = getpagesize();
+  if (alignment < pagesize) alignment = pagesize;
+  size_t aligned_size = ((size + alignment - 1) / alignment) * alignment;
+  if (aligned_size < size) {
+    return NULL;
+  }
+  size = aligned_size;
+
+  // "actual_size" indicates that the bytes from the returned pointer
+  // p up to and including (p + actual_size - 1) have been allocated.
+  if (actual_size) {
+    *actual_size = size;
+  }
+
+  // Ask for extra memory if alignment > pagesize
+  size_t extra = 0;
+  if (alignment > pagesize) {
+    extra = alignment - pagesize;
+  }
+
+  // Note: size + extra does not overflow since:
+  //            size + alignment < (1<<NBITS).
+  // and        extra <= alignment
+  // therefore  size + extra < (1<<NBITS)
+  void* result = mmap(NULL, size + extra,
+                      PROT_READ|PROT_WRITE,
+                      MAP_PRIVATE|MAP_ANONYMOUS,
+                      -1, 0);
+  if (result == reinterpret_cast<void*>(MAP_FAILED)) {
+    return NULL;
+  }
+
+  // Adjust the return memory so it is aligned
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(result);
+  size_t adjust = 0;
+  if ((ptr & (alignment - 1)) != 0) {
+    adjust = alignment - (ptr & (alignment - 1));
+  }
+
+  // Return the unused memory to the system
+  if (adjust > 0) {
+    munmap(reinterpret_cast<void*>(ptr), adjust);
+  }
+  if (adjust < extra) {
+    munmap(reinterpret_cast<void*>(ptr + adjust + size), extra - adjust);
+  }
+
+  ptr += adjust;
+  return reinterpret_cast<void*>(ptr);
+#endif  // HAVE_MMAP
+}
+
+void* DevMemSysAllocator::Alloc(size_t size, size_t *actual_size,
+                                size_t alignment) {
+#ifndef HAVE_MMAP
+  return NULL;
+#else
+  static bool initialized = false;
+  static off_t physmem_base;  // next physical memory address to allocate
+  static off_t physmem_limit; // maximum physical address allowed
+  static int physmem_fd;      // file descriptor for /dev/mem
+
+  // Check if we should use /dev/mem allocation.  Note that it may take
+  // a while to get this flag initialized, so meanwhile we fall back to
+  // the next allocator.  (It looks like 7MB gets allocated before
+  // this flag gets initialized -khr.)
+  if (FLAGS_malloc_devmem_start == 0) {
+    // NOTE: not a devmem_failure - we'd like TCMalloc_SystemAlloc to
+    // try us again next time.
+    return NULL;
+  }
+
+  if (!initialized) {
+    physmem_fd = open("/dev/mem", O_RDWR);
+    if (physmem_fd < 0) {
+      return NULL;
+    }
+    physmem_base = FLAGS_malloc_devmem_start*1024LL*1024LL;
+    physmem_limit = FLAGS_malloc_devmem_limit*1024LL*1024LL;
+    initialized = true;
+  }
+
+  // Enforce page alignment
+  if (pagesize == 0) pagesize = getpagesize();
+  if (alignment < pagesize) alignment = pagesize;
+  size_t aligned_size = ((size + alignment - 1) / alignment) * alignment;
+  if (aligned_size < size) {
+    return NULL;
+  }
+  size = aligned_size;
+
+  // "actual_size" indicates that the bytes from the returned pointer
+  // p up to and including (p + actual_size - 1) have been allocated.
+  if (actual_size) {
+    *actual_size = size;
+  }
+
+  // Ask for extra memory if alignment > pagesize
+  size_t extra = 0;
+  if (alignment > pagesize) {
+    extra = alignment - pagesize;
+  }
+
+  // check to see if we have any memory left
+  if (physmem_limit != 0 &&
+      ((size + extra) > (physmem_limit - physmem_base))) {
+    return NULL;
+  }
+
+  // Note: size + extra does not overflow since:
+  //            size + alignment < (1<<NBITS).
+  // and        extra <= alignment
+  // therefore  size + extra < (1<<NBITS)
+  void *result = mmap(0, size + extra, PROT_WRITE|PROT_READ,
+                      MAP_SHARED, physmem_fd, physmem_base);
+  if (result == reinterpret_cast<void*>(MAP_FAILED)) {
+    return NULL;
+  }
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(result);
+
+  // Adjust the return memory so it is aligned
+  size_t adjust = 0;
+  if ((ptr & (alignment - 1)) != 0) {
+    adjust = alignment - (ptr & (alignment - 1));
+  }
+
+  // Return the unused virtual memory to the system
+  if (adjust > 0) {
+    munmap(reinterpret_cast<void*>(ptr), adjust);
+  }
+  if (adjust < extra) {
+    munmap(reinterpret_cast<void*>(ptr + adjust + size), extra - adjust);
+  }
+
+  ptr += adjust;
+  physmem_base += adjust + size;
+
+  return reinterpret_cast<void*>(ptr);
+#endif  // HAVE_MMAP
+}
+
+void* DefaultSysAllocator::Alloc(size_t size, size_t *actual_size,
+                                 size_t alignment) {
+  for (int i = 0; i < kMaxAllocators; i++) {
+    if (!failed_[i] && allocs_[i] != NULL) {
+      void* result = allocs_[i]->Alloc(size, actual_size, alignment);
+      if (result != NULL) {
+        return result;
+      }
+      failed_[i] = true;
+    }
+  }
+  // After both failed, reset "failed_" to false so that a single failed
+  // allocation won't make the allocator never work again.
+  for (int i = 0; i < kMaxAllocators; i++) {
+    failed_[i] = false;
+  }
+  return NULL;
+}
+
+ATTRIBUTE_WEAK ATTRIBUTE_NOINLINE
+SysAllocator *tc_get_sysalloc_override(SysAllocator *def)
+{
+  return def;
+}
+
+static bool system_alloc_inited = false;
+void InitSystemAllocators(void) {
+  MmapSysAllocator *mmap = new (mmap_space) MmapSysAllocator();
+  SbrkSysAllocator *sbrk = new (sbrk_space) SbrkSysAllocator();
+
+  // In 64-bit debug mode, place the mmap allocator first since it
+  // allocates pointers that do not fit in 32 bits and therefore gives
+  // us better testing of code's 64-bit correctness.  It also leads to
+  // less false negatives in heap-checking code.  (Numbers are less
+  // likely to look like pointers and therefore the conservative gc in
+  // the heap-checker is less likely to misinterpret a number as a
+  // pointer).
+  DefaultSysAllocator *sdef = new (default_space) DefaultSysAllocator();
+  if (kDebugMode && sizeof(void*) > 4) {
+    sdef->SetChildAllocator(mmap, 0, mmap_name);
+    sdef->SetChildAllocator(sbrk, 1, sbrk_name);
+  } else {
+    sdef->SetChildAllocator(sbrk, 0, sbrk_name);
+    sdef->SetChildAllocator(mmap, 1, mmap_name);
+  }
+
+  sys_alloc = tc_get_sysalloc_override(sdef);
+}
+
+void* TCMalloc_SystemAlloc(size_t size, size_t *actual_size,
+                           size_t alignment) {
+  // Discard requests that overflow
+  if (size + alignment < size) return NULL;
+
+  SpinLockHolder lock_holder(&spinlock);
+
+  if (!system_alloc_inited) {
+    InitSystemAllocators();
+    system_alloc_inited = true;
+  }
+
+  // Enforce minimum alignment
+  if (alignment < sizeof(MemoryAligner)) alignment = sizeof(MemoryAligner);
+
+  size_t actual_size_storage;
+  if (actual_size == NULL) {
+    actual_size = &actual_size_storage;
+  }
+
+  void* result = sys_alloc->Alloc(size, actual_size, alignment);
+  if (result != NULL) {
+    CHECK_CONDITION(
+      CheckAddressBits<kAddressBits>(
+        reinterpret_cast<uintptr_t>(result) + *actual_size - 1));
+    TCMalloc_SystemTaken += *actual_size;
+  }
+  return result;
+}
+
+bool TCMalloc_SystemRelease(void* start, size_t length) {
+#ifdef MADV_FREE
+  if (FLAGS_malloc_devmem_start) {
+    // It's not safe to use MADV_FREE/MADV_DONTNEED if we've been
+    // mapping /dev/mem for heap memory.
+    return false;
+  }
+  if (FLAGS_malloc_disable_memory_release) return false;
+  if (pagesize == 0) pagesize = getpagesize();
+  const size_t pagemask = pagesize - 1;
+
+  size_t new_start = reinterpret_cast<size_t>(start);
+  size_t end = new_start + length;
+  size_t new_end = end;
+
+  // Round up the starting address and round down the ending address
+  // to be page aligned:
+  new_start = (new_start + pagesize - 1) & ~pagemask;
+  new_end = new_end & ~pagemask;
+
+  ASSERT((new_start & pagemask) == 0);
+  ASSERT((new_end & pagemask) == 0);
+  ASSERT(new_start >= reinterpret_cast<size_t>(start));
+  ASSERT(new_end <= end);
+
+  if (new_end > new_start) {
+    int result;
+    do {
+      result = madvise(reinterpret_cast<char*>(new_start),
+          new_end - new_start, MADV_FREE);
+    } while (result == -1 && errno == EAGAIN);
+
+    return result != -1;
+  }
+#endif
+  return false;
+}
+
+void TCMalloc_SystemCommit(void* start, size_t length) {
+  // Nothing to do here.  TCMalloc_SystemRelease does not alter pages
+  // such that they need to be re-committed before they can be used by the
+  // application.
+}

diff --git a/src/system-alloc.h b/src/system-alloc.h
new file mode 100644
index 0000000..8233f96
--- /dev/null
+++ b/src/system-alloc.h

@@ -0,0 +1,92 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// Routine that uses sbrk/mmap to allocate memory from the system.
+// Useful for implementing malloc.
+
+#ifndef TCMALLOC_SYSTEM_ALLOC_H_
+#define TCMALLOC_SYSTEM_ALLOC_H_
+
+#include <config.h>
+#include <stddef.h>                     // for size_t
+
+class SysAllocator;
+
+// REQUIRES: "alignment" is a power of two or "0" to indicate default alignment
+//
+// Allocate and return "N" bytes of zeroed memory.
+//
+// If actual_bytes is NULL then the returned memory is exactly the
+// requested size.  If actual bytes is non-NULL then the allocator
+// may optionally return more bytes than asked for (i.e. return an
+// entire "huge" page if a huge page allocator is in use).
+//
+// The returned pointer is a multiple of "alignment" if non-zero. The
+// returned pointer will always be aligned suitably for holding a
+// void*, double, or size_t. In addition, if this platform defines
+// CACHELINE_ALIGNED, the return pointer will always be cacheline
+// aligned.
+//
+// Returns NULL when out of memory.
+extern PERFTOOLS_DLL_DECL
+void* TCMalloc_SystemAlloc(size_t bytes, size_t *actual_bytes,
+			   size_t alignment = 0);
+
+// This call is a hint to the operating system that the pages
+// contained in the specified range of memory will not be used for a
+// while, and can be released for use by other processes or the OS.
+// Pages which are released in this way may be destroyed (zeroed) by
+// the OS.  The benefit of this function is that it frees memory for
+// use by the system, the cost is that the pages are faulted back into
+// the address space next time they are touched, which can impact
+// performance.  (Only pages fully covered by the memory region will
+// be released, partial pages will not.)
+//
+// Returns false if release failed or not supported.
+extern PERFTOOLS_DLL_DECL
+bool TCMalloc_SystemRelease(void* start, size_t length);
+
+// Called to ressurect memory which has been previously released
+// to the system via TCMalloc_SystemRelease.  An attempt to
+// commit a page that is already committed does not cause this
+// function to fail.
+extern PERFTOOLS_DLL_DECL
+void TCMalloc_SystemCommit(void* start, size_t length);
+
+// The current system allocator.
+extern PERFTOOLS_DLL_DECL SysAllocator* sys_alloc;
+
+// Number of bytes taken from system.
+extern PERFTOOLS_DLL_DECL size_t TCMalloc_SystemTaken;
+
+#endif /* TCMALLOC_SYSTEM_ALLOC_H_ */

diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc
new file mode 100644
index 0000000..b7d1913
--- /dev/null
+++ b/src/tcmalloc.cc

@@ -0,0 +1,1736 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+//
+// A malloc that uses a per-thread cache to satisfy small malloc requests.
+// (The time for malloc/free of a small object drops from 300 ns to 50 ns.)
+//
+// See doc/tcmalloc.html for a high-level
+// description of how this malloc works.
+//
+// SYNCHRONIZATION
+//  1. The thread-specific lists are accessed without acquiring any locks.
+//     This is safe because each such list is only accessed by one thread.
+//  2. We have a lock per central free-list, and hold it while manipulating
+//     the central free list for a particular size.
+//  3. The central page allocator is protected by "pageheap_lock".
+//  4. The pagemap (which maps from page-number to descriptor),
+//     can be read without holding any locks, and written while holding
+//     the "pageheap_lock".
+//  5. To improve performance, a subset of the information one can get
+//     from the pagemap is cached in a data structure, pagemap_cache_,
+//     that atomically reads and writes its entries.  This cache can be
+//     read and written without locking.
+//
+//     This multi-threaded access to the pagemap is safe for fairly
+//     subtle reasons.  We basically assume that when an object X is
+//     allocated by thread A and deallocated by thread B, there must
+//     have been appropriate synchronization in the handoff of object
+//     X from thread A to thread B.  The same logic applies to pagemap_cache_.
+//
+// THE PAGEID-TO-SIZECLASS CACHE
+// Hot PageID-to-sizeclass mappings are held by pagemap_cache_.  If this cache
+// returns 0 for a particular PageID then that means "no information," not that
+// the sizeclass is 0.  The cache may have stale information for pages that do
+// not hold the beginning of any free()'able object.  Staleness is eliminated
+// in Populate() for pages with sizeclass > 0 objects, and in do_malloc() and
+// do_memalign() for all other relevant pages.
+//
+// PAGEMAP
+// -------
+// Page map contains a mapping from page id to Span.
+//
+// If Span s occupies pages [p..q],
+//      pagemap[p] == s
+//      pagemap[q] == s
+//      pagemap[p+1..q-1] are undefined
+//      pagemap[p-1] and pagemap[q+1] are defined:
+//         NULL if the corresponding page is not yet in the address space.
+//         Otherwise it points to a Span.  This span may be free
+//         or allocated.  If free, it is in one of pageheap's freelist.
+//
+// TODO: Bias reclamation to larger addresses
+// TODO: implement mallinfo/mallopt
+// TODO: Better testing
+//
+// 9/28/2003 (new page-level allocator replaces ptmalloc2):
+// * malloc/free of small objects goes from ~300 ns to ~50 ns.
+// * allocation of a reasonably complicated struct
+//   goes from about 1100 ns to about 300 ns.
+
+#include "config.h"
+#include <gperftools/tcmalloc.h>
+
+#include <errno.h>                      // for ENOMEM, EINVAL, errno
+#ifdef HAVE_SYS_CDEFS_H
+#include <sys/cdefs.h>                  // for __THROW
+#endif
+#if defined HAVE_STDINT_H
+#include <stdint.h>
+#elif defined HAVE_INTTYPES_H
+#include <inttypes.h>
+#else
+#include <sys/types.h>
+#endif
+#include <stddef.h>                     // for size_t, NULL
+#include <stdlib.h>                     // for getenv
+#include <string.h>                     // for strcmp, memset, strlen, etc
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>                     // for getpagesize, write, etc
+#endif
+#include <algorithm>                    // for max, min
+#include <limits>                       // for numeric_limits
+#include <new>                          // for nothrow_t (ptr only), etc
+#include <vector>                       // for vector
+
+#include <gperftools/malloc_extension.h>
+#include <gperftools/malloc_hook.h>         // for MallocHook
+#include "base/basictypes.h"            // for int64
+#include "base/commandlineflags.h"      // for RegisterFlagValidator, etc
+#include "base/dynamic_annotations.h"   // for RunningOnValgrind
+#include "base/spinlock.h"              // for SpinLockHolder
+#include "central_freelist.h"  // for CentralFreeListPadded
+#include "common.h"            // for StackTrace, kPageShift, etc
+#include "internal_logging.h"  // for ASSERT, TCMalloc_Printer, etc
+#include "linked_list.h"       // for SLL_SetNext
+#include "malloc_hook-inl.h"       // for MallocHook::InvokeNewHook, etc
+#include "page_heap.h"         // for PageHeap, PageHeap::Stats
+#include "page_heap_allocator.h"  // for PageHeapAllocator
+#include "span.h"              // for Span, DLL_Prepend, etc
+#include "stack_trace_table.h"  // for StackTraceTable
+#include "static_vars.h"       // for Static
+#include "system-alloc.h"      // for DumpSystemAllocatorStats, etc
+#include "tcmalloc_guard.h"    // for TCMallocGuard
+#include "thread_cache.h"      // for ThreadCache
+
+#ifdef __clang__
+// clang's apparent focus on code size somehow causes it to ignore
+// normal inline directives even for few functions which inlining is
+// key for performance. In order to get performance of clang's
+// generated code closer to normal, we're forcing inlining via
+// attribute.
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#else
+#define ALWAYS_INLINE inline
+#endif
+
+#if (defined(_WIN32) && !defined(__CYGWIN__) && !defined(__CYGWIN32__)) && !defined(WIN32_OVERRIDE_ALLOCATORS)
+# define WIN32_DO_PATCHING 1
+#endif
+
+// Some windows file somewhere (at least on cygwin) #define's small (!)
+#undef small
+
+using STL_NAMESPACE::max;
+using STL_NAMESPACE::numeric_limits;
+using STL_NAMESPACE::vector;
+
+#include "libc_override.h"
+
+// __THROW is defined in glibc (via <sys/cdefs.h>).  It means,
+// counter-intuitively, "This function will never throw an exception."
+// It's an optional optimization tool, but we may need to use it to
+// match glibc prototypes.
+#ifndef __THROW    // I guess we're not on a glibc system
+# define __THROW   // __THROW is just an optimization, so ok to make it ""
+#endif
+
+using tcmalloc::AlignmentForSize;
+using tcmalloc::kLog;
+using tcmalloc::kCrash;
+using tcmalloc::kCrashWithStats;
+using tcmalloc::Log;
+using tcmalloc::PageHeap;
+using tcmalloc::PageHeapAllocator;
+using tcmalloc::SizeMap;
+using tcmalloc::Span;
+using tcmalloc::StackTrace;
+using tcmalloc::Static;
+using tcmalloc::ThreadCache;
+
+DECLARE_int64(tcmalloc_sample_parameter);
+DECLARE_double(tcmalloc_release_rate);
+
+// For windows, the printf we use to report large allocs is
+// potentially dangerous: it could cause a malloc that would cause an
+// infinite loop.  So by default we set the threshold to a huge number
+// on windows, so this bad situation will never trigger.  You can
+// always set TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD manually if you
+// want this functionality.
+#ifdef _WIN32
+const int64 kDefaultLargeAllocReportThreshold = static_cast<int64>(1) << 62;
+#else
+const int64 kDefaultLargeAllocReportThreshold = static_cast<int64>(1) << 30;
+#endif
+DEFINE_int64(tcmalloc_large_alloc_report_threshold,
+             EnvToInt64("TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD",
+                        kDefaultLargeAllocReportThreshold),
+             "Allocations larger than this value cause a stack "
+             "trace to be dumped to stderr.  The threshold for "
+             "dumping stack traces is increased by a factor of 1.125 "
+             "every time we print a message so that the threshold "
+             "automatically goes up by a factor of ~1000 every 60 "
+             "messages.  This bounds the amount of extra logging "
+             "generated by this flag.  Default value of this flag "
+             "is very large and therefore you should see no extra "
+             "logging unless the flag is overridden.  Set to 0 to "
+             "disable reporting entirely.");
+
+
+// We already declared these functions in tcmalloc.h, but we have to
+// declare them again to give them an ATTRIBUTE_SECTION: we want to
+// put all callers of MallocHook::Invoke* in this module into
+// ATTRIBUTE_SECTION(google_malloc) section, so that
+// MallocHook::GetCallerStackTrace can function accurately.
+#ifndef _WIN32   // windows doesn't have attribute_section, so don't bother
+extern "C" {
+  void* tc_malloc(size_t size) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+  void tc_free(void* ptr) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+  void* tc_realloc(void* ptr, size_t size) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+  void* tc_calloc(size_t nmemb, size_t size) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+  void tc_cfree(void* ptr) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+
+  void* tc_memalign(size_t __alignment, size_t __size) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+  int tc_posix_memalign(void** ptr, size_t align, size_t size) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+  void* tc_valloc(size_t __size) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+  void* tc_pvalloc(size_t __size) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+
+  void tc_malloc_stats(void) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+  int tc_mallopt(int cmd, int value) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+#ifdef HAVE_STRUCT_MALLINFO
+  struct mallinfo tc_mallinfo(void) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+#endif
+
+  void* tc_new(size_t size)
+      ATTRIBUTE_SECTION(google_malloc);
+  void tc_delete(void* p) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+  void* tc_newarray(size_t size)
+      ATTRIBUTE_SECTION(google_malloc);
+  void tc_deletearray(void* p) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+
+  // And the nothrow variants of these:
+  void* tc_new_nothrow(size_t size, const std::nothrow_t&) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+  void* tc_newarray_nothrow(size_t size, const std::nothrow_t&) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+  // Surprisingly, standard C++ library implementations use a
+  // nothrow-delete internally.  See, eg:
+  // http://www.dinkumware.com/manuals/?manual=compleat&page=new.html
+  void tc_delete_nothrow(void* ptr, const std::nothrow_t&) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+  void tc_deletearray_nothrow(void* ptr, const std::nothrow_t&) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+
+  // Some non-standard extensions that we support.
+
+  // This is equivalent to
+  //    OS X: malloc_size()
+  //    glibc: malloc_usable_size()
+  //    Windows: _msize()
+  size_t tc_malloc_size(void* p) __THROW
+      ATTRIBUTE_SECTION(google_malloc);
+}  // extern "C"
+#endif  // #ifndef _WIN32
+
+// ----------------------- IMPLEMENTATION -------------------------------
+
+static int tc_new_mode = 0;  // See tc_set_new_mode().
+
+// Routines such as free() and realloc() catch some erroneous pointers
+// passed to them, and invoke the below when they do.  (An erroneous pointer
+// won't be caught if it's within a valid span or a stale span for which
+// the pagemap cache has a non-zero sizeclass.) This is a cheap (source-editing
+// required) kind of exception handling for these routines.
+namespace {
+void InvalidFree(void* ptr) {
+  Log(kCrash, __FILE__, __LINE__, "Attempt to free invalid pointer", ptr);
+}
+
+size_t InvalidGetSizeForRealloc(const void* old_ptr) {
+  Log(kCrash, __FILE__, __LINE__,
+      "Attempt to realloc invalid pointer", old_ptr);
+  return 0;
+}
+
+size_t InvalidGetAllocatedSize(const void* ptr) {
+  Log(kCrash, __FILE__, __LINE__,
+      "Attempt to get the size of an invalid pointer", ptr);
+  return 0;
+}
+}  // unnamed namespace
+
+// Extract interesting stats
+struct TCMallocStats {
+  uint64_t thread_bytes;      // Bytes in thread caches
+  uint64_t central_bytes;     // Bytes in central cache
+  uint64_t transfer_bytes;    // Bytes in central transfer cache
+  uint64_t metadata_bytes;    // Bytes alloced for metadata
+  PageHeap::Stats pageheap;   // Stats from page heap
+};
+
+// Get stats into "r".  Also, if class_count != NULL, class_count[k]
+// will be set to the total number of objects of size class k in the
+// central cache, transfer cache, and per-thread caches. If small_spans
+// is non-NULL, it is filled.  Same for large_spans.
+static void ExtractStats(TCMallocStats* r, uint64_t* class_count,
+                         PageHeap::SmallSpanStats* small_spans,
+                         PageHeap::LargeSpanStats* large_spans) {
+  r->central_bytes = 0;
+  r->transfer_bytes = 0;
+  for (int cl = 0; cl < kNumClasses; ++cl) {
+    const int length = Static::central_cache()[cl].length();
+    const int tc_length = Static::central_cache()[cl].tc_length();
+    const size_t cache_overhead = Static::central_cache()[cl].OverheadBytes();
+    const size_t size = static_cast<uint64_t>(
+        Static::sizemap()->ByteSizeForClass(cl));
+    r->central_bytes += (size * length) + cache_overhead;
+    r->transfer_bytes += (size * tc_length);
+    if (class_count) {
+      // Sum the lengths of all per-class freelists, except the per-thread
+      // freelists, which get counted when we call GetThreadStats(), below.
+      class_count[cl] = length + tc_length;
+    }
+
+  }
+
+  // Add stats from per-thread heaps
+  r->thread_bytes = 0;
+  { // scope
+    SpinLockHolder h(Static::pageheap_lock());
+    ThreadCache::GetThreadStats(&r->thread_bytes, class_count);
+    r->metadata_bytes = tcmalloc::metadata_system_bytes();
+    r->pageheap = Static::pageheap()->stats();
+    if (small_spans != NULL) {
+      Static::pageheap()->GetSmallSpanStats(small_spans);
+    }
+    if (large_spans != NULL) {
+      Static::pageheap()->GetLargeSpanStats(large_spans);
+    }
+  }
+}
+
+static double PagesToMiB(uint64_t pages) {
+  return (pages << kPageShift) / 1048576.0;
+}
+
+// WRITE stats to "out"
+static void DumpStats(TCMalloc_Printer* out, int level) {
+  TCMallocStats stats;
+  uint64_t class_count[kNumClasses];
+  PageHeap::SmallSpanStats small;
+  PageHeap::LargeSpanStats large;
+  if (level >= 2) {
+    ExtractStats(&stats, class_count, &small, &large);
+  } else {
+    ExtractStats(&stats, NULL, NULL, NULL);
+  }
+
+  static const double MiB = 1048576.0;
+
+  const uint64_t virtual_memory_used = (stats.pageheap.system_bytes
+                                        + stats.metadata_bytes);
+  const uint64_t physical_memory_used = (virtual_memory_used
+                                         - stats.pageheap.unmapped_bytes);
+  const uint64_t bytes_in_use_by_app = (physical_memory_used
+                                        - stats.metadata_bytes
+                                        - stats.pageheap.free_bytes
+                                        - stats.central_bytes
+                                        - stats.transfer_bytes
+                                        - stats.thread_bytes);
+
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+  out->printf(
+      "NOTE:  SMALL MEMORY MODEL IS IN USE, PERFORMANCE MAY SUFFER.\n");
+#endif
+  out->printf(
+      "------------------------------------------------\n"
+      "MALLOC:   %12" PRIu64 " (%7.1f MiB) Bytes in use by application\n"
+      "MALLOC: + %12" PRIu64 " (%7.1f MiB) Bytes in page heap freelist\n"
+      "MALLOC: + %12" PRIu64 " (%7.1f MiB) Bytes in central cache freelist\n"
+      "MALLOC: + %12" PRIu64 " (%7.1f MiB) Bytes in transfer cache freelist\n"
+      "MALLOC: + %12" PRIu64 " (%7.1f MiB) Bytes in thread cache freelists\n"
+      "MALLOC: + %12" PRIu64 " (%7.1f MiB) Bytes in malloc metadata\n"
+      "MALLOC:   ------------\n"
+      "MALLOC: = %12" PRIu64 " (%7.1f MiB) Actual memory used (physical + swap)\n"
+      "MALLOC: + %12" PRIu64 " (%7.1f MiB) Bytes released to OS (aka unmapped)\n"
+      "MALLOC:   ------------\n"
+      "MALLOC: = %12" PRIu64 " (%7.1f MiB) Virtual address space used\n"
+      "MALLOC:\n"
+      "MALLOC:   %12" PRIu64 "              Spans in use\n"
+      "MALLOC:   %12" PRIu64 "              Thread heaps in use\n"
+      "MALLOC:   %12" PRIu64 "              Tcmalloc page size\n"
+      "------------------------------------------------\n"
+      "Call ReleaseFreeMemory() to release freelist memory to the OS"
+      " (via madvise()).\n"
+      "Bytes released to the OS take up virtual address space"
+      " but no physical memory.\n",
+      bytes_in_use_by_app, bytes_in_use_by_app / MiB,
+      stats.pageheap.free_bytes, stats.pageheap.free_bytes / MiB,
+      stats.central_bytes, stats.central_bytes / MiB,
+      stats.transfer_bytes, stats.transfer_bytes / MiB,
+      stats.thread_bytes, stats.thread_bytes / MiB,
+      stats.metadata_bytes, stats.metadata_bytes / MiB,
+      physical_memory_used, physical_memory_used / MiB,
+      stats.pageheap.unmapped_bytes, stats.pageheap.unmapped_bytes / MiB,
+      virtual_memory_used, virtual_memory_used / MiB,
+      uint64_t(Static::span_allocator()->inuse()),
+      uint64_t(ThreadCache::HeapsInUse()),
+      uint64_t(kPageSize));
+
+  if (level >= 2) {
+    out->printf("------------------------------------------------\n");
+    out->printf("Total size of freelists for per-thread caches,\n");
+    out->printf("transfer cache, and central cache, by size class\n");
+    out->printf("------------------------------------------------\n");
+    uint64_t cumulative = 0;
+    for (int cl = 0; cl < kNumClasses; ++cl) {
+      if (class_count[cl] > 0) {
+        uint64_t class_bytes =
+            class_count[cl] * Static::sizemap()->ByteSizeForClass(cl);
+        cumulative += class_bytes;
+        out->printf("class %3d [ %8" PRIuS " bytes ] : "
+                "%8" PRIu64 " objs; %5.1f MiB; %5.1f cum MiB\n",
+                cl, Static::sizemap()->ByteSizeForClass(cl),
+                class_count[cl],
+                class_bytes / MiB,
+                cumulative / MiB);
+      }
+    }
+
+    // append page heap info
+    int nonempty_sizes = 0;
+    for (int s = 0; s < kMaxPages; s++) {
+      if (small.normal_length[s] + small.returned_length[s] > 0) {
+        nonempty_sizes++;
+      }
+    }
+    out->printf("------------------------------------------------\n");
+    out->printf("PageHeap: %d sizes; %6.1f MiB free; %6.1f MiB unmapped\n",
+                nonempty_sizes, stats.pageheap.free_bytes / MiB,
+                stats.pageheap.unmapped_bytes / MiB);
+    out->printf("------------------------------------------------\n");
+    uint64_t total_normal = 0;
+    uint64_t total_returned = 0;
+    for (int s = 0; s < kMaxPages; s++) {
+      const int n_length = small.normal_length[s];
+      const int r_length = small.returned_length[s];
+      if (n_length + r_length > 0) {
+        uint64_t n_pages = s * n_length;
+        uint64_t r_pages = s * r_length;
+        total_normal += n_pages;
+        total_returned += r_pages;
+        out->printf("%6u pages * %6u spans ~ %6.1f MiB; %6.1f MiB cum"
+                    "; unmapped: %6.1f MiB; %6.1f MiB cum\n",
+                    s,
+                    (n_length + r_length),
+                    PagesToMiB(n_pages + r_pages),
+                    PagesToMiB(total_normal + total_returned),
+                    PagesToMiB(r_pages),
+                    PagesToMiB(total_returned));
+      }
+    }
+
+    total_normal += large.normal_pages;
+    total_returned += large.returned_pages;
+    out->printf(">255   large * %6u spans ~ %6.1f MiB; %6.1f MiB cum"
+                "; unmapped: %6.1f MiB; %6.1f MiB cum\n",
+                static_cast<unsigned int>(large.spans),
+                PagesToMiB(large.normal_pages + large.returned_pages),
+                PagesToMiB(total_normal + total_returned),
+                PagesToMiB(large.returned_pages),
+                PagesToMiB(total_returned));
+  }
+}
+
+static void PrintStats(int level) {
+  const int kBufferSize = 16 << 10;
+  char* buffer = new char[kBufferSize];
+  TCMalloc_Printer printer(buffer, kBufferSize);
+  DumpStats(&printer, level);
+  write(STDERR_FILENO, buffer, strlen(buffer));
+  delete[] buffer;
+}
+
+static void** DumpHeapGrowthStackTraces() {
+  // Count how much space we need
+  int needed_slots = 0;
+  {
+    SpinLockHolder h(Static::pageheap_lock());
+    for (StackTrace* t = Static::growth_stacks();
+         t != NULL;
+         t = reinterpret_cast<StackTrace*>(
+             t->stack[tcmalloc::kMaxStackDepth-1])) {
+      needed_slots += 3 + t->depth;
+    }
+    needed_slots += 100;            // Slop in case list grows
+    needed_slots += needed_slots/8; // An extra 12.5% slop
+  }
+
+  void** result = new void*[needed_slots];
+  if (result == NULL) {
+    Log(kLog, __FILE__, __LINE__,
+        "tcmalloc: allocation failed for stack trace slots",
+        needed_slots * sizeof(*result));
+    return NULL;
+  }
+
+  SpinLockHolder h(Static::pageheap_lock());
+  int used_slots = 0;
+  for (StackTrace* t = Static::growth_stacks();
+       t != NULL;
+       t = reinterpret_cast<StackTrace*>(
+           t->stack[tcmalloc::kMaxStackDepth-1])) {
+    ASSERT(used_slots < needed_slots);  // Need to leave room for terminator
+    if (used_slots + 3 + t->depth >= needed_slots) {
+      // No more room
+      break;
+    }
+
+    result[used_slots+0] = reinterpret_cast<void*>(static_cast<uintptr_t>(1));
+    result[used_slots+1] = reinterpret_cast<void*>(t->size);
+    result[used_slots+2] = reinterpret_cast<void*>(t->depth);
+    for (int d = 0; d < t->depth; d++) {
+      result[used_slots+3+d] = t->stack[d];
+    }
+    used_slots += 3 + t->depth;
+  }
+  result[used_slots] = reinterpret_cast<void*>(static_cast<uintptr_t>(0));
+  return result;
+}
+
+static void IterateOverRanges(void* arg, MallocExtension::RangeFunction func) {
+  PageID page = 1;  // Some code may assume that page==0 is never used
+  bool done = false;
+  while (!done) {
+    // Accumulate a small number of ranges in a local buffer
+    static const int kNumRanges = 16;
+    static base::MallocRange ranges[kNumRanges];
+    int n = 0;
+    {
+      SpinLockHolder h(Static::pageheap_lock());
+      while (n < kNumRanges) {
+        if (!Static::pageheap()->GetNextRange(page, &ranges[n])) {
+          done = true;
+          break;
+        } else {
+          uintptr_t limit = ranges[n].address + ranges[n].length;
+          page = (limit + kPageSize - 1) >> kPageShift;
+          n++;
+        }
+      }
+    }
+
+    for (int i = 0; i < n; i++) {
+      (*func)(arg, &ranges[i]);
+    }
+  }
+}
+
+// TCMalloc's support for extra malloc interfaces
+class TCMallocImplementation : public MallocExtension {
+ private:
+  // ReleaseToSystem() might release more than the requested bytes because
+  // the page heap releases at the span granularity, and spans are of wildly
+  // different sizes.  This member keeps track of the extra bytes bytes
+  // released so that the app can periodically call ReleaseToSystem() to
+  // release memory at a constant rate.
+  // NOTE: Protected by Static::pageheap_lock().
+  size_t extra_bytes_released_;
+
+ public:
+  TCMallocImplementation()
+      : extra_bytes_released_(0) {
+  }
+
+  virtual void GetStats(char* buffer, int buffer_length) {
+    ASSERT(buffer_length > 0);
+    TCMalloc_Printer printer(buffer, buffer_length);
+
+    // Print level one stats unless lots of space is available
+    if (buffer_length < 10000) {
+      DumpStats(&printer, 1);
+    } else {
+      DumpStats(&printer, 2);
+    }
+  }
+
+  // We may print an extra, tcmalloc-specific warning message here.
+  virtual void GetHeapSample(MallocExtensionWriter* writer) {
+    if (FLAGS_tcmalloc_sample_parameter == 0) {
+      const char* const kWarningMsg =
+          "%warn\n"
+          "%warn This heap profile does not have any data in it, because\n"
+          "%warn the application was run with heap sampling turned off.\n"
+          "%warn To get useful data from GetHeapSample(), you must\n"
+          "%warn set the environment variable TCMALLOC_SAMPLE_PARAMETER to\n"
+          "%warn a positive sampling period, such as 524288.\n"
+          "%warn\n";
+      writer->append(kWarningMsg, strlen(kWarningMsg));
+    }
+    MallocExtension::GetHeapSample(writer);
+  }
+
+  virtual void** ReadStackTraces(int* sample_period) {
+    tcmalloc::StackTraceTable table;
+    {
+      SpinLockHolder h(Static::pageheap_lock());
+      Span* sampled = Static::sampled_objects();
+      for (Span* s = sampled->next; s != sampled; s = s->next) {
+        table.AddTrace(*reinterpret_cast<StackTrace*>(s->objects));
+      }
+    }
+    *sample_period = ThreadCache::GetCache()->GetSamplePeriod();
+    return table.ReadStackTracesAndClear(); // grabs and releases pageheap_lock
+  }
+
+  virtual void** ReadHeapGrowthStackTraces() {
+    return DumpHeapGrowthStackTraces();
+  }
+
+  virtual void Ranges(void* arg, RangeFunction func) {
+    IterateOverRanges(arg, func);
+  }
+
+  virtual bool GetNumericProperty(const char* name, size_t* value) {
+    ASSERT(name != NULL);
+
+    if (strcmp(name, "generic.current_allocated_bytes") == 0) {
+      TCMallocStats stats;
+      ExtractStats(&stats, NULL, NULL, NULL);
+      *value = stats.pageheap.system_bytes
+               - stats.thread_bytes
+               - stats.central_bytes
+               - stats.transfer_bytes
+               - stats.pageheap.free_bytes
+               - stats.pageheap.unmapped_bytes;
+      return true;
+    }
+
+    if (strcmp(name, "generic.heap_size") == 0) {
+      TCMallocStats stats;
+      ExtractStats(&stats, NULL, NULL, NULL);
+      *value = stats.pageheap.system_bytes;
+      return true;
+    }
+
+    if (strcmp(name, "tcmalloc.slack_bytes") == 0) {
+      // Kept for backwards compatibility.  Now defined externally as:
+      //    pageheap_free_bytes + pageheap_unmapped_bytes.
+      SpinLockHolder l(Static::pageheap_lock());
+      PageHeap::Stats stats = Static::pageheap()->stats();
+      *value = stats.free_bytes + stats.unmapped_bytes;
+      return true;
+    }
+
+    if (strcmp(name, "tcmalloc.central_cache_free_bytes") == 0) {
+      TCMallocStats stats;
+      ExtractStats(&stats, NULL, NULL, NULL);
+      *value = stats.central_bytes;
+      return true;
+    }
+
+    if (strcmp(name, "tcmalloc.transfer_cache_free_bytes") == 0) {
+      TCMallocStats stats;
+      ExtractStats(&stats, NULL, NULL, NULL);
+      *value = stats.transfer_bytes;
+      return true;
+    }
+
+    if (strcmp(name, "tcmalloc.thread_cache_free_bytes") == 0) {
+      TCMallocStats stats;
+      ExtractStats(&stats, NULL, NULL, NULL);
+      *value = stats.thread_bytes;
+      return true;
+    }
+
+    if (strcmp(name, "tcmalloc.pageheap_free_bytes") == 0) {
+      SpinLockHolder l(Static::pageheap_lock());
+      *value = Static::pageheap()->stats().free_bytes;
+      return true;
+    }
+
+    if (strcmp(name, "tcmalloc.pageheap_unmapped_bytes") == 0) {
+      SpinLockHolder l(Static::pageheap_lock());
+      *value = Static::pageheap()->stats().unmapped_bytes;
+      return true;
+    }
+
+    if (strcmp(name, "tcmalloc.max_total_thread_cache_bytes") == 0) {
+      SpinLockHolder l(Static::pageheap_lock());
+      *value = ThreadCache::overall_thread_cache_size();
+      return true;
+    }
+
+    if (strcmp(name, "tcmalloc.current_total_thread_cache_bytes") == 0) {
+      TCMallocStats stats;
+      ExtractStats(&stats, NULL, NULL, NULL);
+      *value = stats.thread_bytes;
+      return true;
+    }
+
+    if (strcmp(name, "tcmalloc.aggressive_memory_decommit") == 0) {
+      *value = size_t(Static::pageheap()->GetAggressiveDecommit());
+      return true;
+    }
+
+    return false;
+  }
+
+  virtual bool SetNumericProperty(const char* name, size_t value) {
+    ASSERT(name != NULL);
+
+    if (strcmp(name, "tcmalloc.max_total_thread_cache_bytes") == 0) {
+      SpinLockHolder l(Static::pageheap_lock());
+      ThreadCache::set_overall_thread_cache_size(value);
+      return true;
+    }
+
+    if (strcmp(name, "tcmalloc.aggressive_memory_decommit") == 0) {
+      Static::pageheap()->SetAggressiveDecommit(value != 0);
+      return true;
+    }
+
+    return false;
+  }
+
+  virtual void MarkThreadIdle() {
+    ThreadCache::BecomeIdle();
+  }
+
+  virtual void MarkThreadBusy();  // Implemented below
+
+  virtual SysAllocator* GetSystemAllocator() {
+    SpinLockHolder h(Static::pageheap_lock());
+    return sys_alloc;
+  }
+
+  virtual void SetSystemAllocator(SysAllocator* alloc) {
+    SpinLockHolder h(Static::pageheap_lock());
+    sys_alloc = alloc;
+  }
+
+  virtual void ReleaseToSystem(size_t num_bytes) {
+    SpinLockHolder h(Static::pageheap_lock());
+    if (num_bytes <= extra_bytes_released_) {
+      // We released too much on a prior call, so don't release any
+      // more this time.
+      extra_bytes_released_ = extra_bytes_released_ - num_bytes;
+      return;
+    }
+    num_bytes = num_bytes - extra_bytes_released_;
+    // num_bytes might be less than one page.  If we pass zero to
+    // ReleaseAtLeastNPages, it won't do anything, so we release a whole
+    // page now and let extra_bytes_released_ smooth it out over time.
+    Length num_pages = max<Length>(num_bytes >> kPageShift, 1);
+    size_t bytes_released = Static::pageheap()->ReleaseAtLeastNPages(
+        num_pages) << kPageShift;
+    if (bytes_released > num_bytes) {
+      extra_bytes_released_ = bytes_released - num_bytes;
+    } else {
+      // The PageHeap wasn't able to release num_bytes.  Don't try to
+      // compensate with a big release next time.  Specifically,
+      // ReleaseFreeMemory() calls ReleaseToSystem(LONG_MAX).
+      extra_bytes_released_ = 0;
+    }
+  }
+
+  virtual void SetMemoryReleaseRate(double rate) {
+    FLAGS_tcmalloc_release_rate = rate;
+  }
+
+  virtual double GetMemoryReleaseRate() {
+    return FLAGS_tcmalloc_release_rate;
+  }
+  virtual size_t GetEstimatedAllocatedSize(size_t size) {
+    if (size <= kMaxSize) {
+      const size_t cl = Static::sizemap()->SizeClass(size);
+      const size_t alloc_size = Static::sizemap()->ByteSizeForClass(cl);
+      return alloc_size;
+    } else {
+      return tcmalloc::pages(size) << kPageShift;
+    }
+  }
+
+  // This just calls GetSizeWithCallback, but because that's in an
+  // unnamed namespace, we need to move the definition below it in the
+  // file.
+  virtual size_t GetAllocatedSize(const void* ptr);
+
+  // This duplicates some of the logic in GetSizeWithCallback, but is
+  // faster.  This is important on OS X, where this function is called
+  // on every allocation operation.
+  virtual Ownership GetOwnership(const void* ptr) {
+    const PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
+    // The rest of tcmalloc assumes that all allocated pointers use at
+    // most kAddressBits bits.  If ptr doesn't, then it definitely
+    // wasn't alloacted by tcmalloc.
+    if ((p >> (kAddressBits - kPageShift)) > 0) {
+      return kNotOwned;
+    }
+    size_t cl = Static::pageheap()->GetSizeClassIfCached(p);
+    if (cl != 0) {
+      return kOwned;
+    }
+    const Span *span = Static::pageheap()->GetDescriptor(p);
+    return span ? kOwned : kNotOwned;
+  }
+
+  virtual void GetFreeListSizes(vector<MallocExtension::FreeListInfo>* v) {
+    static const char* kCentralCacheType = "tcmalloc.central";
+    static const char* kTransferCacheType = "tcmalloc.transfer";
+    static const char* kThreadCacheType = "tcmalloc.thread";
+    static const char* kPageHeapType = "tcmalloc.page";
+    static const char* kPageHeapUnmappedType = "tcmalloc.page_unmapped";
+    static const char* kLargeSpanType = "tcmalloc.large";
+    static const char* kLargeUnmappedSpanType = "tcmalloc.large_unmapped";
+
+    v->clear();
+
+    // central class information
+    int64 prev_class_size = 0;
+    for (int cl = 1; cl < kNumClasses; ++cl) {
+      size_t class_size = Static::sizemap()->ByteSizeForClass(cl);
+      MallocExtension::FreeListInfo i;
+      i.min_object_size = prev_class_size + 1;
+      i.max_object_size = class_size;
+      i.total_bytes_free =
+          Static::central_cache()[cl].length() * class_size;
+      i.type = kCentralCacheType;
+      v->push_back(i);
+
+      // transfer cache
+      i.total_bytes_free =
+          Static::central_cache()[cl].tc_length() * class_size;
+      i.type = kTransferCacheType;
+      v->push_back(i);
+
+      prev_class_size = Static::sizemap()->ByteSizeForClass(cl);
+    }
+
+    // Add stats from per-thread heaps
+    uint64_t class_count[kNumClasses];
+    memset(class_count, 0, sizeof(class_count));
+    {
+      SpinLockHolder h(Static::pageheap_lock());
+      uint64_t thread_bytes = 0;
+      ThreadCache::GetThreadStats(&thread_bytes, class_count);
+    }
+
+    prev_class_size = 0;
+    for (int cl = 1; cl < kNumClasses; ++cl) {
+      MallocExtension::FreeListInfo i;
+      i.min_object_size = prev_class_size + 1;
+      i.max_object_size = Static::sizemap()->ByteSizeForClass(cl);
+      i.total_bytes_free =
+          class_count[cl] * Static::sizemap()->ByteSizeForClass(cl);
+      i.type = kThreadCacheType;
+      v->push_back(i);
+    }
+
+    // append page heap info
+    PageHeap::SmallSpanStats small;
+    PageHeap::LargeSpanStats large;
+    {
+      SpinLockHolder h(Static::pageheap_lock());
+      Static::pageheap()->GetSmallSpanStats(&small);
+      Static::pageheap()->GetLargeSpanStats(&large);
+    }
+
+    // large spans: mapped
+    MallocExtension::FreeListInfo span_info;
+    span_info.type = kLargeSpanType;
+    span_info.max_object_size = (numeric_limits<size_t>::max)();
+    span_info.min_object_size = kMaxPages << kPageShift;
+    span_info.total_bytes_free = large.normal_pages << kPageShift;
+    v->push_back(span_info);
+
+    // large spans: unmapped
+    span_info.type = kLargeUnmappedSpanType;
+    span_info.total_bytes_free = large.returned_pages << kPageShift;
+    v->push_back(span_info);
+
+    // small spans
+    for (int s = 1; s < kMaxPages; s++) {
+      MallocExtension::FreeListInfo i;
+      i.max_object_size = (s << kPageShift);
+      i.min_object_size = ((s - 1) << kPageShift);
+
+      i.type = kPageHeapType;
+      i.total_bytes_free = (s << kPageShift) * small.normal_length[s];
+      v->push_back(i);
+
+      i.type = kPageHeapUnmappedType;
+      i.total_bytes_free = (s << kPageShift) * small.returned_length[s];
+      v->push_back(i);
+    }
+  }
+};
+
+// The constructor allocates an object to ensure that initialization
+// runs before main(), and therefore we do not have a chance to become
+// multi-threaded before initialization.  We also create the TSD key
+// here.  Presumably by the time this constructor runs, glibc is in
+// good enough shape to handle pthread_key_create().
+//
+// The constructor also takes the opportunity to tell STL to use
+// tcmalloc.  We want to do this early, before construct time, so
+// all user STL allocations go through tcmalloc (which works really
+// well for STL).
+//
+// The destructor prints stats when the program exits.
+static int tcmallocguard_refcount = 0;  // no lock needed: runs before main()
+TCMallocGuard::TCMallocGuard() {
+  if (tcmallocguard_refcount++ == 0) {
+    ReplaceSystemAlloc();    // defined in libc_override_*.h
+    tc_free(tc_malloc(1));
+    ThreadCache::InitTSD();
+    tc_free(tc_malloc(1));
+    // Either we, or debugallocation.cc, or valgrind will control memory
+    // management.  We register our extension if we're the winner.
+#ifdef TCMALLOC_USING_DEBUGALLOCATION
+    // Let debugallocation register its extension.
+#else
+    if (RunningOnValgrind()) {
+      // Let Valgrind uses its own malloc (so don't register our extension).
+    } else {
+      MallocExtension::Register(new TCMallocImplementation);
+    }
+#endif
+  }
+}
+
+TCMallocGuard::~TCMallocGuard() {
+  if (--tcmallocguard_refcount == 0) {
+    const char* env = NULL;
+    if (!RunningOnValgrind()) {
+      // Valgrind uses it's own malloc so we cannot do MALLOCSTATS
+      env = getenv("MALLOCSTATS");
+    }
+    if (env != NULL) {
+      int level = atoi(env);
+      if (level < 1) level = 1;
+      PrintStats(level);
+    }
+  }
+}
+#ifndef WIN32_OVERRIDE_ALLOCATORS
+static TCMallocGuard module_enter_exit_hook;
+#endif
+
+//-------------------------------------------------------------------
+// Helpers for the exported routines below
+//-------------------------------------------------------------------
+
+static inline bool CheckCachedSizeClass(void *ptr) {
+  PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
+  size_t cached_value = Static::pageheap()->GetSizeClassIfCached(p);
+  return cached_value == 0 ||
+      cached_value == Static::pageheap()->GetDescriptor(p)->sizeclass;
+}
+
+static inline void* CheckedMallocResult(void *result) {
+  ASSERT(result == NULL || CheckCachedSizeClass(result));
+  return result;
+}
+
+static inline void* SpanToMallocResult(Span *span) {
+  Static::pageheap()->CacheSizeClass(span->start, 0);
+  return
+      CheckedMallocResult(reinterpret_cast<void*>(span->start << kPageShift));
+}
+
+static void* DoSampledAllocation(size_t size) {
+  // Grab the stack trace outside the heap lock
+  StackTrace tmp;
+  tmp.depth = GetStackTrace(tmp.stack, tcmalloc::kMaxStackDepth, 1);
+  tmp.size = size;
+
+  SpinLockHolder h(Static::pageheap_lock());
+  // Allocate span
+  Span *span = Static::pageheap()->New(tcmalloc::pages(size == 0 ? 1 : size));
+  if (UNLIKELY(span == NULL)) {
+    return NULL;
+  }
+
+  // Allocate stack trace
+  StackTrace *stack = Static::stacktrace_allocator()->New();
+  if (UNLIKELY(stack == NULL)) {
+    // Sampling failed because of lack of memory
+    return span;
+  }
+  *stack = tmp;
+  span->sample = 1;
+  span->objects = stack;
+  tcmalloc::DLL_Prepend(Static::sampled_objects(), span);
+
+  return SpanToMallocResult(span);
+}
+
+namespace {
+
+typedef void* (*malloc_fn)(void *arg);
+
+SpinLock set_new_handler_lock(SpinLock::LINKER_INITIALIZED);
+
+void* handle_oom(malloc_fn retry_fn,
+                 void* retry_arg,
+                 bool from_operator,
+                 bool nothrow) {
+  if (!from_operator && !tc_new_mode) {
+    // we're out of memory in C library function (malloc etc) and no
+    // "new mode" forced on us. Just return NULL
+    return NULL;
+  }
+  // we're OOM in operator new or "new mode" is set. We might have to
+  // call new_handle and maybe retry allocation.
+
+  for (;;) {
+    // Get the current new handler.  NB: this function is not
+    // thread-safe.  We make a feeble stab at making it so here, but
+    // this lock only protects against tcmalloc interfering with
+    // itself, not with other libraries calling set_new_handler.
+    std::new_handler nh;
+    {
+      SpinLockHolder h(&set_new_handler_lock);
+      nh = std::set_new_handler(0);
+      (void) std::set_new_handler(nh);
+    }
+#if (defined(__GNUC__) && !defined(__EXCEPTIONS)) || (defined(_HAS_EXCEPTIONS) && !_HAS_EXCEPTIONS)
+    if (!nh) {
+      return NULL;
+    }
+    // Since exceptions are disabled, we don't really know if new_handler
+    // failed.  Assume it will abort if it fails.
+    (*nh)();
+#else
+    // If no new_handler is established, the allocation failed.
+    if (!nh) {
+      if (nothrow) {
+        return NULL;
+      }
+      throw std::bad_alloc();
+    }
+    // Otherwise, try the new_handler.  If it returns, retry the
+    // allocation.  If it throws std::bad_alloc, fail the allocation.
+    // if it throws something else, don't interfere.
+    try {
+      (*nh)();
+    } catch (const std::bad_alloc&) {
+      if (!nothrow) throw;
+      return NULL;
+    }
+#endif  // (defined(__GNUC__) && !defined(__EXCEPTIONS)) || (defined(_HAS_EXCEPTIONS) && !_HAS_EXCEPTIONS)
+
+    // we get here if new_handler returns successfully. So we retry
+    // allocation.
+    void* rv = retry_fn(retry_arg);
+    if (rv != NULL) {
+      return rv;
+    }
+
+    // if allocation failed again we go to next loop iteration
+  }
+}
+
+// Copy of FLAGS_tcmalloc_large_alloc_report_threshold with
+// automatic increases factored in.
+static int64_t large_alloc_threshold =
+  (kPageSize > FLAGS_tcmalloc_large_alloc_report_threshold
+   ? kPageSize : FLAGS_tcmalloc_large_alloc_report_threshold);
+
+static void ReportLargeAlloc(Length num_pages, void* result) {
+  StackTrace stack;
+  stack.depth = GetStackTrace(stack.stack, tcmalloc::kMaxStackDepth, 1);
+
+  static const int N = 1000;
+  char buffer[N];
+  TCMalloc_Printer printer(buffer, N);
+  printer.printf("tcmalloc: large alloc %" PRIu64 " bytes == %p @ ",
+                 static_cast<uint64>(num_pages) << kPageShift,
+                 result);
+  for (int i = 0; i < stack.depth; i++) {
+    printer.printf(" %p", stack.stack[i]);
+  }
+  printer.printf("\n");
+  write(STDERR_FILENO, buffer, strlen(buffer));
+}
+
+void* do_memalign(size_t align, size_t size);
+
+struct retry_memaligh_data {
+  size_t align;
+  size_t size;
+};
+
+static void *retry_do_memalign(void *arg) {
+  retry_memaligh_data *data = static_cast<retry_memaligh_data *>(arg);
+  return do_memalign(data->align, data->size);
+}
+
+static void *maybe_do_cpp_memalign_slow(size_t align, size_t size) {
+  retry_memaligh_data data;
+  data.align = align;
+  data.size = size;
+  return handle_oom(retry_do_memalign, &data,
+                    false, true);
+}
+
+inline void* do_memalign_or_cpp_memalign(size_t align, size_t size) {
+  void *rv = do_memalign(align, size);
+  if (LIKELY(rv != NULL)) {
+    return rv;
+  }
+  return maybe_do_cpp_memalign_slow(align, size);
+}
+
+// Must be called with the page lock held.
+inline bool should_report_large(Length num_pages) {
+  const int64 threshold = large_alloc_threshold;
+  if (threshold > 0 && num_pages >= (threshold >> kPageShift)) {
+    // Increase the threshold by 1/8 every time we generate a report.
+    // We cap the threshold at 8GiB to avoid overflow problems.
+    large_alloc_threshold = (threshold + threshold/8 < 8ll<<30
+                             ? threshold + threshold/8 : 8ll<<30);
+    return true;
+  }
+  return false;
+}
+
+// Helper for do_malloc().
+inline void* do_malloc_pages(ThreadCache* heap, size_t size) {
+  void* result;
+  bool report_large;
+
+  Length num_pages = tcmalloc::pages(size);
+  size = num_pages << kPageShift;
+
+  if ((FLAGS_tcmalloc_sample_parameter > 0) && heap->SampleAllocation(size)) {
+    result = DoSampledAllocation(size);
+
+    SpinLockHolder h(Static::pageheap_lock());
+    report_large = should_report_large(num_pages);
+  } else {
+    SpinLockHolder h(Static::pageheap_lock());
+    Span* span = Static::pageheap()->New(num_pages);
+    result = (UNLIKELY(span == NULL) ? NULL : SpanToMallocResult(span));
+    report_large = should_report_large(num_pages);
+  }
+
+  if (report_large) {
+    ReportLargeAlloc(num_pages, result);
+  }
+  return result;
+}
+
+ALWAYS_INLINE void* do_malloc_small(ThreadCache* heap, size_t size) {
+  ASSERT(Static::IsInited());
+  ASSERT(heap != NULL);
+  size_t cl = Static::sizemap()->SizeClass(size);
+  size = Static::sizemap()->class_to_size(cl);
+
+  if (UNLIKELY(FLAGS_tcmalloc_sample_parameter > 0) && heap->SampleAllocation(size)) {
+    return DoSampledAllocation(size);
+  } else {
+    // The common case, and also the simplest.  This just pops the
+    // size-appropriate freelist, after replenishing it if it's empty.
+    return CheckedMallocResult(heap->Allocate(size, cl));
+  }
+}
+
+ALWAYS_INLINE void* do_malloc(size_t size) {
+  if (ThreadCache::have_tls &&
+      LIKELY(size < ThreadCache::MinSizeForSlowPath())) {
+    return do_malloc_small(ThreadCache::GetCacheWhichMustBePresent(), size);
+  } else if (size <= kMaxSize) {
+    return do_malloc_small(ThreadCache::GetCache(), size);
+  } else {
+    return do_malloc_pages(ThreadCache::GetCache(), size);
+  }
+}
+
+static void *retry_malloc(void* size) {
+  return do_malloc(reinterpret_cast<size_t>(size));
+}
+
+ALWAYS_INLINE void* do_malloc_or_cpp_alloc(size_t size) {
+  void *rv = do_malloc(size);
+  if (LIKELY(rv != NULL)) {
+    return rv;
+  }
+  return handle_oom(retry_malloc, reinterpret_cast<void *>(size),
+                    false, true);
+}
+
+ALWAYS_INLINE void* do_calloc(size_t n, size_t elem_size) {
+  // Overflow check
+  const size_t size = n * elem_size;
+  if (elem_size != 0 && size / elem_size != n) return NULL;
+
+  void* result = do_malloc_or_cpp_alloc(size);
+  if (result != NULL) {
+    memset(result, 0, size);
+  }
+  return result;
+}
+
+// If ptr is NULL, do nothing.  Otherwise invoke the given function.
+inline void free_null_or_invalid(void* ptr, void (*invalid_free_fn)(void*)) {
+  if (ptr != NULL) {
+    (*invalid_free_fn)(ptr);
+  }
+}
+
+// Helper for do_free_with_callback(), below.  Inputs:
+//   ptr is object to be freed
+//   invalid_free_fn is a function that gets invoked on certain "bad frees"
+//   heap is the ThreadCache for this thread, or NULL if it isn't known
+//   heap_must_be_valid is whether heap is known to be non-NULL
+//
+// This function may only be used after Static::IsInited() is true.
+//
+// We can usually detect the case where ptr is not pointing to a page that
+// tcmalloc is using, and in those cases we invoke invalid_free_fn.
+//
+// To maximize speed in the common case, we usually get here with
+// heap_must_be_valid being a manifest constant equal to true.
+ALWAYS_INLINE void do_free_helper(void* ptr,
+                                  void (*invalid_free_fn)(void*),
+                                  ThreadCache* heap,
+                                  bool heap_must_be_valid) {
+  ASSERT((Static::IsInited() && heap != NULL) || !heap_must_be_valid);
+  if (!heap_must_be_valid && !Static::IsInited()) {
+    // We called free() before malloc().  This can occur if the
+    // (system) malloc() is called before tcmalloc is loaded, and then
+    // free() is called after tcmalloc is loaded (and tc_free has
+    // replaced free), but before the global constructor has run that
+    // sets up the tcmalloc data structures.
+    free_null_or_invalid(ptr, invalid_free_fn);
+    return;
+  }
+  Span* span = NULL;
+  const PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
+  size_t cl = Static::pageheap()->GetSizeClassIfCached(p);
+  if (UNLIKELY(cl == 0)) {
+    span = Static::pageheap()->GetDescriptor(p);
+    if (UNLIKELY(!span)) {
+      // span can be NULL because the pointer passed in is NULL or invalid
+      // (not something returned by malloc or friends), or because the
+      // pointer was allocated with some other allocator besides
+      // tcmalloc.  The latter can happen if tcmalloc is linked in via
+      // a dynamic library, but is not listed last on the link line.
+      // In that case, libraries after it on the link line will
+      // allocate with libc malloc, but free with tcmalloc's free.
+      free_null_or_invalid(ptr, invalid_free_fn);
+      return;
+    }
+    cl = span->sizeclass;
+    Static::pageheap()->CacheSizeClass(p, cl);
+  }
+  ASSERT(ptr != NULL);
+  if (LIKELY(cl != 0)) {
+    ASSERT(!Static::pageheap()->GetDescriptor(p)->sample);
+    if (heap_must_be_valid || heap != NULL) {
+      heap->Deallocate(ptr, cl);
+    } else {
+      // Delete directly into central cache
+      tcmalloc::SLL_SetNext(ptr, NULL);
+      Static::central_cache()[cl].InsertRange(ptr, ptr, 1);
+    }
+  } else {
+    SpinLockHolder h(Static::pageheap_lock());
+    ASSERT(reinterpret_cast<uintptr_t>(ptr) % kPageSize == 0);
+    ASSERT(span != NULL && span->start == p);
+    if (span->sample) {
+      StackTrace* st = reinterpret_cast<StackTrace*>(span->objects);
+      tcmalloc::DLL_Remove(span);
+      Static::stacktrace_allocator()->Delete(st);
+      span->objects = NULL;
+    }
+    Static::pageheap()->Delete(span);
+  }
+}
+
+// Helper for the object deletion (free, delete, etc.).  Inputs:
+//   ptr is object to be freed
+//   invalid_free_fn is a function that gets invoked on certain "bad frees"
+//
+// We can usually detect the case where ptr is not pointing to a page that
+// tcmalloc is using, and in those cases we invoke invalid_free_fn.
+ALWAYS_INLINE void do_free_with_callback(void* ptr,
+                                         void (*invalid_free_fn)(void*)) {
+  ThreadCache* heap = NULL;
+  if (LIKELY(ThreadCache::IsFastPathAllowed())) {
+    heap = ThreadCache::GetCacheWhichMustBePresent();
+    do_free_helper(ptr, invalid_free_fn, heap, true);
+  } else {
+    heap = ThreadCache::GetCacheIfPresent();
+    do_free_helper(ptr, invalid_free_fn, heap, false);
+  }
+}
+
+// The default "do_free" that uses the default callback.
+ALWAYS_INLINE void do_free(void* ptr) {
+  return do_free_with_callback(ptr, &InvalidFree);
+}
+
+// NOTE: some logic here is duplicated in GetOwnership (above), for
+// speed.  If you change this function, look at that one too.
+inline size_t GetSizeWithCallback(const void* ptr,
+                                  size_t (*invalid_getsize_fn)(const void*)) {
+  if (ptr == NULL)
+    return 0;
+  const PageID p = reinterpret_cast<uintptr_t>(ptr) >> kPageShift;
+  size_t cl = Static::pageheap()->GetSizeClassIfCached(p);
+  if (cl != 0) {
+    return Static::sizemap()->ByteSizeForClass(cl);
+  } else {
+    const Span *span = Static::pageheap()->GetDescriptor(p);
+    if (UNLIKELY(span == NULL)) {  // means we do not own this memory
+      return (*invalid_getsize_fn)(ptr);
+    } else if (span->sizeclass != 0) {
+      Static::pageheap()->CacheSizeClass(p, span->sizeclass);
+      return Static::sizemap()->ByteSizeForClass(span->sizeclass);
+    } else {
+      return span->length << kPageShift;
+    }
+  }
+}
+
+// This lets you call back to a given function pointer if ptr is invalid.
+// It is used primarily by windows code which wants a specialized callback.
+ALWAYS_INLINE void* do_realloc_with_callback(
+    void* old_ptr, size_t new_size,
+    void (*invalid_free_fn)(void*),
+    size_t (*invalid_get_size_fn)(const void*)) {
+  // Get the size of the old entry
+  const size_t old_size = GetSizeWithCallback(old_ptr, invalid_get_size_fn);
+
+  // Reallocate if the new size is larger than the old size,
+  // or if the new size is significantly smaller than the old size.
+  // We do hysteresis to avoid resizing ping-pongs:
+  //    . If we need to grow, grow to max(new_size, old_size * 1.X)
+  //    . Don't shrink unless new_size < old_size * 0.Y
+  // X and Y trade-off time for wasted space.  For now we do 1.25 and 0.5.
+  const size_t lower_bound_to_grow = old_size + old_size / 4ul;
+  const size_t upper_bound_to_shrink = old_size / 2ul;
+  if ((new_size > old_size) || (new_size < upper_bound_to_shrink)) {
+    // Need to reallocate.
+    void* new_ptr = NULL;
+
+    if (new_size > old_size && new_size < lower_bound_to_grow) {
+      new_ptr = do_malloc_or_cpp_alloc(lower_bound_to_grow);
+    }
+    if (new_ptr == NULL) {
+      // Either new_size is not a tiny increment, or last do_malloc failed.
+      new_ptr = do_malloc_or_cpp_alloc(new_size);
+    }
+    if (UNLIKELY(new_ptr == NULL)) {
+      return NULL;
+    }
+    MallocHook::InvokeNewHook(new_ptr, new_size);
+    memcpy(new_ptr, old_ptr, ((old_size < new_size) ? old_size : new_size));
+    MallocHook::InvokeDeleteHook(old_ptr);
+    // We could use a variant of do_free() that leverages the fact
+    // that we already know the sizeclass of old_ptr.  The benefit
+    // would be small, so don't bother.
+    do_free_with_callback(old_ptr, invalid_free_fn);
+    return new_ptr;
+  } else {
+    // We still need to call hooks to report the updated size:
+    MallocHook::InvokeDeleteHook(old_ptr);
+    MallocHook::InvokeNewHook(old_ptr, new_size);
+    return old_ptr;
+  }
+}
+
+ALWAYS_INLINE void* do_realloc(void* old_ptr, size_t new_size) {
+  return do_realloc_with_callback(old_ptr, new_size,
+                                  &InvalidFree, &InvalidGetSizeForRealloc);
+}
+
+// For use by exported routines below that want specific alignments
+//
+// Note: this code can be slow for alignments > 16, and can
+// significantly fragment memory.  The expectation is that
+// memalign/posix_memalign/valloc/pvalloc will not be invoked very
+// often.  This requirement simplifies our implementation and allows
+// us to tune for expected allocation patterns.
+void* do_memalign(size_t align, size_t size) {
+  ASSERT((align & (align - 1)) == 0);
+  ASSERT(align > 0);
+  if (size + align < size) return NULL;         // Overflow
+
+  // Fall back to malloc if we would already align this memory access properly.
+  if (align <= AlignmentForSize(size)) {
+    void* p = do_malloc(size);
+    ASSERT((reinterpret_cast<uintptr_t>(p) % align) == 0);
+    return p;
+  }
+
+  if (UNLIKELY(Static::pageheap() == NULL)) ThreadCache::InitModule();
+
+  // Allocate at least one byte to avoid boundary conditions below
+  if (size == 0) size = 1;
+
+  if (size <= kMaxSize && align < kPageSize) {
+    // Search through acceptable size classes looking for one with
+    // enough alignment.  This depends on the fact that
+    // InitSizeClasses() currently produces several size classes that
+    // are aligned at powers of two.  We will waste time and space if
+    // we miss in the size class array, but that is deemed acceptable
+    // since memalign() should be used rarely.
+    int cl = Static::sizemap()->SizeClass(size);
+    while (cl < kNumClasses &&
+           ((Static::sizemap()->class_to_size(cl) & (align - 1)) != 0)) {
+      cl++;
+    }
+    if (cl < kNumClasses) {
+      ThreadCache* heap = ThreadCache::GetCache();
+      size = Static::sizemap()->class_to_size(cl);
+      return CheckedMallocResult(heap->Allocate(size, cl));
+    }
+  }
+
+  // We will allocate directly from the page heap
+  SpinLockHolder h(Static::pageheap_lock());
+
+  if (align <= kPageSize) {
+    // Any page-level allocation will be fine
+    // TODO: We could put the rest of this page in the appropriate
+    // TODO: cache but it does not seem worth it.
+    Span* span = Static::pageheap()->New(tcmalloc::pages(size));
+    return UNLIKELY(span == NULL) ? NULL : SpanToMallocResult(span);
+  }
+
+  // Allocate extra pages and carve off an aligned portion
+  const Length alloc = tcmalloc::pages(size + align);
+  Span* span = Static::pageheap()->New(alloc);
+  if (UNLIKELY(span == NULL)) return NULL;
+
+  // Skip starting portion so that we end up aligned
+  Length skip = 0;
+  while ((((span->start+skip) << kPageShift) & (align - 1)) != 0) {
+    skip++;
+  }
+  ASSERT(skip < alloc);
+  if (skip > 0) {
+    Span* rest = Static::pageheap()->Split(span, skip);
+    Static::pageheap()->Delete(span);
+    span = rest;
+  }
+
+  // Skip trailing portion that we do not need to return
+  const Length needed = tcmalloc::pages(size);
+  ASSERT(span->length >= needed);
+  if (span->length > needed) {
+    Span* trailer = Static::pageheap()->Split(span, needed);
+    Static::pageheap()->Delete(trailer);
+  }
+  return SpanToMallocResult(span);
+}
+
+// Helpers for use by exported routines below:
+
+inline void do_malloc_stats() {
+  PrintStats(1);
+}
+
+inline int do_mallopt(int cmd, int value) {
+  return 1;     // Indicates error
+}
+
+#ifdef HAVE_STRUCT_MALLINFO
+inline struct mallinfo do_mallinfo() {
+  TCMallocStats stats;
+  ExtractStats(&stats, NULL, NULL, NULL);
+
+  // Just some of the fields are filled in.
+  struct mallinfo info;
+  memset(&info, 0, sizeof(info));
+
+  // Unfortunately, the struct contains "int" field, so some of the
+  // size values will be truncated.
+  info.arena     = static_cast<int>(stats.pageheap.system_bytes);
+  info.fsmblks   = static_cast<int>(stats.thread_bytes
+                                    + stats.central_bytes
+                                    + stats.transfer_bytes);
+  info.fordblks  = static_cast<int>(stats.pageheap.free_bytes +
+                                    stats.pageheap.unmapped_bytes);
+  info.uordblks  = static_cast<int>(stats.pageheap.system_bytes
+                                    - stats.thread_bytes
+                                    - stats.central_bytes
+                                    - stats.transfer_bytes
+                                    - stats.pageheap.free_bytes
+                                    - stats.pageheap.unmapped_bytes);
+
+  return info;
+}
+#endif  // HAVE_STRUCT_MALLINFO
+
+inline void* cpp_alloc(size_t size, bool nothrow) {
+  void* p = do_malloc(size);
+  if (LIKELY(p)) {
+    return p;
+  }
+  return handle_oom(retry_malloc, reinterpret_cast<void *>(size),
+                    true, nothrow);
+}
+
+}  // end unnamed namespace
+
+// As promised, the definition of this function, declared above.
+size_t TCMallocImplementation::GetAllocatedSize(const void* ptr) {
+  if (ptr == NULL)
+    return 0;
+  ASSERT(TCMallocImplementation::GetOwnership(ptr)
+         != TCMallocImplementation::kNotOwned);
+  return GetSizeWithCallback(ptr, &InvalidGetAllocatedSize);
+}
+
+void TCMallocImplementation::MarkThreadBusy() {
+  // Allocate to force the creation of a thread cache, but avoid
+  // invoking any hooks.
+  do_free(do_malloc(0));
+}
+
+//-------------------------------------------------------------------
+// Exported routines
+//-------------------------------------------------------------------
+
+extern "C" PERFTOOLS_DLL_DECL const char* tc_version(
+    int* major, int* minor, const char** patch) __THROW {
+  if (major) *major = TC_VERSION_MAJOR;
+  if (minor) *minor = TC_VERSION_MINOR;
+  if (patch) *patch = TC_VERSION_PATCH;
+  return TC_VERSION_STRING;
+}
+
+// This function behaves similarly to MSVC's _set_new_mode.
+// If flag is 0 (default), calls to malloc will behave normally.
+// If flag is 1, calls to malloc will behave like calls to new,
+// and the std_new_handler will be invoked on failure.
+// Returns the previous mode.
+extern "C" PERFTOOLS_DLL_DECL int tc_set_new_mode(int flag) __THROW {
+  int old_mode = tc_new_mode;
+  tc_new_mode = flag;
+  return old_mode;
+}
+
+#ifndef TCMALLOC_USING_DEBUGALLOCATION  // debugallocation.cc defines its own
+
+// CAVEAT: The code structure below ensures that MallocHook methods are always
+//         called from the stack frame of the invoked allocation function.
+//         heap-checker.cc depends on this to start a stack trace from
+//         the call to the (de)allocation function.
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_malloc(size_t size) __THROW {
+  void* result = do_malloc_or_cpp_alloc(size);
+  MallocHook::InvokeNewHook(result, size);
+  return result;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void tc_free(void* ptr) __THROW {
+  MallocHook::InvokeDeleteHook(ptr);
+  do_free(ptr);
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_calloc(size_t n,
+                                              size_t elem_size) __THROW {
+  void* result = do_calloc(n, elem_size);
+  MallocHook::InvokeNewHook(result, n * elem_size);
+  return result;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void tc_cfree(void* ptr) __THROW {
+  MallocHook::InvokeDeleteHook(ptr);
+  do_free(ptr);
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_realloc(void* old_ptr,
+                                               size_t new_size) __THROW {
+  if (old_ptr == NULL) {
+    void* result = do_malloc_or_cpp_alloc(new_size);
+    MallocHook::InvokeNewHook(result, new_size);
+    return result;
+  }
+  if (new_size == 0) {
+    MallocHook::InvokeDeleteHook(old_ptr);
+    do_free(old_ptr);
+    return NULL;
+  }
+  return do_realloc(old_ptr, new_size);
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_new(size_t size) {
+  void* p = cpp_alloc(size, false);
+  // We keep this next instruction out of cpp_alloc for a reason: when
+  // it's in, and new just calls cpp_alloc, the optimizer may fold the
+  // new call into cpp_alloc, which messes up our whole section-based
+  // stacktracing (see ATTRIBUTE_SECTION, above).  This ensures cpp_alloc
+  // isn't the last thing this fn calls, and prevents the folding.
+  MallocHook::InvokeNewHook(p, size);
+  return p;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_new_nothrow(size_t size, const std::nothrow_t&) __THROW {
+  void* p = cpp_alloc(size, true);
+  MallocHook::InvokeNewHook(p, size);
+  return p;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW {
+  MallocHook::InvokeDeleteHook(p);
+  do_free(p);
+}
+
+// Standard C++ library implementations define and use this
+// (via ::operator delete(ptr, nothrow)).
+// But it's really the same as normal delete, so we just do the same thing.
+extern "C" PERFTOOLS_DLL_DECL void tc_delete_nothrow(void* p, const std::nothrow_t&) __THROW {
+  MallocHook::InvokeDeleteHook(p);
+  do_free(p);
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_newarray(size_t size) {
+  void* p = cpp_alloc(size, false);
+  // We keep this next instruction out of cpp_alloc for a reason: when
+  // it's in, and new just calls cpp_alloc, the optimizer may fold the
+  // new call into cpp_alloc, which messes up our whole section-based
+  // stacktracing (see ATTRIBUTE_SECTION, above).  This ensures cpp_alloc
+  // isn't the last thing this fn calls, and prevents the folding.
+  MallocHook::InvokeNewHook(p, size);
+  return p;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size, const std::nothrow_t&)
+    __THROW {
+  void* p = cpp_alloc(size, true);
+  MallocHook::InvokeNewHook(p, size);
+  return p;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW {
+  MallocHook::InvokeDeleteHook(p);
+  do_free(p);
+}
+
+extern "C" PERFTOOLS_DLL_DECL void tc_deletearray_nothrow(void* p, const std::nothrow_t&) __THROW {
+  MallocHook::InvokeDeleteHook(p);
+  do_free(p);
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_memalign(size_t align,
+                                                size_t size) __THROW {
+  void* result = do_memalign_or_cpp_memalign(align, size);
+  MallocHook::InvokeNewHook(result, size);
+  return result;
+}
+
+extern "C" PERFTOOLS_DLL_DECL int tc_posix_memalign(
+    void** result_ptr, size_t align, size_t size) __THROW {
+  if (((align % sizeof(void*)) != 0) ||
+      ((align & (align - 1)) != 0) ||
+      (align == 0)) {
+    return EINVAL;
+  }
+
+  void* result = do_memalign_or_cpp_memalign(align, size);
+  MallocHook::InvokeNewHook(result, size);
+  if (UNLIKELY(result == NULL)) {
+    return ENOMEM;
+  } else {
+    *result_ptr = result;
+    return 0;
+  }
+}
+
+static size_t pagesize = 0;
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_valloc(size_t size) __THROW {
+  // Allocate page-aligned object of length >= size bytes
+  if (pagesize == 0) pagesize = getpagesize();
+  void* result = do_memalign_or_cpp_memalign(pagesize, size);
+  MallocHook::InvokeNewHook(result, size);
+  return result;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_pvalloc(size_t size) __THROW {
+  // Round up size to a multiple of pagesize
+  if (pagesize == 0) pagesize = getpagesize();
+  if (size == 0) {     // pvalloc(0) should allocate one page, according to
+    size = pagesize;   // http://man.free4web.biz/man3/libmpatrol.3.html
+  }
+  size = (size + pagesize - 1) & ~(pagesize - 1);
+  void* result = do_memalign_or_cpp_memalign(pagesize, size);
+  MallocHook::InvokeNewHook(result, size);
+  return result;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void tc_malloc_stats(void) __THROW {
+  do_malloc_stats();
+}
+
+extern "C" PERFTOOLS_DLL_DECL int tc_mallopt(int cmd, int value) __THROW {
+  return do_mallopt(cmd, value);
+}
+
+#ifdef HAVE_STRUCT_MALLINFO
+extern "C" PERFTOOLS_DLL_DECL struct mallinfo tc_mallinfo(void) __THROW {
+  return do_mallinfo();
+}
+#endif
+
+extern "C" PERFTOOLS_DLL_DECL size_t tc_malloc_size(void* ptr) __THROW {
+  return MallocExtension::instance()->GetAllocatedSize(ptr);
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* tc_malloc_skip_new_handler(size_t size)  __THROW {
+  void* result = do_malloc(size);
+  MallocHook::InvokeNewHook(result, size);
+  return result;
+}
+
+#endif  // TCMALLOC_USING_DEBUGALLOCATION

diff --git a/src/tcmalloc.h b/src/tcmalloc.h
new file mode 100644
index 0000000..2d64f4e
--- /dev/null
+++ b/src/tcmalloc.h

@@ -0,0 +1,70 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein <opensource@google.com>
+//
+// Some obscure memory-allocation routines may not be declared on all
+// systems.  In those cases, we'll just declare them ourselves.
+// This file is meant to be used only internally, for unittests.
+
+#include <config.h>
+
+#ifndef _XOPEN_SOURCE
+# define _XOPEN_SOURCE 600  // for posix_memalign
+#endif
+#include <stdlib.h>         // for posix_memalign
+// FreeBSD has malloc.h, but complains if you use it
+#if defined(HAVE_MALLOC_H) && !defined(__FreeBSD__)
+#include <malloc.h>         // for memalign, valloc, pvalloc
+#endif
+
+// __THROW is defined in glibc systems.  It means, counter-intuitively,
+// "This function will never throw an exception."  It's an optional
+// optimization tool, but we may need to use it to match glibc prototypes.
+#ifndef __THROW    // I guess we're not on a glibc system
+# define __THROW   // __THROW is just an optimization, so ok to make it ""
+#endif
+
+#if !HAVE_CFREE_SYMBOL
+extern "C" void cfree(void* ptr) __THROW;
+#endif
+#if !HAVE_POSIX_MEMALIGN_SYMBOL
+extern "C" int posix_memalign(void** ptr, size_t align, size_t size) __THROW;
+#endif
+#if !HAVE_MEMALIGN_SYMBOL
+extern "C" void* memalign(size_t __alignment, size_t __size) __THROW;
+#endif
+#if !HAVE_VALLOC_SYMBOL
+extern "C" void* valloc(size_t __size) __THROW;
+#endif
+#if !HAVE_PVALLOC_SYMBOL
+extern "C" void* pvalloc(size_t __size) __THROW;
+#endif

diff --git a/src/tcmalloc_guard.h b/src/tcmalloc_guard.h
new file mode 100644
index 0000000..84952ba
--- /dev/null
+++ b/src/tcmalloc_guard.h

@@ -0,0 +1,49 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// We expose the TCMallocGuard class -- which initializes the tcmalloc
+// allocator -- so classes that need to be sure tcmalloc is loaded
+// before they do stuff -- notably heap-profiler -- can.  To use this
+// create a static TCMallocGuard instance at the top of a file where
+// you need tcmalloc to be initialized before global constructors run.
+
+#ifndef TCMALLOC_TCMALLOC_GUARD_H_
+#define TCMALLOC_TCMALLOC_GUARD_H_
+
+class TCMallocGuard {
+ public:
+  TCMallocGuard();
+  ~TCMallocGuard();
+};
+
+#endif  // TCMALLOC_TCMALLOC_GUARD_H_

diff --git a/src/tests/addressmap_unittest.cc b/src/tests/addressmap_unittest.cc
new file mode 100644
index 0000000..a847dd6
--- /dev/null
+++ b/src/tests/addressmap_unittest.cc

@@ -0,0 +1,171 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+
+#include <stdlib.h>   // for rand()
+#include <vector>
+#include <set>
+#include <algorithm>
+#include <utility>
+#include "addressmap-inl.h"
+#include "base/logging.h"
+#include "base/commandlineflags.h"
+
+DEFINE_int32(iters, 20, "Number of test iterations");
+DEFINE_int32(N, 100000,  "Number of elements to test per iteration");
+
+using std::pair;
+using std::make_pair;
+using std::vector;
+using std::set;
+using std::random_shuffle;
+
+struct UniformRandomNumberGenerator {
+  size_t Uniform(size_t max_size) {
+    if (max_size == 0)
+      return 0;
+    return rand() % max_size;   // not a great random-number fn, but portable
+  }
+};
+static UniformRandomNumberGenerator rnd;
+
+
+// pair of associated value and object size
+typedef pair<int, size_t> ValueT;
+
+struct PtrAndSize {
+  char* ptr;
+  size_t size;
+  PtrAndSize(char* p, size_t s) : ptr(p), size(s) {}
+};
+
+size_t SizeFunc(const ValueT& v) { return v.second; }
+
+static void SetCheckCallback(const void* ptr, ValueT* val,
+                             set<pair<const void*, int> >* check_set) {
+  check_set->insert(make_pair(ptr, val->first));
+}
+
+int main(int argc, char** argv) {
+  // Get a bunch of pointers
+  const int N = FLAGS_N;
+  static const int kMaxRealSize = 49;
+  // 100Mb to stress not finding previous object (AddressMap's cluster is 1Mb):
+  static const size_t kMaxSize = 100*1000*1000;
+  vector<PtrAndSize> ptrs_and_sizes;
+  for (int i = 0; i < N; ++i) {
+    size_t s = rnd.Uniform(kMaxRealSize);
+    ptrs_and_sizes.push_back(PtrAndSize(new char[s], s));
+  }
+
+  for (int x = 0; x < FLAGS_iters; ++x) {
+    RAW_LOG(INFO, "Iteration %d/%d...\n", x, FLAGS_iters);
+
+    // Permute pointers to get rid of allocation order issues
+    random_shuffle(ptrs_and_sizes.begin(), ptrs_and_sizes.end());
+
+    AddressMap<ValueT> map(malloc, free);
+    const ValueT* result;
+    const void* res_p;
+
+    // Insert a bunch of entries
+    for (int i = 0; i < N; ++i) {
+      char* p = ptrs_and_sizes[i].ptr;
+      CHECK(!map.Find(p));
+      int offs = rnd.Uniform(ptrs_and_sizes[i].size);
+      CHECK(!map.FindInside(&SizeFunc, kMaxSize, p + offs, &res_p));
+      map.Insert(p, make_pair(i, ptrs_and_sizes[i].size));
+      CHECK(result = map.Find(p));
+      CHECK_EQ(result->first, i);
+      CHECK(result = map.FindInside(&SizeFunc, kMaxRealSize, p + offs, &res_p));
+      CHECK_EQ(res_p, p);
+      CHECK_EQ(result->first, i);
+      map.Insert(p, make_pair(i + N, ptrs_and_sizes[i].size));
+      CHECK(result = map.Find(p));
+      CHECK_EQ(result->first, i + N);
+    }
+
+    // Delete the even entries
+    for (int i = 0; i < N; i += 2) {
+      void* p = ptrs_and_sizes[i].ptr;
+      ValueT removed;
+      CHECK(map.FindAndRemove(p, &removed));
+      CHECK_EQ(removed.first, i + N);
+    }
+
+    // Lookup the odd entries and adjust them
+    for (int i = 1; i < N; i += 2) {
+      char* p = ptrs_and_sizes[i].ptr;
+      CHECK(result = map.Find(p));
+      CHECK_EQ(result->first, i + N);
+      int offs = rnd.Uniform(ptrs_and_sizes[i].size);
+      CHECK(result = map.FindInside(&SizeFunc, kMaxRealSize, p + offs, &res_p));
+      CHECK_EQ(res_p, p);
+      CHECK_EQ(result->first, i + N);
+      map.Insert(p, make_pair(i + 2*N, ptrs_and_sizes[i].size));
+      CHECK(result = map.Find(p));
+      CHECK_EQ(result->first, i + 2*N);
+    }
+
+    // Insert even entries back
+    for (int i = 0; i < N; i += 2) {
+      char* p = ptrs_and_sizes[i].ptr;
+      int offs = rnd.Uniform(ptrs_and_sizes[i].size);
+      CHECK(!map.FindInside(&SizeFunc, kMaxSize, p + offs, &res_p));
+      map.Insert(p, make_pair(i + 2*N, ptrs_and_sizes[i].size));
+      CHECK(result = map.Find(p));
+      CHECK_EQ(result->first, i + 2*N);
+      CHECK(result = map.FindInside(&SizeFunc, kMaxRealSize, p + offs, &res_p));
+      CHECK_EQ(res_p, p);
+      CHECK_EQ(result->first, i + 2*N);
+    }
+
+    // Check all entries
+    set<pair<const void*, int> > check_set;
+    map.Iterate(SetCheckCallback, &check_set);
+    CHECK_EQ(check_set.size(), N);
+    for (int i = 0; i < N; ++i) {
+      void* p = ptrs_and_sizes[i].ptr;
+      check_set.erase(make_pair(p, i + 2*N));
+      CHECK(result = map.Find(p));
+      CHECK_EQ(result->first, i + 2*N);
+    }
+    CHECK_EQ(check_set.size(), 0);
+  }
+
+  for (int i = 0; i < N; ++i) {
+    delete[] ptrs_and_sizes[i].ptr;
+  }
+
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/atomicops_unittest.cc b/src/tests/atomicops_unittest.cc
new file mode 100644
index 0000000..aa82a6b
--- /dev/null
+++ b/src/tests/atomicops_unittest.cc

@@ -0,0 +1,162 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2006, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Sanjay Ghemawat
+ */
+
+#include <stdio.h>
+#include "base/logging.h"
+#include "base/atomicops.h"
+
+#define GG_ULONGLONG(x)  static_cast<uint64>(x)
+
+
+#define NUM_BITS(T) (sizeof(T) * 8)
+
+
+template <class AtomicType>
+static void TestCompareAndSwap(AtomicType (*compare_and_swap_func)
+                               (volatile AtomicType*, AtomicType, AtomicType)) {
+  AtomicType value = 0;
+  AtomicType prev = (*compare_and_swap_func)(&value, 0, 1);
+  ASSERT_EQ(1, value);
+  ASSERT_EQ(0, prev);
+
+  // Use test value that has non-zero bits in both halves, more for testing
+  // 64-bit implementation on 32-bit platforms.
+  const AtomicType k_test_val = (GG_ULONGLONG(1) <<
+                                 (NUM_BITS(AtomicType) - 2)) + 11;
+  value = k_test_val;
+  prev = (*compare_and_swap_func)(&value, 0, 5);
+  ASSERT_EQ(k_test_val, value);
+  ASSERT_EQ(k_test_val, prev);
+
+  value = k_test_val;
+  prev = (*compare_and_swap_func)(&value, k_test_val, 5);
+  ASSERT_EQ(5, value);
+  ASSERT_EQ(k_test_val, prev);
+}
+
+
+template <class AtomicType>
+static void TestAtomicExchange(AtomicType (*atomic_exchange_func)
+                               (volatile AtomicType*, AtomicType)) {
+  AtomicType value = 0;
+  AtomicType new_value = (*atomic_exchange_func)(&value, 1);
+  ASSERT_EQ(1, value);
+  ASSERT_EQ(0, new_value);
+
+  // Use test value that has non-zero bits in both halves, more for testing
+  // 64-bit implementation on 32-bit platforms.
+  const AtomicType k_test_val = (GG_ULONGLONG(1) <<
+                                 (NUM_BITS(AtomicType) - 2)) + 11;
+  value = k_test_val;
+  new_value = (*atomic_exchange_func)(&value, k_test_val);
+  ASSERT_EQ(k_test_val, value);
+  ASSERT_EQ(k_test_val, new_value);
+
+  value = k_test_val;
+  new_value = (*atomic_exchange_func)(&value, 5);
+  ASSERT_EQ(5, value);
+  ASSERT_EQ(k_test_val, new_value);
+}
+
+
+// This is a simple sanity check that values are correct. Not testing
+// atomicity
+template <class AtomicType>
+static void TestStore() {
+  const AtomicType kVal1 = static_cast<AtomicType>(0xa5a5a5a5a5a5a5a5LL);
+  const AtomicType kVal2 = static_cast<AtomicType>(-1);
+
+  AtomicType value;
+
+  base::subtle::NoBarrier_Store(&value, kVal1);
+  ASSERT_EQ(kVal1, value);
+  base::subtle::NoBarrier_Store(&value, kVal2);
+  ASSERT_EQ(kVal2, value);
+
+  base::subtle::Acquire_Store(&value, kVal1);
+  ASSERT_EQ(kVal1, value);
+  base::subtle::Acquire_Store(&value, kVal2);
+  ASSERT_EQ(kVal2, value);
+
+  base::subtle::Release_Store(&value, kVal1);
+  ASSERT_EQ(kVal1, value);
+  base::subtle::Release_Store(&value, kVal2);
+  ASSERT_EQ(kVal2, value);
+}
+
+// This is a simple sanity check that values are correct. Not testing
+// atomicity
+template <class AtomicType>
+static void TestLoad() {
+  const AtomicType kVal1 = static_cast<AtomicType>(0xa5a5a5a5a5a5a5a5LL);
+  const AtomicType kVal2 = static_cast<AtomicType>(-1);
+
+  AtomicType value;
+
+  value = kVal1;
+  ASSERT_EQ(kVal1, base::subtle::NoBarrier_Load(&value));
+  value = kVal2;
+  ASSERT_EQ(kVal2, base::subtle::NoBarrier_Load(&value));
+
+  value = kVal1;
+  ASSERT_EQ(kVal1, base::subtle::Acquire_Load(&value));
+  value = kVal2;
+  ASSERT_EQ(kVal2, base::subtle::Acquire_Load(&value));
+
+  value = kVal1;
+  ASSERT_EQ(kVal1, base::subtle::Release_Load(&value));
+  value = kVal2;
+  ASSERT_EQ(kVal2, base::subtle::Release_Load(&value));
+}
+
+template <class AtomicType>
+static void TestAtomicOps() {
+  TestCompareAndSwap<AtomicType>(base::subtle::NoBarrier_CompareAndSwap);
+  TestCompareAndSwap<AtomicType>(base::subtle::Acquire_CompareAndSwap);
+  TestCompareAndSwap<AtomicType>(base::subtle::Release_CompareAndSwap);
+
+  TestAtomicExchange<AtomicType>(base::subtle::NoBarrier_AtomicExchange);
+  TestAtomicExchange<AtomicType>(base::subtle::Acquire_AtomicExchange);
+  TestAtomicExchange<AtomicType>(base::subtle::Release_AtomicExchange);
+
+  TestStore<AtomicType>();
+  TestLoad<AtomicType>();
+}
+
+int main(int argc, char** argv) {
+  TestAtomicOps<AtomicWord>();
+  TestAtomicOps<Atomic32>();
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/current_allocated_bytes_test.cc b/src/tests/current_allocated_bytes_test.cc
new file mode 100644
index 0000000..eaa6a7b
--- /dev/null
+++ b/src/tests/current_allocated_bytes_test.cc

@@ -0,0 +1,64 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// ---
+//
+// Author: Craig Silverstein
+
+// This tests the accounting done by tcmalloc.  When we allocate and
+// free a small buffer, the number of bytes used by the application
+// before the alloc+free should match the number of bytes used after.
+// However, the internal data structures used by tcmalloc will be
+// quite different -- new spans will have been allocated, etc.  This
+// is, thus, a simple test that we account properly for the internal
+// data structures, so that we report the actual application-used
+// bytes properly.
+
+#include "config_for_unittests.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <gperftools/malloc_extension.h>
+#include "base/logging.h"
+
+const char kCurrent[] = "generic.current_allocated_bytes";
+
+int main() {
+  // We don't do accounting right when using debugallocation.cc, so
+  // turn off the test then.  TODO(csilvers): get this working too.
+#ifdef NDEBUG
+  size_t before_bytes, after_bytes;
+  MallocExtension::instance()->GetNumericProperty(kCurrent, &before_bytes);
+  free(malloc(200));
+  MallocExtension::instance()->GetNumericProperty(kCurrent, &after_bytes);
+
+  CHECK_EQ(before_bytes, after_bytes);
+#endif
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/debugallocation_test.cc b/src/tests/debugallocation_test.cc
new file mode 100644
index 0000000..d935dbb
--- /dev/null
+++ b/src/tests/debugallocation_test.cc

@@ -0,0 +1,332 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Fred Akalin
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> // for memcmp
+#include <vector>
+#include "gperftools/malloc_extension.h"
+#include "gperftools/tcmalloc.h"
+#include "base/logging.h"
+
+using std::vector;
+
+vector<void (*)()> g_testlist;  // the tests to run
+
+#define TEST(a, b)                                      \
+  struct Test_##a##_##b {                               \
+    Test_##a##_##b() { g_testlist.push_back(&Run); }    \
+    static void Run();                                  \
+  };                                                    \
+  static Test_##a##_##b g_test_##a##_##b;               \
+  void Test_##a##_##b::Run()
+
+
+static int RUN_ALL_TESTS() {
+  vector<void (*)()>::const_iterator it;
+  for (it = g_testlist.begin(); it != g_testlist.end(); ++it) {
+    (*it)();   // The test will error-exit if there's a problem.
+  }
+  fprintf(stderr, "\nPassed %d tests\n\nPASS\n",
+          static_cast<int>(g_testlist.size()));
+  return 0;
+}
+
+// The death tests are meant to be run from a shell-script driver, which
+// passes in an integer saying which death test to run.  We store that
+// test-to-run here, and in the macro use a counter to see when we get
+// to that test, so we can run it.
+static int test_to_run = 0;     // set in main() based on argv
+static int test_counter = 0;    // incremented every time the macro is called
+#define IF_DEBUG_EXPECT_DEATH(statement, regex) do {    \
+  if (test_counter++ == test_to_run) {                  \
+    fprintf(stderr, "Expected regex:%s\n", regex);      \
+    statement;                                          \
+  }                                                     \
+} while (false)
+
+// This flag won't be compiled in in opt mode.
+DECLARE_int32(max_free_queue_size);
+
+// Test match as well as mismatch rules.  But do not test on OS X; on
+// OS X the OS converts new/new[] to malloc before it gets to us, so
+// we are unable to catch these mismatch errors.
+#ifndef __APPLE__
+TEST(DebugAllocationTest, DeallocMismatch) {
+  // malloc can be matched only by free
+  // new can be matched only by delete and delete(nothrow)
+  // new[] can be matched only by delete[] and delete[](nothrow)
+  // new(nothrow) can be matched only by delete and delete(nothrow)
+  // new(nothrow)[] can be matched only by delete[] and delete[](nothrow)
+
+  // Allocate with malloc.
+  {
+    int* x = static_cast<int*>(malloc(sizeof(*x)));
+    IF_DEBUG_EXPECT_DEATH(delete x, "mismatch.*being dealloc.*delete");
+    IF_DEBUG_EXPECT_DEATH(delete [] x, "mismatch.*being dealloc.*delete *[[]");
+    // Should work fine.
+    free(x);
+  }
+
+  // Allocate with new.
+  {
+    int* x = new int;
+    int* y = new int;
+    IF_DEBUG_EXPECT_DEATH(free(x), "mismatch.*being dealloc.*free");
+    IF_DEBUG_EXPECT_DEATH(delete [] x, "mismatch.*being dealloc.*delete *[[]");
+    delete x;
+    ::operator delete(y, std::nothrow);
+  }
+
+  // Allocate with new[].
+  {
+    int* x = new int[1];
+    int* y = new int[1];
+    IF_DEBUG_EXPECT_DEATH(free(x), "mismatch.*being dealloc.*free");
+    IF_DEBUG_EXPECT_DEATH(delete x, "mismatch.*being dealloc.*delete");
+    delete [] x;
+    ::operator delete[](y, std::nothrow);
+  }
+
+  // Allocate with new(nothrow).
+  {
+    int* x = new(std::nothrow) int;
+    int* y = new(std::nothrow) int;
+    IF_DEBUG_EXPECT_DEATH(free(x), "mismatch.*being dealloc.*free");
+    IF_DEBUG_EXPECT_DEATH(delete [] x, "mismatch.*being dealloc.*delete *[[]");
+    delete x;
+    ::operator delete(y, std::nothrow);
+  }
+
+  // Allocate with new(nothrow)[].
+  {
+    int* x = new(std::nothrow) int[1];
+    int* y = new(std::nothrow) int[1];
+    IF_DEBUG_EXPECT_DEATH(free(x), "mismatch.*being dealloc.*free");
+    IF_DEBUG_EXPECT_DEATH(delete x, "mismatch.*being dealloc.*delete");
+    delete [] x;
+    ::operator delete[](y, std::nothrow);
+  }
+}
+#endif  // #ifdef OS_MACOSX
+
+TEST(DebugAllocationTest, DoubleFree) {
+  int* pint = new int;
+  delete pint;
+  IF_DEBUG_EXPECT_DEATH(delete pint, "has been already deallocated");
+}
+
+TEST(DebugAllocationTest, StompBefore) {
+  int* pint = new int;
+#ifndef NDEBUG   // don't stomp memory if we're not in a position to detect it
+  pint[-1] = 5;
+  IF_DEBUG_EXPECT_DEATH(delete pint, "a word before object");
+#endif
+}
+
+TEST(DebugAllocationTest, StompAfter) {
+  int* pint = new int;
+#ifndef NDEBUG   // don't stomp memory if we're not in a position to detect it
+  pint[1] = 5;
+  IF_DEBUG_EXPECT_DEATH(delete pint, "a word after object");
+#endif
+}
+
+TEST(DebugAllocationTest, FreeQueueTest) {
+  // Verify that the allocator doesn't return blocks that were recently freed.
+  int* x = new int;
+  int* old_x = x;
+  delete x;
+  x = new int;
+  #if 1
+    // This check should not be read as a universal guarantee of behavior.  If
+    // other threads are executing, it would be theoretically possible for this
+    // check to fail despite the efforts of debugallocation.cc to the contrary.
+    // It should always hold under the controlled conditions of this unittest,
+    // however.
+    EXPECT_NE(x, old_x);  // Allocator shouldn't return recently freed blocks
+  #else
+    // The below check passes, but since it isn't *required* to pass, I've left
+    // it commented out.
+    // EXPECT_EQ(x, old_x);
+  #endif
+  old_x = NULL;  // avoid breaking opt build with an unused variable warning.
+  delete x;
+}
+
+TEST(DebugAllocationTest, DanglingPointerWriteTest) {
+  // This test can only be run if debugging.
+  //
+  // If not debugging, the 'new' following the dangling write might not be
+  // safe.  When debugging, we expect the (trashed) deleted block to be on the
+  // list of recently-freed blocks, so the following 'new' will be safe.
+#if 1
+  int* x = new int;
+  delete x;
+  int poisoned_x_value = *x;
+  *x = 1;  // a dangling write.
+
+  char* s = new char[FLAGS_max_free_queue_size];
+  // When we delete s, we push the storage that was previously allocated to x
+  // off the end of the free queue.  At that point, the write to that memory
+  // will be detected.
+  IF_DEBUG_EXPECT_DEATH(delete [] s, "Memory was written to after being freed.");
+
+  // restore the poisoned value of x so that we can delete s without causing a
+  // crash.
+  *x = poisoned_x_value;
+  delete [] s;
+#endif
+}
+
+TEST(DebugAllocationTest, DanglingWriteAtExitTest) {
+  int *x = new int;
+  delete x;
+  int old_x_value = *x;
+  *x = 1;
+  // verify that dangling writes are caught at program termination if the
+  // corrupted block never got pushed off of the end of the free queue.
+  IF_DEBUG_EXPECT_DEATH(exit(0), "Memory was written to after being freed.");
+  *x = old_x_value;  // restore x so that the test can exit successfully.
+}
+
+TEST(DebugAllocationTest, StackTraceWithDanglingWriteAtExitTest) {
+  int *x = new int;
+  delete x;
+  int old_x_value = *x;
+  *x = 1;
+  // verify that we also get a stack trace when we have a dangling write.
+  // The " @ " is part of the stack trace output.
+  IF_DEBUG_EXPECT_DEATH(exit(0), " @ .*main");
+  *x = old_x_value;  // restore x so that the test can exit successfully.
+}
+
+static size_t CurrentlyAllocatedBytes() {
+  size_t value;
+  CHECK(MallocExtension::instance()->GetNumericProperty(
+            "generic.current_allocated_bytes", &value));
+  return value;
+}
+
+TEST(DebugAllocationTest, CurrentlyAllocated) {
+  // Clear the free queue
+#if 1
+  FLAGS_max_free_queue_size = 0;
+  // Force a round-trip through the queue management code so that the
+  // new size is seen and the queue of recently-freed blocks is flushed.
+  free(malloc(1));
+  FLAGS_max_free_queue_size = 1048576;
+#endif
+
+  // Free something and check that it disappears from allocated bytes
+  // immediately.
+  char* p = new char[1000];
+  size_t after_malloc = CurrentlyAllocatedBytes();
+  delete[] p;
+  size_t after_free = CurrentlyAllocatedBytes();
+  EXPECT_LE(after_free, after_malloc - 1000);
+}
+
+TEST(DebugAllocationTest, GetAllocatedSizeTest) {
+#if 1
+  // When debug_allocation is in effect, GetAllocatedSize should return
+  // exactly requested size, since debug_allocation doesn't allow users
+  // to write more than that.
+  for (int i = 0; i < 10; ++i) {
+    void *p = malloc(i);
+    EXPECT_EQ(i, MallocExtension::instance()->GetAllocatedSize(p));
+    free(p);
+  }
+#endif
+  void* a = malloc(1000);
+  EXPECT_GE(MallocExtension::instance()->GetAllocatedSize(a), 1000);
+  // This is just a sanity check.  If we allocated too much, alloc is broken
+  EXPECT_LE(MallocExtension::instance()->GetAllocatedSize(a), 5000);
+  EXPECT_GE(MallocExtension::instance()->GetEstimatedAllocatedSize(1000), 1000);
+  free(a);
+}
+
+TEST(DebugAllocationTest, HugeAlloc) {
+  // This must not be a const variable so it doesn't form an
+  // integral-constant-expression which can be *statically* rejected by the
+  // compiler as too large for the allocation.
+  size_t kTooBig = ~static_cast<size_t>(0);
+  void* a = NULL;
+
+#ifndef NDEBUG
+
+  a = malloc(kTooBig);
+  EXPECT_EQ(NULL, a);
+
+  // kAlsoTooBig is small enough not to get caught by debugallocation's check,
+  // but will still fall through to tcmalloc's check. This must also be
+  // a non-const variable. See kTooBig for more details.
+  size_t kAlsoTooBig = kTooBig - 1024;
+
+  a = malloc(kAlsoTooBig);
+  EXPECT_EQ(NULL, a);
+#endif
+}
+
+// based on test program contributed by mikesart@gmail.com aka
+// mikesart@valvesoftware.com. See issue-464.
+TEST(DebugAllocationTest, ReallocAfterMemalign) {
+  char stuff[50];
+  memset(stuff, 0x11, sizeof(stuff));
+  void *p = tc_memalign(16, sizeof(stuff));
+  EXPECT_NE(p, NULL);
+  memcpy(stuff, p, sizeof(stuff));
+
+  p = realloc(p, sizeof(stuff) + 10);
+  EXPECT_NE(p, NULL);
+
+  int rv = memcmp(stuff, p, sizeof(stuff));
+  EXPECT_EQ(rv, 0);
+}
+
+int main(int argc, char** argv) {
+  // If you run without args, we run the non-death parts of the test.
+  // Otherwise, argv[1] should be a number saying which death-test
+  // to run.  We will output a regexp we expect the death-message
+  // to include, and then run the given death test (which hopefully
+  // will produce that error message).  If argv[1] > the number of
+  // death tests, we will run only the non-death parts.  One way to
+  // tell when you are done with all tests is when no 'expected
+  // regexp' message is printed for a given argv[1].
+  if (argc < 2) {
+    test_to_run = -1;   // will never match
+  } else {
+    test_to_run = atoi(argv[1]);
+  }
+  return RUN_ALL_TESTS();
+}

diff --git a/src/tests/debugallocation_test.sh b/src/tests/debugallocation_test.sh
new file mode 100755
index 0000000..faa6c79
--- /dev/null
+++ b/src/tests/debugallocation_test.sh

@@ -0,0 +1,95 @@
+#!/bin/sh
+
+# Copyright (c) 2009, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# ---
+# Author: Craig Silverstein
+
+BINDIR="${BINDIR:-.}"
+
+if [ "x$1" = "x-h" -o "x$1" = "x--help" ]; then
+  echo "USAGE: $0 [unittest dir]"
+  echo "       By default, unittest_dir=$BINDIR"
+  exit 1
+fi
+
+DEBUGALLOCATION_TEST="${1:-$BINDIR/debugallocation_test}"
+
+num_failures=0
+
+# Run the i-th death test and make sure the test has the expected
+# regexp.  We can depend on the first line of the output being
+#    Expected regex:<regex>
+# Evaluates to "done" if we are not actually a death-test (so $1 is
+# too big a number, and we can stop).  Evaluates to "" otherwise.
+# Increments num_failures if the death test does not succeed.
+OneDeathTest() {
+  "$DEBUGALLOCATION_TEST" "$1" 2>&1 | {
+    regex_line='dummy'
+    # Normally the regex_line is the first line of output, but not
+    # always (if tcmalloc itself does any logging to stderr).
+    while test -n "$regex_line"; do
+      read regex_line
+      regex=`expr "$regex_line" : "Expected regex:\(.*\)"`
+      test -n "$regex" && break   # found the regex line
+    done
+    test -z "$regex" && echo "done" || grep "$regex" 2>&1
+  }
+}
+
+death_test_num=0   # which death test to run
+while :; do        # same as 'while true', but more portable
+  echo -n "Running death test $death_test_num..."
+  output="`OneDeathTest $death_test_num`"
+  case $output in
+     # Empty string means grep didn't find anything.
+     "")      echo "FAILED"; num_failures=`expr $num_failures + 1`;;
+     "done"*) echo "done with death tests"; break;;
+     # Any other string means grep found something, like it ought to.
+     *)       echo "OK";;
+  esac
+  death_test_num=`expr $death_test_num + 1`
+done
+
+# Test the non-death parts of the test too
+echo -n "Running non-death tests..."
+if "$DEBUGALLOCATION_TEST"; then
+  echo "OK"
+else
+  echo "FAILED"
+  num_failures=`expr $num_failures + 1`
+fi
+
+if [ "$num_failures" = 0 ]; then
+  echo "PASS"
+else
+  echo "Failed with $num_failures failures"
+fi
+exit $num_failures

diff --git a/src/tests/frag_unittest.cc b/src/tests/frag_unittest.cc
new file mode 100644
index 0000000..c4016f9
--- /dev/null
+++ b/src/tests/frag_unittest.cc

@@ -0,0 +1,133 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2003, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// Test speed of handling fragmented heap
+
+#include "config_for_unittests.h"
+#include <stdlib.h>
+#include <stdio.h>
+#ifdef HAVE_SYS_RESOURCE_H
+#include <sys/time.h>           // for struct timeval
+#include <sys/resource.h>       // for getrusage
+#endif
+#ifdef _WIN32
+#include <windows.h>            // for GetTickCount()
+#endif
+#include <vector>
+#include "base/logging.h"
+#include "common.h"
+#include <gperftools/malloc_extension.h>
+
+using std::vector;
+
+int main(int argc, char** argv) {
+  // Make kAllocSize one page larger than the maximum small object size.
+  static const int kAllocSize = kMaxSize + kPageSize;
+  // Allocate 400MB in total.
+  static const int kTotalAlloc = 400 << 20;
+  static const int kAllocIterations = kTotalAlloc / kAllocSize;
+
+  // Allocate lots of objects
+  vector<char*> saved(kAllocIterations);
+  for (int i = 0; i < kAllocIterations; i++) {
+    saved[i] = new char[kAllocSize];
+  }
+
+  // Check the current "slack".
+  size_t slack_before;
+  MallocExtension::instance()->GetNumericProperty("tcmalloc.slack_bytes",
+                                                  &slack_before);
+
+  // Free alternating ones to fragment heap
+  size_t free_bytes = 0;
+  for (int i = 0; i < saved.size(); i += 2) {
+    delete[] saved[i];
+    free_bytes += kAllocSize;
+  }
+
+  // Check that slack delta is within 10% of expected.
+  size_t slack_after;
+  MallocExtension::instance()->GetNumericProperty("tcmalloc.slack_bytes",
+                                                  &slack_after);
+  CHECK_GE(slack_after, slack_before);
+  size_t slack = slack_after - slack_before;
+
+  CHECK_GT(double(slack), 0.9*free_bytes);
+  CHECK_LT(double(slack), 1.1*free_bytes);
+
+  // Dump malloc stats
+  static const int kBufSize = 1<<20;
+  char* buffer = new char[kBufSize];
+  MallocExtension::instance()->GetStats(buffer, kBufSize);
+  VLOG(1, "%s", buffer);
+  delete[] buffer;
+
+  // Now do timing tests
+  for (int i = 0; i < 5; i++) {
+    static const int kIterations = 100000;
+#ifdef HAVE_SYS_RESOURCE_H
+    struct rusage r;
+    getrusage(RUSAGE_SELF, &r);    // figure out user-time spent on this
+    struct timeval tv_start = r.ru_utime;
+#elif defined(_WIN32)
+    long long int tv_start = GetTickCount();
+#else
+# error No way to calculate time on your system
+#endif
+
+    for (int i = 0; i < kIterations; i++) {
+      size_t s;
+      MallocExtension::instance()->GetNumericProperty("tcmalloc.slack_bytes",
+                                                      &s);
+    }
+
+#ifdef HAVE_SYS_RESOURCE_H
+    getrusage(RUSAGE_SELF, &r);
+    struct timeval tv_end = r.ru_utime;
+    int64 sumsec = static_cast<int64>(tv_end.tv_sec) - tv_start.tv_sec;
+    int64 sumusec = static_cast<int64>(tv_end.tv_usec) - tv_start.tv_usec;
+#elif defined(_WIN32)
+    long long int tv_end = GetTickCount();
+    int64 sumsec = (tv_end - tv_start) / 1000;
+    // Resolution in windows is only to the millisecond, alas
+    int64 sumusec = ((tv_end - tv_start) % 1000) * 1000;
+#else
+# error No way to calculate time on your system
+#endif
+    fprintf(stderr, "getproperty: %6.1f ns/call\n",
+            (sumsec * 1e9 + sumusec * 1e3) / kIterations);
+  }
+
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/getpc_test.cc b/src/tests/getpc_test.cc
new file mode 100644
index 0000000..d75e40b
--- /dev/null
+++ b/src/tests/getpc_test.cc

@@ -0,0 +1,123 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// This verifies that GetPC works correctly.  This test uses a minimum
+// of Google infrastructure, to make it very easy to port to various
+// O/Ses and CPUs and test that GetPC is working.
+
+#include "config.h"
+#include "getpc.h"        // should be first to get the _GNU_SOURCE dfn
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/time.h>     // for setitimer
+
+// Needs to be volatile so compiler doesn't try to optimize it away
+static volatile void* getpc_retval = NULL;    // what GetPC returns
+static volatile bool prof_handler_called = false;
+
+static void prof_handler(int sig, siginfo_t*, void* signal_ucontext) {
+  if (!prof_handler_called)
+    getpc_retval = GetPC(*reinterpret_cast<ucontext_t*>(signal_ucontext));
+  prof_handler_called = true;  // only store the retval once
+}
+
+static void RoutineCallingTheSignal() {
+  struct sigaction sa;
+  sa.sa_sigaction = prof_handler;
+  sa.sa_flags = SA_RESTART | SA_SIGINFO;
+  sigemptyset(&sa.sa_mask);
+  if (sigaction(SIGPROF, &sa, NULL) != 0) {
+    perror("sigaction");
+    exit(1);
+  }
+
+  struct itimerval timer;
+  timer.it_interval.tv_sec = 0;
+  timer.it_interval.tv_usec = 1000;
+  timer.it_value = timer.it_interval;
+  setitimer(ITIMER_PROF, &timer, 0);
+
+  // Now we need to do some work for a while, that doesn't call any
+  // other functions, so we can be guaranteed that when the SIGPROF
+  // fires, we're the routine executing.
+  int r = 0;
+  for (int i = 0; !prof_handler_called; ++i) {
+    for (int j = 0; j < i; j++) {
+      r ^= i;
+      r <<= 1;
+      r ^= j;
+      r >>= 1;
+    }
+  }
+
+  // Now make sure the above loop doesn't get optimized out
+  srand(r);
+}
+
+// This is an upper bound of how many bytes the instructions for
+// RoutineCallingTheSignal might be.  There's probably a more
+// principled way to do this, but I don't know how portable it would be.
+// (The function is 372 bytes when compiled with -g on Mac OS X 10.4.
+// I can imagine it would be even bigger in 64-bit architectures.)
+const int kRoutineSize = 512 * sizeof(void*)/4;    // allow 1024 for 64-bit
+
+int main(int argc, char** argv) {
+  RoutineCallingTheSignal();
+
+  // Annoyingly, C++ disallows casting pointer-to-function to
+  // pointer-to-object, so we use a C-style cast instead.
+  char* expected = (char*)&RoutineCallingTheSignal;
+  char* actual = (char*)getpc_retval;
+
+  // For ia64, ppc64v1, and parisc64, the function pointer is actually
+  // a struct.  For instance, ia64's dl-fptr.h:
+  //   struct fdesc {          /* An FDESC is a function descriptor.  */
+  //      ElfW(Addr) ip;      /* code entry point */
+  //      ElfW(Addr) gp;      /* global pointer */
+  //   };
+  // We want the code entry point.
+  // NOTE: ppc64 ELFv2 (Little Endian) does not have function pointers
+#if defined(__ia64) || \
+    (defined(__powerpc64__) && _CALL_ELF != 2)
+  expected = ((char**)expected)[0];         // this is "ip"
+#endif
+
+  if (actual < expected || actual > expected + kRoutineSize) {
+    printf("Test FAILED: actual PC: %p, expected PC: %p\n", actual, expected);
+    return 1;
+  } else {
+    printf("PASS\n");
+    return 0;
+  }
+}

diff --git a/src/tests/heap-checker-death_unittest.sh b/src/tests/heap-checker-death_unittest.sh
new file mode 100755
index 0000000..752a7ad
--- /dev/null
+++ b/src/tests/heap-checker-death_unittest.sh

@@ -0,0 +1,176 @@
+#!/bin/sh
+# Copyright (c) 2005, Google Inc.
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ---
+# Author: Maxim Lifantsev
+#
+# Run the heap checker unittest in a mode where it is supposed to crash and
+# return an error if it doesn't.
+
+# We expect BINDIR to be set in the environment.
+# If not, we set it to some reasonable value.
+BINDIR="${BINDIR:-.}"
+
+if [ "x$1" = "x-h" -o "x$1" = "x--help" ]; then
+  echo "USAGE: $0 [unittest dir]"
+  echo "       By default, unittest_dir=$BINDIR"
+  exit 1
+fi
+
+EXE="${1:-$BINDIR/heap-checker_unittest}"
+TMPDIR="/tmp/heap_check_death_info"
+
+ALARM() {
+  # You need perl to run pprof, so I assume it's installed
+  perl -e '
+    $timeout=$ARGV[0]; shift;
+    $retval = 255;   # the default retval, for the case where we timed out
+    eval {           # need to run in an eval-block to trigger during system()
+      local $SIG{ALRM} = sub { die "alarm\n" };  # \n is required!
+      alarm $timeout;
+      $retval = system(@ARGV);
+      # Make retval bash-style: exit status, or 128+n if terminated by signal n
+      $retval = ($retval & 127) ? (128 + $retval) : ($retval >> 8);
+      alarm 0;
+    };
+    exit $retval;  # return system()-retval, or 255 if system() never returned
+' "$@"
+}
+
+# $1: timeout for alarm;
+# $2: regexp of expected exit code(s);
+# $3: regexp to match a line in the output;
+# $4: regexp to not match a line in the output;
+# $5+ args to pass to $EXE
+Test() {
+  # Note: make sure these varnames don't conflict with any vars outside Test()!
+  timeout="$1"
+  shift
+  expected_ec="$1"
+  shift
+  expected_regexp="$1"
+  shift
+  unexpected_regexp="$1"
+  shift
+
+  echo -n "Testing $EXE with $@ ... "
+  output="$TMPDIR/output"
+  ALARM $timeout env "$@" $EXE > "$output" 2>&1
+  actual_ec=$?
+  ec_ok=`expr "$actual_ec" : "$expected_ec$" >/dev/null || echo false`
+  matches_ok=`test -z "$expected_regexp" || \
+              grep "$expected_regexp" "$output" >/dev/null 2>&1 || echo false`
+  negmatches_ok=`test -z "$unexpected_regexp" || \
+                 ! grep "$unexpected_regexp" "$output" >/dev/null 2>&1 || echo false`
+  if $ec_ok && $matches_ok && $negmatches_ok; then
+    echo "PASS"
+    return 0  # 0: success
+  fi
+  # If we get here, we failed.  Now we just need to report why
+  echo "FAIL"
+  if [ $actual_ec -eq 255 ]; then  # 255 == SIGTERM due to $ALARM
+    echo "Test was taking unexpectedly long time to run and so we aborted it."
+    echo "Try the test case manually or raise the timeout from $timeout"
+    echo "to distinguish test slowness from a real problem."
+  else
+    $ec_ok || \
+      echo "Wrong exit code: expected: '$expected_ec'; actual: $actual_ec"
+    $matches_ok || \
+      echo "Output did not match '$expected_regexp'"
+    $negmatches_ok || \
+      echo "Output unexpectedly matched '$unexpected_regexp'"
+  fi
+  echo "Output from failed run:"
+  echo "---"
+  cat "$output"
+  echo "---"
+  return 1  # 1: failure
+}
+
+TMPDIR=/tmp/heap_check_death_info
+rm -rf $TMPDIR || exit 1
+mkdir $TMPDIR || exit 2
+
+export HEAPCHECK=strict       # default mode
+
+# These invocations should pass (0 == PASS):
+
+# This tests that turning leak-checker off dynamically works fine
+Test 120 0 "^PASS$" "" HEAPCHECK="" || exit 1
+
+# This disables threads so we can cause leaks reliably and test finding them
+Test 120 0 "^PASS$" "" HEAP_CHECKER_TEST_NO_THREADS=1 || exit 2
+
+# Test that --test_cancel_global_check works
+Test 20 0 "Canceling .* whole-program .* leak check$" "" \
+  HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_TEST_CANCEL_GLOBAL_CHECK=1 || exit 3
+Test 20 0 "Canceling .* whole-program .* leak check$" "" \
+  HEAP_CHECKER_TEST_TEST_LOOP_LEAK=1 HEAP_CHECKER_TEST_TEST_CANCEL_GLOBAL_CHECK=1 || exit 4
+
+# Test that very early log messages are present and controllable:
+EARLY_MSG="Starting tracking the heap$"
+
+Test 60 0 "$EARLY_MSG" "" \
+  HEAPCHECK="" HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 \
+  PERFTOOLS_VERBOSE=10 || exit 5
+Test 60 0 "MemoryRegionMap Init$" "" \
+  HEAPCHECK="" HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 \
+  PERFTOOLS_VERBOSE=11 || exit 6
+Test 60 0 "" "$EARLY_MSG" \
+  HEAPCHECK="" HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 \
+  PERFTOOLS_VERBOSE=-11 || exit 7
+
+# These invocations should fail with very high probability,
+# rather than return 0 or hang (1 == exit(1), 134 == abort(), 139 = SIGSEGV):
+
+Test 60 1 "Exiting .* because of .* leaks$" "" \
+  HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 || exit 8
+Test 60 1 "Exiting .* because of .* leaks$" "" \
+  HEAP_CHECKER_TEST_TEST_LOOP_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 || exit 9
+
+# Test that we produce a reasonable textual leak report.
+Test 60 1 "MakeALeak" "" \
+          HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECK_TEST_NO_THREADS=1 \
+  || exit 10
+
+# Test that very early log messages are present and controllable:
+Test 60 1 "Starting tracking the heap$" "" \
+  HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 PERFTOOLS_VERBOSE=10 \
+  || exit 11
+Test 60 1 "" "Starting tracking the heap" \
+  HEAP_CHECKER_TEST_TEST_LEAK=1 HEAP_CHECKER_TEST_NO_THREADS=1 PERFTOOLS_VERBOSE=-10 \
+  || exit 12
+
+cd /    # so we're not in TMPDIR when we delete it
+rm -rf $TMPDIR
+
+echo "PASS"
+
+exit 0

diff --git a/src/tests/heap-checker_unittest.cc b/src/tests/heap-checker_unittest.cc
new file mode 100644
index 0000000..8c8f865
--- /dev/null
+++ b/src/tests/heap-checker_unittest.cc

@@ -0,0 +1,1538 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Maxim Lifantsev
+//
+// Running:
+// ./heap-checker_unittest
+//
+// If the unittest crashes because it can't find pprof, try:
+// PPROF_PATH=/usr/local/someplace/bin/pprof ./heap-checker_unittest
+//
+// To test that the whole-program heap checker will actually cause a leak, try:
+// HEAPCHECK_TEST_LEAK= ./heap-checker_unittest
+// HEAPCHECK_TEST_LOOP_LEAK= ./heap-checker_unittest
+//
+// Note: Both of the above commands *should* abort with an error message.
+
+// CAVEAT: Do not use vector<> and string on-heap objects in this test,
+// otherwise the test can sometimes fail for tricky leak checks
+// when we want some allocated object not to be found live by the heap checker.
+// This can happen with memory allocators like tcmalloc that can allocate
+// heap objects back to back without any book-keeping data in between.
+// What happens is that end-of-storage pointers of a live vector
+// (or a string depending on the STL implementation used)
+// can happen to point to that other heap-allocated
+// object that is not reachable otherwise and that
+// we don't want to be reachable.
+//
+// The implication of this for real leak checking
+// is just one more chance for the liveness flood to be inexact
+// (see the comment in our .h file).
+
+#include "config_for_unittests.h"
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
+#if defined HAVE_STDINT_H
+#include <stdint.h>             // to get uint16_t (ISO naming madness)
+#elif defined HAVE_INTTYPES_H
+#include <inttypes.h>           // another place uint16_t might be defined
+#endif
+#include <sys/types.h>
+#include <stdlib.h>
+#include <errno.h>              // errno
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>             // for sleep(), geteuid()
+#endif
+#ifdef HAVE_MMAP
+#include <sys/mman.h>
+#endif
+#include <fcntl.h>              // for open(), close()
+#ifdef HAVE_EXECINFO_H
+#include <execinfo.h>           // backtrace
+#endif
+#ifdef HAVE_GRP_H
+#include <grp.h>                // getgrent, getgrnam
+#endif
+#ifdef HAVE_PWD_H
+#include <pwd.h>
+#endif
+
+#include <algorithm>
+#include <iostream>             // for cout
+#include <iomanip>              // for hex
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "base/commandlineflags.h"
+#include "base/googleinit.h"
+#include "base/logging.h"
+#include "base/commandlineflags.h"
+#include "base/thread_lister.h"
+#include <gperftools/heap-checker.h>
+#include "memory_region_map.h"
+#include <gperftools/malloc_extension.h>
+#include <gperftools/stacktrace.h>
+
+// On systems (like freebsd) that don't define MAP_ANONYMOUS, use the old
+// form of the name instead.
+#ifndef MAP_ANONYMOUS
+# define MAP_ANONYMOUS MAP_ANON
+#endif
+
+using namespace std;
+
+// ========================================================================= //
+
+// TODO(maxim): write a shell script to test that these indeed crash us
+//              (i.e. we do detect leaks)
+//              Maybe add more such crash tests.
+
+DEFINE_bool(test_leak,
+            EnvToBool("HEAP_CHECKER_TEST_TEST_LEAK", false),
+            "If should cause a leak crash");
+DEFINE_bool(test_loop_leak,
+            EnvToBool("HEAP_CHECKER_TEST_TEST_LOOP_LEAK", false),
+            "If should cause a looped leak crash");
+DEFINE_bool(test_register_leak,
+            EnvToBool("HEAP_CHECKER_TEST_TEST_REGISTER_LEAK", false),
+            "If should cause a leak crash by hiding a pointer "
+            "that is only in a register");
+DEFINE_bool(test_cancel_global_check,
+            EnvToBool("HEAP_CHECKER_TEST_TEST_CANCEL_GLOBAL_CHECK", false),
+            "If should test HeapLeakChecker::CancelGlobalCheck "
+            "when --test_leak or --test_loop_leak are given; "
+            "the test should not fail then");
+DEFINE_bool(maybe_stripped,
+            EnvToBool("HEAP_CHECKER_TEST_MAYBE_STRIPPED", true),
+            "If we think we can be a stripped binary");
+DEFINE_bool(interfering_threads,
+            EnvToBool("HEAP_CHECKER_TEST_INTERFERING_THREADS", true),
+            "If we should use threads trying "
+            "to interfere with leak checking");
+DEFINE_bool(hoarding_threads,
+            EnvToBool("HEAP_CHECKER_TEST_HOARDING_THREADS", true),
+            "If threads (usually the manager thread) are known "
+            "to retain some old state in their global buffers, "
+            "so that it's hard to force leaks when threads are around");
+            // TODO(maxim): Chage the default to false
+            // when the standard environment used NTPL threads:
+            // they do not seem to have this problem.
+DEFINE_bool(no_threads,
+            EnvToBool("HEAP_CHECKER_TEST_NO_THREADS", false),
+            "If we should not use any threads");
+            // This is used so we can make can_create_leaks_reliably true
+            // for any pthread implementation and test with that.
+
+DECLARE_int64(heap_check_max_pointer_offset);   // heap-checker.cc
+DECLARE_string(heap_check);  // in heap-checker.cc
+
+#define WARN_IF(cond, msg)   LOG_IF(WARNING, cond, msg)
+
+// This is an evil macro!  Be very careful using it...
+#undef VLOG          // and we start by evilling overriding logging.h VLOG
+#define VLOG(lvl)    if (FLAGS_verbose >= (lvl))  cout << "\n"
+// This is, likewise, evil
+#define LOGF         VLOG(INFO)
+
+static void RunHeapBusyThreads();  // below
+
+
+class Closure {
+ public:
+  virtual ~Closure() { }
+  virtual void Run() = 0;
+};
+
+class Callback0 : public Closure {
+ public:
+  typedef void (*FunctionSignature)();
+
+  inline Callback0(FunctionSignature f) : f_(f) {}
+  virtual void Run() { (*f_)(); delete this; }
+
+ private:
+  FunctionSignature f_;
+};
+
+template <class P1> class Callback1 : public Closure {
+ public:
+  typedef void (*FunctionSignature)(P1);
+
+  inline Callback1<P1>(FunctionSignature f, P1 p1) : f_(f), p1_(p1) {}
+  virtual void Run() { (*f_)(p1_); delete this; }
+
+ private:
+  FunctionSignature f_;
+  P1 p1_;
+};
+
+template <class P1, class P2> class Callback2 : public Closure {
+ public:
+  typedef void (*FunctionSignature)(P1,P2);
+
+  inline Callback2<P1,P2>(FunctionSignature f, P1 p1, P2 p2) : f_(f), p1_(p1), p2_(p2) {}
+  virtual void Run() { (*f_)(p1_, p2_); delete this; }
+
+ private:
+  FunctionSignature f_;
+  P1 p1_;
+  P2 p2_;
+};
+
+inline Callback0* NewCallback(void (*function)()) {
+  return new Callback0(function);
+}
+
+template <class P1>
+inline Callback1<P1>* NewCallback(void (*function)(P1), P1 p1) {
+  return new Callback1<P1>(function, p1);
+}
+
+template <class P1, class P2>
+inline Callback2<P1,P2>* NewCallback(void (*function)(P1,P2), P1 p1, P2 p2) {
+  return new Callback2<P1,P2>(function, p1, p2);
+}
+
+
+// Set to true at end of main, so threads know.  Not entirely thread-safe!,
+// but probably good enough.
+static bool g_have_exited_main = false;
+
+// If we can reliably create leaks (i.e. make leaked object
+// really unreachable from any global data).
+static bool can_create_leaks_reliably = false;
+
+// We use a simple allocation wrapper
+// to make sure we wipe out the newly allocated objects
+// in case they still happened to contain some pointer data
+// accidentally left by the memory allocator.
+struct Initialized { };
+static Initialized initialized;
+void* operator new(size_t size, const Initialized&) {
+  // Below we use "p = new(initialized) Foo[1];" and  "delete[] p;"
+  // instead of "p = new(initialized) Foo;"
+  // when we need to delete an allocated object.
+  void* p = malloc(size);
+  memset(p, 0, size);
+  return p;
+}
+void* operator new[](size_t size, const Initialized&) {
+  char* p = new char[size];
+  memset(p, 0, size);
+  return p;
+}
+
+static void DoWipeStack(int n);  // defined below
+static void WipeStack() { DoWipeStack(20); }
+
+static void Pause() {
+  poll(NULL, 0, 77);  // time for thread activity in HeapBusyThreadBody
+
+  // Indirectly test malloc_extension.*:
+  CHECK(MallocExtension::instance()->VerifyAllMemory());
+  int blocks;
+  size_t total;
+  int histogram[kMallocHistogramSize];
+  if (MallocExtension::instance()
+       ->MallocMemoryStats(&blocks, &total, histogram)  &&  total != 0) {
+    VLOG(3) << "Malloc stats: " << blocks << " blocks of "
+            << total << " bytes";
+    for (int i = 0; i < kMallocHistogramSize; ++i) {
+      if (histogram[i]) {
+        VLOG(3) << "  Malloc histogram at " << i << " : " << histogram[i];
+      }
+    }
+  }
+  WipeStack();  // e.g. MallocExtension::VerifyAllMemory
+                // can leave pointers to heap objects on stack
+}
+
+// Make gcc think a pointer is "used"
+template <class T>
+static void Use(T** foo) {
+  VLOG(2) << "Dummy-using " << static_cast<void*>(*foo) << " at " << foo;
+}
+
+// Arbitrary value, but not such that xor'ing with it is likely
+// to map one valid pointer to another valid pointer:
+static const uintptr_t kHideMask =
+  static_cast<uintptr_t>(0xF03A5F7BF03A5F7BLL);
+
+// Helpers to hide a pointer from live data traversal.
+// We just xor the pointer so that (with high probability)
+// it's not a valid address of a heap object anymore.
+// Both Hide and UnHide must be executed within RunHidden() below
+// to prevent leaving stale data on active stack that can be a pointer
+// to a heap object that is not actually reachable via live variables.
+// (UnHide might leave heap pointer value for an object
+//  that will be deallocated but later another object
+//  can be allocated at the same heap address.)
+template <class T>
+static void Hide(T** ptr) {
+  // we cast values, not dereferenced pointers, so no aliasing issues:
+  *ptr = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(*ptr) ^ kHideMask);
+  VLOG(2) << "hid: " << static_cast<void*>(*ptr);
+}
+
+template <class T>
+static void UnHide(T** ptr) {
+  VLOG(2) << "unhiding: " << static_cast<void*>(*ptr);
+  // we cast values, not dereferenced pointers, so no aliasing issues:
+  *ptr = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(*ptr) ^ kHideMask);
+}
+
+static void LogHidden(const char* message, const void* ptr) {
+  LOGF << message << " : "
+       << ptr << " ^ " << reinterpret_cast<void*>(kHideMask) << endl;
+}
+
+// volatile to fool the compiler against inlining the calls to these
+void (*volatile run_hidden_ptr)(Closure* c, int n);
+void (*volatile wipe_stack_ptr)(int n);
+
+static void DoRunHidden(Closure* c, int n) {
+  if (n) {
+    VLOG(10) << "Level " << n << " at " << &n;
+    (*run_hidden_ptr)(c, n-1);
+    (*wipe_stack_ptr)(n);
+    sleep(0);  // undo -foptimize-sibling-calls
+  } else {
+    c->Run();
+  }
+}
+
+/*static*/ void DoWipeStack(int n) {
+  VLOG(10) << "Wipe level " << n << " at " << &n;
+  if (n) {
+    const int sz = 30;
+    volatile int arr[sz] ATTRIBUTE_UNUSED;
+    for (int i = 0; i < sz; ++i) arr[i] = 0;
+    (*wipe_stack_ptr)(n-1);
+    sleep(0);  // undo -foptimize-sibling-calls
+  }
+}
+
+// This executes closure c several stack frames down from the current one
+// and then makes an effort to also wipe out the stack data that was used by
+// the closure.
+// This way we prevent leak checker from finding any temporary pointers
+// of the closure execution on the stack and deciding that
+// these pointers (and the pointed objects) are still live.
+static void RunHidden(Closure* c) {
+  DoRunHidden(c, 15);
+  DoWipeStack(20);
+}
+
+static void DoAllocHidden(size_t size, void** ptr) {
+  void* p = new(initialized) char[size];
+  Hide(&p);
+  Use(&p);  // use only hidden versions
+  VLOG(2) << "Allocated hidden " << p << " at " << &p;
+  *ptr = p;  // assign the hidden versions
+}
+
+static void* AllocHidden(size_t size) {
+  void* r;
+  RunHidden(NewCallback(DoAllocHidden, size, &r));
+  return r;
+}
+
+static void DoDeAllocHidden(void** ptr) {
+  Use(ptr);  // use only hidden versions
+  void* p = *ptr;
+  VLOG(2) << "Deallocating hidden " << p;
+  UnHide(&p);
+  delete [] reinterpret_cast<char*>(p);
+}
+
+static void DeAllocHidden(void** ptr) {
+  RunHidden(NewCallback(DoDeAllocHidden, ptr));
+  *ptr = NULL;
+  Use(ptr);
+}
+
+void PreventHeapReclaiming(size_t size) {
+#ifdef NDEBUG
+  if (true) {
+    static void** no_reclaim_list = NULL;
+    CHECK(size >= sizeof(void*));
+    // We can't use malloc_reclaim_memory flag in opt mode as debugallocation.cc
+    // is not used. Instead we allocate a bunch of heap objects that are
+    // of the same size as what we are going to leak to ensure that the object
+    // we are about to leak is not at the same address as some old allocated
+    // and freed object that might still have pointers leading to it.
+    for (int i = 0; i < 100; ++i) {
+      void** p = reinterpret_cast<void**>(new(initialized) char[size]);
+      p[0] = no_reclaim_list;
+      no_reclaim_list = p;
+    }
+  }
+#endif
+}
+
+static bool RunSilent(HeapLeakChecker* check,
+                      bool (HeapLeakChecker::* func)()) {
+  // By default, don't print the 'we detected a leak' message in the
+  // cases we're expecting a leak (we still print when --v is >= 1).
+  // This way, the logging output is less confusing: we only print
+  // "we detected a leak", and how to diagnose it, for *unexpected* leaks.
+  int32 old_FLAGS_verbose = FLAGS_verbose;
+  if (!VLOG_IS_ON(1))             // not on a verbose setting
+    FLAGS_verbose = FATAL;        // only log fatal errors
+  const bool retval = (check->*func)();
+  FLAGS_verbose = old_FLAGS_verbose;
+  return retval;
+}
+
+#define RUN_SILENT(check, func)  RunSilent(&(check), &HeapLeakChecker::func)
+
+enum CheckType { SAME_HEAP, NO_LEAKS };
+
+static void VerifyLeaks(HeapLeakChecker* check, CheckType type,
+                        int leaked_bytes, int leaked_objects) {
+  WipeStack();  // to help with can_create_leaks_reliably
+  const bool no_leaks =
+    type == NO_LEAKS ? RUN_SILENT(*check, BriefNoLeaks)
+                     : RUN_SILENT(*check, BriefSameHeap);
+  if (can_create_leaks_reliably) {
+    // these might still fail occasionally, but it should be very rare
+    CHECK_EQ(no_leaks, false);
+    CHECK_EQ(check->BytesLeaked(), leaked_bytes);
+    CHECK_EQ(check->ObjectsLeaked(), leaked_objects);
+  } else {
+    WARN_IF(no_leaks != false,
+            "Expected leaks not found: "
+            "Some liveness flood must be too optimistic");
+  }
+}
+
+// not deallocates
+static void TestHeapLeakCheckerDeathSimple() {
+  HeapLeakChecker check("death_simple");
+  void* foo = AllocHidden(100 * sizeof(int));
+  Use(&foo);
+  void* bar = AllocHidden(300);
+  Use(&bar);
+  LogHidden("Leaking", foo);
+  LogHidden("Leaking", bar);
+  Pause();
+  VerifyLeaks(&check, NO_LEAKS, 300 + 100 * sizeof(int), 2);
+  DeAllocHidden(&foo);
+  DeAllocHidden(&bar);
+}
+
+static void MakeDeathLoop(void** arr1, void** arr2) {
+  PreventHeapReclaiming(2 * sizeof(void*));
+  void** a1 = new(initialized) void*[2];
+  void** a2 = new(initialized) void*[2];
+  a1[1] = reinterpret_cast<void*>(a2);
+  a2[1] = reinterpret_cast<void*>(a1);
+  Hide(&a1);
+  Hide(&a2);
+  Use(&a1);
+  Use(&a2);
+  VLOG(2) << "Made hidden loop at " << &a1 << " to " << arr1;
+  *arr1 = a1;
+  *arr2 = a2;
+}
+
+// not deallocates two objects linked together
+static void TestHeapLeakCheckerDeathLoop() {
+  HeapLeakChecker check("death_loop");
+  void* arr1;
+  void* arr2;
+  RunHidden(NewCallback(MakeDeathLoop, &arr1, &arr2));
+  Use(&arr1);
+  Use(&arr2);
+  LogHidden("Leaking", arr1);
+  LogHidden("Leaking", arr2);
+  Pause();
+  VerifyLeaks(&check, NO_LEAKS, 4 * sizeof(void*), 2);
+  DeAllocHidden(&arr1);
+  DeAllocHidden(&arr2);
+}
+
+// deallocates more than allocates
+static void TestHeapLeakCheckerDeathInverse() {
+  void* bar = AllocHidden(250 * sizeof(int));
+  Use(&bar);
+  LogHidden("Pre leaking", bar);
+  Pause();
+  HeapLeakChecker check("death_inverse");
+  void* foo = AllocHidden(100 * sizeof(int));
+  Use(&foo);
+  LogHidden("Leaking", foo);
+  DeAllocHidden(&bar);
+  Pause();
+  VerifyLeaks(&check, SAME_HEAP,
+              100 * static_cast<int64>(sizeof(int)),
+              1);
+  DeAllocHidden(&foo);
+}
+
+// deallocates more than allocates
+static void TestHeapLeakCheckerDeathNoLeaks() {
+  void* foo = AllocHidden(100 * sizeof(int));
+  Use(&foo);
+  void* bar = AllocHidden(250 * sizeof(int));
+  Use(&bar);
+  HeapLeakChecker check("death_noleaks");
+  DeAllocHidden(&bar);
+  CHECK_EQ(check.BriefNoLeaks(), true);
+  DeAllocHidden(&foo);
+}
+
+// have less objecs
+static void TestHeapLeakCheckerDeathCountLess() {
+  void* bar1 = AllocHidden(50 * sizeof(int));
+  Use(&bar1);
+  void* bar2 = AllocHidden(50 * sizeof(int));
+  Use(&bar2);
+  LogHidden("Pre leaking", bar1);
+  LogHidden("Pre leaking", bar2);
+  Pause();
+  HeapLeakChecker check("death_count_less");
+  void* foo = AllocHidden(100 * sizeof(int));
+  Use(&foo);
+  LogHidden("Leaking", foo);
+  DeAllocHidden(&bar1);
+  DeAllocHidden(&bar2);
+  Pause();
+  VerifyLeaks(&check, SAME_HEAP,
+              100 * sizeof(int),
+              1);
+  DeAllocHidden(&foo);
+}
+
+// have more objecs
+static void TestHeapLeakCheckerDeathCountMore() {
+  void* foo = AllocHidden(100 * sizeof(int));
+  Use(&foo);
+  LogHidden("Pre leaking", foo);
+  Pause();
+  HeapLeakChecker check("death_count_more");
+  void* bar1 = AllocHidden(50 * sizeof(int));
+  Use(&bar1);
+  void* bar2 = AllocHidden(50 * sizeof(int));
+  Use(&bar2);
+  LogHidden("Leaking", bar1);
+  LogHidden("Leaking", bar2);
+  DeAllocHidden(&foo);
+  Pause();
+  VerifyLeaks(&check, SAME_HEAP,
+              100 * sizeof(int),
+              2);
+  DeAllocHidden(&bar1);
+  DeAllocHidden(&bar2);
+}
+
+static void TestHiddenPointer() {
+  int i;
+  void* foo = &i;
+  HiddenPointer<void> p(foo);
+  CHECK_EQ(foo, p.get());
+
+  // Confirm pointer doesn't appear to contain a byte sequence
+  // that == the pointer.  We don't really need to test that
+  // the xor trick itself works, as without it nothing in this
+  // test suite would work.  See the Hide/Unhide/*Hidden* set
+  // of helper methods.
+  void **pvoid = reinterpret_cast<void**>(&p);
+  CHECK_NE(foo, *pvoid);
+}
+
+// simple tests that deallocate what they allocated
+static void TestHeapLeakChecker() {
+  { HeapLeakChecker check("trivial");
+    int foo = 5;
+    int* p = &foo;
+    Use(&p);
+    Pause();
+    CHECK(check.BriefSameHeap());
+  }
+  Pause();
+  { HeapLeakChecker check("simple");
+    void* foo = AllocHidden(100 * sizeof(int));
+    Use(&foo);
+    void* bar = AllocHidden(200 * sizeof(int));
+    Use(&bar);
+    DeAllocHidden(&foo);
+    DeAllocHidden(&bar);
+    Pause();
+    CHECK(check.BriefSameHeap());
+  }
+}
+
+// no false positives
+static void TestHeapLeakCheckerNoFalsePositives() {
+  { HeapLeakChecker check("trivial_p");
+    int foo = 5;
+    int* p = &foo;
+    Use(&p);
+    Pause();
+    CHECK(check.BriefSameHeap());
+  }
+  Pause();
+  { HeapLeakChecker check("simple_p");
+    void* foo = AllocHidden(100 * sizeof(int));
+    Use(&foo);
+    void* bar = AllocHidden(200 * sizeof(int));
+    Use(&bar);
+    DeAllocHidden(&foo);
+    DeAllocHidden(&bar);
+    Pause();
+    CHECK(check.SameHeap());
+  }
+}
+
+// test that we detect leaks when we have same total # of bytes and
+// objects, but different individual object sizes
+static void TestLeakButTotalsMatch() {
+  void* bar1 = AllocHidden(240 * sizeof(int));
+  Use(&bar1);
+  void* bar2 = AllocHidden(160 * sizeof(int));
+  Use(&bar2);
+  LogHidden("Pre leaking", bar1);
+  LogHidden("Pre leaking", bar2);
+  Pause();
+  HeapLeakChecker check("trick");
+  void* foo1 = AllocHidden(280 * sizeof(int));
+  Use(&foo1);
+  void* foo2 = AllocHidden(120 * sizeof(int));
+  Use(&foo2);
+  LogHidden("Leaking", foo1);
+  LogHidden("Leaking", foo2);
+  DeAllocHidden(&bar1);
+  DeAllocHidden(&bar2);
+  Pause();
+
+  // foo1 and foo2 leaked
+  VerifyLeaks(&check, NO_LEAKS, (280+120)*sizeof(int), 2);
+
+  DeAllocHidden(&foo1);
+  DeAllocHidden(&foo2);
+}
+
+// no false negatives from pprof
+static void TestHeapLeakCheckerDeathTrick() {
+  void* bar1 = AllocHidden(240 * sizeof(int));
+  Use(&bar1);
+  void* bar2 = AllocHidden(160 * sizeof(int));
+  Use(&bar2);
+  HeapLeakChecker check("death_trick");
+  DeAllocHidden(&bar1);
+  DeAllocHidden(&bar2);
+  void* foo1 = AllocHidden(280 * sizeof(int));
+  Use(&foo1);
+  void* foo2 = AllocHidden(120 * sizeof(int));
+  Use(&foo2);
+  // TODO(maxim): use the above if we make pprof work in automated test runs
+  if (!FLAGS_maybe_stripped) {
+    CHECK_EQ(RUN_SILENT(check, SameHeap), false);
+      // pprof checking should catch the leak
+  } else {
+    WARN_IF(RUN_SILENT(check, SameHeap) != false,
+            "death_trick leak is not caught; "
+            "we must be using a stripped binary");
+  }
+  DeAllocHidden(&foo1);
+  DeAllocHidden(&foo2);
+}
+
+// simple leak
+static void TransLeaks() {
+  AllocHidden(1 * sizeof(char));
+}
+
+// range-based disabling using Disabler
+static void ScopedDisabledLeaks() {
+  HeapLeakChecker::Disabler disabler;
+  AllocHidden(3 * sizeof(int));
+  TransLeaks();
+  (void)malloc(10);  // Direct leak
+}
+
+// have different disabled leaks
+static void* RunDisabledLeaks(void* a) {
+  ScopedDisabledLeaks();
+  return a;
+}
+
+// have different disabled leaks inside of a thread
+static void ThreadDisabledLeaks() {
+  if (FLAGS_no_threads)  return;
+  pthread_t tid;
+  pthread_attr_t attr;
+  CHECK_EQ(pthread_attr_init(&attr), 0);
+  CHECK_EQ(pthread_create(&tid, &attr, RunDisabledLeaks, NULL), 0);
+  void* res;
+  CHECK_EQ(pthread_join(tid, &res), 0);
+}
+
+// different disabled leaks (some in threads)
+static void TestHeapLeakCheckerDisabling() {
+  HeapLeakChecker check("disabling");
+
+  RunDisabledLeaks(NULL);
+  RunDisabledLeaks(NULL);
+  ThreadDisabledLeaks();
+  RunDisabledLeaks(NULL);
+  ThreadDisabledLeaks();
+  ThreadDisabledLeaks();
+
+  Pause();
+
+  CHECK(check.SameHeap());
+}
+
+typedef set<int> IntSet;
+
+static int some_ints[] = { 1, 2, 3, 21, 22, 23, 24, 25 };
+
+static void DoTestSTLAlloc() {
+  IntSet* x = new(initialized) IntSet[1];
+  *x = IntSet(some_ints, some_ints + 6);
+  for (int i = 0; i < 1000; i++) {
+    x->insert(i*3);
+  }
+  delete [] x;
+}
+
+// Check that normal STL usage does not result in a leak report.
+// (In particular we test that there's no complex STL's own allocator
+// running on top of our allocator with hooks to heap profiler
+// that can result in false leak report in this case.)
+static void TestSTLAlloc() {
+  HeapLeakChecker check("stl");
+  RunHidden(NewCallback(DoTestSTLAlloc));
+  CHECK_EQ(check.BriefSameHeap(), true);
+}
+
+static void DoTestSTLAllocInverse(IntSet** setx) {
+  IntSet* x = new(initialized) IntSet[1];
+  *x = IntSet(some_ints, some_ints + 3);
+  for (int i = 0; i < 100; i++) {
+    x->insert(i*2);
+  }
+  Hide(&x);
+  *setx = x;
+}
+
+static void FreeTestSTLAllocInverse(IntSet** setx) {
+  IntSet* x = *setx;
+  UnHide(&x);
+  delete [] x;
+}
+
+// Check that normal leaked STL usage *does* result in a leak report.
+// (In particular we test that there's no complex STL's own allocator
+// running on top of our allocator with hooks to heap profiler
+// that can result in false absence of leak report in this case.)
+static void TestSTLAllocInverse() {
+  HeapLeakChecker check("death_inverse_stl");
+  IntSet* x;
+  RunHidden(NewCallback(DoTestSTLAllocInverse, &x));
+  LogHidden("Leaking", x);
+  if (can_create_leaks_reliably) {
+    WipeStack();  // to help with can_create_leaks_reliably
+    // these might still fail occasionally, but it should be very rare
+    CHECK_EQ(RUN_SILENT(check, BriefNoLeaks), false);
+    CHECK_GE(check.BytesLeaked(), 100 * sizeof(int));
+    CHECK_GE(check.ObjectsLeaked(), 100);
+      // assumes set<>s are represented by some kind of binary tree
+      // or something else allocating >=1 heap object per set object
+  } else {
+    WARN_IF(RUN_SILENT(check, BriefNoLeaks) != false,
+            "Expected leaks not found: "
+            "Some liveness flood must be too optimistic");
+  }
+  RunHidden(NewCallback(FreeTestSTLAllocInverse, &x));
+}
+
+template<class Alloc>
+static void DirectTestSTLAlloc(Alloc allocator, const char* name) {
+  HeapLeakChecker check((string("direct_stl-") + name).c_str());
+  static const int kSize = 1000;
+  typename Alloc::pointer ptrs[kSize];
+  for (int i = 0; i < kSize; ++i) {
+    typename Alloc::pointer p = allocator.allocate(i*3+1);
+    HeapLeakChecker::IgnoreObject(p);
+    // This will crash if p is not known to heap profiler:
+    // (i.e. STL's "allocator" does not have a direct hook to heap profiler)
+    HeapLeakChecker::UnIgnoreObject(p);
+    ptrs[i] = p;
+  }
+  for (int i = 0; i < kSize; ++i) {
+    allocator.deallocate(ptrs[i], i*3+1);
+    ptrs[i] = NULL;
+  }
+  CHECK(check.BriefSameHeap());  // just in case
+}
+
+static struct group* grp = NULL;
+static const int kKeys = 50;
+static pthread_key_t key[kKeys];
+
+static void KeyFree(void* ptr) {
+  delete [] reinterpret_cast<char*>(ptr);
+}
+
+static bool key_init_has_run = false;
+
+static void KeyInit() {
+  for (int i = 0; i < kKeys; ++i) {
+    CHECK_EQ(pthread_key_create(&key[i], KeyFree), 0);
+    VLOG(2) << "pthread key " << i << " : " << key[i];
+  }
+  key_init_has_run = true;   // needed for a sanity-check
+}
+
+// force various C library static and thread-specific allocations
+static void TestLibCAllocate() {
+  CHECK(key_init_has_run);
+  for (int i = 0; i < kKeys; ++i) {
+    void* p = pthread_getspecific(key[i]);
+    if (NULL == p) {
+      if (i == 0) {
+        // Test-logging inside threads which (potentially) creates and uses
+        // thread-local data inside standard C++ library:
+        VLOG(0) << "Adding pthread-specifics for thread " << pthread_self()
+                << " pid " << getpid();
+      }
+      p = new(initialized) char[77 + i];
+      VLOG(2) << "pthread specific " << i << " : " << p;
+      pthread_setspecific(key[i], p);
+    }
+  }
+
+  strerror(errno);
+  const time_t now = time(NULL);
+  ctime(&now);
+#ifdef HAVE_EXECINFO_H
+  void *stack[1];
+  backtrace(stack, 1);
+#endif
+#ifdef HAVE_GRP_H
+  gid_t gid = getgid();
+  getgrgid(gid);
+  if (grp == NULL)  grp = getgrent();  // a race condition here is okay
+  getgrnam(grp->gr_name);
+#endif
+#ifdef HAVE_PWD_H
+  getpwuid(geteuid());
+#endif
+}
+
+// Continuous random heap memory activity to try to disrupt heap checking.
+static void* HeapBusyThreadBody(void* a) {
+  const int thread_num = reinterpret_cast<intptr_t>(a);
+  VLOG(0) << "A new HeapBusyThread " << thread_num;
+  TestLibCAllocate();
+
+  int user = 0;
+  // Try to hide ptr from heap checker in a CPU register:
+  // Here we are just making a best effort to put the only pointer
+  // to a heap object into a thread register to test
+  // the thread-register finding machinery in the heap checker.
+#if defined(__i386__) && defined(__GNUC__)
+  register int** ptr asm("esi");
+#elif defined(__x86_64__) && defined(__GNUC__)
+  register int** ptr asm("r15");
+#else
+  register int** ptr;
+#endif
+  ptr = NULL;
+  typedef set<int> Set;
+  Set s1;
+  while (1) {
+    // TestLibCAllocate() calls libc functions that don't work so well
+    // after main() has exited.  So we just don't do the test then.
+    if (!g_have_exited_main)
+      TestLibCAllocate();
+
+    if (ptr == NULL) {
+      ptr = new(initialized) int*[1];
+      *ptr = new(initialized) int[1];
+    }
+    set<int>* s2 = new(initialized) set<int>[1];
+    s1.insert(random());
+    s2->insert(*s1.begin());
+    user += *s2->begin();
+    **ptr += user;
+    if (random() % 51 == 0) {
+      s1.clear();
+      if (random() % 2 == 0) {
+        s1.~Set();
+        new(&s1) Set;
+      }
+    }
+    VLOG(3) << pthread_self() << " (" << getpid() << "): in wait: "
+            << ptr << ", " << *ptr << "; " << s1.size();
+    VLOG(2) << pthread_self() << " (" << getpid() << "): in wait, ptr = "
+            << reinterpret_cast<void*>(
+                 reinterpret_cast<uintptr_t>(ptr) ^ kHideMask)
+            << "^" << reinterpret_cast<void*>(kHideMask);
+    if (FLAGS_test_register_leak  &&  thread_num % 5 == 0) {
+      // Hide the register "ptr" value with an xor mask.
+      // If one provides --test_register_leak flag, the test should
+      // (with very high probability) crash on some leak check
+      // with a leak report (of some x * sizeof(int) + y * sizeof(int*) bytes)
+      // pointing at the two lines above in this function
+      // with "new(initialized) int" in them as the allocators
+      // of the leaked objects.
+      // CAVEAT: We can't really prevent a compiler to save some
+      // temporary values of "ptr" on the stack and thus let us find
+      // the heap objects not via the register.
+      // Hence it's normal if for certain compilers or optimization modes
+      // --test_register_leak does not cause a leak crash of the above form
+      // (this happens e.g. for gcc 4.0.1 in opt mode).
+      ptr = reinterpret_cast<int **>(
+          reinterpret_cast<uintptr_t>(ptr) ^ kHideMask);
+      // busy loop to get the thread interrupted at:
+      for (int i = 1; i < 10000000; ++i)  user += (1 + user * user * 5) / i;
+      ptr = reinterpret_cast<int **>(
+          reinterpret_cast<uintptr_t>(ptr) ^ kHideMask);
+    } else {
+      poll(NULL, 0, random() % 100);
+    }
+    VLOG(2) << pthread_self() << ": continuing";
+    if (random() % 3 == 0) {
+      delete [] *ptr;
+      delete [] ptr;
+      ptr = NULL;
+    }
+    delete [] s2;
+  }
+  return a;
+}
+
+static void RunHeapBusyThreads() {
+  KeyInit();
+  if (!FLAGS_interfering_threads || FLAGS_no_threads)  return;
+
+  const int n = 17;  // make many threads
+
+  pthread_t tid;
+  pthread_attr_t attr;
+  CHECK_EQ(pthread_attr_init(&attr), 0);
+  // make them and let them run
+  for (int i = 0; i < n; ++i) {
+    VLOG(0) << "Creating extra thread " << i + 1;
+    CHECK(pthread_create(&tid, &attr, HeapBusyThreadBody,
+                         reinterpret_cast<void*>(i)) == 0);
+  }
+
+  Pause();
+  Pause();
+}
+
+// ========================================================================= //
+
+// This code section is to test that objects that are reachable from global
+// variables are not reported as leaks
+// as well as that (Un)IgnoreObject work for such objects fine.
+
+// An object making functions:
+// returns a "weird" pointer to a new object for which
+// it's worth checking that the object is reachable via that pointer.
+typedef void* (*ObjMakerFunc)();
+static list<ObjMakerFunc> obj_makers;  // list of registered object makers
+
+// Helper macro to register an object making function
+// 'name' is an identifier of this object maker,
+// 'body' is its function body that must declare
+//        pointer 'p' to the nex object to return.
+// Usage example:
+//   REGISTER_OBJ_MAKER(trivial, int* p = new(initialized) int;)
+#define REGISTER_OBJ_MAKER(name, body) \
+  void* ObjMaker_##name##_() { \
+    VLOG(1) << "Obj making " << #name; \
+    body; \
+    return p; \
+  } \
+  static ObjMakerRegistrar maker_reg_##name##__(&ObjMaker_##name##_);
+// helper class for REGISTER_OBJ_MAKER
+struct ObjMakerRegistrar {
+  ObjMakerRegistrar(ObjMakerFunc obj_maker) { obj_makers.push_back(obj_maker); }
+};
+
+// List of the objects/pointers made with all the obj_makers
+// to test reachability via global data pointers during leak checks.
+static list<void*>* live_objects = new list<void*>;
+  // pointer so that it does not get destructed on exit
+
+// Exerciser for one ObjMakerFunc.
+static void TestPointerReach(ObjMakerFunc obj_maker) {
+  HeapLeakChecker::IgnoreObject(obj_maker());  // test IgnoreObject
+
+  void* obj = obj_maker();
+  HeapLeakChecker::IgnoreObject(obj);
+  HeapLeakChecker::UnIgnoreObject(obj);  // test UnIgnoreObject
+  HeapLeakChecker::IgnoreObject(obj);  // not to need deletion for obj
+
+  live_objects->push_back(obj_maker());  // test reachability at leak check
+}
+
+// Test all ObjMakerFunc registred via REGISTER_OBJ_MAKER.
+static void TestObjMakers() {
+  for (list<ObjMakerFunc>::const_iterator i = obj_makers.begin();
+       i != obj_makers.end(); ++i) {
+    TestPointerReach(*i);
+    TestPointerReach(*i);  // a couple more times would not hurt
+    TestPointerReach(*i);
+  }
+}
+
+// A dummy class to mimic allocation behavior of string-s.
+template<class T>
+struct Array {
+  Array() {
+    size = 3 + random() % 30;
+    ptr = new(initialized) T[size];
+  }
+  ~Array() { delete [] ptr; }
+  Array(const Array& x) {
+    size = x.size;
+    ptr = new(initialized) T[size];
+    for (size_t i = 0; i < size; ++i) {
+      ptr[i] = x.ptr[i];
+    }
+  }
+  void operator=(const Array& x) {
+    delete [] ptr;
+    size = x.size;
+    ptr = new(initialized) T[size];
+    for (size_t i = 0; i < size; ++i) {
+      ptr[i] = x.ptr[i];
+    }
+  }
+  void append(const Array& x) {
+    T* p = new(initialized) T[size + x.size];
+    for (size_t i = 0; i < size; ++i) {
+      p[i] = ptr[i];
+    }
+    for (size_t i = 0; i < x.size; ++i) {
+      p[size+i] = x.ptr[i];
+    }
+    size += x.size;
+    delete [] ptr;
+    ptr = p;
+  }
+ private:
+  size_t size;
+  T* ptr;
+};
+
+// to test pointers to objects, built-in arrays, string, etc:
+REGISTER_OBJ_MAKER(plain, int* p = new(initialized) int;)
+REGISTER_OBJ_MAKER(int_array_1, int* p = new(initialized) int[1];)
+REGISTER_OBJ_MAKER(int_array, int* p = new(initialized) int[10];)
+REGISTER_OBJ_MAKER(string, Array<char>* p = new(initialized) Array<char>();)
+REGISTER_OBJ_MAKER(string_array,
+                   Array<char>* p = new(initialized) Array<char>[5];)
+REGISTER_OBJ_MAKER(char_array, char* p = new(initialized) char[5];)
+REGISTER_OBJ_MAKER(appended_string,
+  Array<char>* p = new Array<char>();
+  p->append(Array<char>());
+)
+REGISTER_OBJ_MAKER(plain_ptr, int** p = new(initialized) int*;)
+REGISTER_OBJ_MAKER(linking_ptr,
+  int** p = new(initialized) int*;
+  *p = new(initialized) int;
+)
+
+// small objects:
+REGISTER_OBJ_MAKER(0_sized, void* p = malloc(0);)  // 0-sized object (important)
+REGISTER_OBJ_MAKER(1_sized, void* p = malloc(1);)
+REGISTER_OBJ_MAKER(2_sized, void* p = malloc(2);)
+REGISTER_OBJ_MAKER(3_sized, void* p = malloc(3);)
+REGISTER_OBJ_MAKER(4_sized, void* p = malloc(4);)
+
+static int set_data[] = { 1, 2, 3, 4, 5, 6, 7, 21, 22, 23, 24, 25, 26, 27 };
+static set<int> live_leak_set(set_data, set_data+7);
+static const set<int> live_leak_const_set(set_data, set_data+14);
+
+REGISTER_OBJ_MAKER(set,
+  set<int>* p = new(initialized) set<int>(set_data, set_data + 13);
+)
+
+class ClassA {
+ public:
+  explicit ClassA(int a) : ptr(NULL) { }
+  mutable char* ptr;
+};
+static const ClassA live_leak_mutable(1);
+
+template<class C>
+class TClass {
+ public:
+  explicit TClass(int a) : ptr(NULL) { }
+  mutable C val;
+  mutable C* ptr;
+};
+static const TClass<Array<char> > live_leak_templ_mutable(1);
+
+class ClassB {
+ public:
+  ClassB() { }
+  char b[7];
+  virtual void f() { }
+  virtual ~ClassB() { }
+};
+
+class ClassB2 {
+ public:
+  ClassB2() { }
+  char b2[11];
+  virtual void f2() { }
+  virtual ~ClassB2() { }
+};
+
+class ClassD1 : public ClassB {
+  char d1[15];
+  virtual void f() { }
+};
+
+class ClassD2 : public ClassB2 {
+  char d2[19];
+  virtual void f2() { }
+};
+
+class ClassD : public ClassD1, public ClassD2 {
+  char d[3];
+  virtual void f() { }
+  virtual void f2() { }
+};
+
+// to test pointers to objects of base subclasses:
+
+REGISTER_OBJ_MAKER(B,  ClassB*  p = new(initialized) ClassB;)
+REGISTER_OBJ_MAKER(D1, ClassD1* p = new(initialized) ClassD1;)
+REGISTER_OBJ_MAKER(D2, ClassD2* p = new(initialized) ClassD2;)
+REGISTER_OBJ_MAKER(D,  ClassD*  p = new(initialized) ClassD;)
+
+REGISTER_OBJ_MAKER(D1_as_B,  ClassB*  p = new(initialized) ClassD1;)
+REGISTER_OBJ_MAKER(D2_as_B2, ClassB2* p = new(initialized) ClassD2;)
+REGISTER_OBJ_MAKER(D_as_B,   ClassB*  p = new(initialized)  ClassD;)
+REGISTER_OBJ_MAKER(D_as_D1,  ClassD1* p = new(initialized) ClassD;)
+// inside-object pointers:
+REGISTER_OBJ_MAKER(D_as_B2,  ClassB2* p = new(initialized) ClassD;)
+REGISTER_OBJ_MAKER(D_as_D2,  ClassD2* p = new(initialized) ClassD;)
+
+class InterfaceA {
+ public:
+  virtual void A() = 0;
+  virtual ~InterfaceA() { }
+ protected:
+  InterfaceA() { }
+};
+
+class InterfaceB {
+ public:
+  virtual void B() = 0;
+  virtual ~InterfaceB() { }
+ protected:
+  InterfaceB() { }
+};
+
+class InterfaceC : public InterfaceA {
+ public:
+  virtual void C() = 0;
+  virtual ~InterfaceC() { }
+ protected:
+  InterfaceC() { }
+};
+
+class ClassMltD1 : public ClassB, public InterfaceB, public InterfaceC {
+ public:
+  char d1[11];
+  virtual void f() { }
+  virtual void A() { }
+  virtual void B() { }
+  virtual void C() { }
+};
+
+class ClassMltD2 : public InterfaceA, public InterfaceB, public ClassB {
+ public:
+  char d2[15];
+  virtual void f() { }
+  virtual void A() { }
+  virtual void B() { }
+};
+
+// to specifically test heap reachability under
+// inerface-only multiple inheritance (some use inside-object pointers):
+REGISTER_OBJ_MAKER(MltD1,       ClassMltD1* p = new(initialized) ClassMltD1;)
+REGISTER_OBJ_MAKER(MltD1_as_B,  ClassB*     p = new(initialized) ClassMltD1;)
+REGISTER_OBJ_MAKER(MltD1_as_IA, InterfaceA* p = new(initialized) ClassMltD1;)
+REGISTER_OBJ_MAKER(MltD1_as_IB, InterfaceB* p = new(initialized) ClassMltD1;)
+REGISTER_OBJ_MAKER(MltD1_as_IC, InterfaceC* p = new(initialized) ClassMltD1;)
+
+REGISTER_OBJ_MAKER(MltD2,       ClassMltD2* p = new(initialized) ClassMltD2;)
+REGISTER_OBJ_MAKER(MltD2_as_B,  ClassB*     p = new(initialized) ClassMltD2;)
+REGISTER_OBJ_MAKER(MltD2_as_IA, InterfaceA* p = new(initialized) ClassMltD2;)
+REGISTER_OBJ_MAKER(MltD2_as_IB, InterfaceB* p = new(initialized) ClassMltD2;)
+
+// to mimic UnicodeString defined in third_party/icu,
+// which store a platform-independent-sized refcount in the first
+// few bytes and keeps a pointer pointing behind the refcount.
+REGISTER_OBJ_MAKER(unicode_string,
+  char* p = new char[sizeof(uint32) * 10];
+  p += sizeof(uint32);
+)
+// similar, but for platform-dependent-sized refcount
+REGISTER_OBJ_MAKER(ref_counted,
+  char* p = new char[sizeof(int) * 20];
+  p += sizeof(int);
+)
+
+struct Nesting {
+  struct Inner {
+    Nesting* parent;
+    Inner(Nesting* p) : parent(p) {}
+  };
+  Inner i0;
+  char n1[5];
+  Inner i1;
+  char n2[11];
+  Inner i2;
+  char n3[27];
+  Inner i3;
+  Nesting() : i0(this), i1(this), i2(this), i3(this) {}
+};
+
+// to test inside-object pointers pointing at objects nested into heap objects:
+REGISTER_OBJ_MAKER(nesting_i0, Nesting::Inner* p = &((new Nesting())->i0);)
+REGISTER_OBJ_MAKER(nesting_i1, Nesting::Inner* p = &((new Nesting())->i1);)
+REGISTER_OBJ_MAKER(nesting_i2, Nesting::Inner* p = &((new Nesting())->i2);)
+REGISTER_OBJ_MAKER(nesting_i3, Nesting::Inner* p = &((new Nesting())->i3);)
+
+void (* volatile init_forcer)(...);
+
+// allocate many objects reachable from global data
+static void TestHeapLeakCheckerLiveness() {
+  live_leak_mutable.ptr = new(initialized) char[77];
+  live_leak_templ_mutable.ptr = new(initialized) Array<char>();
+  live_leak_templ_mutable.val = Array<char>();
+
+  // smart compiler may see that live_leak_mutable is not used
+  // anywhere so .ptr assignment is not used.
+  //
+  // We force compiler to assume that it is used by having function
+  // variable (set to 0 which hopefully won't be known to compiler)
+  // which gets address of those objects. So compiler has to assume
+  // that .ptr is used.
+  if (init_forcer) {
+    init_forcer(&live_leak_mutable, &live_leak_templ_mutable);
+  }
+  TestObjMakers();
+}
+
+// ========================================================================= //
+
+// Get address (PC value) following the mmap call into addr_after_mmap_call
+static void* Mmapper(uintptr_t* addr_after_mmap_call) {
+  void* r = mmap(NULL, 100, PROT_READ|PROT_WRITE,
+                 MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  // Get current PC value into addr_after_mmap_call
+  void* stack[1];
+  CHECK_EQ(GetStackTrace(stack, 1, 0), 1);
+  *addr_after_mmap_call = reinterpret_cast<uintptr_t>(stack[0]);
+  sleep(0);  // undo -foptimize-sibling-calls
+  return r;
+}
+
+// On PPC64 the stacktrace returned by GetStatcTrace contains the function
+// address from .text segment while function pointers points to ODP entries.
+// The following code decodes the ODP to get the actual symbol address.
+#if defined(__linux) && defined(__PPC64__) && (_CALL_ELF != 2)
+static inline uintptr_t GetFunctionAddress (void* (*func)(uintptr_t*))
+{
+  struct odp_entry_t {
+    unsigned long int symbol;
+    unsigned long int toc;
+    unsigned long int env;
+  } *odp_entry = reinterpret_cast<odp_entry_t*>(func);
+
+  return static_cast<uintptr_t>(odp_entry->symbol);
+}
+#else
+static inline uintptr_t GetFunctionAddress (void* (*func)(uintptr_t*))
+{
+  return reinterpret_cast<uintptr_t>(func);
+}
+#endif
+
+// to trick complier into preventing inlining
+static void* (*mmapper_addr)(uintptr_t* addr) = &Mmapper;
+
+// TODO(maxim): copy/move this to memory_region_map_unittest
+// TODO(maxim): expand this test to include mmap64, mremap and sbrk calls.
+static void VerifyMemoryRegionMapStackGet() {
+  uintptr_t caller_addr_limit;
+  void* addr = (*mmapper_addr)(&caller_addr_limit);
+  uintptr_t caller = 0;
+  { MemoryRegionMap::LockHolder l;
+    for (MemoryRegionMap::RegionIterator
+           i = MemoryRegionMap::BeginRegionLocked();
+           i != MemoryRegionMap::EndRegionLocked(); ++i) {
+      if (i->start_addr == reinterpret_cast<uintptr_t>(addr)) {
+        CHECK_EQ(caller, 0);
+        caller = i->caller();
+      }
+    }
+  }
+  // caller must point into Mmapper function:
+  if (!(GetFunctionAddress(mmapper_addr) <= caller  &&
+        caller < caller_addr_limit)) {
+    LOGF << std::hex << "0x" << caller
+         << " does not seem to point into code of function Mmapper at "
+         << "0x" << reinterpret_cast<uintptr_t>(mmapper_addr)
+         << "! Stack frame collection must be off in MemoryRegionMap!";
+    LOG(FATAL, "\n");
+  }
+  munmap(addr, 100);
+}
+
+static void* Mallocer(uintptr_t* addr_after_malloc_call) {
+  void* r = malloc(100);
+  sleep(0);  // undo -foptimize-sibling-calls
+  // Get current PC value into addr_after_malloc_call
+  void* stack[1];
+  CHECK_EQ(GetStackTrace(stack, 1, 0), 1);
+  *addr_after_malloc_call = reinterpret_cast<uintptr_t>(stack[0]);
+  return r;
+}
+
+// to trick complier into preventing inlining
+static void* (*mallocer_addr)(uintptr_t* addr) = &Mallocer;
+
+// non-static for friendship with HeapProfiler
+// TODO(maxim): expand this test to include
+// realloc, calloc, memalign, valloc, pvalloc, new, and new[].
+extern void VerifyHeapProfileTableStackGet() {
+  uintptr_t caller_addr_limit;
+  void* addr = (*mallocer_addr)(&caller_addr_limit);
+  uintptr_t caller =
+    reinterpret_cast<uintptr_t>(HeapLeakChecker::GetAllocCaller(addr));
+  // caller must point into Mallocer function:
+  if (!(GetFunctionAddress(mallocer_addr) <= caller  &&
+        caller < caller_addr_limit)) {
+    LOGF << std::hex << "0x" << caller
+         << " does not seem to point into code of function Mallocer at "
+         << "0x" << reinterpret_cast<uintptr_t>(mallocer_addr)
+         << "! Stack frame collection must be off in heap profiler!";
+    LOG(FATAL, "\n");
+  }
+  free(addr);
+}
+
+// ========================================================================= //
+
+static void MakeALeak(void** arr) {
+  PreventHeapReclaiming(10 * sizeof(int));
+  void* a = new(initialized) int[10];
+  Hide(&a);
+  *arr = a;
+}
+
+// Helper to do 'return 0;' inside main(): insted we do 'return Pass();'
+static int Pass() {
+  fprintf(stdout, "PASS\n");
+  g_have_exited_main = true;
+  return 0;
+}
+
+int main(int argc, char** argv) {
+  run_hidden_ptr = DoRunHidden;
+  wipe_stack_ptr = DoWipeStack;
+  if (!HeapLeakChecker::IsActive()) {
+    CHECK_EQ(FLAGS_heap_check, "");
+    LOG(WARNING, "HeapLeakChecker got turned off; we won't test much...");
+  } else {
+    VerifyMemoryRegionMapStackGet();
+    VerifyHeapProfileTableStackGet();
+  }
+
+  KeyInit();
+
+  // glibc 2.4, on x86_64 at least, has a lock-ordering bug, which
+  // means deadlock is possible when one thread calls dl_open at the
+  // same time another thread is calling dl_iterate_phdr.  libunwind
+  // calls dl_iterate_phdr, and TestLibCAllocate calls dl_open (or the
+  // various syscalls in it do), at least the first time it's run.
+  // To avoid the deadlock, we run TestLibCAllocate once before getting
+  // multi-threaded.
+  // TODO(csilvers): once libc is fixed, or libunwind can work around it,
+  //                 get rid of this early call.  We *want* our test to
+  //                 find potential problems like this one!
+  TestLibCAllocate();
+
+  if (FLAGS_interfering_threads) {
+    RunHeapBusyThreads();  // add interference early
+  }
+  TestLibCAllocate();
+
+  LOGF << "In main(): heap_check=" << FLAGS_heap_check << endl;
+
+  CHECK(HeapLeakChecker::NoGlobalLeaks());  // so far, so good
+
+  if (FLAGS_test_leak) {
+    void* arr;
+    RunHidden(NewCallback(MakeALeak, &arr));
+    Use(&arr);
+    LogHidden("Leaking", arr);
+    if (FLAGS_test_cancel_global_check) {
+      HeapLeakChecker::CancelGlobalCheck();
+    } else {
+      // Verify we can call NoGlobalLeaks repeatedly without deadlocking
+      HeapLeakChecker::NoGlobalLeaks();
+      HeapLeakChecker::NoGlobalLeaks();
+    }
+    return Pass();
+      // whole-program leak-check should (with very high probability)
+      // catch the leak of arr (10 * sizeof(int) bytes)
+      // (when !FLAGS_test_cancel_global_check)
+  }
+
+  if (FLAGS_test_loop_leak) {
+    void* arr1;
+    void* arr2;
+    RunHidden(NewCallback(MakeDeathLoop, &arr1, &arr2));
+    Use(&arr1);
+    Use(&arr2);
+    LogHidden("Loop leaking", arr1);
+    LogHidden("Loop leaking", arr2);
+    if (FLAGS_test_cancel_global_check) {
+      HeapLeakChecker::CancelGlobalCheck();
+    } else {
+      // Verify we can call NoGlobalLeaks repeatedly without deadlocking
+      HeapLeakChecker::NoGlobalLeaks();
+      HeapLeakChecker::NoGlobalLeaks();
+    }
+    return Pass();
+      // whole-program leak-check should (with very high probability)
+      // catch the leak of arr1 and arr2 (4 * sizeof(void*) bytes)
+      // (when !FLAGS_test_cancel_global_check)
+  }
+
+  if (FLAGS_test_register_leak) {
+    // make us fail only where the .sh test expects:
+    Pause();
+    for (int i = 0; i < 100; ++i) {  // give it some time to crash
+      CHECK(HeapLeakChecker::NoGlobalLeaks());
+      Pause();
+    }
+    return Pass();
+  }
+
+  TestHeapLeakCheckerLiveness();
+
+  HeapLeakChecker heap_check("all");
+
+  TestHiddenPointer();
+
+  TestHeapLeakChecker();
+  Pause();
+  TestLeakButTotalsMatch();
+  Pause();
+
+  TestHeapLeakCheckerDeathSimple();
+  Pause();
+  TestHeapLeakCheckerDeathLoop();
+  Pause();
+  TestHeapLeakCheckerDeathInverse();
+  Pause();
+  TestHeapLeakCheckerDeathNoLeaks();
+  Pause();
+  TestHeapLeakCheckerDeathCountLess();
+  Pause();
+  TestHeapLeakCheckerDeathCountMore();
+  Pause();
+
+  TestHeapLeakCheckerDeathTrick();
+  Pause();
+
+  CHECK(HeapLeakChecker::NoGlobalLeaks());  // so far, so good
+
+  TestHeapLeakCheckerNoFalsePositives();
+  Pause();
+
+  TestHeapLeakCheckerDisabling();
+  Pause();
+
+  TestSTLAlloc();
+  Pause();
+  TestSTLAllocInverse();
+  Pause();
+
+  // Test that various STL allocators work.  Some of these are redundant, but
+  // we don't know how STL might change in the future.  For example,
+  // http://wiki/Main/StringNeStdString.
+#define DTSL(a) { DirectTestSTLAlloc(a, #a); \
+                  Pause(); }
+  DTSL(std::allocator<char>());
+  DTSL(std::allocator<int>());
+  DTSL(std::string().get_allocator());
+  DTSL(string().get_allocator());
+  DTSL(vector<int>().get_allocator());
+  DTSL(vector<double>().get_allocator());
+  DTSL(vector<vector<int> >().get_allocator());
+  DTSL(vector<string>().get_allocator());
+  DTSL((map<string, string>().get_allocator()));
+  DTSL((map<string, int>().get_allocator()));
+  DTSL(set<char>().get_allocator());
+#undef DTSL
+
+  TestLibCAllocate();
+  Pause();
+
+  CHECK(HeapLeakChecker::NoGlobalLeaks());  // so far, so good
+
+  Pause();
+
+  if (!FLAGS_maybe_stripped) {
+    CHECK(heap_check.SameHeap());
+  } else {
+    WARN_IF(heap_check.SameHeap() != true,
+            "overall leaks are caught; we must be using a stripped binary");
+  }
+
+  CHECK(HeapLeakChecker::NoGlobalLeaks());  // so far, so good
+
+  return Pass();
+}

diff --git a/src/tests/heap-checker_unittest.sh b/src/tests/heap-checker_unittest.sh
new file mode 100755
index 0000000..3c9c0e9
--- /dev/null
+++ b/src/tests/heap-checker_unittest.sh

@@ -0,0 +1,89 @@
+#!/bin/sh
+
+# Copyright (c) 2005, Google Inc.
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ---
+# Author: Craig Silverstein
+#
+# Runs the heap-checker unittest with various environment variables.
+# This is necessary because we turn on features like the heap profiler
+# and heap checker via environment variables.  This test makes sure
+# they all play well together.
+
+# We expect BINDIR and PPROF_PATH to be set in the environment.
+# If not, we set them to some reasonable values
+BINDIR="${BINDIR:-.}"
+PPROF_PATH="${PPROF_PATH:-$BINDIR/src/pprof}"
+
+if [ "x$1" = "x-h" -o "$1" = "x--help" ]; then
+  echo "USAGE: $0 [unittest dir] [path to pprof]"
+  echo "       By default, unittest_dir=$BINDIR, pprof_path=$PPROF_PATH"
+  exit 1
+fi
+
+HEAP_CHECKER="${1:-$BINDIR/heap-checker_unittest}"
+PPROF_PATH="${2:-$PPROF_PATH}"
+
+TMPDIR=/tmp/heap_check_info
+rm -rf $TMPDIR || exit 2
+mkdir $TMPDIR || exit 3
+
+# $1: value of heap-check env. var.
+run_check() {
+    export PPROF_PATH="$PPROF_PATH"
+    [ -n "$1" ] && export HEAPCHECK="$1" || unset HEAPPROFILE
+
+    echo -n "Testing $HEAP_CHECKER with HEAPCHECK=$1 ... "
+    if $HEAP_CHECKER > $TMPDIR/output 2>&1; then
+      echo "OK"
+    else
+      echo "FAILED"
+      echo "Output from the failed run:"
+      echo "----"
+      cat $TMPDIR/output
+      echo "----"      
+      exit 4
+    fi
+
+    # If we set HEAPPROFILE, then we expect it to actually have emitted
+    # a profile.  Check that it did.
+    if [ -n "$HEAPPROFILE" ]; then
+      [ -e "$HEAPPROFILE.0001.heap" ] || exit 5
+    fi
+}
+
+run_check ""
+run_check "local"
+run_check "normal"
+run_check "strict"
+
+rm -rf $TMPDIR      # clean up
+
+echo "PASS"

diff --git a/src/tests/heap-profiler_unittest.cc b/src/tests/heap-profiler_unittest.cc
new file mode 100644
index 0000000..3317813
--- /dev/null
+++ b/src/tests/heap-profiler_unittest.cc

@@ -0,0 +1,168 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// A small program that just exercises our heap profiler by allocating
+// memory and letting the heap-profiler emit a profile.  We don't test
+// threads (TODO).  By itself, this unittest tests that the heap-profiler
+// doesn't crash on simple programs, but its output can be analyzed by
+// another testing script to actually verify correctness.  See, eg,
+// heap-profiler_unittest.sh.
+
+#include "config_for_unittests.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>                  // for mkdir()
+#include <sys/stat.h>               // for mkdir() on freebsd and os x
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>                 // for fork()
+#endif
+#include <sys/wait.h>               // for wait()
+#include <string>
+#include "base/basictypes.h"
+#include "base/logging.h"
+#include <gperftools/heap-profiler.h>
+
+using std::string;
+
+static const int kMaxCount = 100000;
+int* g_array[kMaxCount];              // an array of int-vectors
+
+static ATTRIBUTE_NOINLINE void Allocate(int start, int end, int size) {
+  // NOTE: we're using this to prevent gcc 5 from merging otherwise
+  // identical Allocate & Allocate2 functions.
+  VLOG(10, "Allocate");
+  for (int i = start; i < end; ++i) {
+    if (i < kMaxCount)
+      g_array[i] = new int[size];
+  }
+}
+
+static ATTRIBUTE_NOINLINE void Allocate2(int start, int end, int size) {
+  VLOG(10, "Allocate2");
+  for (int i = start; i < end; ++i) {
+    if (i < kMaxCount)
+      g_array[i] = new int[size];
+  }
+}
+
+static void Deallocate(int start, int end) {
+  for (int i = start; i < end; ++i) {
+    delete[] g_array[i];
+    g_array[i] = 0;
+  }
+}
+
+static void TestHeapProfilerStartStopIsRunning() {
+  // If you run this with whole-program heap-profiling on, than
+  // IsHeapProfilerRunning should return true.
+  if (!IsHeapProfilerRunning()) {
+    const char* tmpdir = getenv("TMPDIR");
+    if (tmpdir == NULL)
+      tmpdir = "/tmp";
+    mkdir(tmpdir, 0755);     // if necessary
+    HeapProfilerStart((string(tmpdir) + "/start_stop").c_str());
+    CHECK(IsHeapProfilerRunning());
+
+    Allocate(0, 40, 100);
+    Deallocate(0, 40);
+
+    HeapProfilerStop();
+    CHECK(!IsHeapProfilerRunning());
+  }
+}
+
+static void TestDumpHeapProfiler() {
+  // If you run this with whole-program heap-profiling on, than
+  // IsHeapProfilerRunning should return true.
+  if (!IsHeapProfilerRunning()) {
+    const char* tmpdir = getenv("TMPDIR");
+    if (tmpdir == NULL)
+      tmpdir = "/tmp";
+    mkdir(tmpdir, 0755);     // if necessary
+    HeapProfilerStart((string(tmpdir) + "/dump").c_str());
+    CHECK(IsHeapProfilerRunning());
+
+    Allocate(0, 40, 100);
+    Deallocate(0, 40);
+
+    char* output = GetHeapProfile();
+    free(output);
+    HeapProfilerStop();
+  }
+}
+
+
+int main(int argc, char** argv) {
+  if (argc > 2 || (argc == 2 && argv[1][0] == '-')) {
+    printf("USAGE: %s [number of children to fork]\n", argv[0]);
+    exit(0);
+  }
+  int num_forks = 0;
+  if (argc == 2) {
+    num_forks = atoi(argv[1]);
+  }
+
+  TestHeapProfilerStartStopIsRunning();
+  TestDumpHeapProfiler();
+
+  Allocate(0, 40, 100);
+  Deallocate(0, 40);
+
+  Allocate(0, 40, 100);
+  Allocate(0, 40, 100);
+  Allocate2(40, 400, 1000);
+  Allocate2(400, 1000, 10000);
+  Deallocate(0, 1000);
+
+  Allocate(0, 100, 100000);
+  Deallocate(0, 10);
+  Deallocate(10, 20);
+  Deallocate(90, 100);
+  Deallocate(20, 90);
+
+  while (num_forks-- > 0) {
+    switch (fork()) {
+      case -1:
+        printf("FORK failed!\n");
+        return 1;
+      case 0:             // child
+        return execl(argv[0], argv[0], NULL);   // run child with no args
+      default:
+        wait(NULL);       // we'll let the kids run one at a time
+    }
+  }
+
+  printf("DONE.\n");
+
+  return 0;
+}

diff --git a/src/tests/heap-profiler_unittest.sh b/src/tests/heap-profiler_unittest.sh
new file mode 100755
index 0000000..b4c2e9f
--- /dev/null
+++ b/src/tests/heap-profiler_unittest.sh

@@ -0,0 +1,150 @@
+#!/bin/sh
+
+# Copyright (c) 2005, Google Inc.
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ---
+# Author: Craig Silverstein
+#
+# Runs the heap-profiler unittest and makes sure the profile looks appropriate.
+#
+# We run under the assumption that if $HEAP_PROFILER is run with --help,
+# it prints a usage line of the form
+#   USAGE: <actual executable being run> [...]
+#
+# This is because libtool sometimes turns the 'executable' into a
+# shell script which runs an actual binary somewhere else.
+
+# We expect BINDIR and PPROF_PATH to be set in the environment.
+# If not, we set them to some reasonable values
+BINDIR="${BINDIR:-.}"
+PPROF_PATH="${PPROF_PATH:-$BINDIR/src/pprof}"
+
+if [ "x$1" = "x-h" -o "x$1" = "x--help" ]; then
+  echo "USAGE: $0 [unittest dir] [path to pprof]"
+  echo "       By default, unittest_dir=$BINDIR, pprof_path=$PPROF_PATH"
+  exit 1
+fi
+
+HEAP_PROFILER="${1:-$BINDIR/heap-profiler_unittest}"
+PPROF="${2:-$PPROF_PATH}"
+TEST_TMPDIR=/tmp/heap_profile_info
+
+# It's meaningful to the profiler, so make sure we know its state
+unset HEAPPROFILE
+
+rm -rf "$TEST_TMPDIR"
+mkdir "$TEST_TMPDIR" || exit 2
+
+num_failures=0
+
+# Given one profile (to check the contents of that profile) or two
+# profiles (to check the diff between the profiles), and a function
+# name, verify that the function name takes up at least 90% of the
+# allocated memory.  The function name is actually specified first.
+VerifyMemFunction() {
+  function="$1"
+  shift
+
+  # get program name.  Note we have to unset HEAPPROFILE so running
+  # help doesn't overwrite existing profiles.
+  exec=`unset HEAPPROFILE; $HEAP_PROFILER --help | awk '{print $2; exit;}'`
+
+  if [ $# = 2 ]; then
+    [ -f "$1" ] || { echo "Profile not found: $1"; exit 1; }
+    [ -f "$2" ] || { echo "Profile not found: $2"; exit 1; }
+    $PPROF --base="$1" $exec "$2" >"$TEST_TMPDIR/output.pprof" 2>&1
+  else
+    [ -f "$1" ] || { echo "Profile not found: $1"; exit 1; }
+    $PPROF $exec "$1" >"$TEST_TMPDIR/output.pprof" 2>&1
+  fi
+
+  cat "$TEST_TMPDIR/output.pprof" \
+      | tr -d % | awk '$6 ~ /^'$function'$/ && $2 > 90 {exit 1;}'
+  if [ $? != 1 ]; then
+    echo
+    echo "--- Test failed for $function: didn't account for 90% of executable memory"
+    echo "--- Program output:"
+    cat "$TEST_TMPDIR/output"
+    echo "--- pprof output:"
+    cat "$TEST_TMPDIR/output.pprof"
+    echo "---"
+    num_failures=`expr $num_failures + 1`
+  fi
+}
+
+VerifyOutputContains() {
+  text="$1"
+
+  if ! grep "$text" "$TEST_TMPDIR/output" >/dev/null 2>&1; then
+    echo "--- Test failed: output does not contain '$text'"
+    echo "--- Program output:"
+    cat "$TEST_TMPDIR/output"
+    echo "---"
+    num_failures=`expr $num_failures + 1`
+  fi
+}
+
+HEAPPROFILE="$TEST_TMPDIR/test"
+HEAP_PROFILE_INUSE_INTERVAL="10240"   # need this to be 10Kb
+HEAP_PROFILE_ALLOCATION_INTERVAL="$HEAP_PROFILE_INUSE_INTERVAL"
+HEAP_PROFILE_DEALLOCATION_INTERVAL="$HEAP_PROFILE_INUSE_INTERVAL"
+export HEAPPROFILE
+export HEAP_PROFILE_INUSE_INTERVAL
+export HEAP_PROFILE_ALLOCATION_INTERVAL
+export HEAP_PROFILE_DEALLOCATION_INTERVAL
+
+# We make the unittest run a child process, to test that the child
+# process doesn't try to write a heap profile as well and step on the
+# parent's toes.  If it does, we expect the parent-test to fail.
+$HEAP_PROFILER 1 >$TEST_TMPDIR/output 2>&1     # run program, with 1 child proc
+
+VerifyMemFunction Allocate2 "$HEAPPROFILE.1329.heap"
+VerifyMemFunction Allocate "$HEAPPROFILE.1448.heap" "$HEAPPROFILE.1548.heap"
+
+# Check the child process got to emit its own profile as well.
+VerifyMemFunction Allocate2 "$HEAPPROFILE"_*.1329.heap
+VerifyMemFunction Allocate "$HEAPPROFILE"_*.1448.heap "$HEAPPROFILE"_*.1548.heap
+
+# Make sure we logged both about allocating and deallocating memory
+VerifyOutputContains "62 MB allocated"
+VerifyOutputContains "62 MB freed"
+
+# Now try running without --heap_profile specified, to allow
+# testing of the HeapProfileStart/Stop functionality.
+$HEAP_PROFILER >"$TEST_TMPDIR/output2" 2>&1
+
+rm -rf $TMPDIR      # clean up
+
+if [ $num_failures = 0 ]; then
+  echo "PASS"
+else
+  echo "Tests finished with $num_failures failures"
+fi
+exit $num_failures

diff --git a/src/tests/large_heap_fragmentation_unittest.cc b/src/tests/large_heap_fragmentation_unittest.cc
new file mode 100644
index 0000000..0886599
--- /dev/null
+++ b/src/tests/large_heap_fragmentation_unittest.cc

@@ -0,0 +1,62 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This is a unit test for exercising fragmentation of large (over 1
+// meg) page spans. It makes sure that allocations/releases of
+// increasing memory chunks do not blowup memory
+// usage. See also https://code.google.com/p/gperftools/issues/detail?id=368
+
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "base/logging.h"
+#include "common.h"
+#include <gperftools/malloc_extension.h>
+
+
+int main (int argc, char** argv) {
+  for (int pass = 1; pass <= 3; pass++) {
+    size_t size = 100*1024*1024;
+    while (size < 500*1024*1024) {
+      void *ptr = malloc(size);
+      free(ptr);
+      size += 20000;
+
+      size_t heap_size = static_cast<size_t>(-1);
+      MallocExtension::instance()->GetNumericProperty("generic.heap_size",
+                                                      &heap_size);
+
+
+      CHECK_LT(heap_size, 1*1024*1024*1024);
+    }
+  }
+
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/low_level_alloc_unittest.cc b/src/tests/low_level_alloc_unittest.cc
new file mode 100644
index 0000000..e3cb555
--- /dev/null
+++ b/src/tests/low_level_alloc_unittest.cc

@@ -0,0 +1,197 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2006, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// A test for low_level_alloc.cc
+
+#include <stdio.h>
+#include <map>
+#include "base/low_level_alloc.h"
+#include "base/logging.h"
+#include <gperftools/malloc_hook.h>
+
+using std::map;
+
+// a block of memory obtained from the allocator
+struct BlockDesc {
+  char *ptr;      // pointer to memory
+  int len;        // number of bytes
+  int fill;       // filled with data starting with this
+};
+
+// Check that the pattern placed in the block d
+// by RandomizeBlockDesc is still there.
+static void CheckBlockDesc(const BlockDesc &d) {
+  for (int i = 0; i != d.len; i++) {
+    CHECK((d.ptr[i] & 0xff) == ((d.fill + i) & 0xff));
+  }
+}
+
+// Fill the block "*d" with a pattern
+// starting with a random byte.
+static void RandomizeBlockDesc(BlockDesc *d) {
+  d->fill = rand() & 0xff;
+  for (int i = 0; i != d->len; i++) {
+    d->ptr[i] = (d->fill + i) & 0xff;
+  }
+}
+
+// Use to indicate to the malloc hooks that
+// this calls is from LowLevelAlloc.
+static bool using_low_level_alloc = false;
+
+// n times, toss a coin, and based on the outcome
+// either allocate a new block or deallocate an old block.
+// New blocks are placed in a map with a random key
+// and initialized with RandomizeBlockDesc().
+// If keys conflict, the older block is freed.
+// Old blocks are always checked with CheckBlockDesc()
+// before being freed.  At the end of the run,
+// all remaining allocated blocks are freed.
+// If use_new_arena is true, use a fresh arena, and then delete it.
+// If call_malloc_hook is true and user_arena is true,
+// allocations and deallocations are reported via the MallocHook
+// interface.
+static void Test(bool use_new_arena, bool call_malloc_hook, int n) {
+  typedef map<int, BlockDesc> AllocMap;
+  AllocMap allocated;
+  AllocMap::iterator it;
+  BlockDesc block_desc;
+  int rnd;
+  LowLevelAlloc::Arena *arena = 0;
+  if (use_new_arena) {
+    int32 flags = call_malloc_hook?  LowLevelAlloc::kCallMallocHook :  0;
+    arena = LowLevelAlloc::NewArena(flags, LowLevelAlloc::DefaultArena());
+  }
+  for (int i = 0; i != n; i++) {
+    if (i != 0 && i % 10000 == 0) {
+      printf(".");
+      fflush(stdout);
+    }
+
+    switch(rand() & 1) {      // toss a coin
+    case 0:     // coin came up heads: add a block
+      using_low_level_alloc = true;
+      block_desc.len = rand() & 0x3fff;
+      block_desc.ptr =
+        reinterpret_cast<char *>(
+                        arena == 0
+                        ? LowLevelAlloc::Alloc(block_desc.len)
+                        : LowLevelAlloc::AllocWithArena(block_desc.len, arena));
+      using_low_level_alloc = false;
+      RandomizeBlockDesc(&block_desc);
+      rnd = rand();
+      it = allocated.find(rnd);
+      if (it != allocated.end()) {
+        CheckBlockDesc(it->second);
+        using_low_level_alloc = true;
+        LowLevelAlloc::Free(it->second.ptr);
+        using_low_level_alloc = false;
+        it->second = block_desc;
+      } else {
+        allocated[rnd] = block_desc;
+      }
+      break;
+    case 1:     // coin came up tails: remove a block
+      it = allocated.begin();
+      if (it != allocated.end()) {
+        CheckBlockDesc(it->second);
+        using_low_level_alloc = true;
+        LowLevelAlloc::Free(it->second.ptr);
+        using_low_level_alloc = false;
+        allocated.erase(it);
+      }
+      break;
+    }
+  }
+  // remove all remaniing blocks
+  while ((it = allocated.begin()) != allocated.end()) {
+    CheckBlockDesc(it->second);
+    using_low_level_alloc = true;
+    LowLevelAlloc::Free(it->second.ptr);
+    using_low_level_alloc = false;
+    allocated.erase(it);
+  }
+  if (use_new_arena) {
+    CHECK(LowLevelAlloc::DeleteArena(arena));
+  }
+}
+
+// used for counting allocates and frees
+static int32 allocates;
+static int32 frees;
+
+// called on each alloc if kCallMallocHook specified
+static void AllocHook(const void *p, size_t size) {
+  if (using_low_level_alloc) {
+    allocates++;
+  }
+}
+
+// called on each free if kCallMallocHook specified
+static void FreeHook(const void *p) {
+  if (using_low_level_alloc) {
+    frees++;
+  }
+}
+
+int main(int argc, char *argv[]) {
+  // This is needed by maybe_threads_unittest.sh, which parses argv[0]
+  // to figure out what directory low_level_alloc_unittest is in.
+  if (argc != 1) {
+    fprintf(stderr, "USAGE: %s\n", argv[0]);
+    return 1;
+  }
+
+  CHECK(MallocHook::AddNewHook(&AllocHook));
+  CHECK(MallocHook::AddDeleteHook(&FreeHook));
+  CHECK_EQ(allocates, 0);
+  CHECK_EQ(frees, 0);
+  Test(false, false, 50000);
+  CHECK_NE(allocates, 0);   // default arena calls hooks
+  CHECK_NE(frees, 0);
+  for (int i = 0; i != 16; i++) {
+    bool call_hooks = ((i & 1) == 1);
+    allocates = 0;
+    frees = 0;
+    Test(true, call_hooks, 15000);
+    if (call_hooks) {
+      CHECK_GT(allocates, 5000); // arena calls hooks
+      CHECK_GT(frees, 5000);
+    } else {
+      CHECK_EQ(allocates, 0);    // arena doesn't call hooks
+      CHECK_EQ(frees, 0);
+    }
+  }
+  printf("\nPASS\n");
+  CHECK(MallocHook::RemoveNewHook(&AllocHook));
+  CHECK(MallocHook::RemoveDeleteHook(&FreeHook));
+  return 0;
+}

diff --git a/src/tests/malloc_extension_c_test.c b/src/tests/malloc_extension_c_test.c
new file mode 100644
index 0000000..278fdb7
--- /dev/null
+++ b/src/tests/malloc_extension_c_test.c

@@ -0,0 +1,182 @@
+/* -*- Mode: C; c-basic-offset: 2; indent-tabs-mode: nil -*- */
+/* Copyright (c) 2009, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Craig Silverstein
+ *
+ * This tests the c shims: malloc_extension_c.h and malloc_hook_c.h.
+ * Mostly, we'll just care that these shims compile under gcc
+ * (*not* g++!)
+ *
+ * NOTE: this is C code, not C++ code!
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>   /* for size_t */
+#include <gperftools/malloc_extension_c.h>
+#include <gperftools/malloc_hook_c.h>
+
+#define FAIL(msg) do {                          \
+  fprintf(stderr, "FATAL ERROR: %s\n", msg);    \
+  exit(1);                                      \
+} while (0)
+
+static int g_new_hook_calls = 0;
+static int g_delete_hook_calls = 0;
+
+void TestNewHook(const void* ptr, size_t size) {
+  g_new_hook_calls++;
+}
+
+void TestDeleteHook(const void* ptr) {
+  g_delete_hook_calls++;
+}
+
+static
+void *forced_malloc(size_t size)
+{
+  extern void *tc_malloc(size_t);
+  void *rv = tc_malloc(size);
+  if (!rv) {
+    FAIL("malloc is not supposed to fail here");
+  }
+  return rv;
+}
+
+void TestMallocHook(void) {
+  /* TODO(csilvers): figure out why we get:
+   * E0100 00:00:00.000000  7383 malloc_hook.cc:244] RAW: google_malloc section is missing, thus InHookCaller is broken!
+   */
+#if 0
+  void* result[5];
+
+  if (MallocHook_GetCallerStackTrace(result, sizeof(result)/sizeof(*result),
+                                     0) < 2) {  /* should have this and main */
+    FAIL("GetCallerStackTrace failed");
+  }
+#endif
+
+  if (!MallocHook_AddNewHook(&TestNewHook)) {
+    FAIL("Failed to add new hook");
+  }
+  if (!MallocHook_AddDeleteHook(&TestDeleteHook)) {
+    FAIL("Failed to add delete hook");
+  }
+
+  free(forced_malloc(10));
+  free(forced_malloc(20));
+  if (g_new_hook_calls != 2) {
+    FAIL("Wrong number of calls to the new hook");
+  }
+  if (g_delete_hook_calls != 2) {
+    FAIL("Wrong number of calls to the delete hook");
+  }
+  if (!MallocHook_RemoveNewHook(&TestNewHook)) {
+    FAIL("Failed to remove new hook");
+  }
+  if (!MallocHook_RemoveDeleteHook(&TestDeleteHook)) {
+    FAIL("Failed to remove delete hook");
+  }
+
+  free(forced_malloc(10));
+  free(forced_malloc(20));
+  if (g_new_hook_calls != 2) {
+    FAIL("Wrong number of calls to the new hook");
+  }
+
+  MallocHook_SetNewHook(&TestNewHook);
+  MallocHook_SetDeleteHook(&TestDeleteHook);
+
+  free(forced_malloc(10));
+  free(forced_malloc(20));
+  if (g_new_hook_calls != 4) {
+    FAIL("Wrong number of calls to the singular new hook");
+  }
+
+  if (MallocHook_SetNewHook(NULL) == NULL) {
+    FAIL("Failed to set new hook");
+  }
+  if (MallocHook_SetDeleteHook(NULL) == NULL) {
+    FAIL("Failed to set delete hook");
+  }
+}
+
+void TestMallocExtension(void) {
+  int blocks;
+  size_t total;
+  int hist[64];
+  char buffer[200];
+  char* x = (char*)malloc(10);
+
+  MallocExtension_VerifyAllMemory();
+  MallocExtension_VerifyMallocMemory(x);
+  MallocExtension_MallocMemoryStats(&blocks, &total, hist);
+  MallocExtension_GetStats(buffer, sizeof(buffer));
+  if (!MallocExtension_GetNumericProperty("generic.current_allocated_bytes",
+                                          &total)) {
+    FAIL("GetNumericProperty failed for generic.current_allocated_bytes");
+  }
+  if (total < 10) {
+    FAIL("GetNumericProperty had bad return for generic.current_allocated_bytes");
+  }
+  if (!MallocExtension_GetNumericProperty("generic.current_allocated_bytes",
+                                          &total)) {
+    FAIL("GetNumericProperty failed for generic.current_allocated_bytes");
+  }
+  MallocExtension_MarkThreadIdle();
+  MallocExtension_MarkThreadBusy();
+  MallocExtension_ReleaseToSystem(1);
+  MallocExtension_ReleaseFreeMemory();
+  if (MallocExtension_GetEstimatedAllocatedSize(10) < 10) {
+    FAIL("GetEstimatedAllocatedSize returned a bad value (too small)");
+  }
+  if (MallocExtension_GetAllocatedSize(x) < 10) {
+    FAIL("GetEstimatedAllocatedSize returned a bad value (too small)");
+  }
+  if (MallocExtension_GetOwnership(x) != MallocExtension_kOwned) {
+    FAIL("DidAllocatePtr returned a bad value (kNotOwned)");
+  }
+  /* TODO(csilvers): this relies on undocumented behavior that
+     GetOwnership works on stack-allocated variables.  Use a better test. */
+  if (MallocExtension_GetOwnership(hist) != MallocExtension_kNotOwned) {
+    FAIL("DidAllocatePtr returned a bad value (kOwned)");
+  }
+
+  free(x);
+}
+
+int main(int argc, char** argv) {
+  TestMallocHook();
+  TestMallocExtension();
+
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/malloc_extension_test.cc b/src/tests/malloc_extension_test.cc
new file mode 100644
index 0000000..31c4968
--- /dev/null
+++ b/src/tests/malloc_extension_test.cc

@@ -0,0 +1,98 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// Simple test of malloc_extension.  Includes test of C shims.
+
+#include "config_for_unittests.h"
+#include <stdio.h>
+#include <sys/types.h>
+#include "base/logging.h"
+#include <gperftools/malloc_extension.h>
+#include <gperftools/malloc_extension_c.h>
+
+int main(int argc, char** argv) {
+  void* a = malloc(1000);
+
+  size_t cxx_bytes_used, c_bytes_used;
+  ASSERT_TRUE(MallocExtension::instance()->GetNumericProperty(
+      "generic.current_allocated_bytes", &cxx_bytes_used));
+  ASSERT_TRUE(MallocExtension_GetNumericProperty(
+      "generic.current_allocated_bytes", &c_bytes_used));
+  ASSERT_GT(cxx_bytes_used, 1000);
+  ASSERT_EQ(cxx_bytes_used, c_bytes_used);
+
+  ASSERT_TRUE(MallocExtension::instance()->VerifyAllMemory());
+  ASSERT_TRUE(MallocExtension_VerifyAllMemory());
+
+  ASSERT_EQ(MallocExtension::kOwned,
+            MallocExtension::instance()->GetOwnership(a));
+  // TODO(csilvers): this relies on undocumented behavior that
+  // GetOwnership works on stack-allocated variables.  Use a better test.
+  ASSERT_EQ(MallocExtension::kNotOwned,
+            MallocExtension::instance()->GetOwnership(&cxx_bytes_used));
+  ASSERT_EQ(MallocExtension::kNotOwned,
+            MallocExtension::instance()->GetOwnership(NULL));
+  ASSERT_GE(MallocExtension::instance()->GetAllocatedSize(a), 1000);
+  // This is just a sanity check.  If we allocated too much, tcmalloc is broken
+  ASSERT_LE(MallocExtension::instance()->GetAllocatedSize(a), 5000);
+  ASSERT_GE(MallocExtension::instance()->GetEstimatedAllocatedSize(1000), 1000);
+
+  for (int i = 0; i < 10; ++i) {
+    void *p = malloc(i);
+    ASSERT_GE(MallocExtension::instance()->GetAllocatedSize(p),
+             MallocExtension::instance()->GetEstimatedAllocatedSize(i));
+    free(p);
+  }
+
+  // Check the c-shim version too.
+  ASSERT_EQ(MallocExtension_kOwned, MallocExtension_GetOwnership(a));
+  ASSERT_EQ(MallocExtension_kNotOwned,
+            MallocExtension_GetOwnership(&cxx_bytes_used));
+  ASSERT_EQ(MallocExtension_kNotOwned, MallocExtension_GetOwnership(NULL));
+  ASSERT_GE(MallocExtension_GetAllocatedSize(a), 1000);
+  ASSERT_LE(MallocExtension_GetAllocatedSize(a), 5000);
+  ASSERT_GE(MallocExtension_GetEstimatedAllocatedSize(1000), 1000);
+
+  free(a);
+
+  // Verify that the .cc file and .h file have the same enum values.
+  ASSERT_EQ(static_cast<int>(MallocExtension::kUnknownOwnership),
+            static_cast<int>(MallocExtension_kUnknownOwnership));
+  ASSERT_EQ(static_cast<int>(MallocExtension::kOwned),
+            static_cast<int>(MallocExtension_kOwned));
+  ASSERT_EQ(static_cast<int>(MallocExtension::kNotOwned),
+            static_cast<int>(MallocExtension_kNotOwned));
+
+  printf("DONE\n");
+  return 0;
+}

diff --git a/src/tests/malloc_hook_test.cc b/src/tests/malloc_hook_test.cc
new file mode 100644
index 0000000..a5cd860
--- /dev/null
+++ b/src/tests/malloc_hook_test.cc

@@ -0,0 +1,367 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ----
+// Author: llib@google.com (Bill Clarke)
+
+#include "config_for_unittests.h"
+#include <assert.h>
+#include <stdio.h>
+#ifdef HAVE_MMAP
+#include <sys/mman.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>    // for sleep()
+#endif
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <gperftools/malloc_hook.h>
+#include "malloc_hook-inl.h"
+#include "base/logging.h"
+#include "base/simple_mutex.h"
+#include "base/sysinfo.h"
+#include "tests/testutil.h"
+
+// On systems (like freebsd) that don't define MAP_ANONYMOUS, use the old
+// form of the name instead.
+#ifndef MAP_ANONYMOUS
+# define MAP_ANONYMOUS MAP_ANON
+#endif
+
+namespace {
+
+using std::string;
+using std::vector;
+
+vector<void (*)()> g_testlist;  // the tests to run
+
+#define TEST(a, b)                                      \
+  struct Test_##a##_##b {                               \
+    Test_##a##_##b() { g_testlist.push_back(&Run); }    \
+    static void Run();                                  \
+  };                                                    \
+  static Test_##a##_##b g_test_##a##_##b;               \
+  void Test_##a##_##b::Run()
+
+
+static int RUN_ALL_TESTS() {
+  vector<void (*)()>::const_iterator it;
+  for (it = g_testlist.begin(); it != g_testlist.end(); ++it) {
+    (*it)();   // The test will error-exit if there's a problem.
+  }
+  fprintf(stderr, "\nPassed %d tests\n\nPASS\n",
+          static_cast<int>(g_testlist.size()));
+  return 0;
+}
+
+void Sleep(int seconds) {
+#ifdef _MSC_VER
+  _sleep(seconds * 1000);   // Windows's _sleep takes milliseconds argument
+#else
+  sleep(seconds);
+#endif
+}
+
+using std::min;
+using base::internal::kHookListMaxValues;
+
+// Since HookList is a template and is defined in malloc_hook.cc, we can only
+// use an instantiation of it from malloc_hook.cc.  We then reinterpret those
+// values as integers for testing.
+typedef base::internal::HookList<MallocHook::NewHook> TestHookList;
+
+int TestHookList_Traverse(const TestHookList& list, uintptr_t* output_array, int n) {
+  MallocHook::NewHook values_as_hooks[kHookListMaxValues];
+  int result = list.Traverse(values_as_hooks, min(n, kHookListMaxValues));
+  for (int i = 0; i < result; ++i) {
+    output_array[i] = reinterpret_cast<const uintptr_t>(*values_as_hooks[i]);
+  }
+  return result;
+}
+
+bool TestHookList_Add(TestHookList* list, int val) {
+  return list->Add(reinterpret_cast<MallocHook::NewHook>(val));
+}
+
+bool TestHookList_Remove(TestHookList* list, int val) {
+  return list->Remove(reinterpret_cast<MallocHook::NewHook>(val));
+}
+
+// Note that this is almost the same as INIT_HOOK_LIST in malloc_hook.cc without
+// the cast.
+#define INIT_HOOK_LIST(initial_value) { 1, { initial_value } }
+
+TEST(HookListTest, InitialValueExists) {
+  TestHookList list = INIT_HOOK_LIST(69);
+  uintptr_t values[2] = { 0, 0 };
+  EXPECT_EQ(1, TestHookList_Traverse(list, values, 2));
+  EXPECT_EQ(69, values[0]);
+  EXPECT_EQ(1, list.priv_end);
+}
+
+TEST(HookListTest, CanRemoveInitialValue) {
+  TestHookList list = INIT_HOOK_LIST(69);
+  ASSERT_TRUE(TestHookList_Remove(&list, 69));
+  EXPECT_EQ(0, list.priv_end);
+
+  uintptr_t values[2] = { 0, 0 };
+  EXPECT_EQ(0, TestHookList_Traverse(list, values, 2));
+}
+
+TEST(HookListTest, AddAppends) {
+  TestHookList list = INIT_HOOK_LIST(69);
+  ASSERT_TRUE(TestHookList_Add(&list, 42));
+  EXPECT_EQ(2, list.priv_end);
+
+  uintptr_t values[2] = { 0, 0 };
+  EXPECT_EQ(2, TestHookList_Traverse(list, values, 2));
+  EXPECT_EQ(69, values[0]);
+  EXPECT_EQ(42, values[1]);
+}
+
+TEST(HookListTest, RemoveWorksAndWillClearSize) {
+  TestHookList list = INIT_HOOK_LIST(69);
+  ASSERT_TRUE(TestHookList_Add(&list, 42));
+
+  ASSERT_TRUE(TestHookList_Remove(&list, 69));
+  EXPECT_EQ(2, list.priv_end);
+
+  uintptr_t values[2] = { 0, 0 };
+  EXPECT_EQ(1, TestHookList_Traverse(list, values, 2));
+  EXPECT_EQ(42, values[0]);
+
+  ASSERT_TRUE(TestHookList_Remove(&list, 42));
+  EXPECT_EQ(0, list.priv_end);
+  EXPECT_EQ(0, TestHookList_Traverse(list, values, 2));
+}
+
+TEST(HookListTest, AddPrependsAfterRemove) {
+  TestHookList list = INIT_HOOK_LIST(69);
+  ASSERT_TRUE(TestHookList_Add(&list, 42));
+
+  ASSERT_TRUE(TestHookList_Remove(&list, 69));
+  EXPECT_EQ(2, list.priv_end);
+
+  ASSERT_TRUE(TestHookList_Add(&list, 7));
+  EXPECT_EQ(2, list.priv_end);
+
+  uintptr_t values[2] = { 0, 0 };
+  EXPECT_EQ(2, TestHookList_Traverse(list, values, 2));
+  EXPECT_EQ(7, values[0]);
+  EXPECT_EQ(42, values[1]);
+}
+
+TEST(HookListTest, InvalidAddRejected) {
+  TestHookList list = INIT_HOOK_LIST(69);
+  EXPECT_FALSE(TestHookList_Add(&list, 0));
+
+  uintptr_t values[2] = { 0, 0 };
+  EXPECT_EQ(1, TestHookList_Traverse(list, values, 2));
+  EXPECT_EQ(69, values[0]);
+  EXPECT_EQ(1, list.priv_end);
+}
+
+TEST(HookListTest, FillUpTheList) {
+  TestHookList list = INIT_HOOK_LIST(69);
+  int num_inserts = 0;
+  while (TestHookList_Add(&list, ++num_inserts))
+    ;
+  EXPECT_EQ(kHookListMaxValues, num_inserts);
+  EXPECT_EQ(kHookListMaxValues, list.priv_end);
+
+  uintptr_t values[kHookListMaxValues + 1];
+  EXPECT_EQ(kHookListMaxValues, TestHookList_Traverse(list, values,
+                                                      kHookListMaxValues));
+  EXPECT_EQ(69, values[0]);
+  for (int i = 1; i < kHookListMaxValues; ++i) {
+    EXPECT_EQ(i, values[i]);
+  }
+}
+
+void MultithreadedTestThread(TestHookList* list, int shift,
+                             int thread_num) {
+  string message;
+  char buf[64];
+  for (int i = 1; i < 1000; ++i) {
+    // In each loop, we insert a unique value, check it exists, remove it, and
+    // check it doesn't exist.  We also record some stats to log at the end of
+    // each thread.  Each insertion location and the length of the list is
+    // non-deterministic (except for the very first one, over all threads, and
+    // after the very last one the list should be empty).
+    int value = (i << shift) + thread_num;
+    EXPECT_TRUE(TestHookList_Add(list, value));
+    sched_yield();  // Ensure some more interleaving.
+    uintptr_t values[kHookListMaxValues + 1];
+    int num_values = TestHookList_Traverse(*list, values, kHookListMaxValues);
+    EXPECT_LT(0, num_values);
+    int value_index;
+    for (value_index = 0;
+         value_index < num_values && values[value_index] != value;
+         ++value_index)
+      ;
+    EXPECT_LT(value_index, num_values);  // Should have found value.
+    snprintf(buf, sizeof(buf), "[%d/%d; ", value_index, num_values);
+    message += buf;
+    sched_yield();
+    EXPECT_TRUE(TestHookList_Remove(list, value));
+    sched_yield();
+    num_values = TestHookList_Traverse(*list, values, kHookListMaxValues);
+    for (value_index = 0;
+         value_index < num_values && values[value_index] != value;
+         ++value_index)
+      ;
+    EXPECT_EQ(value_index, num_values);  // Should not have found value.
+    snprintf(buf, sizeof(buf), "%d]", num_values);
+    message += buf;
+    sched_yield();
+  }
+  fprintf(stderr, "thread %d: %s\n", thread_num, message.c_str());
+}
+
+static volatile int num_threads_remaining;
+static TestHookList list = INIT_HOOK_LIST(69);
+static Mutex threadcount_lock;
+
+void MultithreadedTestThreadRunner(int thread_num) {
+  // Wait for all threads to start running.
+  {
+    MutexLock ml(&threadcount_lock);
+    assert(num_threads_remaining > 0);
+    --num_threads_remaining;
+
+    // We should use condvars and the like, but for this test, we'll
+    // go simple and busy-wait.
+    while (num_threads_remaining > 0) {
+      threadcount_lock.Unlock();
+      Sleep(1);
+      threadcount_lock.Lock();
+    }
+  }
+
+  // shift is the smallest number such that (1<<shift) > kHookListMaxValues
+  int shift = 0;
+  for (int i = kHookListMaxValues; i > 0; i >>= 1)
+    shift += 1;
+
+  MultithreadedTestThread(&list, shift, thread_num);
+}
+
+
+TEST(HookListTest, MultithreadedTest) {
+  ASSERT_TRUE(TestHookList_Remove(&list, 69));
+  ASSERT_EQ(0, list.priv_end);
+
+  // Run kHookListMaxValues thread, each running MultithreadedTestThread.
+  // First, we need to set up the rest of the globals.
+  num_threads_remaining = kHookListMaxValues;   // a global var
+  RunManyThreadsWithId(&MultithreadedTestThreadRunner, num_threads_remaining,
+                       1 << 15);
+
+  uintptr_t values[kHookListMaxValues + 1];
+  EXPECT_EQ(0, TestHookList_Traverse(list, values, kHookListMaxValues));
+  EXPECT_EQ(0, list.priv_end);
+}
+
+// We only do mmap-hooking on (some) linux systems.
+#if defined(HAVE_MMAP) && defined(__linux) && \
+    (defined(__i386__) || defined(__x86_64__) || defined(__PPC__))
+
+int mmap_calls = 0;
+int mmap_matching_calls = 0;
+int munmap_calls = 0;
+int munmap_matching_calls = 0;
+const int kMmapMagicFd = 1;
+void* const kMmapMagicPointer = reinterpret_cast<void*>(1);
+
+int MmapReplacement(const void* start,
+                     size_t size,
+                     int protection,
+                     int flags,
+                     int fd,
+                     off_t offset,
+                     void** result) {
+  ++mmap_calls;
+  if (fd == kMmapMagicFd) {
+    ++mmap_matching_calls;
+    *result = kMmapMagicPointer;
+    return true;
+  }
+  return false;
+}
+
+int MunmapReplacement(const void* ptr, size_t size, int* result) {
+  ++munmap_calls;
+  if (ptr == kMmapMagicPointer) {
+    ++munmap_matching_calls;
+    *result = 0;
+    return true;
+  }
+  return false;
+}
+
+TEST(MallocMookTest, MmapReplacements) {
+  mmap_calls = mmap_matching_calls = munmap_calls = munmap_matching_calls = 0;
+  MallocHook::SetMmapReplacement(&MmapReplacement);
+  MallocHook::SetMunmapReplacement(&MunmapReplacement);
+  EXPECT_EQ(kMmapMagicPointer, mmap(NULL, 1, PROT_READ, MAP_PRIVATE,
+                                    kMmapMagicFd, 0));
+  EXPECT_EQ(1, mmap_matching_calls);
+
+  char* ptr = reinterpret_cast<char*>(
+      mmap(NULL, 1, PROT_READ | PROT_WRITE,
+           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+  EXPECT_EQ(2, mmap_calls);
+  EXPECT_EQ(1, mmap_matching_calls);
+  ASSERT_NE(MAP_FAILED, ptr);
+  *ptr = 'a';
+
+  EXPECT_EQ(0, munmap(kMmapMagicPointer, 1));
+  EXPECT_EQ(1, munmap_calls);
+  EXPECT_EQ(1, munmap_matching_calls);
+
+  EXPECT_EQ(0, munmap(ptr, 1));
+  EXPECT_EQ(2, munmap_calls);
+  EXPECT_EQ(1, munmap_matching_calls);
+
+  // The DEATH test below is flaky, because we've just munmapped the memory,
+  // making it available for mmap()ing again. There is no guarantee that it
+  // will stay unmapped, and in fact it gets reused ~10% of the time.
+  // It the area is reused, then not only we don't die, but we also corrupt
+  // whoever owns that memory now.
+  // EXPECT_DEATH(*ptr = 'a', "SIGSEGV");
+}
+#endif  // #ifdef HAVE_MMAP && linux && ...
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  return RUN_ALL_TESTS();
+}

diff --git a/src/tests/markidle_unittest.cc b/src/tests/markidle_unittest.cc
new file mode 100644
index 0000000..827609f
--- /dev/null
+++ b/src/tests/markidle_unittest.cc

@@ -0,0 +1,109 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2003, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// MallocExtension::MarkThreadIdle() testing
+#include <stdio.h>
+
+#include "config_for_unittests.h"
+#include "base/logging.h"
+#include <gperftools/malloc_extension.h>
+#include "tests/testutil.h"   // for RunThread()
+
+// Helper routine to do lots of allocations
+static void TestAllocation() {
+  static const int kNum = 100;
+  void* ptr[kNum];
+  for (int size = 8; size <= 65536; size*=2) {
+    for (int i = 0; i < kNum; i++) {
+      ptr[i] = malloc(size);
+    }
+    for (int i = 0; i < kNum; i++) {
+      free(ptr[i]);
+    }
+  }
+}
+
+// Routine that does a bunch of MarkThreadIdle() calls in sequence
+// without any intervening allocations
+static void MultipleIdleCalls() {
+  for (int i = 0; i < 4; i++) {
+    MallocExtension::instance()->MarkThreadIdle();
+  }
+}
+
+// Routine that does a bunch of MarkThreadIdle() calls in sequence
+// with intervening allocations
+static void MultipleIdleNonIdlePhases() {
+  for (int i = 0; i < 4; i++) {
+    TestAllocation();
+    MallocExtension::instance()->MarkThreadIdle();
+  }
+}
+
+// Get current thread cache usage
+static size_t GetTotalThreadCacheSize() {
+  size_t result;
+  CHECK(MallocExtension::instance()->GetNumericProperty(
+            "tcmalloc.current_total_thread_cache_bytes",
+            &result));
+  return result;
+}
+
+// Check that MarkThreadIdle() actually reduces the amount
+// of per-thread memory.
+static void TestIdleUsage() {
+  const size_t original = GetTotalThreadCacheSize();
+
+  TestAllocation();
+  const size_t post_allocation = GetTotalThreadCacheSize();
+  CHECK_GT(post_allocation, original);
+
+  MallocExtension::instance()->MarkThreadIdle();
+  const size_t post_idle = GetTotalThreadCacheSize();
+  CHECK_LE(post_idle, original);
+
+  // Log after testing because logging can allocate heap memory.
+  VLOG(0, "Original usage: %" PRIuS "\n", original);
+  VLOG(0, "Post allocation: %" PRIuS "\n", post_allocation);
+  VLOG(0, "Post idle: %" PRIuS "\n", post_idle);
+}
+
+int main(int argc, char** argv) {
+  RunThread(&TestIdleUsage);
+  RunThread(&TestAllocation);
+  RunThread(&MultipleIdleCalls);
+  RunThread(&MultipleIdleNonIdlePhases);
+
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/maybe_threads_unittest.sh b/src/tests/maybe_threads_unittest.sh
new file mode 100755
index 0000000..77b3b78
--- /dev/null
+++ b/src/tests/maybe_threads_unittest.sh

@@ -0,0 +1,79 @@
+#!/bin/sh
+
+# Copyright (c) 2007, Google Inc.
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ---
+# Author: Craig Silverstein
+#
+# maybe_threads.cc was written to allow LD_PRELOAD=libtcmalloc.so to
+# work even on binaries that were not linked with pthreads.  This
+# unittest tests that, by running low_level_alloc_unittest with an
+# LD_PRELOAD.  (low_level_alloc_unittest was chosen because it doesn't
+# link in tcmalloc.)
+#
+# We assume all the .so files are in the same directory as both
+# addressmap_unittest and profiler1_unittest.  The reason we need
+# profiler1_unittest is because it's instrumented to show the directory
+# it's "really" in when run without any args.  In practice this will either
+# be BINDIR, or, when using libtool, BINDIR/.lib.
+
+# We expect BINDIR to be set in the environment.
+# If not, we set them to some reasonable values.
+BINDIR="${BINDIR:-.}"
+
+if [ "x$1" = "x-h" -o "x$1" = "x--help" ]; then
+  echo "USAGE: $0 [unittest dir]"
+  echo "       By default, unittest_dir=$BINDIR"
+  exit 1
+fi
+
+UNITTEST_DIR=${1:-$BINDIR}
+
+# Figure out the "real" unittest directory.  Also holds the .so files.
+UNITTEST_DIR=`$UNITTEST_DIR/low_level_alloc_unittest --help 2>&1 \
+              | awk '{print $2; exit;}' \
+              | xargs dirname`
+
+# Figure out where libtcmalloc lives.   It should be in UNITTEST_DIR,
+# but with libtool it might be in a subdir.
+if [ -r "$UNITTEST_DIR/libtcmalloc_minimal.so" ]; then
+  LIB_PATH="$UNITTEST_DIR/libtcmalloc_minimal.so"
+elif [ -r "$UNITTEST_DIR/.libs/libtcmalloc_minimal.so" ]; then
+  LIB_PATH="$UNITTEST_DIR/.libs/libtcmalloc_minimal.so"
+elif [ -r "$UNITTEST_DIR/libtcmalloc_minimal.dylib" ]; then   # for os x
+  LIB_PATH="$UNITTEST_DIR/libtcmalloc_minimal.dylib"
+elif [ -r "$UNITTEST_DIR/.libs/libtcmalloc_minimal.dylib" ]; then
+  LIB_PATH="$UNITTEST_DIR/.libs/libtcmalloc_minimal.dylib"
+else
+  echo "Cannot run $0: cannot find libtcmalloc_minimal.so"
+  exit 2
+fi
+
+LD_PRELOAD="$LIB_PATH" $UNITTEST_DIR/low_level_alloc_unittest

diff --git a/src/tests/memalign_unittest.cc b/src/tests/memalign_unittest.cc
new file mode 100644
index 0000000..309a3df
--- /dev/null
+++ b/src/tests/memalign_unittest.cc

@@ -0,0 +1,221 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2004, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// Check memalign related routines.
+//
+// We can't really do a huge amount of checking, but at the very
+// least, the following code checks that return values are properly
+// aligned, and that writing into the objects works.
+
+#include "config_for_unittests.h"
+
+// Complicated ordering requirements.  tcmalloc.h defines (indirectly)
+// _POSIX_C_SOURCE, which it needs so stdlib.h defines posix_memalign.
+// unistd.h, on the other hand, requires _POSIX_C_SOURCE to be unset,
+// at least on Mac OS X, in order to define getpagesize.  The solution
+// is to #include unistd.h first.  This is safe because unistd.h
+// doesn't sub-include stdlib.h, so we'll still get posix_memalign
+// when we #include stdlib.h.  Blah.
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>        // for getpagesize()
+#endif
+#include "tcmalloc.h"      // must come early, to pick up posix_memalign
+#include <assert.h>
+#include <stdlib.h>        // defines posix_memalign
+#include <stdio.h>         // for the printf at the end
+#ifdef HAVE_STDINT_H
+#include <stdint.h>        // for uintptr_t
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>        // for getpagesize()
+#endif
+// Malloc can be in several places on older versions of OS X.
+#if defined(HAVE_MALLOC_H)
+#include <malloc.h>        // for memalign() and valloc()
+#elif defined(HAVE_MALLOC_MALLOC_H)
+#include <malloc/malloc.h>
+#elif defined(HAVE_SYS_MALLOC_H)
+#include <sys/malloc.h>
+#endif
+#include "base/basictypes.h"
+#include "base/logging.h"
+#include "tests/testutil.h"
+
+
+// Return the next interesting size/delta to check.  Returns -1 if no more.
+static int NextSize(int size) {
+  if (size < 100) {
+    return size+1;
+  } else if (size < 1048576) {
+    // Find next power of two
+    int power = 1;
+    while (power < size) {
+      power <<= 1;
+    }
+
+    // Yield (power-1, power, power+1)
+    if (size < power-1) {
+      return power-1;
+    } else if (size == power-1) {
+      return power;
+    } else {
+      assert(size == power);
+      return power+1;
+    }
+  } else {
+    return -1;
+  }
+}
+
+// Shortform for cast
+static uintptr_t Number(void* p) {
+  return reinterpret_cast<uintptr_t>(p);
+}
+
+// Check alignment
+static void CheckAlignment(void* p, int align) {
+  if ((Number(p) & (align-1)) != 0)
+    LOG(FATAL, "wrong alignment; wanted 0x%x; got %p\n", align, p);
+}
+
+// Fill a buffer of the specified size with a predetermined pattern
+static void Fill(void* p, int n, char seed) {
+  unsigned char* buffer = reinterpret_cast<unsigned char*>(p);
+  for (int i = 0; i < n; i++) {
+    buffer[i] = ((seed + i) & 0xff);
+  }
+}
+
+// Check that the specified buffer has the predetermined pattern
+// generated by Fill()
+static bool Valid(const void* p, int n, char seed) {
+  const unsigned char* buffer = reinterpret_cast<const unsigned char*>(p);
+  for (int i = 0; i < n; i++) {
+    if (buffer[i] != ((seed + i) & 0xff)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+int main(int argc, char** argv) {
+  SetTestResourceLimit();
+
+  // Try allocating data with a bunch of alignments and sizes
+  for (int a = 1; a < 1048576; a *= 2) {
+    for (int s = 0; s != -1; s = NextSize(s)) {
+      void* ptr = memalign(a, s);
+      CheckAlignment(ptr, a);
+      Fill(ptr, s, 'x');
+      CHECK(Valid(ptr, s, 'x'));
+      free(ptr);
+
+      if ((a >= sizeof(void*)) && ((a & (a-1)) == 0)) {
+        CHECK(posix_memalign(&ptr, a, s) == 0);
+        CheckAlignment(ptr, a);
+        Fill(ptr, s, 'y');
+        CHECK(Valid(ptr, s, 'y'));
+        free(ptr);
+      }
+    }
+  }
+
+  {
+    // Check various corner cases
+    void* p1 = memalign(1<<20, 1<<19);
+    void* p2 = memalign(1<<19, 1<<19);
+    void* p3 = memalign(1<<21, 1<<19);
+    CheckAlignment(p1, 1<<20);
+    CheckAlignment(p2, 1<<19);
+    CheckAlignment(p3, 1<<21);
+    Fill(p1, 1<<19, 'a');
+    Fill(p2, 1<<19, 'b');
+    Fill(p3, 1<<19, 'c');
+    CHECK(Valid(p1, 1<<19, 'a'));
+    CHECK(Valid(p2, 1<<19, 'b'));
+    CHECK(Valid(p3, 1<<19, 'c'));
+    free(p1);
+    free(p2);
+    free(p3);
+  }
+
+  {
+    // posix_memalign
+    void* ptr;
+    CHECK(posix_memalign(&ptr, 0, 1) == EINVAL);
+    CHECK(posix_memalign(&ptr, sizeof(void*)/2, 1) == EINVAL);
+    CHECK(posix_memalign(&ptr, sizeof(void*)+1, 1) == EINVAL);
+    CHECK(posix_memalign(&ptr, 4097, 1) == EINVAL);
+
+    // Grab some memory so that the big allocation below will definitely fail.
+    void* p_small = malloc(4*1048576);
+    CHECK(p_small != NULL);
+
+    // Make sure overflow is returned as ENOMEM
+    const size_t zero = 0;
+    static const size_t kMinusNTimes = 10;
+    for ( size_t i = 1; i < kMinusNTimes; ++i ) {
+      int r = posix_memalign(&ptr, 1024, zero - i);
+      CHECK(r == ENOMEM);
+    }
+
+    free(p_small);
+  }
+
+  const int pagesize = getpagesize();
+  {
+    // valloc
+    for (int s = 0; s != -1; s = NextSize(s)) {
+      void* p = valloc(s);
+      CheckAlignment(p, pagesize);
+      Fill(p, s, 'v');
+      CHECK(Valid(p, s, 'v'));
+      free(p);
+    }
+  }
+
+  {
+    // pvalloc
+    for (int s = 0; s != -1; s = NextSize(s)) {
+      void* p = pvalloc(s);
+      CheckAlignment(p, pagesize);
+      int alloc_needed = ((s + pagesize - 1) / pagesize) * pagesize;
+      Fill(p, alloc_needed, 'x');
+      CHECK(Valid(p, alloc_needed, 'x'));
+      free(p);
+    }
+  }
+
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/packed-cache_test.cc b/src/tests/packed-cache_test.cc
new file mode 100644
index 0000000..befbd77
--- /dev/null
+++ b/src/tests/packed-cache_test.cc

@@ -0,0 +1,63 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Geoff Pike
+
+#include <stdio.h>
+#include "base/logging.h"
+#include "packed-cache-inl.h"
+
+static const int kHashbits = PackedCache<64, uint64>::kHashbits;
+
+// A basic sanity test.
+void PackedCacheTest_basic() {
+  PackedCache<32, uint32> cache(0);
+  CHECK_EQ(cache.GetOrDefault(0, 1), 0);
+  cache.Put(0, 17);
+  CHECK(cache.Has(0));
+  CHECK_EQ(cache.GetOrDefault(0, 1), 17);
+  cache.Put(19, 99);
+  CHECK(cache.Has(0) && cache.Has(19));
+  CHECK_EQ(cache.GetOrDefault(0, 1), 17);
+  CHECK_EQ(cache.GetOrDefault(19, 1), 99);
+  // Knock <0, 17> out by using a conflicting key.
+  cache.Put(1 << kHashbits, 22);
+  CHECK(!cache.Has(0));
+  CHECK_EQ(cache.GetOrDefault(0, 1), 1);
+  CHECK_EQ(cache.GetOrDefault(1 << kHashbits, 1), 22);
+}
+
+int main(int argc, char **argv) {
+  PackedCacheTest_basic();
+
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/page_heap_test.cc b/src/tests/page_heap_test.cc
new file mode 100644
index 0000000..e82a1da
--- /dev/null
+++ b/src/tests/page_heap_test.cc

@@ -0,0 +1,169 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright 2009 Google Inc. All Rights Reserved.
+// Author: fikes@google.com (Andrew Fikes)
+//
+// Use of this source code is governed by a BSD-style license that can
+// be found in the LICENSE file.
+
+#include "config_for_unittests.h"
+#include "page_heap.h"
+#include "system-alloc.h"
+#include <stdio.h>
+#include "base/logging.h"
+#include "common.h"
+
+DECLARE_int64(tcmalloc_heap_limit_mb);
+
+namespace {
+
+// The system will only release memory if the block size is equal or hight than
+// system page size.
+static bool HaveSystemRelease =
+    TCMalloc_SystemRelease(
+      TCMalloc_SystemAlloc(getpagesize(), NULL, 0), getpagesize());
+
+static void CheckStats(const tcmalloc::PageHeap* ph,
+                       uint64_t system_pages,
+                       uint64_t free_pages,
+                       uint64_t unmapped_pages) {
+  tcmalloc::PageHeap::Stats stats = ph->stats();
+
+  if (!HaveSystemRelease) {
+    free_pages += unmapped_pages;
+    unmapped_pages = 0;
+  }
+
+  EXPECT_EQ(system_pages, stats.system_bytes >> kPageShift);
+  EXPECT_EQ(free_pages, stats.free_bytes >> kPageShift);
+  EXPECT_EQ(unmapped_pages, stats.unmapped_bytes >> kPageShift);
+}
+
+static void TestPageHeap_Stats() {
+  tcmalloc::PageHeap* ph = new tcmalloc::PageHeap();
+
+  // Empty page heap
+  CheckStats(ph, 0, 0, 0);
+
+  // Allocate a span 's1'
+  tcmalloc::Span* s1 = ph->New(256);
+  CheckStats(ph, 256, 0, 0);
+
+  // Split span 's1' into 's1', 's2'.  Delete 's2'
+  tcmalloc::Span* s2 = ph->Split(s1, 128);
+  ph->Delete(s2);
+  CheckStats(ph, 256, 128, 0);
+
+  // Unmap deleted span 's2'
+  ph->ReleaseAtLeastNPages(1);
+  CheckStats(ph, 256, 0, 128);
+
+  // Delete span 's1'
+  ph->Delete(s1);
+  CheckStats(ph, 256, 128, 128);
+
+  delete ph;
+}
+
+static void TestPageHeap_Limit() {
+  tcmalloc::PageHeap* ph = new tcmalloc::PageHeap();
+
+  CHECK_EQ(kMaxPages, 1 << (20 - kPageShift));
+
+  // We do not know much is taken from the system for other purposes,
+  // so we detect the proper limit:
+  {
+    FLAGS_tcmalloc_heap_limit_mb = 1;
+    tcmalloc::Span* s = NULL;
+    while((s = ph->New(kMaxPages)) == NULL) {
+      FLAGS_tcmalloc_heap_limit_mb++;
+    }
+    FLAGS_tcmalloc_heap_limit_mb += 9;
+    ph->Delete(s);
+    // We are [10, 11) mb from the limit now.
+  }
+
+  // Test AllocLarge and GrowHeap first:
+  {
+    tcmalloc::Span * spans[10];
+    for (int i=0; i<10; ++i) {
+      spans[i] = ph->New(kMaxPages);
+      EXPECT_NE(spans[i], NULL);
+    }
+    EXPECT_EQ(ph->New(kMaxPages), NULL);
+
+    for (int i=0; i<10; i += 2) {
+      ph->Delete(spans[i]);
+    }
+
+    tcmalloc::Span *defragmented = ph->New(5 * kMaxPages);
+
+    if (HaveSystemRelease) {
+      // EnsureLimit should release deleted normal spans
+      EXPECT_NE(defragmented, NULL);
+      EXPECT_TRUE(ph->CheckExpensive());
+      ph->Delete(defragmented);
+    }
+    else
+    {
+      EXPECT_EQ(defragmented, NULL);
+      EXPECT_TRUE(ph->CheckExpensive());
+    }
+
+    for (int i=1; i<10; i += 2) {
+      ph->Delete(spans[i]);
+    }
+  }
+
+  // Once again, testing small lists this time (twice smaller spans):
+  {
+    tcmalloc::Span * spans[20];
+    for (int i=0; i<20; ++i) {
+      spans[i] = ph->New(kMaxPages >> 1);
+      EXPECT_NE(spans[i], NULL);
+    }
+    // one more half size allocation may be possible:
+    tcmalloc::Span * lastHalf = ph->New(kMaxPages >> 1);
+    EXPECT_EQ(ph->New(kMaxPages >> 1), NULL);
+
+    for (int i=0; i<20; i += 2) {
+      ph->Delete(spans[i]);
+    }
+
+    for(Length len = kMaxPages >> 2; len < 5 * kMaxPages; len = len << 1)
+    {
+      if(len <= kMaxPages >> 1 || HaveSystemRelease) {
+        tcmalloc::Span *s = ph->New(len);
+        EXPECT_NE(s, NULL);
+        ph->Delete(s);
+      }
+    }
+
+    EXPECT_TRUE(ph->CheckExpensive());
+
+    for (int i=1; i<20; i += 2) {
+      ph->Delete(spans[i]);
+    }
+
+    if (lastHalf != NULL) {
+      ph->Delete(lastHalf);
+    }
+  }
+
+  delete ph;
+}
+
+}  // namespace
+
+int main(int argc, char **argv) {
+  TestPageHeap_Stats();
+  TestPageHeap_Limit();
+  printf("PASS\n");
+  // on windows as part of library destructors we call getenv which
+  // calls malloc which fails due to our exhausted heap limit. It then
+  // causes fancy stack overflow because log message we're printing
+  // for failed allocation somehow cause malloc calls too
+  //
+  // To keep us out of trouble we just drop malloc limit
+  FLAGS_tcmalloc_heap_limit_mb = 0;
+  return 0;
+}

diff --git a/src/tests/pagemap_unittest.cc b/src/tests/pagemap_unittest.cc
new file mode 100644
index 0000000..88d46e7
--- /dev/null
+++ b/src/tests/pagemap_unittest.cc

@@ -0,0 +1,178 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2003, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+
+#include "config_for_unittests.h"
+#include <stdio.h>
+#include <stdlib.h>
+#if defined HAVE_STDINT_H
+#include <stdint.h>             // to get intptr_t
+#elif defined HAVE_INTTYPES_H
+#include <inttypes.h>           // another place intptr_t might be defined
+#endif
+#include <sys/types.h>
+#include <vector>
+#include "base/logging.h"
+#include "pagemap.h"
+
+using std::vector;
+
+static void Permute(vector<intptr_t>* elements) {
+  if (elements->empty())
+    return;
+  const size_t num_elements = elements->size();
+  for (size_t i = num_elements - 1; i > 0; --i) {
+    const size_t newpos = rand() % (i + 1);
+    const intptr_t tmp = (*elements)[i];   // swap
+    (*elements)[i] = (*elements)[newpos];
+    (*elements)[newpos] = tmp;
+  }
+}
+
+// Note: we leak memory every time a map is constructed, so do not
+// create too many maps.
+
+// Test specified map type
+template <class Type>
+void TestMap(int limit, bool limit_is_below_the_overflow_boundary) {
+  RAW_LOG(INFO, "Running test with %d iterations...\n", limit);
+
+  { // Test sequential ensure/assignment
+    Type map(malloc);
+    for (intptr_t i = 0; i < static_cast<intptr_t>(limit); i++) {
+      map.Ensure(i, 1);
+      map.set(i, (void*)(i+1));
+      CHECK_EQ(map.get(i), (void*)(i+1));
+    }
+    for (intptr_t i = 0; i < static_cast<intptr_t>(limit); i++) {
+      CHECK_EQ(map.get(i), (void*)(i+1));
+    }
+  }
+
+  { // Test bulk Ensure
+    Type map(malloc);
+    map.Ensure(0, limit);
+    for (intptr_t i = 0; i < static_cast<intptr_t>(limit); i++) {
+      map.set(i, (void*)(i+1));
+      CHECK_EQ(map.get(i), (void*)(i+1));
+    }
+    for (intptr_t i = 0; i < static_cast<intptr_t>(limit); i++) {
+      CHECK_EQ(map.get(i), (void*)(i+1));
+    }
+  }
+
+  // Test that we correctly notice overflow
+  {
+    Type map(malloc);
+    CHECK_EQ(map.Ensure(limit, limit+1), limit_is_below_the_overflow_boundary);
+  }
+
+  { // Test randomized accesses
+    srand(301);   // srand isn't great, but it's portable
+    vector<intptr_t> elements;
+    for (intptr_t i = 0; i < static_cast<intptr_t>(limit); i++) elements.push_back(i);
+    Permute(&elements);
+
+    Type map(malloc);
+    for (intptr_t i = 0; i < static_cast<intptr_t>(limit); i++) {
+      map.Ensure(elements[i], 1);
+      map.set(elements[i], (void*)(elements[i]+1));
+      CHECK_EQ(map.get(elements[i]), (void*)(elements[i]+1));
+    }
+    for (intptr_t i = 0; i < static_cast<intptr_t>(limit); i++) {
+      CHECK_EQ(map.get(i), (void*)(i+1));
+    }
+  }
+}
+
+// REQUIRES: BITS==10, i.e., valid range is [0,1023].
+// Representations for different types will end up being:
+//    PageMap1: array[1024]
+//    PageMap2: array[32][32]
+//    PageMap3: array[16][16][4]
+template <class Type>
+void TestNext(const char* name) {
+  RAW_LOG(ERROR, "Running NextTest %s\n", name);
+  Type map(malloc);
+  char a, b, c, d, e;
+
+  // When map is empty
+  CHECK(map.Next(0) == NULL);
+  CHECK(map.Next(5) == NULL);
+  CHECK(map.Next(1<<30) == NULL);
+
+  // Add a single value
+  map.Ensure(40, 1);
+  map.set(40, &a);
+  CHECK(map.Next(0) == &a);
+  CHECK(map.Next(39) == &a);
+  CHECK(map.Next(40) == &a);
+  CHECK(map.Next(41) == NULL);
+  CHECK(map.Next(1<<30) == NULL);
+
+  // Add a few values
+  map.Ensure(41, 1);
+  map.Ensure(100, 3);
+  map.set(41, &b);
+  map.set(100, &c);
+  map.set(101, &d);
+  map.set(102, &e);
+  CHECK(map.Next(0) == &a);
+  CHECK(map.Next(39) == &a);
+  CHECK(map.Next(40) == &a);
+  CHECK(map.Next(41) == &b);
+  CHECK(map.Next(42) == &c);
+  CHECK(map.Next(63) == &c);
+  CHECK(map.Next(64) == &c);
+  CHECK(map.Next(65) == &c);
+  CHECK(map.Next(99) == &c);
+  CHECK(map.Next(100) == &c);
+  CHECK(map.Next(101) == &d);
+  CHECK(map.Next(102) == &e);
+  CHECK(map.Next(103) == NULL);
+}
+
+int main(int argc, char** argv) {
+  TestMap< TCMalloc_PageMap1<10> > (100, true);
+  TestMap< TCMalloc_PageMap1<10> > (1 << 10, false);
+  TestMap< TCMalloc_PageMap2<20> > (100, true);
+  TestMap< TCMalloc_PageMap2<20> > (1 << 20, false);
+  TestMap< TCMalloc_PageMap3<20> > (100, true);
+  TestMap< TCMalloc_PageMap3<20> > (1 << 20, false);
+
+  TestNext< TCMalloc_PageMap1<10> >("PageMap1");
+  TestNext< TCMalloc_PageMap2<10> >("PageMap2");
+  TestNext< TCMalloc_PageMap3<10> >("PageMap3");
+
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/profile-handler_unittest.cc b/src/tests/profile-handler_unittest.cc
new file mode 100644
index 0000000..2984d0d
--- /dev/null
+++ b/src/tests/profile-handler_unittest.cc

@@ -0,0 +1,525 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright 2009 Google Inc. All Rights Reserved.
+// Author: Nabeel Mian (nabeelmian@google.com)
+//         Chris Demetriou (cgd@google.com)
+//
+// Use of this source code is governed by a BSD-style license that can
+// be found in the LICENSE file.
+//
+//
+// This file contains the unit tests for profile-handler.h interface.
+//
+// It is linked into three separate unit tests:
+//     profile-handler_unittest tests basic functionality
+//     profile-handler_disable_test tests that the profiler
+//         is disabled with --install_signal_handlers=false
+//     profile-handler_conflict_test tests that the profiler
+//         is disabled when a SIGPROF handler is registered before InitGoogle.
+
+#include "config.h"
+#include "profile-handler.h"
+
+#include <assert.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+#include "base/logging.h"
+#include "base/simple_mutex.h"
+
+// Some helpful macros for the test class
+#define TEST_F(cls, fn)    void cls :: fn()
+
+// Do we expect the profiler to be enabled?
+DEFINE_bool(test_profiler_enabled, true,
+            "expect profiler to be enabled during tests");
+
+// Should we look at the kernel signal handler settings during the test?
+// Not if we're in conflict_test, because we can't distinguish its nop
+// handler from the real one.
+DEFINE_bool(test_profiler_signal_handler, true,
+            "check profiler signal handler during tests");
+
+namespace {
+
+// TODO(csilvers): error-checking on the pthreads routines
+class Thread {
+ public:
+  Thread() : joinable_(false) { }
+  virtual ~Thread() { }
+  void SetJoinable(bool value) { joinable_ = value; }
+  void Start() {
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, joinable_ ? PTHREAD_CREATE_JOINABLE
+                                                 : PTHREAD_CREATE_DETACHED);
+    pthread_create(&thread_, &attr, &DoRun, this);
+    pthread_attr_destroy(&attr);
+  }
+  void Join()  {
+    assert(joinable_);
+    pthread_join(thread_, NULL);
+  }
+  virtual void Run() = 0;
+ private:
+  static void* DoRun(void* cls) {
+    ProfileHandlerRegisterThread();
+    reinterpret_cast<Thread*>(cls)->Run();
+    return NULL;
+  }
+  pthread_t thread_;
+  bool joinable_;
+};
+
+// Sleep interval in nano secs. ITIMER_PROF goes off only afer the specified CPU
+// time is consumed. Under heavy load this process may no get scheduled in a
+// timely fashion. Therefore, give enough time (20x of ProfileHandle timer
+// interval 10ms (100Hz)) for this process to accumulate enought CPU time to get
+// a profile tick.
+int kSleepInterval = 200000000;
+
+// Sleep interval in nano secs. To ensure that if the timer has expired it is
+// reset.
+int kTimerResetInterval = 5000000;
+
+// Whether each thread has separate timers.
+static bool linux_per_thread_timers_mode_ = false;
+static bool timer_separate_ = false;
+static int timer_type_ = ITIMER_PROF;
+static int signal_number_ = SIGPROF;
+
+// Delays processing by the specified number of nano seconds. 'delay_ns'
+// must be less than the number of nano seconds in a second (1000000000).
+void Delay(int delay_ns) {
+  static const int kNumNSecInSecond = 1000000000;
+  EXPECT_LT(delay_ns, kNumNSecInSecond);
+  struct timespec delay = { 0, delay_ns };
+  nanosleep(&delay, 0);
+}
+
+// Checks whether the profile timer is enabled for the current thread.
+bool IsTimerEnabled() {
+  itimerval current_timer;
+  EXPECT_EQ(0, getitimer(timer_type_, &current_timer));
+  if ((current_timer.it_value.tv_sec == 0) &&
+      (current_timer.it_value.tv_usec != 0)) {
+    // May be the timer has expired. Sleep for a bit and check again.
+    Delay(kTimerResetInterval);
+    EXPECT_EQ(0, getitimer(timer_type_, &current_timer));
+  }
+  return (current_timer.it_value.tv_sec != 0 ||
+          current_timer.it_value.tv_usec != 0);
+}
+
+class VirtualTimerGetterThread : public Thread {
+ public:
+  VirtualTimerGetterThread() {
+    memset(&virtual_timer_, 0, sizeof virtual_timer_);
+  }
+  struct itimerval virtual_timer_;
+
+ private:
+  void Run() {
+    CHECK_EQ(0, getitimer(ITIMER_VIRTUAL, &virtual_timer_));
+  }
+};
+
+// This function checks whether the timers are shared between thread. This
+// function spawns a thread, so use it carefully when testing thread-dependent
+// behaviour.
+static bool threads_have_separate_timers() {
+  struct itimerval new_timer_val;
+
+  // Enable the virtual timer in the current thread.
+  memset(&new_timer_val, 0, sizeof new_timer_val);
+  new_timer_val.it_value.tv_sec = 1000000;  // seconds
+  CHECK_EQ(0, setitimer(ITIMER_VIRTUAL, &new_timer_val, NULL));
+
+  // Spawn a thread, get the virtual timer's value there.
+  VirtualTimerGetterThread thread;
+  thread.SetJoinable(true);
+  thread.Start();
+  thread.Join();
+
+  // Disable timer here.
+  memset(&new_timer_val, 0, sizeof new_timer_val);
+  CHECK_EQ(0, setitimer(ITIMER_VIRTUAL, &new_timer_val, NULL));
+
+  bool target_timer_enabled = (thread.virtual_timer_.it_value.tv_sec != 0 ||
+                               thread.virtual_timer_.it_value.tv_usec != 0);
+  if (!target_timer_enabled) {
+    LOG(INFO, "threads have separate timers");
+    return true;
+  } else {
+    LOG(INFO, "threads have shared timers");
+    return false;
+  }
+}
+
+// Dummy worker thread to accumulate cpu time.
+class BusyThread : public Thread {
+ public:
+  BusyThread() : stop_work_(false) {
+  }
+
+  // Setter/Getters
+  bool stop_work() {
+    MutexLock lock(&mu_);
+    return stop_work_;
+  }
+  void set_stop_work(bool stop_work) {
+    MutexLock lock(&mu_);
+    stop_work_ = stop_work;
+  }
+
+ private:
+  // Protects stop_work_ below.
+  Mutex mu_;
+  // Whether to stop work?
+  bool stop_work_;
+
+  // Do work until asked to stop.
+  void Run() {
+    while (!stop_work()) {
+    }
+    // If timers are separate, check that timer is enabled for this thread.
+    EXPECT_TRUE(linux_per_thread_timers_mode_ || !timer_separate_ || IsTimerEnabled());
+  }
+};
+
+class NullThread : public Thread {
+ private:
+  void Run() {
+    // If timers are separate, check that timer is enabled for this thread.
+    EXPECT_TRUE(linux_per_thread_timers_mode_ || !timer_separate_ || IsTimerEnabled());
+  }
+};
+
+// Signal handler which tracks the profile timer ticks.
+static void TickCounter(int sig, siginfo_t* sig_info, void *vuc,
+                        void* tick_counter) {
+  int* counter = static_cast<int*>(tick_counter);
+  ++(*counter);
+}
+
+// This class tests the profile-handler.h interface.
+class ProfileHandlerTest {
+ protected:
+
+  // Determines whether threads have separate timers.
+  static void SetUpTestCase() {
+    timer_type_ = (getenv("CPUPROFILE_REALTIME") ? ITIMER_REAL : ITIMER_PROF);
+    signal_number_ = (getenv("CPUPROFILE_REALTIME") ? SIGALRM : SIGPROF);
+
+    timer_separate_ = threads_have_separate_timers();
+#if HAVE_LINUX_SIGEV_THREAD_ID
+    linux_per_thread_timers_mode_ = (getenv("CPUPROFILE_PER_THREAD_TIMERS") != NULL);
+    const char *signal_number = getenv("CPUPROFILE_TIMER_SIGNAL");
+    if (signal_number) {
+      signal_number_ = strtol(signal_number, NULL, 0);
+      linux_per_thread_timers_mode_ = true;
+    }
+#endif
+    Delay(kTimerResetInterval);
+  }
+
+  // Sets up the profile timers and SIGPROF/SIGALRM handler in a known state.
+  // It does the following:
+  // 1. Unregisters all the callbacks, stops the timer (if shared) and
+  //    clears out timer_sharing state in the ProfileHandler. This clears
+  //    out any state left behind by the previous test or during module
+  //    initialization when the test program was started.
+  // 2. Spawns two threads which will be registered with the ProfileHandler.
+  //    At this time ProfileHandler knows if the timers are shared.
+  // 3. Starts a busy worker thread to accumulate CPU usage.
+  virtual void SetUp() {
+    // Reset the state of ProfileHandler between each test. This unregisters
+    // all callbacks, stops timer (if shared) and clears timer sharing state.
+    ProfileHandlerReset();
+    EXPECT_EQ(0, GetCallbackCount());
+    VerifyDisabled();
+    // ProfileHandler requires at least two threads to be registerd to determine
+    // whether timers are shared.
+    RegisterThread();
+    RegisterThread();
+    // Now that two threads are started, verify that the signal handler is
+    // disabled and the timers are correctly enabled/disabled.
+    VerifyDisabled();
+    // Start worker to accumulate cpu usage.
+    StartWorker();
+  }
+
+  virtual void TearDown() {
+    ProfileHandlerReset();
+    // Stops the worker thread.
+    StopWorker();
+  }
+
+  // Starts a no-op thread that gets registered with the ProfileHandler. Waits
+  // for the thread to stop.
+  void RegisterThread() {
+    NullThread t;
+    t.SetJoinable(true);
+    t.Start();
+    t.Join();
+  }
+
+  // Starts a busy worker thread to accumulate cpu time. There should be only
+  // one busy worker running. This is required for the case where there are
+  // separate timers for each thread.
+  void StartWorker() {
+    busy_worker_ = new BusyThread();
+    busy_worker_->SetJoinable(true);
+    busy_worker_->Start();
+    // Wait for worker to start up and register with the ProfileHandler.
+    // TODO(nabeelmian) This may not work under very heavy load.
+    Delay(kSleepInterval);
+  }
+
+  // Stops the worker thread.
+  void StopWorker() {
+    busy_worker_->set_stop_work(true);
+    busy_worker_->Join();
+    delete busy_worker_;
+  }
+
+  // Checks whether SIGPROF/SIGALRM signal handler is enabled.
+  bool IsSignalEnabled() {
+    struct sigaction sa;
+    CHECK_EQ(sigaction(signal_number_, NULL, &sa), 0);
+    return ((sa.sa_handler == SIG_IGN) || (sa.sa_handler == SIG_DFL)) ?
+        false : true;
+  }
+
+  // Gets the number of callbacks registered with the ProfileHandler.
+  uint32 GetCallbackCount() {
+    ProfileHandlerState state;
+    ProfileHandlerGetState(&state);
+    return state.callback_count;
+  }
+
+  // Gets the current ProfileHandler interrupt count.
+  uint64 GetInterruptCount() {
+    ProfileHandlerState state;
+    ProfileHandlerGetState(&state);
+    return state.interrupts;
+  }
+
+  // Verifies that a callback is correctly registered and receiving
+  // profile ticks.
+  void VerifyRegistration(const int& tick_counter) {
+    // Check the callback count.
+    EXPECT_GT(GetCallbackCount(), 0);
+    // Check that the profile timer is enabled.
+    EXPECT_EQ(FLAGS_test_profiler_enabled, linux_per_thread_timers_mode_ || IsTimerEnabled());
+    // Check that the signal handler is enabled.
+    if (FLAGS_test_profiler_signal_handler) {
+      EXPECT_EQ(FLAGS_test_profiler_enabled, IsSignalEnabled());
+    }
+    uint64 interrupts_before = GetInterruptCount();
+    // Sleep for a bit and check that tick counter is making progress.
+    int old_tick_count = tick_counter;
+    Delay(kSleepInterval);
+    int new_tick_count = tick_counter;
+    uint64 interrupts_after = GetInterruptCount();
+    if (FLAGS_test_profiler_enabled) {
+      EXPECT_GT(new_tick_count, old_tick_count);
+      EXPECT_GT(interrupts_after, interrupts_before);
+    } else {
+      EXPECT_EQ(new_tick_count, old_tick_count);
+      EXPECT_EQ(interrupts_after, interrupts_before);
+    }
+  }
+
+  // Verifies that a callback is not receiving profile ticks.
+  void VerifyUnregistration(const int& tick_counter) {
+    // Sleep for a bit and check that tick counter is not making progress.
+    int old_tick_count = tick_counter;
+    Delay(kSleepInterval);
+    int new_tick_count = tick_counter;
+    EXPECT_EQ(old_tick_count, new_tick_count);
+    // If no callbacks, signal handler and shared timer should be disabled.
+    if (GetCallbackCount() == 0) {
+      if (FLAGS_test_profiler_signal_handler) {
+        EXPECT_FALSE(IsSignalEnabled());
+      }
+      if (!linux_per_thread_timers_mode_) {
+        if (timer_separate_) {
+          EXPECT_TRUE(IsTimerEnabled());
+        } else {
+          EXPECT_FALSE(IsTimerEnabled());
+        }
+      }
+    }
+  }
+
+  // Verifies that the SIGPROF/SIGALRM interrupt handler is disabled and the
+  // timer, if shared, is disabled. Expects the worker to be running.
+  void VerifyDisabled() {
+    // Check that the signal handler is disabled.
+    if (FLAGS_test_profiler_signal_handler) {
+      EXPECT_FALSE(IsSignalEnabled());
+    }
+    // Check that the callback count is 0.
+    EXPECT_EQ(0, GetCallbackCount());
+    // Check that the timer is disabled if shared, enabled otherwise.
+    if (!linux_per_thread_timers_mode_) {
+      if (timer_separate_) {
+        EXPECT_TRUE(IsTimerEnabled());
+      } else {
+        EXPECT_FALSE(IsTimerEnabled());
+      }
+    }
+    // Verify that the ProfileHandler is not accumulating profile ticks.
+    uint64 interrupts_before = GetInterruptCount();
+    Delay(kSleepInterval);
+    uint64 interrupts_after = GetInterruptCount();
+    EXPECT_EQ(interrupts_before, interrupts_after);
+  }
+
+  // Registers a callback and waits for kTimerResetInterval for timers to get
+  // reset.
+  ProfileHandlerToken* RegisterCallback(void* callback_arg) {
+    ProfileHandlerToken* token = ProfileHandlerRegisterCallback(
+        TickCounter, callback_arg);
+    Delay(kTimerResetInterval);
+    return token;
+  }
+
+  // Unregisters a callback and waits for kTimerResetInterval for timers to get
+  // reset.
+  void UnregisterCallback(ProfileHandlerToken* token) {
+    ProfileHandlerUnregisterCallback(token);
+    Delay(kTimerResetInterval);
+  }
+
+  // Busy worker thread to accumulate cpu usage.
+  BusyThread* busy_worker_;
+
+ private:
+  // The tests to run
+  void RegisterUnregisterCallback();
+  void MultipleCallbacks();
+  void Reset();
+  void RegisterCallbackBeforeThread();
+
+ public:
+#define RUN(test)  do {                         \
+    printf("Running %s\n", #test);              \
+    ProfileHandlerTest pht;                     \
+    pht.SetUp();                                \
+    pht.test();                                 \
+    pht.TearDown();                             \
+} while (0)
+
+  static int RUN_ALL_TESTS() {
+    SetUpTestCase();
+    RUN(RegisterUnregisterCallback);
+    RUN(MultipleCallbacks);
+    RUN(Reset);
+    RUN(RegisterCallbackBeforeThread);
+    printf("Done\n");
+    return 0;
+  }
+};
+
+// Verifies ProfileHandlerRegisterCallback and
+// ProfileHandlerUnregisterCallback.
+TEST_F(ProfileHandlerTest, RegisterUnregisterCallback) {
+  int tick_count = 0;
+  ProfileHandlerToken* token = RegisterCallback(&tick_count);
+  VerifyRegistration(tick_count);
+  UnregisterCallback(token);
+  VerifyUnregistration(tick_count);
+}
+
+// Verifies that multiple callbacks can be registered.
+TEST_F(ProfileHandlerTest, MultipleCallbacks) {
+  // Register first callback.
+  int first_tick_count;
+  ProfileHandlerToken* token1 = RegisterCallback(&first_tick_count);
+  // Check that callback was registered correctly.
+  VerifyRegistration(first_tick_count);
+  EXPECT_EQ(1, GetCallbackCount());
+
+  // Register second callback.
+  int second_tick_count;
+  ProfileHandlerToken* token2 = RegisterCallback(&second_tick_count);
+  // Check that callback was registered correctly.
+  VerifyRegistration(second_tick_count);
+  EXPECT_EQ(2, GetCallbackCount());
+
+  // Unregister first callback.
+  UnregisterCallback(token1);
+  VerifyUnregistration(first_tick_count);
+  EXPECT_EQ(1, GetCallbackCount());
+  // Verify that second callback is still registered.
+  VerifyRegistration(second_tick_count);
+
+  // Unregister second callback.
+  UnregisterCallback(token2);
+  VerifyUnregistration(second_tick_count);
+  EXPECT_EQ(0, GetCallbackCount());
+
+  // Verify that the signal handler and timers are correctly disabled.
+  VerifyDisabled();
+}
+
+// Verifies ProfileHandlerReset
+TEST_F(ProfileHandlerTest, Reset) {
+  // Verify that the profile timer interrupt is disabled.
+  VerifyDisabled();
+  int first_tick_count;
+  RegisterCallback(&first_tick_count);
+  VerifyRegistration(first_tick_count);
+  EXPECT_EQ(1, GetCallbackCount());
+
+  // Register second callback.
+  int second_tick_count;
+  RegisterCallback(&second_tick_count);
+  VerifyRegistration(second_tick_count);
+  EXPECT_EQ(2, GetCallbackCount());
+
+  // Reset the profile handler and verify that callback were correctly
+  // unregistered and timer/signal are disabled.
+  ProfileHandlerReset();
+  VerifyUnregistration(first_tick_count);
+  VerifyUnregistration(second_tick_count);
+  VerifyDisabled();
+}
+
+// Verifies that ProfileHandler correctly handles a case where a callback was
+// registered before the second thread started.
+TEST_F(ProfileHandlerTest, RegisterCallbackBeforeThread) {
+  // Stop the worker.
+  StopWorker();
+  // Unregister all existing callbacks, stop the timer (if shared), disable
+  // the signal handler and reset the timer sharing state in the Profile
+  // Handler.
+  ProfileHandlerReset();
+  EXPECT_EQ(0, GetCallbackCount());
+  VerifyDisabled();
+
+  // Start the worker. At this time ProfileHandler doesn't know if timers are
+  // shared as only one thread has registered so far.
+  StartWorker();
+  // Register a callback and check that profile ticks are being delivered.
+  int tick_count;
+  RegisterCallback(&tick_count);
+  EXPECT_EQ(1, GetCallbackCount());
+  VerifyRegistration(tick_count);
+
+  // Register a second thread and verify that timer and signal handler are
+  // correctly enabled.
+  RegisterThread();
+  EXPECT_EQ(1, GetCallbackCount());
+  EXPECT_EQ(FLAGS_test_profiler_enabled, linux_per_thread_timers_mode_ || IsTimerEnabled());
+  if (FLAGS_test_profiler_signal_handler) {
+    EXPECT_EQ(FLAGS_test_profiler_enabled, IsSignalEnabled());
+  }
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  return ProfileHandlerTest::RUN_ALL_TESTS();
+}

diff --git a/src/tests/profiledata_unittest.cc b/src/tests/profiledata_unittest.cc
new file mode 100644
index 0000000..972c1b0
--- /dev/null
+++ b/src/tests/profiledata_unittest.cc

@@ -0,0 +1,611 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+// Author: Chris Demetriou
+//
+// This file contains the unit tests for the ProfileData class.
+
+#if defined HAVE_STDINT_H
+#include <stdint.h>             // to get uintptr_t
+#elif defined HAVE_INTTYPES_H
+#include <inttypes.h>           // another place uintptr_t might be defined
+#endif
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <string.h>
+#include <string>
+
+#include "profiledata.h"
+
+#include "base/commandlineflags.h"
+#include "base/logging.h"
+
+using std::string;
+
+// Some helpful macros for the test class
+#define TEST_F(cls, fn)    void cls :: fn()
+
+namespace {
+
+template<typename T> class scoped_array {
+ public:
+  scoped_array(T* data) : data_(data) { }
+  ~scoped_array() { delete[] data_; }
+  T* get() { return data_; }
+  T& operator[](int i) { return data_[i]; }
+ private:
+  T* const data_;
+};
+
+// Re-runs fn until it doesn't cause EINTR.
+#define NO_INTR(fn)   do {} while ((fn) < 0 && errno == EINTR)
+
+// Read up to "count" bytes from file descriptor "fd" into the buffer
+// starting at "buf" while handling short reads and EINTR.  On
+// success, return the number of bytes read.  Otherwise, return -1.
+static ssize_t ReadPersistent(const int fd, void *buf, const size_t count) {
+  CHECK_GE(fd, 0);
+  char *buf0 = reinterpret_cast<char *>(buf);
+  ssize_t num_bytes = 0;
+  while (num_bytes < count) {
+    ssize_t len;
+    NO_INTR(len = read(fd, buf0 + num_bytes, count - num_bytes));
+    if (len < 0) {  // There was an error other than EINTR.
+      return -1;
+    }
+    if (len == 0) {  // Reached EOF.
+      break;
+    }
+    num_bytes += len;
+  }
+  CHECK(num_bytes <= count);
+  return num_bytes;
+}
+
+// Thin wrapper around a file descriptor so that the file descriptor
+// gets closed for sure.
+struct FileDescriptor {
+  const int fd_;
+  explicit FileDescriptor(int fd) : fd_(fd) {}
+  ~FileDescriptor() {
+    if (fd_ >= 0) {
+      NO_INTR(close(fd_));
+    }
+  }
+  int get() { return fd_; }
+};
+
+// must be the same as with ProfileData::Slot.
+typedef uintptr_t ProfileDataSlot;
+
+// Quick and dirty function to make a number into a void* for use in a
+// sample.
+inline void* V(intptr_t x) { return reinterpret_cast<void*>(x); }
+
+// String returned by ProfileDataChecker helper functions to indicate success.
+const char kNoError[] = "";
+
+class ProfileDataChecker {
+ public:
+  ProfileDataChecker() {
+    const char* tmpdir = getenv("TMPDIR");
+    if (tmpdir == NULL)
+      tmpdir = "/tmp";
+    mkdir(tmpdir, 0755);     // if necessary
+    filename_ = string(tmpdir) + "/profiledata_unittest.tmp";
+  }
+
+  string filename() const { return filename_; }
+
+  // Checks the first 'num_slots' profile data slots in the file
+  // against the data pointed to by 'slots'.  Returns kNoError if the
+  // data matched, otherwise returns an indication of the cause of the
+  // mismatch.
+  string Check(const ProfileDataSlot* slots, int num_slots) {
+    return CheckWithSkips(slots, num_slots, NULL, 0);
+  }
+
+  // Checks the first 'num_slots' profile data slots in the file
+  // against the data pointed to by 'slots', skipping over entries
+  // described by 'skips' and 'num_skips'.
+  //
+  // 'skips' must be a sorted list of (0-based) slot numbers to be
+  // skipped, of length 'num_skips'.  Note that 'num_slots' includes
+  // any skipped slots, i.e., the first 'num_slots' profile data slots
+  // will be considered, but some may be skipped.
+  //
+  // Returns kNoError if the data matched, otherwise returns an
+  // indication of the cause of the mismatch.
+  string CheckWithSkips(const ProfileDataSlot* slots, int num_slots,
+                        const int* skips, int num_skips);
+
+  // Validate that a profile is correctly formed.  The profile is
+  // assumed to have been created by the same kind of binary (e.g.,
+  // same slot size, same endian, etc.) as is validating the profile.
+  //
+  // Returns kNoError if the profile appears valid, otherwise returns
+  // an indication of the problem with the profile.
+  string ValidateProfile();
+
+ private:
+  string filename_;
+};
+
+string ProfileDataChecker::CheckWithSkips(const ProfileDataSlot* slots,
+                                          int num_slots, const int* skips,
+                                          int num_skips) {
+  FileDescriptor fd(open(filename_.c_str(), O_RDONLY));
+  if (fd.get() < 0)
+    return "file open error";
+
+  scoped_array<ProfileDataSlot> filedata(new ProfileDataSlot[num_slots]);
+  size_t expected_bytes = num_slots * sizeof filedata[0];
+  ssize_t bytes_read = ReadPersistent(fd.get(), filedata.get(), expected_bytes);
+  if (expected_bytes != bytes_read)
+    return "file too small";
+
+  for (int i = 0; i < num_slots; i++) {
+    if (num_skips > 0 && *skips == i) {
+      num_skips--;
+      skips++;
+      continue;
+    }
+    if (slots[i] != filedata[i])
+      return "data mismatch";
+  }
+  return kNoError;
+}
+
+string ProfileDataChecker::ValidateProfile() {
+  FileDescriptor fd(open(filename_.c_str(), O_RDONLY));
+  if (fd.get() < 0)
+    return "file open error";
+
+  struct stat statbuf;
+  if (fstat(fd.get(), &statbuf) != 0)
+    return "fstat error";
+  if (statbuf.st_size != static_cast<ssize_t>(statbuf.st_size))
+    return "file impossibly large";
+  ssize_t filesize = statbuf.st_size;
+
+  scoped_array<char> filedata(new char[filesize]);
+  if (ReadPersistent(fd.get(), filedata.get(), filesize) != filesize)
+    return "read of whole file failed";
+
+  // Must have enough data for the header and the trailer.
+  if (filesize < (5 + 3) * sizeof(ProfileDataSlot))
+    return "not enough data in profile for header + trailer";
+
+  // Check the header
+  if (reinterpret_cast<ProfileDataSlot*>(filedata.get())[0] != 0)
+    return "error in header: non-zero count";
+  if (reinterpret_cast<ProfileDataSlot*>(filedata.get())[1] != 3)
+    return "error in header: num_slots != 3";
+  if (reinterpret_cast<ProfileDataSlot*>(filedata.get())[2] != 0)
+    return "error in header: non-zero format version";
+  // Period (slot 3) can have any value.
+  if (reinterpret_cast<ProfileDataSlot*>(filedata.get())[4] != 0)
+    return "error in header: non-zero padding value";
+  ssize_t cur_offset = 5 * sizeof(ProfileDataSlot);
+
+  // While there are samples, skip them.  Each sample consists of
+  // at least three slots.
+  bool seen_trailer = false;
+  while (!seen_trailer) {
+    if (cur_offset > filesize - 3 * sizeof(ProfileDataSlot))
+      return "truncated sample header";
+    ProfileDataSlot* sample =
+        reinterpret_cast<ProfileDataSlot*>(filedata.get() + cur_offset);
+    ProfileDataSlot slots_this_sample = 2 + sample[1];
+    ssize_t size_this_sample = slots_this_sample * sizeof(ProfileDataSlot);
+    if (cur_offset > filesize - size_this_sample)
+      return "truncated sample";
+
+    if (sample[0] == 0 && sample[1] == 1 && sample[2] == 0) {
+      seen_trailer = true;
+    } else {
+      if (sample[0] < 1)
+        return "error in sample: sample count < 1";
+      if (sample[1] < 1)
+        return "error in sample: num_pcs < 1";
+      for (int i = 2; i < slots_this_sample; i++) {
+        if (sample[i] == 0)
+          return "error in sample: NULL PC";
+      }
+    }
+    cur_offset += size_this_sample;
+  }
+
+  // There must be at least one line in the (text) list of mapped objects,
+  // and it must be terminated by a newline.  Note, the use of newline
+  // here and below Might not be reasonable on non-UNIX systems.
+  if (cur_offset >= filesize)
+    return "no list of mapped objects";
+  if (filedata[filesize - 1] != '\n')
+    return "profile did not end with a complete line";
+
+  while (cur_offset < filesize) {
+    char* line_start = filedata.get() + cur_offset;
+
+    // Find the end of the line, and replace it with a NUL for easier
+    // scanning.
+    char* line_end = strchr(line_start, '\n');
+    *line_end = '\0';
+
+    // Advance past any leading space.  It's allowed in some lines,
+    // but not in others.
+    bool has_leading_space = false;
+    char* line_cur = line_start;
+    while (*line_cur == ' ') {
+      has_leading_space = true;
+      line_cur++;
+    }
+
+    bool found_match = false;
+
+    // Check for build lines.
+    if (!found_match) {
+      found_match = (strncmp(line_cur, "build=", 6) == 0);
+      // Anything may follow "build=", and leading space is allowed.
+    }
+
+    // A line from ProcMapsIterator::FormatLine, of the form:
+    //
+    // 40000000-40015000 r-xp 00000000 03:01 12845071   /lib/ld-2.3.2.so
+    //
+    // Leading space is not allowed.  The filename may be omitted or
+    // may consist of multiple words, so we scan only up to the
+    // space before the filename.
+    if (!found_match) {
+      int chars_scanned = -1;
+      sscanf(line_cur, "%*x-%*x %*c%*c%*c%*c %*x %*x:%*x %*d %n",
+             &chars_scanned);
+      found_match = (chars_scanned > 0 && !has_leading_space);
+    }
+
+    // A line from DumpAddressMap, of the form:
+    //
+    // 40000000-40015000: /lib/ld-2.3.2.so
+    //
+    // Leading space is allowed.  The filename may be omitted or may
+    // consist of multiple words, so we scan only up to the space
+    // before the filename.
+    if (!found_match) {
+      int chars_scanned = -1;
+      sscanf(line_cur, "%*x-%*x: %n", &chars_scanned);
+      found_match = (chars_scanned > 0);
+    }
+
+    if (!found_match)
+      return "unrecognized line in text section";
+
+    cur_offset += (line_end - line_start) + 1;
+  }
+
+  return kNoError;
+}
+
+class ProfileDataTest {
+ protected:
+  void ExpectStopped() {
+    EXPECT_FALSE(collector_.enabled());
+  }
+
+  void ExpectRunningSamples(int samples) {
+    ProfileData::State state;
+    collector_.GetCurrentState(&state);
+    EXPECT_TRUE(state.enabled);
+    EXPECT_EQ(samples, state.samples_gathered);
+  }
+
+  void ExpectSameState(const ProfileData::State& before,
+                       const ProfileData::State& after) {
+    EXPECT_EQ(before.enabled, after.enabled);
+    EXPECT_EQ(before.samples_gathered, after.samples_gathered);
+    EXPECT_EQ(before.start_time, after.start_time);
+    EXPECT_STREQ(before.profile_name, after.profile_name);
+  }
+
+  ProfileData        collector_;
+  ProfileDataChecker checker_;
+
+ private:
+  // The tests to run
+  void OpsWhenStopped();
+  void StartStopEmpty();
+  void StartStopNoOptionsEmpty();
+  void StartWhenStarted();
+  void StartStopEmpty2();
+  void CollectOne();
+  void CollectTwoMatching();
+  void CollectTwoFlush();
+  void StartResetRestart();
+
+ public:
+#define RUN(test)  do {                         \
+    printf("Running %s\n", #test);              \
+    ProfileDataTest pdt;                        \
+    pdt.test();                                 \
+} while (0)
+
+  static int RUN_ALL_TESTS() {
+    RUN(OpsWhenStopped);
+    RUN(StartStopEmpty);
+    RUN(StartWhenStarted);
+    RUN(StartStopEmpty2);
+    RUN(CollectOne);
+    RUN(CollectTwoMatching);
+    RUN(CollectTwoFlush);
+    RUN(StartResetRestart);
+    return 0;
+  }
+};
+
+// Check that various operations are safe when stopped.
+TEST_F(ProfileDataTest, OpsWhenStopped) {
+  ExpectStopped();
+  EXPECT_FALSE(collector_.enabled());
+
+  // Verify that state is disabled, all-empty/all-0
+  ProfileData::State state_before;
+  collector_.GetCurrentState(&state_before);
+  EXPECT_FALSE(state_before.enabled);
+  EXPECT_EQ(0, state_before.samples_gathered);
+  EXPECT_EQ(0, state_before.start_time);
+  EXPECT_STREQ("", state_before.profile_name);
+
+  // Safe to call stop again.
+  collector_.Stop();
+
+  // Safe to call FlushTable.
+  collector_.FlushTable();
+
+  // Safe to call Add.
+  const void *trace[] = { V(100), V(101), V(102), V(103), V(104) };
+  collector_.Add(arraysize(trace), trace);
+
+  ProfileData::State state_after;
+  collector_.GetCurrentState(&state_after);
+
+  ExpectSameState(state_before, state_after);
+}
+
+// Start and Stop, collecting no samples.  Verify output contents.
+TEST_F(ProfileDataTest, StartStopEmpty) {
+  const int frequency = 1;
+  ProfileDataSlot slots[] = {
+    0, 3, 0, 1000000 / frequency, 0,    // binary header
+    0, 1, 0                             // binary trailer
+  };
+
+  ExpectStopped();
+  ProfileData::Options options;
+  options.set_frequency(frequency);
+  EXPECT_TRUE(collector_.Start(checker_.filename().c_str(), options));
+  ExpectRunningSamples(0);
+  collector_.Stop();
+  ExpectStopped();
+  EXPECT_EQ(kNoError, checker_.ValidateProfile());
+  EXPECT_EQ(kNoError, checker_.Check(slots, arraysize(slots)));
+}
+
+// Start and Stop with no options, collecting no samples.  Verify
+// output contents.
+TEST_F(ProfileDataTest, StartStopNoOptionsEmpty) {
+  // We're not requesting a specific period, implementation can do
+  // whatever it likes.
+  ProfileDataSlot slots[] = {
+    0, 3, 0, 0 /* skipped */, 0,        // binary header
+    0, 1, 0                             // binary trailer
+  };
+  int slots_to_skip[] = { 3 };
+
+  ExpectStopped();
+  EXPECT_TRUE(collector_.Start(checker_.filename().c_str(),
+                               ProfileData::Options()));
+  ExpectRunningSamples(0);
+  collector_.Stop();
+  ExpectStopped();
+  EXPECT_EQ(kNoError, checker_.ValidateProfile());
+  EXPECT_EQ(kNoError, checker_.CheckWithSkips(slots, arraysize(slots),
+                                              slots_to_skip,
+                                              arraysize(slots_to_skip)));
+}
+
+// Start after already started.  Should return false and not impact
+// collected data or state.
+TEST_F(ProfileDataTest, StartWhenStarted) {
+  const int frequency = 1;
+  ProfileDataSlot slots[] = {
+    0, 3, 0, 1000000 / frequency, 0,    // binary header
+    0, 1, 0                             // binary trailer
+  };
+
+  ProfileData::Options options;
+  options.set_frequency(frequency);
+  EXPECT_TRUE(collector_.Start(checker_.filename().c_str(), options));
+
+  ProfileData::State state_before;
+  collector_.GetCurrentState(&state_before);
+
+  options.set_frequency(frequency * 2);
+  CHECK(!collector_.Start("foobar", options));
+
+  ProfileData::State state_after;
+  collector_.GetCurrentState(&state_after);
+  ExpectSameState(state_before, state_after);
+
+  collector_.Stop();
+  ExpectStopped();
+  EXPECT_EQ(kNoError, checker_.ValidateProfile());
+  EXPECT_EQ(kNoError, checker_.Check(slots, arraysize(slots)));
+}
+
+// Like StartStopEmpty, but uses a different file name and frequency.
+TEST_F(ProfileDataTest, StartStopEmpty2) {
+  const int frequency = 2;
+  ProfileDataSlot slots[] = {
+    0, 3, 0, 1000000 / frequency, 0,    // binary header
+    0, 1, 0                             // binary trailer
+  };
+
+  ExpectStopped();
+  ProfileData::Options options;
+  options.set_frequency(frequency);
+  EXPECT_TRUE(collector_.Start(checker_.filename().c_str(), options));
+  ExpectRunningSamples(0);
+  collector_.Stop();
+  ExpectStopped();
+  EXPECT_EQ(kNoError, checker_.ValidateProfile());
+  EXPECT_EQ(kNoError, checker_.Check(slots, arraysize(slots)));
+}
+
+TEST_F(ProfileDataTest, CollectOne) {
+  const int frequency = 2;
+  ProfileDataSlot slots[] = {
+    0, 3, 0, 1000000 / frequency, 0,    // binary header
+    1, 5, 100, 101, 102, 103, 104,      // our sample
+    0, 1, 0                             // binary trailer
+  };
+
+  ExpectStopped();
+  ProfileData::Options options;
+  options.set_frequency(frequency);
+  EXPECT_TRUE(collector_.Start(checker_.filename().c_str(), options));
+  ExpectRunningSamples(0);
+
+  const void *trace[] = { V(100), V(101), V(102), V(103), V(104) };
+  collector_.Add(arraysize(trace), trace);
+  ExpectRunningSamples(1);
+
+  collector_.Stop();
+  ExpectStopped();
+  EXPECT_EQ(kNoError, checker_.ValidateProfile());
+  EXPECT_EQ(kNoError, checker_.Check(slots, arraysize(slots)));
+}
+
+TEST_F(ProfileDataTest, CollectTwoMatching) {
+  const int frequency = 2;
+  ProfileDataSlot slots[] = {
+    0, 3, 0, 1000000 / frequency, 0,    // binary header
+    2, 5, 100, 201, 302, 403, 504,      // our two samples
+    0, 1, 0                             // binary trailer
+  };
+
+  ExpectStopped();
+  ProfileData::Options options;
+  options.set_frequency(frequency);
+  EXPECT_TRUE(collector_.Start(checker_.filename().c_str(), options));
+  ExpectRunningSamples(0);
+
+  for (int i = 0; i < 2; ++i) {
+    const void *trace[] = { V(100), V(201), V(302), V(403), V(504) };
+    collector_.Add(arraysize(trace), trace);
+    ExpectRunningSamples(i + 1);
+  }
+
+  collector_.Stop();
+  ExpectStopped();
+  EXPECT_EQ(kNoError, checker_.ValidateProfile());
+  EXPECT_EQ(kNoError, checker_.Check(slots, arraysize(slots)));
+}
+
+TEST_F(ProfileDataTest, CollectTwoFlush) {
+  const int frequency = 2;
+  ProfileDataSlot slots[] = {
+    0, 3, 0, 1000000 / frequency, 0,    // binary header
+    1, 5, 100, 201, 302, 403, 504,      // first sample (flushed)
+    1, 5, 100, 201, 302, 403, 504,      // second identical sample
+    0, 1, 0                             // binary trailer
+  };
+
+  ExpectStopped();
+  ProfileData::Options options;
+  options.set_frequency(frequency);
+  EXPECT_TRUE(collector_.Start(checker_.filename().c_str(), options));
+  ExpectRunningSamples(0);
+
+  const void *trace[] = { V(100), V(201), V(302), V(403), V(504) };
+
+  collector_.Add(arraysize(trace), trace);
+  ExpectRunningSamples(1);
+  collector_.FlushTable();
+
+  collector_.Add(arraysize(trace), trace);
+  ExpectRunningSamples(2);
+
+  collector_.Stop();
+  ExpectStopped();
+  EXPECT_EQ(kNoError, checker_.ValidateProfile());
+  EXPECT_EQ(kNoError, checker_.Check(slots, arraysize(slots)));
+}
+
+// Start then reset, verify that the result is *not* a valid profile.
+// Then start again and make sure the result is OK.
+TEST_F(ProfileDataTest, StartResetRestart) {
+  ExpectStopped();
+  ProfileData::Options options;
+  options.set_frequency(1);
+  EXPECT_TRUE(collector_.Start(checker_.filename().c_str(), options));
+  ExpectRunningSamples(0);
+  collector_.Reset();
+  ExpectStopped();
+  // We expect the resulting file to be empty.  This is a minimal test
+  // of ValidateProfile.
+  EXPECT_NE(kNoError, checker_.ValidateProfile());
+
+  struct stat statbuf;
+  EXPECT_EQ(0, stat(checker_.filename().c_str(), &statbuf));
+  EXPECT_EQ(0, statbuf.st_size);
+
+  const int frequency = 2;  // Different frequency than used above.
+  ProfileDataSlot slots[] = {
+    0, 3, 0, 1000000 / frequency, 0,    // binary header
+    0, 1, 0                             // binary trailer
+  };
+
+  options.set_frequency(frequency);
+  EXPECT_TRUE(collector_.Start(checker_.filename().c_str(), options));
+  ExpectRunningSamples(0);
+  collector_.Stop();
+  ExpectStopped();
+  EXPECT_EQ(kNoError, checker_.ValidateProfile());
+  EXPECT_EQ(kNoError, checker_.Check(slots, arraysize(slots)));
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  int rc = ProfileDataTest::RUN_ALL_TESTS();
+  printf("%s\n", rc == 0 ? "PASS" : "FAIL");
+  return rc;
+}

diff --git a/src/tests/profiler_unittest.cc b/src/tests/profiler_unittest.cc
new file mode 100644
index 0000000..321f848
--- /dev/null
+++ b/src/tests/profiler_unittest.cc

@@ -0,0 +1,147 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// Does some simple arithmetic and a few libc routines, so we can profile it.
+// Define WITH_THREADS to add pthread functionality as well (otherwise, btw,
+// the num_threads argument to this program is ingored).
+
+#include "config_for_unittests.h"
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>                 // for fork()
+#endif
+#include <sys/wait.h>               // for wait()
+#include "gperftools/profiler.h"
+#include "base/simple_mutex.h"
+#include "tests/testutil.h"
+
+static int result = 0;
+static int g_iters = 0;   // argv[1]
+
+Mutex mutex(Mutex::LINKER_INITIALIZED);
+
+static void test_other_thread() {
+#ifndef NO_THREADS
+  ProfilerRegisterThread();
+
+  int i, m;
+  char b[128];
+  MutexLock ml(&mutex);
+  for (m = 0; m < 1000000; ++m) {          // run millions of times
+    for (i = 0; i < g_iters; ++i ) {
+      result ^= i;
+    }
+    snprintf(b, sizeof(b), "other: %d", result);  // get some libc action
+  }
+#endif
+}
+
+static void test_main_thread() {
+  int i, m;
+  char b[128];
+  MutexLock ml(&mutex);
+  for (m = 0; m < 1000000; ++m) {          // run millions of times
+    for (i = 0; i < g_iters; ++i ) {
+      result ^= i;
+    }
+    snprintf(b, sizeof(b), "same: %d", result);  // get some libc action
+  }
+}
+
+int main(int argc, char** argv) {
+  if ( argc <= 1 ) {
+    fprintf(stderr, "USAGE: %s <iters> [num_threads] [filename]\n", argv[0]);
+    fprintf(stderr, "   iters: How many million times to run the XOR test.\n");
+    fprintf(stderr, "   num_threads: how many concurrent threads.\n");
+    fprintf(stderr, "                0 or 1 for single-threaded mode,\n");
+    fprintf(stderr, "                -# to fork instead of thread.\n");
+    fprintf(stderr, "   filename: The name of the output profile.\n");
+    fprintf(stderr, ("             If you don't specify, set CPUPROFILE "
+                     "in the environment instead!\n"));
+    return 1;
+  }
+
+  g_iters = atoi(argv[1]);
+  int num_threads = 1;
+  const char* filename = NULL;
+  if (argc > 2) {
+    num_threads = atoi(argv[2]);
+  }
+  if (argc > 3) {
+    filename = argv[3];
+  }
+
+  if (filename) {
+    ProfilerStart(filename);
+  }
+
+  test_main_thread();
+
+  ProfilerFlush();                           // just because we can
+
+  // The other threads, if any, will run only half as long as the main thread
+  if(num_threads > 0) {
+    RunManyThreads(test_other_thread, num_threads);
+  } else {
+  // Or maybe they asked to fork.  The fork test is only interesting
+  // when we use CPUPROFILE to name, so check for that
+#ifdef HAVE_UNISTD_H
+    for (; num_threads < 0; ++num_threads) {   // -<num_threads> to fork
+      if (filename) {
+        printf("FORK test only makes sense when no filename is specified.\n");
+        return 2;
+      }
+      switch (fork()) {
+        case -1:
+          printf("FORK failed!\n");
+          return 1;
+        case 0:             // child
+          return execl(argv[0], argv[0], argv[1], NULL);
+        default:
+          wait(NULL);       // we'll let the kids run one at a time
+      }
+    }
+#else
+    fprintf(stderr, "%s was compiled without support for fork() and exec()\n", argv[0]);
+#endif
+  }
+
+  test_main_thread();
+
+  if (filename) {
+    ProfilerStop();
+  }
+
+  return 0;
+}

diff --git a/src/tests/profiler_unittest.sh b/src/tests/profiler_unittest.sh
new file mode 100755
index 0000000..4085f2c
--- /dev/null
+++ b/src/tests/profiler_unittest.sh

@@ -0,0 +1,269 @@
+#!/bin/sh
+
+# Copyright (c) 2005, Google Inc.
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ---
+# Author: Craig Silverstein
+#
+# Runs the 4 profiler unittests and makes sure their profiles look
+# appropriate.  We expect two commandline args, as described below.
+#
+# We run under the assumption that if $PROFILER1 is run with no
+# arguments, it prints a usage line of the form
+#   USAGE: <actual executable being run> [...]
+#
+# This is because libtool sometimes turns the 'executable' into a
+# shell script which runs an actual binary somewhere else.
+
+# We expect BINDIR and PPROF_PATH to be set in the environment.
+# If not, we set them to some reasonable values
+BINDIR="${BINDIR:-.}"
+PPROF_PATH="${PPROF_PATH:-$BINDIR/src/pprof}"
+
+if [ "x$1" = "x-h" -o "x$1" = "x--help" ]; then
+  echo "USAGE: $0 [unittest dir] [path to pprof]"
+  echo "       By default, unittest_dir=$BINDIR, pprof_path=$PPROF_PATH"
+  exit 1
+fi
+
+TMPDIR=/tmp/profile_info
+
+UNITTEST_DIR=${1:-$BINDIR}
+PPROF=${2:-$PPROF_PATH}
+
+# We test the sliding-window functionality of the cpu-profile reader
+# by using a small stride, forcing lots of reads.
+PPROF_FLAGS="--test_stride=128"
+
+PROFILER1="$UNITTEST_DIR/profiler1_unittest"
+PROFILER2="$UNITTEST_DIR/profiler2_unittest"
+PROFILER3="$UNITTEST_DIR/profiler3_unittest"
+PROFILER4="$UNITTEST_DIR/profiler4_unittest"
+
+# Unfortunately, for us, libtool can replace executables with a shell
+# script that does some work before calling the 'real' executable
+# under a different name.  We need the 'real' executable name to run
+# pprof on it.  We've constructed all the binaries used in this
+# unittest so when they are called with no arguments, they report
+# their argv[0], which is the real binary name.
+Realname() {
+  "$1" 2>&1 | awk '{print $2; exit;}'
+}
+
+PROFILER1_REALNAME=`Realname "$PROFILER1"`
+PROFILER2_REALNAME=`Realname "$PROFILER2"`
+PROFILER3_REALNAME=`Realname "$PROFILER3"`
+PROFILER4_REALNAME=`Realname "$PROFILER4"`
+
+# It's meaningful to the profiler, so make sure we know its state
+unset CPUPROFILE
+
+# Some output/logging in the profiler can cause issues when running the unit
+# tests. For example, logging a warning when the profiler is detected as being
+# present but no CPUPROFILE is specified in the environment. Especially when
+# we are checking for a silent run or specific timing constraints are being
+# checked. So set the env variable signifying that we are running in a unit
+# test environment.
+PERFTOOLS_UNITTEST=1 
+
+rm -rf "$TMPDIR"
+mkdir "$TMPDIR" || exit 2
+
+num_failures=0
+
+RegisterFailure() {
+  num_failures=`expr $num_failures + 1`
+}
+
+# Takes two filenames representing profiles, with their executable scripts,
+# and a multiplier, and verifies that the 'contentful' functions in each
+# profile take the same time (possibly scaled by the given multiplier). It
+# used to be "same" meant within 50%, after adding an noise-reducing X units
+# to each value.  But even that would often spuriously fail, so now it's
+# "both non-zero". We're pretty forgiving.
+VerifySimilar() {
+  prof1="$TMPDIR/$1"
+  exec1="$2"
+  prof2="$TMPDIR/$3"
+  exec2="$4"
+  mult="$5"
+
+  # We are careful not to put exec1 and exec2 in quotes, because if
+  # they are the empty string, it means we want to use the 1-arg
+  # version of pprof.
+  mthread1=`"$PPROF" $PPROF_FLAGS $exec1 "$prof1" | grep test_main_thread | awk '{print $1}'`
+  mthread2=`"$PPROF" $PPROF_FLAGS $exec2 "$prof2" | grep test_main_thread | awk '{print $1}'`
+  mthread1_plus=`expr $mthread1 + 5`
+  mthread2_plus=`expr $mthread2 + 5`
+  if [ -z "$mthread1" ] || [ -z "$mthread2" ] || \
+     [ "$mthread1" -le 0 -o "$mthread2" -le 0 ]
+#    || [ `expr $mthread1_plus \* $mult` -gt `expr $mthread2_plus \* 2` -o \
+#         `expr $mthread1_plus \* $mult \* 2` -lt `expr $mthread2_plus` ]
+  then
+    echo
+    echo ">>> profile on $exec1 vs $exec2 with multiplier $mult failed:"
+    echo "Actual times (in profiling units) were '$mthread1' vs. '$mthread2'"
+    echo
+    RegisterFailure
+  fi
+}
+
+# Takes two filenames representing profiles, and optionally their
+# executable scripts (these may be empty if the profiles include
+# symbols), and verifies that the two profiles are identical.
+VerifyIdentical() {
+  prof1="$TMPDIR/$1"
+  exec1="$2"
+  prof2="$TMPDIR/$3"
+  exec2="$4"
+
+  # We are careful not to put exec1 and exec2 in quotes, because if
+  # they are the empty string, it means we want to use the 1-arg
+  # version of pprof.
+  "$PPROF" $PPROF_FLAGS $exec1 "$prof1" > "$TMPDIR/out1"
+  "$PPROF" $PPROF_FLAGS $exec2 "$prof2" > "$TMPDIR/out2"
+  diff=`diff "$TMPDIR/out1" "$TMPDIR/out2"`
+
+  if [ ! -z "$diff" ]; then
+    echo
+    echo ">>> profile doesn't match, args: $exec1 $prof1 vs. $exec2 $prof2"
+    echo ">>> Diff:"
+    echo "$diff"
+    echo
+    RegisterFailure
+  fi
+}
+
+# Takes a filename representing a profile, with its executable,
+# and a multiplier, and verifies that the main-thread function takes
+# the same amount of time as the other-threads function (possibly scaled
+# by the given multiplier).  Figuring out the multiplier can be tricky,
+# since by design the main thread runs twice as long as each of the
+# 'other' threads!  It used to be "same" meant within 50%, after adding an 
+# noise-reducing X units to each value.  But even that would often
+# spuriously fail, so now it's "both non-zero".  We're pretty forgiving.
+VerifyAcrossThreads() {
+  prof1="$TMPDIR/$1"
+  # We need to run the script with no args to get the actual exe name
+  exec1="$2"
+  mult="$3"
+
+  # We are careful not to put exec1 in quotes, because if it is the
+  # empty string, it means we want to use the 1-arg version of pprof.
+  mthread=`$PPROF $PPROF_FLAGS $exec1 "$prof1" | grep test_main_thread | awk '{print $1}'`
+  othread=`$PPROF $PPROF_FLAGS $exec1 "$prof1" | grep test_other_thread | awk '{print $1}'`
+  if [ -z "$mthread" ] || [ -z "$othread" ] || \
+     [ "$mthread" -le 0 -o "$othread" -le 0 ]
+#    || [ `expr $mthread \* $mult \* 3` -gt `expr $othread \* 10` -o \
+#         `expr $mthread \* $mult \* 10` -lt `expr $othread \* 3` ]
+  then
+    echo
+    echo ">>> profile on $exec1 (main vs thread) with multiplier $mult failed:"
+    echo "Actual times (in profiling units) were '$mthread' vs. '$othread'"
+    echo
+    RegisterFailure
+  fi
+}
+
+echo
+echo ">>> WARNING <<<"
+echo "This test looks at timing information to determine correctness."
+echo "If your system is loaded, the test may spuriously fail."
+echo "If the test does fail with an 'Actual times' error, try running again."
+echo
+
+# profiler1 is a non-threaded version
+"$PROFILER1" 50 1 "$TMPDIR/p1" || RegisterFailure
+"$PROFILER1" 100 1 "$TMPDIR/p2" || RegisterFailure
+VerifySimilar p1 "$PROFILER1_REALNAME" p2 "$PROFILER1_REALNAME" 2
+
+# Verify the same thing works if we statically link
+"$PROFILER2" 50 1 "$TMPDIR/p3" || RegisterFailure
+"$PROFILER2" 100 1 "$TMPDIR/p4" || RegisterFailure
+VerifySimilar p3 "$PROFILER2_REALNAME" p4 "$PROFILER2_REALNAME" 2
+
+# Verify the same thing works if we specify via CPUPROFILE
+CPUPROFILE="$TMPDIR/p5" "$PROFILER2" 50 || RegisterFailure
+CPUPROFILE="$TMPDIR/p6" "$PROFILER2" 100 || RegisterFailure
+VerifySimilar p5 "$PROFILER2_REALNAME" p6 "$PROFILER2_REALNAME" 2
+
+CPUPROFILE="$TMPDIR/p5b" "$PROFILER3" 30 || RegisterFailure
+CPUPROFILE="$TMPDIR/p5c" "$PROFILER3" 60 || RegisterFailure
+VerifySimilar p5b "$PROFILER3_REALNAME" p5c "$PROFILER3_REALNAME" 2
+
+# Now try what happens when we use threads
+"$PROFILER3" 30 2 "$TMPDIR/p7" || RegisterFailure
+"$PROFILER3" 60 2 "$TMPDIR/p8" || RegisterFailure
+VerifySimilar p7 "$PROFILER3_REALNAME" p8 "$PROFILER3_REALNAME" 2
+
+"$PROFILER4" 30 2 "$TMPDIR/p9" || RegisterFailure
+"$PROFILER4" 60 2 "$TMPDIR/p10" || RegisterFailure
+VerifySimilar p9 "$PROFILER4_REALNAME" p10 "$PROFILER4_REALNAME" 2
+
+# More threads!
+"$PROFILER4" 25 3 "$TMPDIR/p9" || RegisterFailure
+"$PROFILER4" 50 3 "$TMPDIR/p10" || RegisterFailure
+VerifySimilar p9 "$PROFILER4_REALNAME" p10 "$PROFILER4_REALNAME" 2
+
+# Compare how much time the main thread takes compared to the other threads
+# Recall the main thread runs twice as long as the other threads, by design.
+"$PROFILER4" 20 4 "$TMPDIR/p11" || RegisterFailure
+VerifyAcrossThreads p11 "$PROFILER4_REALNAME" 2
+
+# Test symbol save and restore
+"$PROFILER1" 50 1 "$TMPDIR/p12" || RegisterFailure
+"$PPROF" $PPROF_FLAGS "$PROFILER1_REALNAME" "$TMPDIR/p12" --raw \
+    >"$TMPDIR/p13" 2>/dev/null || RegisterFailure
+VerifyIdentical p12 "$PROFILER1_REALNAME" p13 "" || RegisterFailure
+
+"$PROFILER3" 30 2 "$TMPDIR/p14" || RegisterFailure
+"$PPROF" $PPROF_FLAGS "$PROFILER3_REALNAME" "$TMPDIR/p14" --raw \
+    >"$TMPDIR/p15" 2>/dev/null || RegisterFailure
+VerifyIdentical p14 "$PROFILER3_REALNAME" p15 "" || RegisterFailure
+
+# Test using ITIMER_REAL instead of ITIMER_PROF.
+env CPUPROFILE_REALTIME=1 "$PROFILER3" 30 2 "$TMPDIR/p16" || RegisterFailure
+env CPUPROFILE_REALTIME=1 "$PROFILER3" 60 2 "$TMPDIR/p17" || RegisterFailure
+VerifySimilar p16 "$PROFILER3_REALNAME" p17 "$PROFILER3_REALNAME" 2
+
+
+# Make sure that when we have a process with a fork, the profiles don't
+# clobber each other
+CPUPROFILE="$TMPDIR/pfork" "$PROFILER1" 1 -2 || RegisterFailure
+n=`ls $TMPDIR/pfork* | wc -l`
+if [ $n != 3 ]; then
+  echo "FORK test FAILED: expected 3 profiles (for main + 2 children), found $n"
+  num_failures=`expr $num_failures + 1`
+fi
+
+rm -rf "$TMPDIR"      # clean up
+
+echo "Tests finished with $num_failures failures"
+exit $num_failures

diff --git a/src/tests/raw_printer_test.cc b/src/tests/raw_printer_test.cc
new file mode 100644
index 0000000..2c7be6a
--- /dev/null
+++ b/src/tests/raw_printer_test.cc

@@ -0,0 +1,64 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright 2009 Google Inc. All Rights Reserved.
+// Author: sanjay@google.com (Sanjay Ghemawat)
+//
+// Use of this source code is governed by a BSD-style license that can
+// be found in the LICENSE file.
+
+#include "raw_printer.h"
+#include <stdio.h>
+#include <string>
+#include "base/logging.h"
+
+using std::string;
+
+#define TEST(a, b)  void TEST_##a##_##b()
+#define RUN_TEST(a, b)  TEST_##a##_##b()
+
+TEST(RawPrinter, Empty) {
+  char buffer[1];
+  base::RawPrinter printer(buffer, arraysize(buffer));
+  CHECK_EQ(0, printer.length());
+  CHECK_EQ(string(""), buffer);
+  CHECK_EQ(0, printer.space_left());
+  printer.Printf("foo");
+  CHECK_EQ(string(""), string(buffer));
+  CHECK_EQ(0, printer.length());
+  CHECK_EQ(0, printer.space_left());
+}
+
+TEST(RawPrinter, PartiallyFilled) {
+  char buffer[100];
+  base::RawPrinter printer(buffer, arraysize(buffer));
+  printer.Printf("%s %s", "hello", "world");
+  CHECK_EQ(string("hello world"), string(buffer));
+  CHECK_EQ(11, printer.length());
+  CHECK_LT(0, printer.space_left());
+}
+
+TEST(RawPrinter, Truncated) {
+  char buffer[3];
+  base::RawPrinter printer(buffer, arraysize(buffer));
+  printer.Printf("%d", 12345678);
+  CHECK_EQ(string("12"), string(buffer));
+  CHECK_EQ(2, printer.length());
+  CHECK_EQ(0, printer.space_left());
+}
+
+TEST(RawPrinter, ExactlyFilled) {
+  char buffer[12];
+  base::RawPrinter printer(buffer, arraysize(buffer));
+  printer.Printf("%s %s", "hello", "world");
+  CHECK_EQ(string("hello world"), string(buffer));
+  CHECK_EQ(11, printer.length());
+  CHECK_EQ(0, printer.space_left());
+}
+
+int main(int argc, char **argv) {
+  RUN_TEST(RawPrinter, Empty);
+  RUN_TEST(RawPrinter, PartiallyFilled);
+  RUN_TEST(RawPrinter, Truncated);
+  RUN_TEST(RawPrinter, ExactlyFilled);
+  printf("PASS\n");
+  return 0;   // 0 means success
+}

diff --git a/src/tests/realloc_unittest.cc b/src/tests/realloc_unittest.cc
new file mode 100644
index 0000000..e3d7b59
--- /dev/null
+++ b/src/tests/realloc_unittest.cc

@@ -0,0 +1,125 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2004, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// Test realloc() functionality
+
+#include "config_for_unittests.h"
+#include <assert.h>                     // for assert
+#include <stdio.h>
+#include <stddef.h>                     // for size_t, NULL
+#include <stdlib.h>                     // for free, malloc, realloc
+#include <algorithm>                    // for min
+#include "base/logging.h"
+
+using std::min;
+
+
+// Fill a buffer of the specified size with a predetermined pattern
+static void Fill(unsigned char* buffer, int n) {
+  for (int i = 0; i < n; i++) {
+    buffer[i] = (i & 0xff);
+  }
+}
+
+// Check that the specified buffer has the predetermined pattern
+// generated by Fill()
+static bool Valid(unsigned char* buffer, int n) {
+  for (int i = 0; i < n; i++) {
+    if (buffer[i] != (i & 0xff)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Return the next interesting size/delta to check.  Returns -1 if no more.
+static int NextSize(int size) {
+  if (size < 100) {
+    return size+1;
+  } else if (size < 100000) {
+    // Find next power of two
+    int power = 1;
+    while (power < size) {
+      power <<= 1;
+    }
+
+    // Yield (power-1, power, power+1)
+    if (size < power-1) {
+      return power-1;
+    } else if (size == power-1) {
+      return power;
+    } else {
+      assert(size == power);
+      return power+1;
+    }
+  } else {
+    return -1;
+  }
+}
+
+int main(int argc, char** argv) {
+  for (int src_size = 0; src_size >= 0; src_size = NextSize(src_size)) {
+    for (int dst_size = 0; dst_size >= 0; dst_size = NextSize(dst_size)) {
+      unsigned char* src = (unsigned char*) malloc(src_size);
+      Fill(src, src_size);
+      unsigned char* dst = (unsigned char*) realloc(src, dst_size);
+      CHECK(Valid(dst, min(src_size, dst_size)));
+      Fill(dst, dst_size);
+      CHECK(Valid(dst, dst_size));
+      if (dst != NULL) free(dst);
+    }
+  }
+
+  // Now make sure realloc works correctly even when we overflow the
+  // packed cache, so some entries are evicted from the cache.
+  // The cache has 2^12 entries, keyed by page number.
+  const int kNumEntries = 1 << 14;
+  int** p = (int**)malloc(sizeof(*p) * kNumEntries);
+  int sum = 0;
+  for (int i = 0; i < kNumEntries; i++) {
+    p[i] = (int*)malloc(8192);   // no page size is likely to be bigger
+    p[i][1000] = i;              // use memory deep in the heart of p
+  }
+  for (int i = 0; i < kNumEntries; i++) {
+    p[i] = (int*)realloc(p[i], 9000);
+  }
+  for (int i = 0; i < kNumEntries; i++) {
+    sum += p[i][1000];
+    free(p[i]);
+  }
+  CHECK_EQ(kNumEntries/2 * (kNumEntries - 1), sum);  // assume kNE is even
+  free(p);
+
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/sampler_test.cc b/src/tests/sampler_test.cc
new file mode 100755
index 0000000..cd64b0f
--- /dev/null
+++ b/src/tests/sampler_test.cc

@@ -0,0 +1,658 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// All Rights Reserved.
+//
+// Author: Daniel Ford
+//
+// Checks basic properties of the sampler
+
+#include "config_for_unittests.h"
+#include <stdlib.h>        // defines posix_memalign
+#include <stdio.h>         // for the printf at the end
+#if defined HAVE_STDINT_H
+#include <stdint.h>             // to get uintptr_t
+#elif defined HAVE_INTTYPES_H
+#include <inttypes.h>           // another place uintptr_t might be defined
+#endif
+#include <sys/types.h>
+#include <iostream>
+#include <algorithm>
+#include <vector>
+#include <string>
+#include <cmath>
+#include "base/logging.h"
+#include "base/commandlineflags.h"
+#include "sampler.h"       // The Sampler class being tested
+
+using std::sort;
+using std::min;
+using std::max;
+using std::vector;
+using std::abs;
+
+vector<void (*)()> g_testlist;  // the tests to run
+
+#define TEST(a, b)                                      \
+  struct Test_##a##_##b {                               \
+    Test_##a##_##b() { g_testlist.push_back(&Run); }    \
+    static void Run();                                  \
+  };                                                    \
+  static Test_##a##_##b g_test_##a##_##b;               \
+  void Test_##a##_##b::Run()
+
+
+static int RUN_ALL_TESTS() {
+  vector<void (*)()>::const_iterator it;
+  for (it = g_testlist.begin(); it != g_testlist.end(); ++it) {
+    (*it)();   // The test will error-exit if there's a problem.
+  }
+  fprintf(stderr, "\nPassed %d tests\n\nPASS\n", (int)g_testlist.size());
+  return 0;
+}
+
+#undef LOG   // defined in base/logging.h
+// Ideally, we'd put the newline at the end, but this hack puts the
+// newline at the end of the previous log message, which is good enough :-)
+#define LOG(level)  std::cerr << "\n"
+
+static std::string StringPrintf(const char* format, ...) {
+  char buf[256];   // should be big enough for all logging
+  va_list ap;
+  va_start(ap, format);
+  perftools_vsnprintf(buf, sizeof(buf), format, ap);
+  va_end(ap);
+  return buf;
+}
+
+namespace {
+template<typename T> class scoped_array {
+ public:
+  scoped_array(T* p) : p_(p) { }
+  ~scoped_array() { delete[] p_; }
+  const T* get() const { return p_; }
+  T* get() { return p_; }
+  T& operator[](int i) { return p_[i]; }
+ private:
+  T* p_;
+};
+}
+
+// Note that these tests are stochastic.
+// This mean that the chance of correct code passing the test is,
+// in the case of 5 standard deviations:
+// kSigmas=5:    ~99.99994267%
+// in the case of 4 standard deviations:
+// kSigmas=4:    ~99.993666%
+static const double kSigmas = 4;
+static const size_t kSamplingInterval = 512*1024;
+
+// Tests that GetSamplePeriod returns the expected value
+// which is 1<<19
+TEST(Sampler, TestGetSamplePeriod) {
+  tcmalloc::Sampler sampler;
+  sampler.Init(1);
+  uint64_t sample_period;
+  sample_period = sampler.GetSamplePeriod();
+  CHECK_GT(sample_period, 0);
+}
+
+// Tests of the quality of the random numbers generated
+// This uses the Anderson Darling test for uniformity.
+// See "Evaluating the Anderson-Darling Distribution" by Marsaglia
+// for details.
+
+// Short cut version of ADinf(z), z>0 (from Marsaglia)
+// This returns the p-value for Anderson Darling statistic in
+// the limit as n-> infinity. For finite n, apply the error fix below.
+double AndersonDarlingInf(double z) {
+  if (z < 2) {
+    return exp(-1.2337141 / z) / sqrt(z) * (2.00012 + (0.247105 -
+                (0.0649821 - (0.0347962 - (0.011672 - 0.00168691
+                * z) * z) * z) * z) * z);
+  }
+  return exp( - exp(1.0776 - (2.30695 - (0.43424 - (0.082433 -
+                    (0.008056 - 0.0003146 * z) * z) * z) * z) * z));
+}
+
+// Corrects the approximation error in AndersonDarlingInf for small values of n
+// Add this to AndersonDarlingInf to get a better approximation
+// (from Marsaglia)
+double AndersonDarlingErrFix(int n, double x) {
+  if (x > 0.8) {
+    return (-130.2137 + (745.2337 - (1705.091 - (1950.646 -
+            (1116.360 - 255.7844 * x) * x) * x) * x) * x) / n;
+  }
+  double cutoff = 0.01265 + 0.1757 / n;
+  double t;
+  if (x < cutoff) {
+    t = x / cutoff;
+    t = sqrt(t) * (1 - t) * (49 * t - 102);
+    return t * (0.0037 / (n * n) + 0.00078 / n + 0.00006) / n;
+  } else {
+    t = (x - cutoff) / (0.8 - cutoff);
+    t = -0.00022633 + (6.54034 - (14.6538 - (14.458 - (8.259 - 1.91864
+          * t) * t) * t) * t) * t;
+    return t * (0.04213 + 0.01365 / n) / n;
+  }
+}
+
+// Returns the AndersonDarling p-value given n and the value of the statistic
+double AndersonDarlingPValue(int n, double z) {
+  double ad = AndersonDarlingInf(z);
+  double errfix = AndersonDarlingErrFix(n, ad);
+  return ad + errfix;
+}
+
+double AndersonDarlingStatistic(int n, double* random_sample) {
+  double ad_sum = 0;
+  for (int i = 0; i < n; i++) {
+    ad_sum += (2*i + 1) * log(random_sample[i] * (1 - random_sample[n-1-i]));
+  }
+  double ad_statistic = - n - 1/static_cast<double>(n) * ad_sum;
+  return ad_statistic;
+}
+
+// Tests if the array of doubles is uniformly distributed.
+// Returns the p-value of the Anderson Darling Statistic
+// for the given set of sorted random doubles
+// See "Evaluating the Anderson-Darling Distribution" by
+// Marsaglia and Marsaglia for details.
+double AndersonDarlingTest(int n, double* random_sample) {
+  double ad_statistic = AndersonDarlingStatistic(n, random_sample);
+  LOG(INFO) << StringPrintf("AD stat = %f, n=%d\n", ad_statistic, n);
+  double p = AndersonDarlingPValue(n, ad_statistic);
+  return p;
+}
+
+// Test the AD Test. The value of the statistic should go to zero as n->infty
+// Not run as part of regular tests
+void ADTestTest(int n) {
+  scoped_array<double> random_sample(new double[n]);
+  for (int i = 0; i < n; i++) {
+    random_sample[i] = (i+0.01)/n;
+  }
+  sort(random_sample.get(), random_sample.get() + n);
+  double ad_stat = AndersonDarlingStatistic(n, random_sample.get());
+  LOG(INFO) << StringPrintf("Testing the AD test. n=%d, ad_stat = %f",
+                            n, ad_stat);
+}
+
+// Print the CDF of the distribution of the Anderson-Darling Statistic
+// Used for checking the Anderson-Darling Test
+// Not run as part of regular tests
+void ADCDF() {
+  for (int i = 1; i < 40; i++) {
+    double x = i/10.0;
+    LOG(INFO) << "x= " << x << "  adpv= "
+              << AndersonDarlingPValue(100, x) << ", "
+              << AndersonDarlingPValue(1000, x);
+  }
+}
+
+// Testing that NextRandom generates uniform
+// random numbers.
+// Applies the Anderson-Darling test for uniformity
+void TestNextRandom(int n) {
+  tcmalloc::Sampler sampler;
+  sampler.Init(1);
+  uint64_t x = 1;
+  // This assumes that the prng returns 48 bit numbers
+  uint64_t max_prng_value = static_cast<uint64_t>(1)<<48;
+  // Initialize
+  for (int i = 1; i <= 20; i++) {  // 20 mimics sampler.Init()
+    x = sampler.NextRandom(x);
+  }
+  scoped_array<uint64_t> int_random_sample(new uint64_t[n]);
+  // Collect samples
+  for (int i = 0; i < n; i++) {
+    int_random_sample[i] = x;
+    x = sampler.NextRandom(x);
+  }
+  // First sort them...
+  sort(int_random_sample.get(), int_random_sample.get() + n);
+  scoped_array<double> random_sample(new double[n]);
+  // Convert them to uniform randoms (in the range [0,1])
+  for (int i = 0; i < n; i++) {
+    random_sample[i] = static_cast<double>(int_random_sample[i])/max_prng_value;
+  }
+  // Now compute the Anderson-Darling statistic
+  double ad_pvalue = AndersonDarlingTest(n, random_sample.get());
+  LOG(INFO) << StringPrintf("pvalue for AndersonDarlingTest "
+                            "with n= %d is p= %f\n", n, ad_pvalue);
+  CHECK_GT(min(ad_pvalue, 1 - ad_pvalue), 0.0001);
+  //           << StringPrintf("prng is not uniform, %d\n", n);
+}
+
+
+TEST(Sampler, TestNextRandom_MultipleValues) {
+  TestNextRandom(10);  // Check short-range correlation
+  TestNextRandom(100);
+  TestNextRandom(1000);
+  TestNextRandom(10000);  // Make sure there's no systematic error
+}
+
+// Tests that PickNextSamplePeriod generates
+// geometrically distributed random numbers.
+// First converts to uniforms then applied the
+// Anderson-Darling test for uniformity.
+void TestPickNextSample(int n) {
+  tcmalloc::Sampler sampler;
+  sampler.Init(1);
+  scoped_array<uint64_t> int_random_sample(new uint64_t[n]);
+  int sample_period = sampler.GetSamplePeriod();
+  int ones_count = 0;
+  for (int i = 0; i < n; i++) {
+    int_random_sample[i] = sampler.PickNextSamplingPoint();
+    CHECK_GE(int_random_sample[i], 1);
+    if (int_random_sample[i] == 1) {
+      ones_count += 1;
+    }
+    CHECK_LT(ones_count, 4); // << " out of " << i << " samples.";
+  }
+  // First sort them...
+  sort(int_random_sample.get(), int_random_sample.get() + n);
+  scoped_array<double> random_sample(new double[n]);
+  // Convert them to uniform random numbers
+  // by applying the geometric CDF
+  for (int i = 0; i < n; i++) {
+    random_sample[i] = 1 - exp(-static_cast<double>(int_random_sample[i])
+                           / sample_period);
+  }
+  // Now compute the Anderson-Darling statistic
+  double geom_ad_pvalue = AndersonDarlingTest(n, random_sample.get());
+  LOG(INFO) << StringPrintf("pvalue for geometric AndersonDarlingTest "
+                             "with n= %d is p= %f\n", n, geom_ad_pvalue);
+  CHECK_GT(min(geom_ad_pvalue, 1 - geom_ad_pvalue), 0.0001);
+      //          << "PickNextSamplingPoint does not produce good "
+      //             "geometric/exponential random numbers\n";
+}
+
+TEST(Sampler, TestPickNextSample_MultipleValues) {
+  TestPickNextSample(10);  // Make sure the first few are good (enough)
+  TestPickNextSample(100);
+  TestPickNextSample(1000);
+  TestPickNextSample(10000);  // Make sure there's no systematic error
+}
+
+
+// This is superceeded by the Anderson-Darling Test
+// and it not run now.
+// Tests how fast nearby values are spread out with  LRand64
+// The purpose of this code is to determine how many
+// steps to apply to the seed during initialization
+void TestLRand64Spread() {
+  tcmalloc::Sampler sampler;
+  sampler.Init(1);
+  uint64_t current_value;
+  printf("Testing LRand64 Spread\n");
+  for (int i = 1; i < 10; i++) {
+    printf("%d ", i);
+    current_value = i;
+    for (int j = 1; j < 100; j++) {
+      current_value = sampler.NextRandom(current_value);
+    }
+    LOG(INFO) << current_value;
+  }
+}
+
+
+// Test for Fastlog2 code
+// We care about the percentage error because we're using this
+// for choosing step sizes, so "close" is relative to the size of
+// the step we would get if we used the built-in log function
+TEST(Sampler, FastLog2) {
+  tcmalloc::Sampler sampler;
+  sampler.Init(1);
+  double max_ratio_error = 0;
+  for (double d = -1021.9; d < 1; d+= 0.13124235) {
+    double e = pow(2.0, d);
+    double truelog = log(e) / log(2.0);  // log_2(e)
+    double fastlog = sampler.FastLog2(e);
+    max_ratio_error = max(max_ratio_error,
+                          max(truelog/fastlog-1, fastlog/truelog-1));
+    CHECK_LE(max_ratio_error, 0.01);
+        //        << StringPrintf("d = %f, e=%f, truelog = %f, fastlog= %f\n",
+        //                        d, e, truelog, fastlog);
+  }
+  LOG(INFO) << StringPrintf("Fastlog2: max_ratio_error = %f\n",
+                            max_ratio_error);
+}
+
+// Futher tests
+
+bool CheckMean(size_t mean, int num_samples) {
+  tcmalloc::Sampler sampler;
+  sampler.Init(1);
+  size_t total = 0;
+  for (int i = 0; i < num_samples; i++) {
+    total += sampler.PickNextSamplingPoint();
+  }
+  double empirical_mean = total / static_cast<double>(num_samples);
+  double expected_sd = mean / pow(num_samples * 1.0, 0.5);
+  return(fabs(mean-empirical_mean) < expected_sd * kSigmas);
+}
+
+// Prints a sequence so you can look at the distribution
+void OutputSequence(int sequence_length) {
+  tcmalloc::Sampler sampler;
+  sampler.Init(1);
+  size_t next_step;
+  for (int i = 0; i< sequence_length; i++) {
+    next_step = sampler.PickNextSamplingPoint();
+    LOG(INFO) << next_step;
+  }
+}
+
+
+double StandardDeviationsErrorInSample(
+              int total_samples, int picked_samples,
+              int alloc_size, int sampling_interval) {
+  double p = 1 - exp(-(static_cast<double>(alloc_size) / sampling_interval));
+  double expected_samples = total_samples * p;
+  double sd = pow(p*(1-p)*total_samples, 0.5);
+  return((picked_samples - expected_samples) / sd);
+}
+
+TEST(Sampler, LargeAndSmallAllocs_CombinedTest) {
+  tcmalloc::Sampler sampler;
+  sampler.Init(1);
+  int counter_big = 0;
+  int counter_small = 0;
+  int size_big = 129*8*1024+1;
+  int size_small = 1024*8;
+  int num_iters = 128*4*8;
+  // Allocate in mixed chunks
+  for (int i = 0; i < num_iters; i++) {
+    if (sampler.SampleAllocation(size_big)) {
+      counter_big += 1;
+    }
+    for (int i = 0; i < 129; i++) {
+      if (sampler.SampleAllocation(size_small)) {
+        counter_small += 1;
+      }
+    }
+  }
+  // Now test that there are the right number of each
+  double large_allocs_sds =
+     StandardDeviationsErrorInSample(num_iters, counter_big,
+                                     size_big, kSamplingInterval);
+  double small_allocs_sds =
+     StandardDeviationsErrorInSample(num_iters*129, counter_small,
+                                     size_small, kSamplingInterval);
+  LOG(INFO) << StringPrintf("large_allocs_sds = %f\n", large_allocs_sds);
+  LOG(INFO) << StringPrintf("small_allocs_sds = %f\n", small_allocs_sds);
+  CHECK_LE(fabs(large_allocs_sds), kSigmas);
+  CHECK_LE(fabs(small_allocs_sds), kSigmas);
+}
+
+// Tests whether the mean is about right over 1000 samples
+TEST(Sampler, IsMeanRight) {
+  CHECK(CheckMean(kSamplingInterval, 1000));
+}
+
+// This flag is for the OldSampler class to use
+const int64 FLAGS_mock_tcmalloc_sample_parameter = 1<<19;
+
+// A cut down and slightly refactored version of the old Sampler
+class OldSampler {
+ public:
+  void Init(uint32_t seed);
+  void Cleanup() {}
+
+  // Record allocation of "k" bytes.  Return true iff allocation
+  // should be sampled
+  bool SampleAllocation(size_t k);
+
+  // Generate a geometric with mean 1M (or FLAG value)
+  void PickNextSample(size_t k);
+
+  // Initialize the statics for the Sample class
+  static void InitStatics() {
+    sample_period = 1048583;
+  }
+  size_t bytes_until_sample_;
+
+ private:
+  uint32_t rnd_;                   // Cheap random number generator
+  static uint64_t sample_period;
+  // Should be a prime just above a power of 2:
+  // 2, 5, 11, 17, 37, 67, 131, 257,
+  // 521, 1031, 2053, 4099, 8209, 16411,
+  // 32771, 65537, 131101, 262147, 524309, 1048583,
+  // 2097169, 4194319, 8388617, 16777259, 33554467
+};
+
+// Statics for OldSampler
+uint64_t OldSampler::sample_period;
+
+void OldSampler::Init(uint32_t seed) {
+  // Initialize PRNG -- run it for a bit to get to good values
+  if (seed != 0) {
+    rnd_ = seed;
+  } else {
+    rnd_ = 12345;
+  }
+  bytes_until_sample_ = 0;
+  for (int i = 0; i < 100; i++) {
+    PickNextSample(sample_period * 2);
+  }
+};
+
+// A cut-down version of the old PickNextSampleRoutine
+void OldSampler::PickNextSample(size_t k) {
+  // Make next "random" number
+  // x^32+x^22+x^2+x^1+1 is a primitive polynomial for random numbers
+  static const uint32_t kPoly = (1 << 22) | (1 << 2) | (1 << 1) | (1 << 0);
+  uint32_t r = rnd_;
+  rnd_ = (r << 1) ^ ((static_cast<int32_t>(r) >> 31) & kPoly);
+
+  // Next point is "rnd_ % (sample_period)".  I.e., average
+  // increment is "sample_period/2".
+  const int flag_value = FLAGS_mock_tcmalloc_sample_parameter;
+  static int last_flag_value = -1;
+
+  if (flag_value != last_flag_value) {
+    // There should be a spinlock here, but this code is
+    // for benchmarking only.
+    sample_period = 1048583;
+    last_flag_value = flag_value;
+  }
+
+  bytes_until_sample_ += rnd_ % sample_period;
+
+  if (k > (static_cast<size_t>(-1) >> 2)) {
+    // If the user has asked for a huge allocation then it is possible
+    // for the code below to loop infinitely.  Just return (note that
+    // this throws off the sampling accuracy somewhat, but a user who
+    // is allocating more than 1G of memory at a time can live with a
+    // minor inaccuracy in profiling of small allocations, and also
+    // would rather not wait for the loop below to terminate).
+    return;
+  }
+
+  while (bytes_until_sample_ < k) {
+    // Increase bytes_until_sample_ by enough average sampling periods
+    // (sample_period >> 1) to allow us to sample past the current
+    // allocation.
+    bytes_until_sample_ += (sample_period >> 1);
+  }
+
+  bytes_until_sample_ -= k;
+}
+
+inline bool OldSampler::SampleAllocation(size_t k) {
+  if (bytes_until_sample_ < k) {
+    PickNextSample(k);
+    return true;
+  } else {
+    bytes_until_sample_ -= k;
+    return false;
+  }
+}
+
+// This checks that the stated maximum value for the
+// tcmalloc_sample_parameter flag never overflows bytes_until_sample_
+TEST(Sampler, bytes_until_sample_Overflow_Underflow) {
+  tcmalloc::Sampler sampler;
+  sampler.Init(1);
+  uint64_t one = 1;
+  // sample_parameter = 0;  // To test the edge case
+  uint64_t sample_parameter_array[4] = {0, 1, one<<19, one<<58};
+  for (int i = 0; i < 4; i++) {
+    uint64_t sample_parameter = sample_parameter_array[i];
+    LOG(INFO) << "sample_parameter = " << sample_parameter;
+    double sample_scaling = - log(2.0) * sample_parameter;
+    // Take the top 26 bits as the random number
+    // (This plus the 1<<26 sampling bound give a max step possible of
+    // 1209424308 bytes.)
+    const uint64_t prng_mod_power = 48;  // Number of bits in prng
+
+    // First, check the largest_prng value
+    uint64_t largest_prng_value = (static_cast<uint64_t>(1)<<48) - 1;
+    double q = (largest_prng_value >> (prng_mod_power - 26)) + 1.0;
+    LOG(INFO) << StringPrintf("q = %f\n", q);
+    LOG(INFO) << StringPrintf("FastLog2(q) = %f\n", sampler.FastLog2(q));
+    LOG(INFO) << StringPrintf("log2(q) = %f\n", log(q)/log(2.0));
+    // Replace min(sampler.FastLog2(q) - 26, 0.0) with
+    // (sampler.FastLog2(q) - 26.000705) when using that optimization
+    uint64_t smallest_sample_step
+        = static_cast<uint64_t>(min(sampler.FastLog2(q) - 26, 0.0)
+                                * sample_scaling + 1);
+    LOG(INFO) << "Smallest sample step is " << smallest_sample_step;
+    uint64_t cutoff = static_cast<uint64_t>(10)
+                      * (sample_parameter/(one<<24) + 1);
+    LOG(INFO) << "Acceptable value is < " << cutoff;
+    // This checks that the answer is "small" and positive
+    CHECK_LE(smallest_sample_step, cutoff);
+
+    // Next, check with the smallest prng value
+    uint64_t smallest_prng_value = 0;
+    q = (smallest_prng_value >> (prng_mod_power - 26)) + 1.0;
+    LOG(INFO) << StringPrintf("q = %f\n", q);
+    // Replace min(sampler.FastLog2(q) - 26, 0.0) with
+    // (sampler.FastLog2(q) - 26.000705) when using that optimization
+    uint64_t largest_sample_step
+        = static_cast<uint64_t>(min(sampler.FastLog2(q) - 26, 0.0)
+                                * sample_scaling + 1);
+    LOG(INFO) << "Largest sample step is " << largest_sample_step;
+    CHECK_LE(largest_sample_step, one<<63);
+    CHECK_GE(largest_sample_step, smallest_sample_step);
+  }
+}
+
+
+// Test that NextRand is in the right range.  Unfortunately, this is a
+// stochastic test which could miss problems.
+TEST(Sampler, NextRand_range) {
+  tcmalloc::Sampler sampler;
+  sampler.Init(1);
+  uint64_t one = 1;
+  // The next number should be (one << 48) - 1
+  uint64_t max_value = (one << 48) - 1;
+  uint64_t x = (one << 55);
+  int n = 22;  // 27;
+  LOG(INFO) << "Running sampler.NextRandom 1<<" << n << " times";
+  for (int i = 1; i <= (1<<n); i++) {  // 20 mimics sampler.Init()
+    x = sampler.NextRandom(x);
+    CHECK_LE(x, max_value);
+  }
+}
+
+// Tests certain arithmetic operations to make sure they compute what we
+// expect them too (for testing across different platforms)
+TEST(Sampler, arithmetic_1) {
+  tcmalloc::Sampler sampler;
+  sampler.Init(1);
+  uint64_t rnd;  // our 48 bit random number, which we don't trust
+  const uint64_t prng_mod_power = 48;
+  uint64_t one = 1;
+  rnd = one;
+  uint64_t max_value = (one << 48) - 1;
+  for (int i = 1; i <= (1>>27); i++) {  // 20 mimics sampler.Init()
+    rnd = sampler.NextRandom(rnd);
+    CHECK_LE(rnd, max_value);
+    double q = (rnd >> (prng_mod_power - 26)) + 1.0;
+    CHECK_GE(q, 0); // << rnd << "  " << prng_mod_power;
+  }
+  // Test some potentially out of bounds value for rnd
+  for (int i = 1; i <= 66; i++) {
+    rnd = one << i;
+    double q = (rnd >> (prng_mod_power - 26)) + 1.0;
+    LOG(INFO) << "rnd = " << rnd << " i=" << i << " q=" << q;
+    CHECK_GE(q, 0);
+    //        << " rnd=" << rnd << "  i=" << i << " prng_mod_power" << prng_mod_power;
+  }
+}
+
+void test_arithmetic(uint64_t rnd) {
+  const uint64_t prng_mod_power = 48;  // Number of bits in prng
+  uint64_t shifted_rnd = rnd >> (prng_mod_power - 26);
+  CHECK_GE(shifted_rnd, 0);
+  CHECK_LT(shifted_rnd, (1<<26));
+  LOG(INFO) << shifted_rnd;
+  LOG(INFO) << static_cast<double>(shifted_rnd);
+  CHECK_GE(static_cast<double>(static_cast<uint32_t>(shifted_rnd)), 0);
+      //      << " rnd=" << rnd << "  srnd=" << shifted_rnd;
+  CHECK_GE(static_cast<double>(shifted_rnd), 0);
+      //      << " rnd=" << rnd << "  srnd=" << shifted_rnd;
+  double q = static_cast<double>(shifted_rnd) + 1.0;
+  CHECK_GT(q, 0);
+}
+
+// Tests certain arithmetic operations to make sure they compute what we
+// expect them too (for testing across different platforms)
+// know bad values under with -c dbg --cpu piii for _some_ binaries:
+// rnd=227453640600554
+// shifted_rnd=54229173
+// (hard to reproduce)
+TEST(Sampler, arithmetic_2) {
+  uint64_t rnd = 227453640600554LL;
+  test_arithmetic(rnd);
+}
+
+
+// It's not really a test, but it's good to know
+TEST(Sample, size_of_class) {
+  tcmalloc::Sampler sampler;
+  sampler.Init(1);
+  LOG(INFO) << "Size of Sampler class is: " << sizeof(tcmalloc::Sampler);
+  LOG(INFO) << "Size of Sampler object is: " << sizeof(sampler);
+}
+
+// Make sure sampling is enabled, or the tests won't work right.
+DECLARE_int64(tcmalloc_sample_parameter);
+
+int main(int argc, char **argv) {
+  if (FLAGS_tcmalloc_sample_parameter == 0)
+    FLAGS_tcmalloc_sample_parameter = 524288;
+  return RUN_ALL_TESTS();
+}

diff --git a/src/tests/sampling_test.cc b/src/tests/sampling_test.cc
new file mode 100644
index 0000000..729aba8
--- /dev/null
+++ b/src/tests/sampling_test.cc

@@ -0,0 +1,83 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// This tests ReadStackTraces and ReadGrowthStackTraces.  It does this
+// by doing a bunch of allocations and then calling those functions.
+// A driver shell-script can call this, and then call pprof, and
+// verify the expected output.  The output is written to
+// argv[1].heap and argv[1].growth
+
+#include "config_for_unittests.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include "base/logging.h"
+#include <gperftools/malloc_extension.h>
+
+using std::string;
+
+extern "C" void* AllocateAllocate() ATTRIBUTE_NOINLINE;
+
+extern "C" void* AllocateAllocate() {
+  // The VLOG's are mostly to discourage inlining
+  VLOG(1, "Allocating some more");
+  void* p = malloc(10000);
+  VLOG(1, "Done allocating");
+  return p;
+}
+
+static void WriteStringToFile(const string& s, const string& filename) {
+  FILE* fp = fopen(filename.c_str(), "w");
+  fwrite(s.data(), 1, s.length(), fp);
+  fclose(fp);
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    fprintf(stderr, "USAGE: %s <base of output files>\n", argv[0]);
+    exit(1);
+  }
+  for (int i = 0; i < 8000; i++) {
+    AllocateAllocate();
+  }
+
+  string s;
+  MallocExtension::instance()->GetHeapSample(&s);
+  WriteStringToFile(s, string(argv[1]) + ".heap");
+
+  s.clear();
+  MallocExtension::instance()->GetHeapGrowthStacks(&s);
+  WriteStringToFile(s, string(argv[1]) + ".growth");
+
+  return 0;
+}

diff --git a/src/tests/sampling_test.sh b/src/tests/sampling_test.sh
new file mode 100755
index 0000000..2a58426
--- /dev/null
+++ b/src/tests/sampling_test.sh

@@ -0,0 +1,94 @@
+#!/bin/sh
+
+# Copyright (c) 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# ---
+# Author: Craig Silverstein
+#
+# This is a test that tcmalloc creates, and pprof reads, sampling data
+# correctly: both for the heap profile (ReadStackTraces) and for
+# growth in the heap sized (ReadGrowthStackTraces).
+
+BINDIR="${BINDIR:-.}"
+PPROF_PATH="${PPROF_PATH:-$BINDIR/src/pprof}"
+
+if [ "x$1" = "x-h" -o "x$1" = "x--help" ]; then
+  echo "USAGE: $0 [unittest dir] [path to pprof]"
+  echo "       By default, unittest_dir=$BINDIR, pprof_path=$PPROF_PATH"
+  exit 1
+fi
+
+SAMPLING_TEST="${1:-$BINDIR/sampling_test}"
+PPROF="${2:-$PPROF_PATH}"
+OUTDIR="/tmp/sampling_test_dir"
+
+# libtool is annoying, and puts the actual executable in a different
+# directory, replacing the seeming-executable with a shell script.
+# We use the error output of sampling_test to indicate its real location
+SAMPLING_TEST_BINARY=`"$SAMPLING_TEST" 2>&1 | awk '/USAGE/ {print $2; exit;}'`
+
+# A kludge for cygwin.  Unfortunately, 'test -f' says that 'foo' exists
+# even when it doesn't, and only foo.exe exists.  Other unix utilities
+# (like nm) need you to say 'foo.exe'.  We use one such utility, cat, to
+# see what the *real* binary name is.
+if ! cat "$SAMPLING_TEST_BINARY" >/dev/null 2>&1; then
+  SAMPLING_TEST_BINARY="$SAMPLING_TEST_BINARY".exe
+fi
+
+die() {    # runs the command given as arguments, and then dies.
+    echo "FAILED.  Output from $@"
+    echo "----"
+    "$@"
+    echo "----"
+    exit 1
+}
+
+rm -rf "$OUTDIR" || die "Unable to delete $OUTDIR"
+mkdir "$OUTDIR" || die "Unable to create $OUTDIR"
+
+# This puts the output into out.heap and out.growth.  It allocates
+# 8*10^7 bytes of memory, which is 76M.  Because we sample, the
+# estimate may be a bit high or a bit low: we accept anything from
+# 50M to 99M.
+"$SAMPLING_TEST" "$OUTDIR/out"
+
+echo "Testing heap output..."
+"$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.heap" \
+   | grep '[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \
+   || die "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.heap"
+echo "OK"
+
+echo "Testing growth output..."
+"$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.growth" \
+   | grep '[5-9][0-9]\.[0-9][ 0-9.%]*_*AllocateAllocate' >/dev/null \
+   || die "$PPROF" --text "$SAMPLING_TEST_BINARY" "$OUTDIR/out.growth"
+echo "OK"
+
+echo "PASS"

diff --git a/src/tests/simple_compat_test.cc b/src/tests/simple_compat_test.cc
new file mode 100644
index 0000000..5dbfd7a
--- /dev/null
+++ b/src/tests/simple_compat_test.cc

@@ -0,0 +1,68 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2012, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// This just verifies that we can compile code that #includes stuff
+// via the backwards-compatibility 'google/' #include-dir.  It does
+// not include config.h on purpose, to better simulate a perftools
+// client.
+
+#include <stddef.h>
+#include <stdio.h>
+#include <google/heap-checker.h>
+#include <google/heap-profiler.h>
+#include <google/malloc_extension.h>
+#include <google/malloc_extension_c.h>
+#include <google/malloc_hook.h>
+#include <google/malloc_hook_c.h>
+#include <google/profiler.h>
+#include <google/stacktrace.h>
+#include <google/tcmalloc.h>
+
+// We don't link in -lprofiler for this test, so be sure not to make
+// any function calls that require the cpu-profiler code.  The
+// heap-profiler is ok.
+
+HeapLeakChecker::Disabler* heap_checker_h;
+void (*heap_profiler_h)(const char*) = &HeapProfilerStart;
+MallocExtension::Ownership malloc_extension_h;
+MallocExtension_Ownership malloc_extension_c_h;
+MallocHook::NewHook* malloc_hook_h;
+MallocHook_NewHook* malloc_hook_c_h;
+ProfilerOptions* profiler_h;
+int (*stacktrace_h)(void**, int, int) = &GetStackTrace;
+void* (*tcmalloc_h)(size_t) = &tc_new;
+
+int main(int argc, char** argv) {
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/stack_trace_table_test.cc b/src/tests/stack_trace_table_test.cc
new file mode 100644
index 0000000..3cacd2d
--- /dev/null
+++ b/src/tests/stack_trace_table_test.cc

@@ -0,0 +1,102 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright 2009 Google Inc. All Rights Reserved.
+// Author: fikes@google.com (Andrew Fikes)
+//
+// Use of this source code is governed by a BSD-style license that can
+// be found in the LICENSE file.
+
+
+#include "config_for_unittests.h"
+#include <stdio.h>   // for puts()
+#include "stack_trace_table.h"
+#include "base/logging.h"
+#include "base/spinlock.h"
+#include "static_vars.h"
+
+#undef ARRAYSIZE   // may be defined on, eg, windows
+#define ARRAYSIZE(a)  ( sizeof(a) / sizeof(*(a)) )
+
+static void CheckTracesAndReset(tcmalloc::StackTraceTable* table,
+                        const uintptr_t* expected, int len) {
+  void** entries = table->ReadStackTracesAndClear();
+  for (int i = 0; i < len; ++i) {
+    CHECK_EQ(reinterpret_cast<uintptr_t>(entries[i]), expected[i]);
+  }
+  delete[] entries;
+}
+
+static void AddTrace(tcmalloc::StackTraceTable* table,
+                     const tcmalloc::StackTrace& t) {
+  // Normally we'd need this lock, but since the test is single-threaded
+  // we don't.  I comment it out on windows because the DLL-decl thing
+  // is really annoying in this case.
+#ifndef _MSC_VER
+  SpinLockHolder h(tcmalloc::Static::pageheap_lock());
+#endif
+  table->AddTrace(t);
+}
+
+int main(int argc, char **argv) {
+  tcmalloc::StackTraceTable table;
+
+  // Empty table
+  CHECK_EQ(table.depth_total(), 0);
+  CHECK_EQ(table.bucket_total(), 0);
+  static const uintptr_t k1[] = {0};
+  CheckTracesAndReset(&table, k1, ARRAYSIZE(k1));
+
+  tcmalloc::StackTrace t1;
+  t1.size = static_cast<uintptr_t>(1024);
+  t1.depth = static_cast<uintptr_t>(2);
+  t1.stack[0] = reinterpret_cast<void*>(1);
+  t1.stack[1] = reinterpret_cast<void*>(2);
+
+
+  tcmalloc::StackTrace t2;
+  t2.size = static_cast<uintptr_t>(512);
+  t2.depth = static_cast<uintptr_t>(2);
+  t2.stack[0] = reinterpret_cast<void*>(2);
+  t2.stack[1] = reinterpret_cast<void*>(1);
+
+  // Table w/ just t1
+  AddTrace(&table, t1);
+  CHECK_EQ(table.depth_total(), 2);
+  CHECK_EQ(table.bucket_total(), 1);
+  static const uintptr_t k2[] = {1, 1024, 2, 1, 2, 0};
+  CheckTracesAndReset(&table, k2, ARRAYSIZE(k2));
+
+  // Table w/ t1, t2
+  AddTrace(&table, t1);
+  AddTrace(&table, t2);
+  CHECK_EQ(table.depth_total(), 4);
+  CHECK_EQ(table.bucket_total(), 2);
+  static const uintptr_t k3[] = {1, 1024, 2, 1, 2, 1,  512, 2, 2, 1, 0};
+  CheckTracesAndReset(&table, k3, ARRAYSIZE(k3));
+
+  // Table w/ 2 x t1, 1 x t2
+  AddTrace(&table, t1);
+  AddTrace(&table, t2);
+  AddTrace(&table, t1);
+  CHECK_EQ(table.depth_total(), 4);
+  CHECK_EQ(table.bucket_total(), 2);
+  static const uintptr_t k4[] = {2, 2048, 2, 1, 2, 1,  512, 2, 2, 1, 0};
+  CheckTracesAndReset(&table, k4, ARRAYSIZE(k4));
+
+  // Same stack as t1, but w/ different size
+  tcmalloc::StackTrace t3;
+  t3.size = static_cast<uintptr_t>(2);
+  t3.depth = static_cast<uintptr_t>(2);
+  t3.stack[0] = reinterpret_cast<void*>(1);
+  t3.stack[1] = reinterpret_cast<void*>(2);
+
+  // Table w/ t1, t3
+  AddTrace(&table, t1);
+  AddTrace(&table, t3);
+  CHECK_EQ(table.depth_total(), 2);
+  CHECK_EQ(table.bucket_total(), 1);
+  static const uintptr_t k5[] = {2, 1026, 2, 1, 2, 0};
+  CheckTracesAndReset(&table, k5, ARRAYSIZE(k5));
+
+  puts("PASS");
+  return 0;
+}

diff --git a/src/tests/stacktrace_unittest.cc b/src/tests/stacktrace_unittest.cc
new file mode 100644
index 0000000..3c9f735
--- /dev/null
+++ b/src/tests/stacktrace_unittest.cc

@@ -0,0 +1,194 @@
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "config_for_unittests.h"
+#ifdef HAVE_EXECINFO_H
+#include <execinfo.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include "base/commandlineflags.h"
+#include "base/logging.h"
+#include <gperftools/stacktrace.h>
+
+namespace {
+
+// Obtain a backtrace, verify that the expected callers are present in the
+// backtrace, and maybe print the backtrace to stdout.
+
+// The sequence of functions whose return addresses we expect to see in the
+// backtrace.
+const int BACKTRACE_STEPS = 6;
+
+struct AddressRange {
+  const void *start, *end;
+};
+
+// Expected function [start,end] range.
+AddressRange expected_range[BACKTRACE_STEPS];
+
+#if __GNUC__
+// Using GCC extension: address of a label can be taken with '&&label'.
+// Start should be a label somewhere before recursive call, end somewhere
+// after it.
+#define INIT_ADDRESS_RANGE(fn, start_label, end_label, prange)           \
+  do {                                                                   \
+    (prange)->start = &&start_label;                                     \
+    (prange)->end = &&end_label;                                         \
+    CHECK_LT((prange)->start, (prange)->end);                            \
+  } while (0)
+// This macro expands into "unmovable" code (opaque to GCC), and that
+// prevents GCC from moving a_label up or down in the code.
+// Without it, there is no code following the 'end' label, and GCC
+// (4.3.1, 4.4.0) thinks it safe to assign &&end an address that is before
+// the recursive call.
+#define DECLARE_ADDRESS_LABEL(a_label)                                   \
+  a_label: do { __asm__ __volatile__(""); } while (0)
+// Gcc 4.4.0 may split function into multiple chunks, and the chunk
+// performing recursive call may end up later in the code then the return
+// instruction (this actually happens with FDO).
+// Adjust function range from __builtin_return_address.
+#define ADJUST_ADDRESS_RANGE_FROM_RA(prange)                             \
+  do {                                                                   \
+    void *ra = __builtin_return_address(0);                              \
+    CHECK_LT((prange)->start, ra);                                       \
+    if (ra > (prange)->end) {                                            \
+      printf("Adjusting range from %p..%p to %p..%p\n",                  \
+             (prange)->start, (prange)->end,                             \
+             (prange)->start, ra);                                       \
+      (prange)->end = ra;                                                \
+    }                                                                    \
+  } while (0)
+#else
+// Assume the Check* functions below are not longer than 256 bytes.
+#define INIT_ADDRESS_RANGE(fn, start_label, end_label, prange)           \
+  do {                                                                   \
+    (prange)->start = reinterpret_cast<const void *>(&fn);               \
+    (prange)->end = reinterpret_cast<const char *>(&fn) + 256;           \
+  } while (0)
+#define DECLARE_ADDRESS_LABEL(a_label) do { } while (0)
+#define ADJUST_ADDRESS_RANGE_FROM_RA(prange) do { } while (0)
+#endif  // __GNUC__
+
+//-----------------------------------------------------------------------//
+
+void CheckRetAddrIsInFunction(void *ret_addr, const AddressRange &range)
+{
+  CHECK_GE(ret_addr, range.start);
+  CHECK_LE(ret_addr, range.end);
+}
+
+//-----------------------------------------------------------------------//
+
+void ATTRIBUTE_NOINLINE CheckStackTrace(int);
+void ATTRIBUTE_NOINLINE CheckStackTraceLeaf(void) {
+  const int STACK_LEN = 10;
+  void *stack[STACK_LEN];
+  int size;
+
+  ADJUST_ADDRESS_RANGE_FROM_RA(&expected_range[1]);
+  INIT_ADDRESS_RANGE(CheckStackTraceLeaf, start, end, &expected_range[0]);
+  DECLARE_ADDRESS_LABEL(start);
+  size = GetStackTrace(stack, STACK_LEN, 0);
+  printf("Obtained %d stack frames.\n", size);
+  CHECK_GE(size, 1);
+  CHECK_LE(size, STACK_LEN);
+
+#ifdef HAVE_EXECINFO_H
+  {
+    char **strings = backtrace_symbols(stack, size);
+    printf("Obtained %d stack frames.\n", size);
+    for (int i = 0; i < size; i++)
+      printf("%s %p\n", strings[i], stack[i]);
+    printf("CheckStackTrace() addr: %p\n", &CheckStackTrace);
+    free(strings);
+  }
+#endif
+
+  for (int i = 0; i < BACKTRACE_STEPS; i++) {
+    printf("Backtrace %d: expected: %p..%p  actual: %p ... ",
+           i, expected_range[i].start, expected_range[i].end, stack[i]);
+    fflush(stdout);
+    CheckRetAddrIsInFunction(stack[i], expected_range[i]);
+    printf("OK\n");
+  }
+  DECLARE_ADDRESS_LABEL(end);
+}
+
+//-----------------------------------------------------------------------//
+
+/* Dummy functions to make the backtrace more interesting. */
+void ATTRIBUTE_NOINLINE CheckStackTrace4(int i) {
+  ADJUST_ADDRESS_RANGE_FROM_RA(&expected_range[2]);
+  INIT_ADDRESS_RANGE(CheckStackTrace4, start, end, &expected_range[1]);
+  DECLARE_ADDRESS_LABEL(start);
+  for (int j = i; j >= 0; j--)
+    CheckStackTraceLeaf();
+  DECLARE_ADDRESS_LABEL(end);
+}
+void ATTRIBUTE_NOINLINE CheckStackTrace3(int i) {
+  ADJUST_ADDRESS_RANGE_FROM_RA(&expected_range[3]);
+  INIT_ADDRESS_RANGE(CheckStackTrace3, start, end, &expected_range[2]);
+  DECLARE_ADDRESS_LABEL(start);
+  for (int j = i; j >= 0; j--)
+    CheckStackTrace4(j);
+  DECLARE_ADDRESS_LABEL(end);
+}
+void ATTRIBUTE_NOINLINE CheckStackTrace2(int i) {
+  ADJUST_ADDRESS_RANGE_FROM_RA(&expected_range[4]);
+  INIT_ADDRESS_RANGE(CheckStackTrace2, start, end, &expected_range[3]);
+  DECLARE_ADDRESS_LABEL(start);
+  for (int j = i; j >= 0; j--)
+    CheckStackTrace3(j);
+  DECLARE_ADDRESS_LABEL(end);
+}
+void ATTRIBUTE_NOINLINE CheckStackTrace1(int i) {
+  ADJUST_ADDRESS_RANGE_FROM_RA(&expected_range[5]);
+  INIT_ADDRESS_RANGE(CheckStackTrace1, start, end, &expected_range[4]);
+  DECLARE_ADDRESS_LABEL(start);
+  for (int j = i; j >= 0; j--)
+    CheckStackTrace2(j);
+  DECLARE_ADDRESS_LABEL(end);
+}
+void ATTRIBUTE_NOINLINE CheckStackTrace(int i) {
+  INIT_ADDRESS_RANGE(CheckStackTrace, start, end, &expected_range[5]);
+  DECLARE_ADDRESS_LABEL(start);
+  for (int j = i; j >= 0; j--)
+    CheckStackTrace1(j);
+  DECLARE_ADDRESS_LABEL(end);
+}
+
+}  // namespace
+//-----------------------------------------------------------------------//
+
+int main(int argc, char ** argv) {
+  CheckStackTrace(0);
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/system-alloc_unittest.cc b/src/tests/system-alloc_unittest.cc
new file mode 100644
index 0000000..4a5f7c0
--- /dev/null
+++ b/src/tests/system-alloc_unittest.cc

@@ -0,0 +1,155 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Arun Sharma
+
+#include "config_for_unittests.h"
+#include "system-alloc.h"
+#include <stdio.h>
+#if defined HAVE_STDINT_H
+#include <stdint.h>             // to get uintptr_t
+#elif defined HAVE_INTTYPES_H
+#include <inttypes.h>           // another place uintptr_t might be defined
+#endif
+#include <sys/types.h>
+#include <algorithm>
+#include <limits>
+#include "base/logging.h"               // for Check_GEImpl, Check_LTImpl, etc
+#include <gperftools/malloc_extension.h>    // for MallocExtension::instance
+#include "common.h"                     // for kAddressBits
+
+class ArraySysAllocator : public SysAllocator {
+public:
+  // Was this allocator invoked at least once?
+  bool invoked_;
+
+  ArraySysAllocator() : SysAllocator() {
+    ptr_ = 0;
+    invoked_ = false;
+  }
+
+  void* Alloc(size_t size, size_t *actual_size, size_t alignment) {
+    invoked_ = true;
+
+    if (size > kArraySize) {
+      return NULL;
+    }
+
+    void *result = &array_[ptr_];
+    uintptr_t ptr = reinterpret_cast<uintptr_t>(result);
+
+    if (actual_size) {
+      *actual_size = size;
+    }
+
+    // Try to get more memory for alignment
+    size_t extra = alignment - (ptr & (alignment-1));
+    size += extra;
+    CHECK_LT(ptr_ + size, kArraySize);
+
+    if ((ptr & (alignment-1)) != 0) {
+      ptr += alignment - (ptr & (alignment-1));
+    }
+
+    ptr_ += size;
+    return reinterpret_cast<void *>(ptr);
+  }
+
+  void DumpStats() {
+  }
+
+private:
+  static const int kArraySize = 8 * 1024 * 1024;
+  char array_[kArraySize];
+  // We allocate the next chunk from here
+  int ptr_;
+
+};
+const int ArraySysAllocator::kArraySize;
+ArraySysAllocator a;
+
+static void TestBasicInvoked() {
+  MallocExtension::instance()->SetSystemAllocator(&a);
+
+  // An allocation size that is likely to trigger the system allocator.
+  // XXX: this is implementation specific.
+  char *p = new char[1024 * 1024];
+  delete [] p;
+
+  // Make sure that our allocator was invoked.
+  CHECK(a.invoked_);
+}
+
+#if 0  // could port this to various OSs, but won't bother for now
+TEST(AddressBits, CpuVirtualBits) {
+  // Check that kAddressBits is as least as large as either the number of bits
+  // in a pointer or as the number of virtual bits handled by the processor.
+  // To be effective this test must be run on each processor model.
+  const int kPointerBits = 8 * sizeof(void*);
+  const int kImplementedVirtualBits = NumImplementedVirtualBits();
+
+  CHECK_GE(kAddressBits, std::min(kImplementedVirtualBits, kPointerBits));
+}
+#endif
+
+static void TestBasicRetryFailTest() {
+  // Check with the allocator still works after a failed allocation.
+  //
+  // There is no way to call malloc and guarantee it will fail.  malloc takes a
+  // size_t parameter and the C++ standard does not constrain the size of
+  // size_t.  For example, consider an implementation where size_t is 32 bits
+  // and pointers are 64 bits.
+  //
+  // It is likely, though, that sizeof(size_t) == sizeof(void*).  In that case,
+  // the first allocation here might succeed but the second allocation must
+  // fail.
+  //
+  // If the second allocation succeeds, you will have to rewrite or
+  // disable this test.
+  // The weird parens are to avoid macro-expansion of 'max' on windows.
+  const size_t kHugeSize = (std::numeric_limits<size_t>::max)() / 2;
+  void* p1 = malloc(kHugeSize);
+  void* p2 = malloc(kHugeSize);
+  CHECK(p2 == NULL);
+  if (p1 != NULL) free(p1);
+
+  char* q = new char[1024];
+  CHECK(q != NULL);
+  delete [] q;
+}
+
+int main(int argc, char** argv) {
+  TestBasicInvoked();
+  TestBasicRetryFailTest();
+
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/tcmalloc_large_unittest.cc b/src/tests/tcmalloc_large_unittest.cc
new file mode 100644
index 0000000..ff22007
--- /dev/null
+++ b/src/tests/tcmalloc_large_unittest.cc

@@ -0,0 +1,138 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Michael Chastain
+//
+// This is a unit test for large allocations in malloc and friends.
+// "Large" means "so large that they overflow the address space".
+// For 32 bits, this means allocations near 2^32 bytes and 2^31 bytes.
+// For 64 bits, this means allocations near 2^64 bytes and 2^63 bytes.
+
+#include <stddef.h>                     // for size_t, NULL
+#include <stdlib.h>                     // for malloc, free, realloc
+#include <stdio.h>
+#include <set>                          // for set, etc
+
+#include "base/logging.h"               // for operator<<, CHECK, etc
+
+using std::set;
+
+// Alloc a size that should always fail.
+
+void TryAllocExpectFail(size_t size) {
+  void* p1 = malloc(size);
+  CHECK(p1 == NULL);
+
+  void* p2 = malloc(1);
+  CHECK(p2 != NULL);
+
+  void* p3 = realloc(p2, size);
+  CHECK(p3 == NULL);
+
+  free(p2);
+}
+
+// Alloc a size that might work and might fail.
+// If it does work, touch some pages.
+
+void TryAllocMightFail(size_t size) {
+  unsigned char* p = static_cast<unsigned char*>(malloc(size));
+  if ( p != NULL ) {
+    unsigned char volatile* vp = p;  // prevent optimizations
+    static const size_t kPoints = 1024;
+
+    for ( size_t i = 0; i < kPoints; ++i ) {
+      vp[i * (size / kPoints)] = static_cast<unsigned char>(i);
+    }
+
+    for ( size_t i = 0; i < kPoints; ++i ) {
+      CHECK(vp[i * (size / kPoints)] == static_cast<unsigned char>(i));
+    }
+
+    vp[size-1] = 'M';
+    CHECK(vp[size-1] == 'M');
+  }
+
+  free(p);
+}
+
+int main (int argc, char** argv) {
+  // Allocate some 0-byte objects.  They better be unique.
+  // 0 bytes is not large but it exercises some paths related to
+  // large-allocation code.
+  {
+    static const int kZeroTimes = 1024;
+    printf("Test malloc(0) x %d\n", kZeroTimes);
+    set<char*> p_set;
+    for ( int i = 0; i < kZeroTimes; ++i ) {
+      char* p = new char;
+      CHECK(p != NULL);
+      CHECK(p_set.find(p) == p_set.end());
+      p_set.insert(p_set.end(), p);
+    }
+    // Just leak the memory.
+  }
+
+  // Grab some memory so that some later allocations are guaranteed to fail.
+  printf("Test small malloc\n");
+  void* p_small = malloc(4*1048576);
+  CHECK(p_small != NULL);
+
+  // Test sizes up near the maximum size_t.
+  // These allocations test the wrap-around code.
+  printf("Test malloc(0 - N)\n");
+  const size_t zero = 0;
+  static const size_t kMinusNTimes = 16384;
+  for ( size_t i = 1; i < kMinusNTimes; ++i ) {
+    TryAllocExpectFail(zero - i);
+  }
+
+  // Test sizes a bit smaller.
+  // The small malloc above guarantees that all these return NULL.
+  printf("Test malloc(0 - 1048576 - N)\n");
+  static const size_t kMinusMBMinusNTimes = 16384;
+  for ( size_t i = 0; i < kMinusMBMinusNTimes; ++i) {
+    TryAllocExpectFail(zero - 1048576 - i);
+  }
+
+  // Test sizes at half of size_t.
+  // These might or might not fail to allocate.
+  printf("Test malloc(max/2 +- N)\n");
+  static const size_t kHalfPlusMinusTimes = 64;
+  const size_t half = (zero - 2) / 2 + 1;
+  for ( size_t i = 0; i < kHalfPlusMinusTimes; ++i) {
+    TryAllocMightFail(half - i);
+    TryAllocMightFail(half + i);
+  }
+
+  printf("PASS\n");
+  return 0;
+}

diff --git a/src/tests/tcmalloc_unittest.cc b/src/tests/tcmalloc_unittest.cc
new file mode 100644
index 0000000..69698bc
--- /dev/null
+++ b/src/tests/tcmalloc_unittest.cc

@@ -0,0 +1,1427 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Unittest for the TCMalloc implementation.
+//
+// * The test consists of a set of threads.
+// * Each thread maintains a set of allocated objects, with
+//   a bound on the total amount of data in the set.
+// * Each allocated object's contents are generated by
+//   hashing the object pointer, and a generation count
+//   in the object.  This allows us to easily check for
+//   data corruption.
+// * At any given step, the thread can do any of the following:
+//     a. Allocate an object
+//     b. Increment an object's generation count and update
+//        its contents.
+//     c. Pass the object to another thread
+//     d. Free an object
+//   Also, at the end of every step, object(s) are freed to maintain
+//   the memory upper-bound.
+//
+// If this test is compiled with -DDEBUGALLOCATION, then we don't
+// run some tests that test the inner workings of tcmalloc and
+// break on debugallocation: that certain allocations are aligned
+// in a certain way (even though no standard requires it), and that
+// realloc() tries to minimize copying (which debug allocators don't
+// care about).
+
+#include "config_for_unittests.h"
+// Complicated ordering requirements.  tcmalloc.h defines (indirectly)
+// _POSIX_C_SOURCE, which it needs so stdlib.h defines posix_memalign.
+// unistd.h, on the other hand, requires _POSIX_C_SOURCE to be unset,
+// at least on FreeBSD, in order to define sbrk.  The solution
+// is to #include unistd.h first.  This is safe because unistd.h
+// doesn't sub-include stdlib.h, so we'll still get posix_memalign
+// when we #include stdlib.h.  Blah.
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>        // for testing sbrk hooks
+#endif
+#include "tcmalloc.h"      // must come early, to pick up posix_memalign
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#if defined HAVE_STDINT_H
+#include <stdint.h>        // for intptr_t
+#endif
+#include <sys/types.h>     // for size_t
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>         // for open; used with mmap-hook test
+#endif
+#ifdef HAVE_MMAP
+#include <sys/mman.h>      // for testing mmap hooks
+#endif
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>        // defines pvalloc/etc on cygwin
+#endif
+#include <assert.h>
+#include <vector>
+#include <algorithm>
+#include <string>
+#include <new>
+#include "base/logging.h"
+#include "base/simple_mutex.h"
+#include "gperftools/malloc_hook.h"
+#include "gperftools/malloc_extension.h"
+#include "gperftools/tcmalloc.h"
+#include "thread_cache.h"
+#include "system-alloc.h"
+#include "tests/testutil.h"
+
+// Windows doesn't define pvalloc and a few other obsolete unix
+// functions; nor does it define posix_memalign (which is not obsolete).
+#if defined(_WIN32)
+# define cfree free         // don't bother to try to test these obsolete fns
+# define valloc malloc
+# define pvalloc malloc
+// I'd like to map posix_memalign to _aligned_malloc, but _aligned_malloc
+// must be paired with _aligned_free (not normal free), which is too
+// invasive a change to how we allocate memory here.  So just bail
+static bool kOSSupportsMemalign = false;
+static inline void* Memalign(size_t align, size_t size) {
+  //LOG(FATAL) << "memalign not supported on windows";
+  exit(1);
+  return NULL;
+}
+static inline int PosixMemalign(void** ptr, size_t align, size_t size) {
+  //LOG(FATAL) << "posix_memalign not supported on windows";
+  exit(1);
+  return -1;
+}
+
+// OS X defines posix_memalign in some OS versions but not others;
+// it's confusing enough to check that it's easiest to just not to test.
+#elif defined(__APPLE__)
+static bool kOSSupportsMemalign = false;
+static inline void* Memalign(size_t align, size_t size) {
+  //LOG(FATAL) << "memalign not supported on OS X";
+  exit(1);
+  return NULL;
+}
+static inline int PosixMemalign(void** ptr, size_t align, size_t size) {
+  //LOG(FATAL) << "posix_memalign not supported on OS X";
+  exit(1);
+  return -1;
+}
+
+#else
+static bool kOSSupportsMemalign = true;
+static inline void* Memalign(size_t align, size_t size) {
+  return memalign(align, size);
+}
+static inline int PosixMemalign(void** ptr, size_t align, size_t size) {
+  return posix_memalign(ptr, align, size);
+}
+
+#endif
+
+// On systems (like freebsd) that don't define MAP_ANONYMOUS, use the old
+// form of the name instead.
+#ifndef MAP_ANONYMOUS
+# define MAP_ANONYMOUS MAP_ANON
+#endif
+
+#define LOGSTREAM   stdout
+
+using std::vector;
+using std::string;
+
+DECLARE_double(tcmalloc_release_rate);
+DECLARE_int32(max_free_queue_size);     // in debugallocation.cc
+DECLARE_int64(tcmalloc_sample_parameter);
+
+namespace testing {
+
+static const int FLAGS_numtests = 50000;
+static const int FLAGS_log_every_n_tests = 50000; // log exactly once
+
+// Testing parameters
+static const int FLAGS_lgmaxsize = 16;   // lg() of the max size object to alloc
+static const int FLAGS_numthreads = 10;  // Number of threads
+static const int FLAGS_threadmb = 4;     // Max memory size allocated by thread
+static const int FLAGS_lg_max_memalign = 18; // lg of max alignment for memalign
+
+static const double FLAGS_memalign_min_fraction = 0;    // min expected%
+static const double FLAGS_memalign_max_fraction = 0.4;  // max expected%
+static const double FLAGS_memalign_max_alignment_ratio = 6;  // alignment/size
+
+// Weights of different operations
+static const int FLAGS_allocweight = 50;    // Weight for picking allocation
+static const int FLAGS_freeweight = 50;     // Weight for picking free
+static const int FLAGS_updateweight = 10;   // Weight for picking update
+static const int FLAGS_passweight = 1;      // Weight for passing object
+
+static const int kSizeBits = 8 * sizeof(size_t);
+static const size_t kMaxSize = ~static_cast<size_t>(0);
+static const size_t kMaxSignedSize = ((size_t(1) << (kSizeBits-1)) - 1);
+
+static const size_t kNotTooBig = 100000;
+// We want an allocation that is definitely more than main memory.  OS
+// X has special logic to discard very big allocs before even passing
+// the request along to the user-defined memory allocator; we're not
+// interested in testing their logic, so we have to make sure we're
+// not *too* big.
+static const size_t kTooBig = kMaxSize - 100000;
+
+static int news_handled = 0;
+
+// Global array of threads
+class TesterThread;
+static TesterThread** threads;
+
+// To help with generating random numbers
+class TestHarness {
+ private:
+  // Information kept per type
+  struct Type {
+    string      name;
+    int         type;
+    int         weight;
+  };
+
+ public:
+  TestHarness(int seed)
+      : types_(new vector<Type>), total_weight_(0), num_tests_(0) {
+    srandom(seed);
+  }
+  ~TestHarness() {
+    delete types_;
+  }
+
+  // Add operation type with specified weight.  When starting a new
+  // iteration, an operation type is picked with probability
+  // proportional to its weight.
+  //
+  // "type" must be non-negative.
+  // "weight" must be non-negative.
+  void AddType(int type, int weight, const char* name);
+
+  // Call this to get the type of operation for the next iteration.
+  // It returns a random operation type from the set of registered
+  // operations.  Returns -1 if tests should finish.
+  int PickType();
+
+  // If n == 0, returns the next pseudo-random number in the range [0 .. 0]
+  // If n != 0, returns the next pseudo-random number in the range [0 .. n)
+  int Uniform(int n) {
+    if (n == 0) {
+      return random() * 0;
+    } else {
+      return random() % n;
+    }
+  }
+  // Pick "base" uniformly from range [0,max_log] and then return
+  // "base" random bits.  The effect is to pick a number in the range
+  // [0,2^max_log-1] with bias towards smaller numbers.
+  int Skewed(int max_log) {
+    const int base = random() % (max_log+1);
+    return random() % (1 << base);
+  }
+
+ private:
+  vector<Type>*         types_;         // Registered types
+  int                   total_weight_;  // Total weight of all types
+  int                   num_tests_;     // Num tests run so far
+};
+
+void TestHarness::AddType(int type, int weight, const char* name) {
+  Type t;
+  t.name = name;
+  t.type = type;
+  t.weight = weight;
+  types_->push_back(t);
+  total_weight_ += weight;
+}
+
+int TestHarness::PickType() {
+  if (num_tests_ >= FLAGS_numtests) return -1;
+  num_tests_++;
+
+  assert(total_weight_ > 0);
+  // This is a little skewed if total_weight_ doesn't divide 2^31, but it's close
+  int v = Uniform(total_weight_);
+  int i;
+  for (i = 0; i < types_->size(); i++) {
+    v -= (*types_)[i].weight;
+    if (v < 0) {
+      break;
+    }
+  }
+
+  assert(i < types_->size());
+  if ((num_tests_ % FLAGS_log_every_n_tests) == 0) {
+    fprintf(LOGSTREAM, "  Test %d out of %d: %s\n",
+            num_tests_, FLAGS_numtests, (*types_)[i].name.c_str());
+  }
+  return (*types_)[i].type;
+}
+
+class AllocatorState : public TestHarness {
+ public:
+  explicit AllocatorState(int seed) : TestHarness(seed), memalign_fraction_(0) {
+    if (kOSSupportsMemalign) {
+      CHECK_GE(FLAGS_memalign_max_fraction, 0);
+      CHECK_LE(FLAGS_memalign_max_fraction, 1);
+      CHECK_GE(FLAGS_memalign_min_fraction, 0);
+      CHECK_LE(FLAGS_memalign_min_fraction, 1);
+      double delta = FLAGS_memalign_max_fraction - FLAGS_memalign_min_fraction;
+      CHECK_GE(delta, 0);
+      memalign_fraction_ = (Uniform(10000)/10000.0 * delta +
+                            FLAGS_memalign_min_fraction);
+      //fprintf(LOGSTREAM, "memalign fraction: %f\n", memalign_fraction_);
+    }
+  }
+  virtual ~AllocatorState() {}
+
+  // Allocate memory.  Randomly choose between malloc() or posix_memalign().
+  void* alloc(size_t size) {
+    if (Uniform(100) < memalign_fraction_ * 100) {
+      // Try a few times to find a reasonable alignment, or fall back on malloc.
+      for (int i = 0; i < 5; i++) {
+        size_t alignment = 1 << Uniform(FLAGS_lg_max_memalign);
+        if (alignment >= sizeof(intptr_t) &&
+            (size < sizeof(intptr_t) ||
+             alignment < FLAGS_memalign_max_alignment_ratio * size)) {
+          void *result = reinterpret_cast<void*>(static_cast<intptr_t>(0x1234));
+          int err = PosixMemalign(&result, alignment, size);
+          if (err != 0) {
+            CHECK_EQ(err, ENOMEM);
+          }
+          return err == 0 ? result : NULL;
+        }
+      }
+    }
+    return malloc(size);
+  }
+
+ private:
+  double memalign_fraction_;
+};
+
+
+// Info kept per thread
+class TesterThread {
+ private:
+  // Info kept per allocated object
+  struct Object {
+    char*       ptr;                    // Allocated pointer
+    int         size;                   // Allocated size
+    int         generation;             // Generation counter of object contents
+  };
+
+  Mutex                 lock_;          // For passing in another thread's obj
+  int                   id_;            // My thread id
+  AllocatorState        rnd_;           // For generating random numbers
+  vector<Object>        heap_;          // This thread's heap
+  vector<Object>        passed_;        // Pending objects passed from others
+  size_t                heap_size_;     // Current heap size
+  int                   locks_ok_;      // Number of OK TryLock() ops
+  int                   locks_failed_;  // Number of failed TryLock() ops
+
+  // Type of operations
+  enum Type { ALLOC, FREE, UPDATE, PASS };
+
+  // ACM minimal standard random number generator.  (re-entrant.)
+  class ACMRandom {
+    int32 seed_;
+   public:
+    explicit ACMRandom(int32 seed) { seed_ = seed; }
+    int32 Next() {
+      const int32 M = 2147483647L;   // 2^31-1
+      const int32 A = 16807;
+      // In effect, we are computing seed_ = (seed_ * A) % M, where M = 2^31-1
+      uint32 lo = A * (int32)(seed_ & 0xFFFF);
+      uint32 hi = A * (int32)((uint32)seed_ >> 16);
+      lo += (hi & 0x7FFF) << 16;
+      if (lo > M) {
+        lo &= M;
+        ++lo;
+      }
+      lo += hi >> 15;
+      if (lo > M) {
+        lo &= M;
+        ++lo;
+      }
+      return (seed_ = (int32) lo);
+    }
+  };
+
+ public:
+  TesterThread(int id)
+    : id_(id),
+      rnd_(id+1),
+      heap_size_(0),
+      locks_ok_(0),
+      locks_failed_(0) {
+  }
+
+  virtual ~TesterThread() {
+    if (FLAGS_verbose)
+      fprintf(LOGSTREAM, "Thread %2d: locks %6d ok; %6d trylocks failed\n",
+              id_, locks_ok_, locks_failed_);
+    if (locks_ok_ + locks_failed_ >= 1000) {
+      CHECK_LE(locks_failed_, locks_ok_ / 2);
+    }
+  }
+
+  virtual void Run() {
+    rnd_.AddType(ALLOC,  FLAGS_allocweight,   "allocate");
+    rnd_.AddType(FREE,   FLAGS_freeweight,    "free");
+    rnd_.AddType(UPDATE, FLAGS_updateweight,  "update");
+    rnd_.AddType(PASS,   FLAGS_passweight,    "pass");
+
+    while (true) {
+      AcquirePassedObjects();
+
+      switch (rnd_.PickType()) {
+        case ALLOC:   AllocateObject(); break;
+        case FREE:    FreeObject();     break;
+        case UPDATE:  UpdateObject();   break;
+        case PASS:    PassObject();     break;
+        case -1:      goto done;
+        default:      assert(NULL == "Unknown type");
+      }
+
+      ShrinkHeap();
+    }
+
+ done:
+    DeleteHeap();
+  }
+
+  // Allocate a new object
+  void AllocateObject() {
+    Object object;
+    object.size = rnd_.Skewed(FLAGS_lgmaxsize);
+    object.ptr = static_cast<char*>(rnd_.alloc(object.size));
+    CHECK(object.ptr);
+    object.generation = 0;
+    FillContents(&object);
+    heap_.push_back(object);
+    heap_size_ += object.size;
+  }
+
+  // Mutate a random object
+  void UpdateObject() {
+    if (heap_.empty()) return;
+    const int index = rnd_.Uniform(heap_.size());
+    CheckContents(heap_[index]);
+    heap_[index].generation++;
+    FillContents(&heap_[index]);
+  }
+
+  // Free a random object
+  void FreeObject() {
+    if (heap_.empty()) return;
+    const int index = rnd_.Uniform(heap_.size());
+    Object object = heap_[index];
+    CheckContents(object);
+    free(object.ptr);
+    heap_size_ -= object.size;
+    heap_[index] = heap_[heap_.size()-1];
+    heap_.pop_back();
+  }
+
+  // Delete all objects in the heap
+  void DeleteHeap() {
+    while (!heap_.empty()) {
+      FreeObject();
+    }
+  }
+
+  // Free objects until our heap is small enough
+  void ShrinkHeap() {
+    while (heap_size_ > FLAGS_threadmb << 20) {
+      assert(!heap_.empty());
+      FreeObject();
+    }
+  }
+
+  // Pass a random object to another thread
+  void PassObject() {
+    // Pick object to pass
+    if (heap_.empty()) return;
+    const int index = rnd_.Uniform(heap_.size());
+    Object object = heap_[index];
+    CheckContents(object);
+
+    // Pick thread to pass
+    const int tid = rnd_.Uniform(FLAGS_numthreads);
+    TesterThread* thread = threads[tid];
+
+    if (thread->lock_.TryLock()) {
+      // Pass the object
+      locks_ok_++;
+      thread->passed_.push_back(object);
+      thread->lock_.Unlock();
+      heap_size_ -= object.size;
+      heap_[index] = heap_[heap_.size()-1];
+      heap_.pop_back();
+    } else {
+      locks_failed_++;
+    }
+  }
+
+  // Grab any objects passed to this thread by another thread
+  void AcquirePassedObjects() {
+    // We do not create unnecessary contention by always using
+    // TryLock().  Plus we unlock immediately after swapping passed
+    // objects into a local vector.
+    vector<Object> copy;
+    { // Locking scope
+      if (!lock_.TryLock()) {
+        locks_failed_++;
+        return;
+      }
+      locks_ok_++;
+      swap(copy, passed_);
+      lock_.Unlock();
+    }
+
+    for (int i = 0; i < copy.size(); ++i) {
+      const Object& object = copy[i];
+      CheckContents(object);
+      heap_.push_back(object);
+      heap_size_ += object.size;
+    }
+  }
+
+  // Fill object contents according to ptr/generation
+  void FillContents(Object* object) {
+    ACMRandom r(reinterpret_cast<intptr_t>(object->ptr) & 0x7fffffff);
+    for (int i = 0; i < object->generation; ++i) {
+      r.Next();
+    }
+    const char c = static_cast<char>(r.Next());
+    memset(object->ptr, c, object->size);
+  }
+
+  // Check object contents
+  void CheckContents(const Object& object) {
+    ACMRandom r(reinterpret_cast<intptr_t>(object.ptr) & 0x7fffffff);
+    for (int i = 0; i < object.generation; ++i) {
+      r.Next();
+    }
+
+    // For large objects, we just check a prefix/suffix
+    const char expected = static_cast<char>(r.Next());
+    const int limit1 = object.size < 32 ? object.size : 32;
+    const int start2 = limit1 > object.size - 32 ? limit1 : object.size - 32;
+    for (int i = 0; i < limit1; ++i) {
+      CHECK_EQ(object.ptr[i], expected);
+    }
+    for (int i = start2; i < object.size; ++i) {
+      CHECK_EQ(object.ptr[i], expected);
+    }
+  }
+};
+
+static void RunThread(int thread_id) {
+  threads[thread_id]->Run();
+}
+
+static void TryHugeAllocation(size_t s, AllocatorState* rnd) {
+  void* p = rnd->alloc(s);
+  CHECK(p == NULL);   // huge allocation s should fail!
+}
+
+static void TestHugeAllocations(AllocatorState* rnd) {
+  // Check that asking for stuff tiny bit smaller than largest possible
+  // size returns NULL.
+  for (size_t i = 0; i < 70000; i += rnd->Uniform(20)) {
+    TryHugeAllocation(kMaxSize - i, rnd);
+  }
+  // Asking for memory sizes near signed/unsigned boundary (kMaxSignedSize)
+  // might work or not, depending on the amount of virtual memory.
+#ifndef DEBUGALLOCATION    // debug allocation takes forever for huge allocs
+  for (size_t i = 0; i < 100; i++) {
+    void* p = NULL;
+    p = rnd->alloc(kMaxSignedSize + i);
+    if (p) free(p);    // if: free(NULL) is not necessarily defined
+    p = rnd->alloc(kMaxSignedSize - i);
+    if (p) free(p);
+  }
+#endif
+
+  // Check that ReleaseFreeMemory has no visible effect (aka, does not
+  // crash the test):
+  MallocExtension* inst = MallocExtension::instance();
+  CHECK(inst);
+  inst->ReleaseFreeMemory();
+}
+
+static void TestCalloc(size_t n, size_t s, bool ok) {
+  char* p = reinterpret_cast<char*>(calloc(n, s));
+  if (FLAGS_verbose)
+    fprintf(LOGSTREAM, "calloc(%" PRIxS ", %" PRIxS "): %p\n", n, s, p);
+  if (!ok) {
+    CHECK(p == NULL);  // calloc(n, s) should not succeed
+  } else {
+    CHECK(p != NULL);  // calloc(n, s) should succeed
+    for (int i = 0; i < n*s; i++) {
+      CHECK(p[i] == '\0');
+    }
+    free(p);
+  }
+}
+
+// This makes sure that reallocing a small number of bytes in either
+// direction doesn't cause us to allocate new memory.
+static void TestRealloc() {
+#ifndef DEBUGALLOCATION  // debug alloc doesn't try to minimize reallocs
+  // When sampling, we always allocate in units of page-size, which
+  // makes reallocs of small sizes do extra work (thus, failing these
+  // checks).  Since sampling is random, we turn off sampling to make
+  // sure that doesn't happen to us here.
+  const int64 old_sample_parameter = FLAGS_tcmalloc_sample_parameter;
+  FLAGS_tcmalloc_sample_parameter = 0;   // turn off sampling
+
+  int start_sizes[] = { 100, 1000, 10000, 100000 };
+  int deltas[] = { 1, -2, 4, -8, 16, -32, 64, -128 };
+
+  for (int s = 0; s < sizeof(start_sizes)/sizeof(*start_sizes); ++s) {
+    void* p = malloc(start_sizes[s]);
+    CHECK(p);
+    // The larger the start-size, the larger the non-reallocing delta.
+    for (int d = 0; d < (s+1) * 2; ++d) {
+      void* new_p = realloc(p, start_sizes[s] + deltas[d]);
+      CHECK(p == new_p);  // realloc should not allocate new memory
+    }
+    // Test again, but this time reallocing smaller first.
+    for (int d = 0; d < s*2; ++d) {
+      void* new_p = realloc(p, start_sizes[s] - deltas[d]);
+      CHECK(p == new_p);  // realloc should not allocate new memory
+    }
+    free(p);
+  }
+  FLAGS_tcmalloc_sample_parameter = old_sample_parameter;
+#endif
+}
+
+static void TestNewHandler() throw (std::bad_alloc) {
+  ++news_handled;
+  throw std::bad_alloc();
+}
+
+static void TestOneNew(void* (*func)(size_t)) {
+  // success test
+  try {
+    void* ptr = (*func)(kNotTooBig);
+    if (0 == ptr) {
+      fprintf(LOGSTREAM, "allocation should not have failed.\n");
+      abort();
+    }
+  } catch (...) {
+    fprintf(LOGSTREAM, "allocation threw unexpected exception.\n");
+    abort();
+  }
+
+  // failure test
+  // we should always receive a bad_alloc exception
+  try {
+    (*func)(kTooBig);
+    fprintf(LOGSTREAM, "allocation should have failed.\n");
+    abort();
+  } catch (const std::bad_alloc&) {
+    // correct
+  } catch (...) {
+    fprintf(LOGSTREAM, "allocation threw unexpected exception.\n");
+    abort();
+  }
+}
+
+static void TestNew(void* (*func)(size_t)) {
+  news_handled = 0;
+
+  // test without new_handler:
+  std::new_handler saved_handler = std::set_new_handler(0);
+  TestOneNew(func);
+
+  // test with new_handler:
+  std::set_new_handler(TestNewHandler);
+  TestOneNew(func);
+  if (news_handled != 1) {
+    fprintf(LOGSTREAM, "new_handler was not called.\n");
+    abort();
+  }
+  std::set_new_handler(saved_handler);
+}
+
+static void TestOneNothrowNew(void* (*func)(size_t, const std::nothrow_t&)) {
+  // success test
+  try {
+    void* ptr = (*func)(kNotTooBig, std::nothrow);
+    if (0 == ptr) {
+      fprintf(LOGSTREAM, "allocation should not have failed.\n");
+      abort();
+    }
+  } catch (...) {
+    fprintf(LOGSTREAM, "allocation threw unexpected exception.\n");
+    abort();
+  }
+
+  // failure test
+  // we should always receive a bad_alloc exception
+  try {
+    if ((*func)(kTooBig, std::nothrow) != 0) {
+      fprintf(LOGSTREAM, "allocation should have failed.\n");
+      abort();
+    }
+  } catch (...) {
+    fprintf(LOGSTREAM, "nothrow allocation threw unexpected exception.\n");
+    abort();
+  }
+}
+
+static void TestNothrowNew(void* (*func)(size_t, const std::nothrow_t&)) {
+  news_handled = 0;
+
+  // test without new_handler:
+  std::new_handler saved_handler = std::set_new_handler(0);
+  TestOneNothrowNew(func);
+
+  // test with new_handler:
+  std::set_new_handler(TestNewHandler);
+  TestOneNothrowNew(func);
+  if (news_handled != 1) {
+    fprintf(LOGSTREAM, "nothrow new_handler was not called.\n");
+    abort();
+  }
+  std::set_new_handler(saved_handler);
+}
+
+
+// These are used as callbacks by the sanity-check.  Set* and Reset*
+// register the hook that counts how many times the associated memory
+// function is called.  After each such call, call Verify* to verify
+// that we used the tcmalloc version of the call, and not the libc.
+// Note the ... in the hook signature: we don't care what arguments
+// the hook takes.
+#define MAKE_HOOK_CALLBACK(hook_type)                                   \
+  static volatile int g_##hook_type##_calls = 0;                                 \
+  static void IncrementCallsTo##hook_type(...) {                        \
+    g_##hook_type##_calls++;                                            \
+  }                                                                     \
+  static void Verify##hook_type##WasCalled() {                          \
+    CHECK_GT(g_##hook_type##_calls, 0);                                 \
+    g_##hook_type##_calls = 0;  /* reset for next call */               \
+  }                                                                     \
+  static void Set##hook_type() {                                        \
+    CHECK(MallocHook::Add##hook_type(                                   \
+        (MallocHook::hook_type)&IncrementCallsTo##hook_type));          \
+  }                                                                     \
+  static void Reset##hook_type() {                                      \
+    CHECK(MallocHook::Remove##hook_type(                                \
+        (MallocHook::hook_type)&IncrementCallsTo##hook_type));          \
+  }
+
+// We do one for each hook typedef in malloc_hook.h
+MAKE_HOOK_CALLBACK(NewHook);
+MAKE_HOOK_CALLBACK(DeleteHook);
+MAKE_HOOK_CALLBACK(MmapHook);
+MAKE_HOOK_CALLBACK(MremapHook);
+MAKE_HOOK_CALLBACK(MunmapHook);
+MAKE_HOOK_CALLBACK(SbrkHook);
+
+static void TestAlignmentForSize(int size) {
+  fprintf(LOGSTREAM, "Testing alignment of malloc(%d)\n", size);
+  static const int kNum = 100;
+  void* ptrs[kNum];
+  for (int i = 0; i < kNum; i++) {
+    ptrs[i] = malloc(size);
+    uintptr_t p = reinterpret_cast<uintptr_t>(ptrs[i]);
+    CHECK((p % sizeof(void*)) == 0);
+    CHECK((p % sizeof(double)) == 0);
+
+    // Must have 16-byte (or 8-byte in case of -DTCMALLOC_ALIGN_8BYTES)
+    // alignment for large enough objects
+    if (size >= kMinAlign) {
+      CHECK((p % kMinAlign) == 0);
+    }
+  }
+  for (int i = 0; i < kNum; i++) {
+    free(ptrs[i]);
+  }
+}
+
+static void TestMallocAlignment() {
+  for (int lg = 0; lg < 16; lg++) {
+    TestAlignmentForSize((1<<lg) - 1);
+    TestAlignmentForSize((1<<lg) + 0);
+    TestAlignmentForSize((1<<lg) + 1);
+  }
+}
+
+static void TestHugeThreadCache() {
+  fprintf(LOGSTREAM, "==== Testing huge thread cache\n");
+  // More than 2^16 to cause integer overflow of 16 bit counters.
+  static const int kNum = 70000;
+  char** array = new char*[kNum];
+  for (int i = 0; i < kNum; ++i) {
+    array[i] = new char[10];
+  }
+  for (int i = 0; i < kNum; ++i) {
+    delete[] array[i];
+  }
+  delete[] array;
+}
+
+namespace {
+
+struct RangeCallbackState {
+  uintptr_t ptr;
+  base::MallocRange::Type expected_type;
+  size_t min_size;
+  bool matched;
+};
+
+static void RangeCallback(void* arg, const base::MallocRange* r) {
+  RangeCallbackState* state = reinterpret_cast<RangeCallbackState*>(arg);
+  if (state->ptr >= r->address &&
+      state->ptr < r->address + r->length) {
+    if (state->expected_type == base::MallocRange::FREE) {
+      // We are expecting r->type == FREE, but ReleaseMemory
+      // may have already moved us to UNMAPPED state instead (this happens in
+      // approximately 0.1% of executions). Accept either state.
+      CHECK(r->type == base::MallocRange::FREE ||
+            r->type == base::MallocRange::UNMAPPED);
+    } else {
+      CHECK_EQ(r->type, state->expected_type);
+    }
+    CHECK_GE(r->length, state->min_size);
+    state->matched = true;
+  }
+}
+
+// Check that at least one of the callbacks from Ranges() contains
+// the specified address with the specified type, and has size
+// >= min_size.
+static void CheckRangeCallback(void* ptr, base::MallocRange::Type type,
+                               size_t min_size) {
+  RangeCallbackState state;
+  state.ptr = reinterpret_cast<uintptr_t>(ptr);
+  state.expected_type = type;
+  state.min_size = min_size;
+  state.matched = false;
+  MallocExtension::instance()->Ranges(&state, RangeCallback);
+  CHECK(state.matched);
+}
+
+}
+
+static bool HaveSystemRelease =
+    TCMalloc_SystemRelease(TCMalloc_SystemAlloc(kPageSize, NULL, 0), kPageSize);
+
+static void TestRanges() {
+  static const int MB = 1048576;
+  void* a = malloc(MB);
+  void* b = malloc(MB);
+  base::MallocRange::Type releasedType =
+      HaveSystemRelease ? base::MallocRange::UNMAPPED : base::MallocRange::FREE;
+
+  CheckRangeCallback(a, base::MallocRange::INUSE, MB);
+  CheckRangeCallback(b, base::MallocRange::INUSE, MB);
+  free(a);
+  CheckRangeCallback(a, base::MallocRange::FREE, MB);
+  CheckRangeCallback(b, base::MallocRange::INUSE, MB);
+  MallocExtension::instance()->ReleaseFreeMemory();
+  CheckRangeCallback(a, releasedType, MB);
+  CheckRangeCallback(b, base::MallocRange::INUSE, MB);
+  free(b);
+  CheckRangeCallback(a, releasedType, MB);
+  CheckRangeCallback(b, base::MallocRange::FREE, MB);
+}
+
+#ifndef DEBUGALLOCATION
+static size_t GetUnmappedBytes() {
+  size_t bytes;
+  CHECK(MallocExtension::instance()->GetNumericProperty(
+      "tcmalloc.pageheap_unmapped_bytes", &bytes));
+  return bytes;
+}
+#endif
+
+class AggressiveDecommitChanger {
+  size_t old_value_;
+public:
+  AggressiveDecommitChanger(size_t new_value) {
+    MallocExtension *inst = MallocExtension::instance();
+    bool rv = inst->GetNumericProperty("tcmalloc.aggressive_memory_decommit", &old_value_);
+    CHECK_CONDITION(rv);
+    rv = inst->SetNumericProperty("tcmalloc.aggressive_memory_decommit", new_value);
+    CHECK_CONDITION(rv);
+  }
+  ~AggressiveDecommitChanger() {
+    MallocExtension *inst = MallocExtension::instance();
+    bool rv = inst->SetNumericProperty("tcmalloc.aggressive_memory_decommit", old_value_);
+    CHECK_CONDITION(rv);
+  }
+};
+
+static void TestReleaseToSystem() {
+  // Debug allocation mode adds overhead to each allocation which
+  // messes up all the equality tests here.  I just disable the
+  // teset in this mode.  TODO(csilvers): get it to work for debugalloc?
+#ifndef DEBUGALLOCATION
+
+  if(!HaveSystemRelease) return;
+
+  const double old_tcmalloc_release_rate = FLAGS_tcmalloc_release_rate;
+  FLAGS_tcmalloc_release_rate = 0;
+
+  AggressiveDecommitChanger disabler(0);
+
+  static const int MB = 1048576;
+  void* a = malloc(MB);
+  void* b = malloc(MB);
+  MallocExtension::instance()->ReleaseFreeMemory();
+  size_t starting_bytes = GetUnmappedBytes();
+
+  // Calling ReleaseFreeMemory() a second time shouldn't do anything.
+  MallocExtension::instance()->ReleaseFreeMemory();
+  EXPECT_EQ(starting_bytes, GetUnmappedBytes());
+
+  // ReleaseToSystem shouldn't do anything either.
+  MallocExtension::instance()->ReleaseToSystem(MB);
+  EXPECT_EQ(starting_bytes, GetUnmappedBytes());
+
+  free(a);
+
+  // The span to release should be 1MB.
+  MallocExtension::instance()->ReleaseToSystem(MB/2);
+  EXPECT_EQ(starting_bytes + MB, GetUnmappedBytes());
+
+  // Should do nothing since the previous call released too much.
+  MallocExtension::instance()->ReleaseToSystem(MB/4);
+  EXPECT_EQ(starting_bytes + MB, GetUnmappedBytes());
+
+  free(b);
+
+  // Use up the extra MB/4 bytes from 'a' and also release 'b'.
+  MallocExtension::instance()->ReleaseToSystem(MB/2);
+  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());
+
+  // Should do nothing since the previous call released too much.
+  MallocExtension::instance()->ReleaseToSystem(MB/2);
+  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());
+
+  // Nothing else to release.
+  MallocExtension::instance()->ReleaseFreeMemory();
+  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());
+
+  a = malloc(MB);
+  free(a);
+  EXPECT_EQ(starting_bytes + MB, GetUnmappedBytes());
+
+  // Releasing less than a page should still trigger a release.
+  MallocExtension::instance()->ReleaseToSystem(1);
+  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());
+
+  FLAGS_tcmalloc_release_rate = old_tcmalloc_release_rate;
+#endif   // #ifndef DEBUGALLOCATION
+}
+
+static void TestAggressiveDecommit() {
+  // Debug allocation mode adds overhead to each allocation which
+  // messes up all the equality tests here.  I just disable the
+  // teset in this mode.
+#ifndef DEBUGALLOCATION
+
+  if(!HaveSystemRelease) return;
+
+  fprintf(LOGSTREAM, "Testing aggressive de-commit\n");
+
+  AggressiveDecommitChanger enabler(1);
+
+  static const int MB = 1048576;
+  void* a = malloc(MB);
+  void* b = malloc(MB);
+
+  size_t starting_bytes = GetUnmappedBytes();
+
+  // ReleaseToSystem shouldn't do anything either.
+  MallocExtension::instance()->ReleaseToSystem(MB);
+  EXPECT_EQ(starting_bytes, GetUnmappedBytes());
+
+  free(a);
+
+  // The span to release should be 1MB.
+  EXPECT_EQ(starting_bytes + MB, GetUnmappedBytes());
+
+  free(b);
+
+  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());
+
+  // Nothing else to release.
+  MallocExtension::instance()->ReleaseFreeMemory();
+  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());
+
+  a = malloc(MB);
+  free(a);
+
+  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());
+
+  fprintf(LOGSTREAM, "Done testing aggressive de-commit\n");
+
+#endif   // #ifndef DEBUGALLOCATION
+}
+
+// On MSVC10, in release mode, the optimizer convinces itself
+// g_no_memory is never changed (I guess it doesn't realize OnNoMemory
+// might be called).  Work around this by setting the var volatile.
+volatile bool g_no_memory = false;
+std::new_handler g_old_handler = NULL;
+static void OnNoMemory() {
+  g_no_memory = true;
+  std::set_new_handler(g_old_handler);
+}
+
+static void TestSetNewMode() {
+  int old_mode = tc_set_new_mode(1);
+
+  g_old_handler = std::set_new_handler(&OnNoMemory);
+  g_no_memory = false;
+  void* ret = malloc(kTooBig);
+  EXPECT_EQ(NULL, ret);
+  EXPECT_TRUE(g_no_memory);
+
+  g_old_handler = std::set_new_handler(&OnNoMemory);
+  g_no_memory = false;
+  ret = calloc(1, kTooBig);
+  EXPECT_EQ(NULL, ret);
+  EXPECT_TRUE(g_no_memory);
+
+  g_old_handler = std::set_new_handler(&OnNoMemory);
+  g_no_memory = false;
+  ret = realloc(NULL, kTooBig);
+  EXPECT_EQ(NULL, ret);
+  EXPECT_TRUE(g_no_memory);
+
+  if (kOSSupportsMemalign) {
+    // Not really important, but must be small enough such that
+    // kAlignment + kTooBig does not overflow.
+    const int kAlignment = 1 << 5;
+
+    g_old_handler = std::set_new_handler(&OnNoMemory);
+    g_no_memory = false;
+    ret = Memalign(kAlignment, kTooBig);
+    EXPECT_EQ(NULL, ret);
+    EXPECT_TRUE(g_no_memory);
+
+    g_old_handler = std::set_new_handler(&OnNoMemory);
+    g_no_memory = false;
+    EXPECT_EQ(ENOMEM,
+              PosixMemalign(&ret, kAlignment, kTooBig));
+    EXPECT_EQ(NULL, ret);
+    EXPECT_TRUE(g_no_memory);
+  }
+
+  tc_set_new_mode(old_mode);
+}
+
+static void TestErrno(void) {
+  errno = 0;
+  void* ret = memalign(128, kTooBig);
+  EXPECT_EQ(NULL, ret);
+  EXPECT_EQ(ENOMEM, errno);
+
+  errno = 0;
+  ret = malloc(kTooBig);
+  EXPECT_EQ(NULL, ret);
+  EXPECT_EQ(ENOMEM, errno);
+
+  errno = 0;
+  ret = tc_malloc_skip_new_handler(kTooBig);
+  EXPECT_EQ(NULL, ret);
+  EXPECT_EQ(ENOMEM, errno);
+}
+
+static int RunAllTests(int argc, char** argv) {
+  // Optional argv[1] is the seed
+  AllocatorState rnd(argc > 1 ? atoi(argv[1]) : 100);
+
+  SetTestResourceLimit();
+
+  // TODO(odo):  This test has been disabled because it is only by luck that it
+  // does not result in fragmentation.  When tcmalloc makes an allocation which
+  // spans previously unused leaves of the pagemap it will allocate and fill in
+  // the leaves to cover the new allocation.  The leaves happen to be 256MiB in
+  // the 64-bit build, and with the sbrk allocator these allocations just
+  // happen to fit in one leaf by luck.  With other allocators (mmap,
+  // memfs_malloc when used with small pages) the allocations generally span
+  // two leaves and this results in a very bad fragmentation pattern with this
+  // code.  The same failure can be forced with the sbrk allocator just by
+  // allocating something on the order of 128MiB prior to starting this test so
+  // that the test allocations straddle a 256MiB boundary.
+
+  // TODO(csilvers): port MemoryUsage() over so the test can use that
+#if 0
+# include <unistd.h>      // for getpid()
+  // Allocate and deallocate blocks of increasing sizes to check if the alloc
+  // metadata fragments the memory. (Do not put other allocations/deallocations
+  // before this test, it may break).
+  {
+    size_t memory_usage = MemoryUsage(getpid());
+    fprintf(LOGSTREAM, "Testing fragmentation\n");
+    for ( int i = 200; i < 240; ++i ) {
+      int size = i << 20;
+      void *test1 = rnd.alloc(size);
+      CHECK(test1);
+      for ( int j = 0; j < size; j += (1 << 12) ) {
+        static_cast<char*>(test1)[j] = 1;
+      }
+      free(test1);
+    }
+    // There may still be a bit of fragmentation at the beginning, until we
+    // reach kPageMapBigAllocationThreshold bytes so we check for
+    // 200 + 240 + margin.
+    CHECK_LT(MemoryUsage(getpid()), memory_usage + (450 << 20) );
+  }
+#endif
+
+  // Check that empty allocation works
+  fprintf(LOGSTREAM, "Testing empty allocation\n");
+  {
+    void* p1 = rnd.alloc(0);
+    CHECK(p1 != NULL);
+    void* p2 = rnd.alloc(0);
+    CHECK(p2 != NULL);
+    CHECK(p1 != p2);
+    free(p1);
+    free(p2);
+  }
+
+  // This code stresses some of the memory allocation via STL.
+  // It may call operator delete(void*, nothrow_t).
+  fprintf(LOGSTREAM, "Testing STL use\n");
+  {
+    std::vector<int> v;
+    v.push_back(1);
+    v.push_back(2);
+    v.push_back(3);
+    v.push_back(0);
+    std::stable_sort(v.begin(), v.end());
+  }
+
+  // Test each of the memory-allocation functions once, just as a sanity-check
+  fprintf(LOGSTREAM, "Sanity-testing all the memory allocation functions\n");
+  {
+    // We use new-hook and delete-hook to verify we actually called the
+    // tcmalloc version of these routines, and not the libc version.
+    SetNewHook();      // defined as part of MAKE_HOOK_CALLBACK, above
+    SetDeleteHook();   // ditto
+
+    void* p1 = malloc(10);
+    CHECK(p1 != NULL);    // force use of this variable
+    VerifyNewHookWasCalled();
+    // Also test the non-standard tc_malloc_size
+    size_t actual_p1_size = tc_malloc_size(p1);
+    CHECK_GE(actual_p1_size, 10);
+    CHECK_LT(actual_p1_size, 100000);   // a reasonable upper-bound, I think
+    free(p1);
+    VerifyDeleteHookWasCalled();
+
+    p1 = tc_malloc_skip_new_handler(10);
+    CHECK(p1 != NULL);
+    VerifyNewHookWasCalled();
+    free(p1);
+    VerifyDeleteHookWasCalled();
+
+    p1 = calloc(10, 2);
+    CHECK(p1 != NULL);
+    VerifyNewHookWasCalled();
+    // We make sure we realloc to a big size, since some systems (OS
+    // X) will notice if the realloced size continues to fit into the
+    // malloc-block and make this a noop if so.
+    p1 = realloc(p1, 30000);
+    CHECK(p1 != NULL);
+    VerifyNewHookWasCalled();
+    VerifyDeleteHookWasCalled();
+    cfree(p1);  // synonym for free
+    VerifyDeleteHookWasCalled();
+
+    if (kOSSupportsMemalign) {
+      CHECK_EQ(PosixMemalign(&p1, sizeof(p1), 40), 0);
+      CHECK(p1 != NULL);
+      VerifyNewHookWasCalled();
+      free(p1);
+      VerifyDeleteHookWasCalled();
+
+      p1 = Memalign(sizeof(p1) * 2, 50);
+      CHECK(p1 != NULL);
+      VerifyNewHookWasCalled();
+      free(p1);
+      VerifyDeleteHookWasCalled();
+    }
+
+    // Windows has _aligned_malloc.  Let's test that that's captured too.
+#if (defined(_MSC_VER) || defined(__MINGW32__)) && !defined(PERFTOOLS_NO_ALIGNED_MALLOC)
+    p1 = _aligned_malloc(sizeof(p1) * 2, 64);
+    CHECK(p1 != NULL);
+    VerifyNewHookWasCalled();
+    _aligned_free(p1);
+    VerifyDeleteHookWasCalled();
+#endif
+
+    p1 = valloc(60);
+    CHECK(p1 != NULL);
+    VerifyNewHookWasCalled();
+    free(p1);
+    VerifyDeleteHookWasCalled();
+
+    p1 = pvalloc(70);
+    CHECK(p1 != NULL);
+    VerifyNewHookWasCalled();
+    free(p1);
+    VerifyDeleteHookWasCalled();
+
+    char* p2 = new char;
+    CHECK(p2 != NULL);
+    VerifyNewHookWasCalled();
+    delete p2;
+    VerifyDeleteHookWasCalled();
+
+    p2 = new char[100];
+    CHECK(p2 != NULL);
+    VerifyNewHookWasCalled();
+    delete[] p2;
+    VerifyDeleteHookWasCalled();
+
+    p2 = new(std::nothrow) char;
+    CHECK(p2 != NULL);
+    VerifyNewHookWasCalled();
+    delete p2;
+    VerifyDeleteHookWasCalled();
+
+    p2 = new(std::nothrow) char[100];
+    CHECK(p2 != NULL);
+    VerifyNewHookWasCalled();
+    delete[] p2;
+    VerifyDeleteHookWasCalled();
+
+    // Another way of calling operator new
+    p2 = static_cast<char*>(::operator new(100));
+    CHECK(p2 != NULL);
+    VerifyNewHookWasCalled();
+    ::operator delete(p2);
+    VerifyDeleteHookWasCalled();
+
+    // Try to call nothrow's delete too.  Compilers use this.
+    p2 = static_cast<char*>(::operator new(100, std::nothrow));
+    CHECK(p2 != NULL);
+    VerifyNewHookWasCalled();
+    ::operator delete(p2, std::nothrow);
+    VerifyDeleteHookWasCalled();
+
+    // Try strdup(), which the system allocates but we must free.  If
+    // all goes well, libc will use our malloc!
+    p2 = strdup("test");
+    CHECK(p2 != NULL);
+    VerifyNewHookWasCalled();
+    free(p2);
+    VerifyDeleteHookWasCalled();
+
+
+    // Test mmap too: both anonymous mmap and mmap of a file
+    // Note that for right now we only override mmap on linux
+    // systems, so those are the only ones for which we check.
+    SetMmapHook();
+    SetMremapHook();
+    SetMunmapHook();
+#if defined(HAVE_MMAP) && defined(__linux) && \
+       (defined(__i386__) || defined(__x86_64__))
+    int size = 8192*2;
+    p1 = mmap(NULL, size, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE,
+              -1, 0);
+    CHECK(p1 != NULL);
+    VerifyMmapHookWasCalled();
+    p1 = mremap(p1, size, size/2, 0);
+    CHECK(p1 != NULL);
+    VerifyMremapHookWasCalled();
+    size /= 2;
+    munmap(p1, size);
+    VerifyMunmapHookWasCalled();
+
+    int fd = open("/dev/zero", O_RDONLY);
+    CHECK_GE(fd, 0);   // make sure the open succeeded
+    p1 = mmap(NULL, 8192, PROT_READ, MAP_SHARED, fd, 0);
+    CHECK(p1 != NULL);
+    VerifyMmapHookWasCalled();
+    munmap(p1, 8192);
+    VerifyMunmapHookWasCalled();
+    close(fd);
+#else   // this is just to quiet the compiler: make sure all fns are called
+    IncrementCallsToMmapHook();
+    IncrementCallsToMunmapHook();
+    IncrementCallsToMremapHook();
+    VerifyMmapHookWasCalled();
+    VerifyMremapHookWasCalled();
+    VerifyMunmapHookWasCalled();
+#endif
+
+    // Test sbrk
+    SetSbrkHook();
+#if defined(HAVE_SBRK) && defined(__linux) && \
+       (defined(__i386__) || defined(__x86_64__))
+    p1 = sbrk(8192);
+    CHECK(p1 != NULL);
+    VerifySbrkHookWasCalled();
+    p1 = sbrk(-8192);
+    CHECK(p1 != NULL);
+    VerifySbrkHookWasCalled();
+    // However, sbrk hook should *not* be called with sbrk(0)
+    p1 = sbrk(0);
+    CHECK(p1 != NULL);
+    CHECK_EQ(g_SbrkHook_calls, 0);
+#else   // this is just to quiet the compiler: make sure all fns are called
+    IncrementCallsToSbrkHook();
+    VerifySbrkHookWasCalled();
+#endif
+
+    // Reset the hooks to what they used to be.  These are all
+    // defined as part of MAKE_HOOK_CALLBACK, above.
+    ResetNewHook();
+    ResetDeleteHook();
+    ResetMmapHook();
+    ResetMremapHook();
+    ResetMunmapHook();
+    ResetSbrkHook();
+  }
+
+  // Check that "lots" of memory can be allocated
+  fprintf(LOGSTREAM, "Testing large allocation\n");
+  {
+    const int mb_to_allocate = 100;
+    void* p = rnd.alloc(mb_to_allocate << 20);
+    CHECK(p != NULL);  // could not allocate
+    free(p);
+  }
+
+  TestMallocAlignment();
+
+  // Check calloc() with various arguments
+  fprintf(LOGSTREAM, "Testing calloc\n");
+  TestCalloc(0, 0, true);
+  TestCalloc(0, 1, true);
+  TestCalloc(1, 1, true);
+  TestCalloc(1<<10, 0, true);
+  TestCalloc(1<<20, 0, true);
+  TestCalloc(0, 1<<10, true);
+  TestCalloc(0, 1<<20, true);
+  TestCalloc(1<<20, 2, true);
+  TestCalloc(2, 1<<20, true);
+  TestCalloc(1000, 1000, true);
+
+  TestCalloc(kMaxSize, 2, false);
+  TestCalloc(2, kMaxSize, false);
+  TestCalloc(kMaxSize, kMaxSize, false);
+
+  TestCalloc(kMaxSignedSize, 3, false);
+  TestCalloc(3, kMaxSignedSize, false);
+  TestCalloc(kMaxSignedSize, kMaxSignedSize, false);
+
+  // Test that realloc doesn't always reallocate and copy memory.
+  fprintf(LOGSTREAM, "Testing realloc\n");
+  TestRealloc();
+
+  fprintf(LOGSTREAM, "Testing operator new(nothrow).\n");
+  TestNothrowNew(&::operator new);
+  fprintf(LOGSTREAM, "Testing operator new[](nothrow).\n");
+  TestNothrowNew(&::operator new[]);
+  fprintf(LOGSTREAM, "Testing operator new.\n");
+  TestNew(&::operator new);
+  fprintf(LOGSTREAM, "Testing operator new[].\n");
+  TestNew(&::operator new[]);
+
+  // Create threads
+  fprintf(LOGSTREAM, "Testing threaded allocation/deallocation (%d threads)\n",
+          FLAGS_numthreads);
+  threads = new TesterThread*[FLAGS_numthreads];
+  for (int i = 0; i < FLAGS_numthreads; ++i) {
+    threads[i] = new TesterThread(i);
+  }
+
+  // This runs all the tests at the same time, with a 1M stack size each
+  RunManyThreadsWithId(RunThread, FLAGS_numthreads, 1<<20);
+
+  for (int i = 0; i < FLAGS_numthreads; ++i) delete threads[i];    // Cleanup
+
+  // Do the memory intensive tests after threads are done, since exhausting
+  // the available address space can make pthread_create to fail.
+
+  // Check that huge allocations fail with NULL instead of crashing
+  fprintf(LOGSTREAM, "Testing huge allocations\n");
+  TestHugeAllocations(&rnd);
+
+  // Check that large allocations fail with NULL instead of crashing
+#ifndef DEBUGALLOCATION    // debug allocation takes forever for huge allocs
+  fprintf(LOGSTREAM, "Testing out of memory\n");
+  for (int s = 0; ; s += (10<<20)) {
+    void* large_object = rnd.alloc(s);
+    if (large_object == NULL) break;
+    free(large_object);
+  }
+#endif
+
+  TestHugeThreadCache();
+  TestRanges();
+  TestReleaseToSystem();
+  TestAggressiveDecommit();
+  TestSetNewMode();
+  TestErrno();
+
+  return 0;
+}
+
+}
+
+using testing::RunAllTests;
+
+int main(int argc, char** argv) {
+#ifdef DEBUGALLOCATION    // debug allocation takes forever for huge allocs
+  FLAGS_max_free_queue_size = 0;  // return freed blocks to tcmalloc immediately
+#endif
+
+  RunAllTests(argc, argv);
+
+  // Test tc_version()
+  fprintf(LOGSTREAM, "Testing tc_version()\n");
+  int major;
+  int minor;
+  const char* patch;
+  char mmp[64];
+  const char* human_version = tc_version(&major, &minor, &patch);
+  snprintf(mmp, sizeof(mmp), "%d.%d%s", major, minor, patch);
+  CHECK(!strcmp(PACKAGE_STRING, human_version));
+  CHECK(!strcmp(PACKAGE_VERSION, mmp));
+
+  fprintf(LOGSTREAM, "PASS\n");
+}

diff --git a/src/tests/tcmalloc_unittest.sh b/src/tests/tcmalloc_unittest.sh
new file mode 100755
index 0000000..755241e
--- /dev/null
+++ b/src/tests/tcmalloc_unittest.sh

@@ -0,0 +1,80 @@
+#!/bin/sh
+
+# Copyright (c) 2013, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ---
+# Author: Adhemerval Zanella
+#
+# Runs the tcmalloc_unittest with various environment variables.
+# This is necessary because tuning some environment variables
+# (TCMALLOC_TRANSFER_NUM_OBJ for instance) should not change program
+# behavior, just performance.
+
+BINDIR="${BINDIR:-.}"
+TCMALLOC_UNITTEST="${1:-$BINDIR/tcmalloc_unittest}"
+
+TMPDIR=/tmp/tcmalloc_unittest
+rm -rf $TMPDIR || exit 2
+mkdir $TMPDIR || exit 3
+
+run_unittest() {
+    if $TCMALLOC_UNITTEST > $TMPDIR/output 2>&1; then
+      echo "OK"
+    else
+      echo "FAILED"
+      echo "Output from the failed run:"
+      echo "----"
+      cat $TMPDIR/output
+      echo "----"
+      exit 4
+    fi
+}
+
+# $1: value of tcmalloc_unittest env. var.
+run_check_transfer_num_obj() {
+    [ -n "$1" ] && export TCMALLOC_TRANSFER_NUM_OBJ="$1"
+
+    echo -n "Testing $TCMALLOC_UNITTEST with TCMALLOC_TRANSFER_NUM_OBJ=$1 ... "
+    run_unittest
+}
+
+run_check_transfer_num_obj ""
+run_check_transfer_num_obj "40"
+run_check_transfer_num_obj "4096"
+
+echo -n "Testing $TCMALLOC_UNITTEST with TCMALLOC_AGGRESSIVE_DECOMMIT=f ... "
+
+TCMALLOC_AGGRESSIVE_DECOMMIT=f run_unittest
+
+echo -n "Testing $TCMALLOC_UNITTEST with TCMALLOC_HEAP_LIMIT_MB=512 ... "
+
+TCMALLOC_HEAP_LIMIT_MB=512 run_unittest
+
+echo "PASS"

diff --git a/src/tests/testutil.cc b/src/tests/testutil.cc
new file mode 100644
index 0000000..c2c71cb
--- /dev/null
+++ b/src/tests/testutil.cc

@@ -0,0 +1,224 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// A few routines that are useful for multiple tests in this directory.
+
+#include "config_for_unittests.h"
+#include <stdlib.h>           // for NULL, abort()
+// On FreeBSD, if you #include <sys/resource.h>, you have to get stdint first.
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#ifdef HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>
+#endif
+#include "tests/testutil.h"
+
+
+// When compiled 64-bit and run on systems with swap several unittests will end
+// up trying to consume all of RAM+swap, and that can take quite some time.  By
+// limiting the address-space size we get sufficient coverage without blowing
+// out job limits.
+void SetTestResourceLimit() {
+#ifdef HAVE_SYS_RESOURCE_H
+  // The actual resource we need to set varies depending on which flavour of
+  // unix.  On Linux we need RLIMIT_AS because that covers the use of mmap.
+  // Otherwise hopefully RLIMIT_RSS is good enough.  (Unfortunately 64-bit
+  // and 32-bit headers disagree on the type of these constants!)
+#ifdef RLIMIT_AS
+#define USE_RESOURCE RLIMIT_AS
+#else
+#define USE_RESOURCE RLIMIT_RSS
+#endif
+
+  // Restrict the test to 1GiB, which should fit comfortably well on both
+  // 32-bit and 64-bit hosts, and executes in ~1s.
+  const rlim_t kMaxMem = 1<<30;
+
+  struct rlimit rlim;
+  if (getrlimit(USE_RESOURCE, &rlim) == 0) {
+    if (rlim.rlim_cur == RLIM_INFINITY || rlim.rlim_cur > kMaxMem) {
+      rlim.rlim_cur = kMaxMem;
+      setrlimit(USE_RESOURCE, &rlim); // ignore result
+    }
+  }
+#endif  /* HAVE_SYS_RESOURCE_H */
+}
+
+
+struct FunctionAndId {
+  void (*ptr_to_function)(int);
+  int id;
+};
+
+#if defined(NO_THREADS) || !(defined(HAVE_PTHREAD) || defined(_WIN32))
+
+extern "C" void RunThread(void (*fn)()) {
+  (*fn)();
+}
+
+extern "C" void RunManyThreads(void (*fn)(), int count) {
+  // I guess the best we can do is run fn sequentially, 'count' times
+  for (int i = 0; i < count; i++)
+    (*fn)();
+}
+
+extern "C" void RunManyThreadsWithId(void (*fn)(int), int count, int) {
+  for (int i = 0; i < count; i++)
+    (*fn)(i);    // stacksize doesn't make sense in a non-threaded context
+}
+
+#elif defined(_WIN32)
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN  /* We always want minimal includes */
+#endif
+#include <windows.h>
+
+extern "C" {
+  // This helper function has the signature that pthread_create wants.
+  DWORD WINAPI RunFunctionInThread(LPVOID ptr_to_ptr_to_fn) {
+    (**static_cast<void (**)()>(ptr_to_ptr_to_fn))();    // runs fn
+    return 0;
+  }
+
+  DWORD WINAPI RunFunctionInThreadWithId(LPVOID ptr_to_fnid) {
+    FunctionAndId* fn_and_id = static_cast<FunctionAndId*>(ptr_to_fnid);
+    (*fn_and_id->ptr_to_function)(fn_and_id->id);   // runs fn
+    return 0;
+  }
+
+  void RunManyThreads(void (*fn)(), int count) {
+    DWORD dummy;
+    HANDLE* hThread = new HANDLE[count];
+    for (int i = 0; i < count; i++) {
+      hThread[i] = CreateThread(NULL, 0, RunFunctionInThread, &fn, 0, &dummy);
+      if (hThread[i] == NULL)  ExitProcess(i);
+    }
+    WaitForMultipleObjects(count, hThread, TRUE, INFINITE);
+    for (int i = 0; i < count; i++) {
+      CloseHandle(hThread[i]);
+    }
+    delete[] hThread;
+  }
+
+  void RunThread(void (*fn)()) {
+    RunManyThreads(fn, 1);
+  }
+
+  void RunManyThreadsWithId(void (*fn)(int), int count, int stacksize) {
+    DWORD dummy;
+    HANDLE* hThread = new HANDLE[count];
+    FunctionAndId* fn_and_ids = new FunctionAndId[count];
+    for (int i = 0; i < count; i++) {
+      fn_and_ids[i].ptr_to_function = fn;
+      fn_and_ids[i].id = i;
+      hThread[i] = CreateThread(NULL, stacksize, RunFunctionInThreadWithId,
+                                &fn_and_ids[i], 0, &dummy);
+      if (hThread[i] == NULL)  ExitProcess(i);
+    }
+    WaitForMultipleObjects(count, hThread, TRUE, INFINITE);
+    for (int i = 0; i < count; i++) {
+      CloseHandle(hThread[i]);
+    }
+    delete[] fn_and_ids;
+    delete[] hThread;
+  }
+}
+
+#else  // not NO_THREADS, not !HAVE_PTHREAD, not _WIN32
+
+#include <pthread.h>
+
+#define SAFE_PTHREAD(fncall)  do { if ((fncall) != 0) abort(); } while (0)
+
+extern "C" {
+  // This helper function has the signature that pthread_create wants.
+  static void* RunFunctionInThread(void *ptr_to_ptr_to_fn) {
+    (**static_cast<void (**)()>(ptr_to_ptr_to_fn))();    // runs fn
+    return NULL;
+  }
+
+  static void* RunFunctionInThreadWithId(void *ptr_to_fnid) {
+    FunctionAndId* fn_and_id = static_cast<FunctionAndId*>(ptr_to_fnid);
+    (*fn_and_id->ptr_to_function)(fn_and_id->id);   // runs fn
+    return NULL;
+  }
+
+  // Run a function in a thread of its own and wait for it to finish.
+  // This is useful for tcmalloc testing, because each thread is
+  // handled separately in tcmalloc, so there's interesting stuff to
+  // test even if the threads are not running concurrently.
+  void RunThread(void (*fn)()) {
+    pthread_t thr;
+    // Even though fn is on the stack, it's safe to pass a pointer to it,
+    // because we pthread_join immediately (ie, before RunInThread exits).
+    SAFE_PTHREAD(pthread_create(&thr, NULL, RunFunctionInThread, &fn));
+    SAFE_PTHREAD(pthread_join(thr, NULL));
+  }
+
+  void RunManyThreads(void (*fn)(), int count) {
+    pthread_t* thr = new pthread_t[count];
+    for (int i = 0; i < count; i++) {
+      SAFE_PTHREAD(pthread_create(&thr[i], NULL, RunFunctionInThread, &fn));
+    }
+    for (int i = 0; i < count; i++) {
+      SAFE_PTHREAD(pthread_join(thr[i], NULL));
+    }
+    delete[] thr;
+  }
+
+  void RunManyThreadsWithId(void (*fn)(int), int count, int stacksize) {
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    pthread_attr_setstacksize(&attr, stacksize);
+
+    pthread_t* thr = new pthread_t[count];
+    FunctionAndId* fn_and_ids = new FunctionAndId[count];
+    for (int i = 0; i < count; i++) {
+      fn_and_ids[i].ptr_to_function = fn;
+      fn_and_ids[i].id = i;
+      SAFE_PTHREAD(pthread_create(&thr[i], &attr,
+                                  RunFunctionInThreadWithId, &fn_and_ids[i]));
+    }
+    for (int i = 0; i < count; i++) {
+      SAFE_PTHREAD(pthread_join(thr[i], NULL));
+    }
+    delete[] fn_and_ids;
+    delete[] thr;
+
+    pthread_attr_destroy(&attr);
+  }
+}
+
+#endif

diff --git a/src/tests/testutil.h b/src/tests/testutil.h
new file mode 100644
index 0000000..071a209
--- /dev/null
+++ b/src/tests/testutil.h

@@ -0,0 +1,62 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+
+#ifndef TCMALLOC_TOOLS_TESTUTIL_H_
+#define TCMALLOC_TOOLS_TESTUTIL_H_
+
+// Run a function in a thread of its own and wait for it to finish.
+// The function you pass in must have the signature
+//    void MyFunction();
+extern "C" void RunThread(void (*fn)());
+
+// Run a function X times, in X threads, and wait for them all to finish.
+// The function you pass in must have the signature
+//    void MyFunction();
+extern "C" void RunManyThreads(void (*fn)(), int count);
+
+// The 'advanced' version: run a function X times, in X threads, and
+// wait for them all to finish.  Give them all the specified stack-size.
+// (If you're curious why this takes a stacksize and the others don't,
+// it's because the one client of this fn wanted to specify stacksize. :-) )
+// The function you pass in must have the signature
+//    void MyFunction(int idx);
+// where idx is the index of the thread (which of the X threads this is).
+extern "C" void RunManyThreadsWithId(void (*fn)(int), int count, int stacksize);
+
+// When compiled 64-bit and run on systems with swap several unittests will end
+// up trying to consume all of RAM+swap, and that can take quite some time.  By
+// limiting the address-space size we get sufficient coverage without blowing
+// out job limits.
+void SetTestResourceLimit();
+
+#endif  // TCMALLOC_TOOLS_TESTUTIL_H_

diff --git a/src/tests/thread_dealloc_unittest.cc b/src/tests/thread_dealloc_unittest.cc
new file mode 100644
index 0000000..97615cd
--- /dev/null
+++ b/src/tests/thread_dealloc_unittest.cc

@@ -0,0 +1,84 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2004, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat
+//
+// Check that we do not leak memory when cycling through lots of threads.
+
+#include "config_for_unittests.h"
+#include <stdio.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>    // for sleep()
+#endif
+#include "base/logging.h"
+#include <gperftools/malloc_extension.h>
+#include "tests/testutil.h"   // for RunThread()
+
+// Size/number of objects to allocate per thread (1 MB per thread)
+static const int kObjectSize = 1024;
+static const int kNumObjects = 1024;
+
+// Number of threads to create and destroy
+static const int kNumThreads = 1000;
+
+// Allocate lots of stuff
+static void AllocStuff() {
+  void** objects = new void*[kNumObjects];
+  for (int i = 0; i < kNumObjects; i++) {
+    objects[i] = malloc(kObjectSize);
+  }
+  for (int i = 0; i < kNumObjects; i++) {
+    free(objects[i]);
+  }
+  delete[] objects;
+}
+
+int main(int argc, char** argv) {
+  static const int kDisplaySize = 1048576;
+  char* display = new char[kDisplaySize];
+
+  for (int i = 0; i < kNumThreads; i++) {
+    RunThread(&AllocStuff);
+
+    if (((i+1) % 200) == 0) {
+      fprintf(stderr, "Iteration: %d of %d\n", (i+1), kNumThreads);
+      MallocExtension::instance()->GetStats(display, kDisplaySize);
+      fprintf(stderr, "%s\n", display);
+    }
+  }
+  delete[] display;
+
+  printf("PASS\n");
+#ifdef HAVE_UNISTD_H
+  sleep(1);     // Prevent exit race problem with glibc
+#endif
+  return 0;
+}

diff --git a/src/third_party/valgrind.h b/src/third_party/valgrind.h
new file mode 100644
index 0000000..577c59a
--- /dev/null
+++ b/src/third_party/valgrind.h

@@ -0,0 +1,3924 @@
+/* -*- c -*-
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (valgrind.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 2, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2000-2008 Julian Seward.  All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must 
+      not claim that you wrote the original software.  If you use this 
+      software in a product, an acknowledgment in the product 
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote 
+      products derived from this software without specific prior written 
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (valgrind.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 2.  See the
+   COPYING file in the source distribution for details.
+
+   ---------------------------------------------------------------- 
+*/
+
+
+/* This file is for inclusion into client (your!) code.
+
+   You can use these macros to manipulate and query Valgrind's 
+   execution inside your own programs.
+
+   The resulting executables will still run without Valgrind, just a
+   little bit more slowly than they otherwise would, but otherwise
+   unchanged.  When not running on valgrind, each client request
+   consumes very few (eg. 7) instructions, so the resulting performance
+   loss is negligible unless you plan to execute client requests
+   millions of times per second.  Nevertheless, if that is still a
+   problem, you can compile with the NVALGRIND symbol defined (gcc
+   -DNVALGRIND) so that client requests are not even compiled in.  */
+
+#ifndef __VALGRIND_H
+#define __VALGRIND_H
+
+#include <stdarg.h>
+
+/* Nb: this file might be included in a file compiled with -ansi.  So
+   we can't use C++ style "//" comments nor the "asm" keyword (instead
+   use "__asm__"). */
+
+/* Derive some tags indicating what the target platform is.  Note
+   that in this file we're using the compiler's CPP symbols for
+   identifying architectures, which are different to the ones we use
+   within the rest of Valgrind.  Note, __powerpc__ is active for both
+   32 and 64-bit PPC, whereas __powerpc64__ is only active for the
+   latter (on Linux, that is). */
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64_linux
+#undef PLAT_ppc32_aix5
+#undef PLAT_ppc64_aix5
+
+#if !defined(_AIX) && defined(__i386__)
+#  define PLAT_x86_linux 1
+#elif !defined(_AIX) && defined(__x86_64__)
+#  define PLAT_amd64_linux 1
+#elif !defined(_AIX) && defined(__powerpc__) && !defined(__powerpc64__)
+#  define PLAT_ppc32_linux 1
+#elif !defined(_AIX) && defined(__powerpc__) && defined(__powerpc64__)
+#  define PLAT_ppc64_linux 1
+#elif defined(_AIX) && defined(__64BIT__)
+#  define PLAT_ppc64_aix5 1
+#elif defined(_AIX) && !defined(__64BIT__)
+#  define PLAT_ppc32_aix5 1
+#endif
+
+
+/* If we're not compiling for our target platform, don't generate
+   any inline asms.  */
+#if !defined(PLAT_x86_linux) && !defined(PLAT_amd64_linux) \
+    && !defined(PLAT_ppc32_linux) && !defined(PLAT_ppc64_linux) \
+    && !defined(PLAT_ppc32_aix5) && !defined(PLAT_ppc64_aix5)
+#  if !defined(NVALGRIND)
+#    define NVALGRIND 1
+#  endif
+#endif
+
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE SPECIFICS for SPECIAL INSTRUCTIONS.  There is nothing */
+/* in here of use to end-users -- skip to the next section.           */
+/* ------------------------------------------------------------------ */
+
+#if defined(NVALGRIND)
+
+/* Define NVALGRIND to completely remove the Valgrind magic sequence
+   from the compiled code (analogous to NDEBUG's effects on
+   assert()) */
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+   {                                                              \
+      (_zzq_rlval) = (_zzq_default);                              \
+   }
+
+#else  /* ! NVALGRIND */
+
+/* The following defines the magic code sequences which the JITter
+   spots and handles magically.  Don't look too closely at them as
+   they will rot your brain.
+
+   The assembly code sequences for all architectures is in this one
+   file.  This is because this file must be stand-alone, and we don't
+   want to have multiple files.
+
+   For VALGRIND_DO_CLIENT_REQUEST, we must ensure that the default
+   value gets put in the return slot, so that everything works when
+   this is executed not under Valgrind.  Args are passed in a memory
+   block, and so there's no intrinsic limit to the number that could
+   be passed, but it's currently five.
+   
+   The macro args are: 
+      _zzq_rlval    result lvalue
+      _zzq_default  default value (result returned when running on real CPU)
+      _zzq_request  request code
+      _zzq_arg1..5  request params
+
+   The other two macros are used to support function wrapping, and are
+   a lot simpler.  VALGRIND_GET_NR_CONTEXT returns the value of the
+   guest's NRADDR pseudo-register and whatever other information is
+   needed to safely run the call original from the wrapper: on
+   ppc64-linux, the R2 value at the divert point is also needed.  This
+   information is abstracted into a user-visible type, OrigFn.
+
+   VALGRIND_CALL_NOREDIR_* behaves the same as the following on the
+   guest, but guarantees that the branch instruction will not be
+   redirected: x86: call *%eax, amd64: call *%rax, ppc32/ppc64:
+   branch-and-link-to-r11.  VALGRIND_CALL_NOREDIR is just text, not a
+   complete inline asm, since it needs to be combined with more magic
+   inline asm stuff to be useful.
+*/
+
+/* ------------------------- x86-linux ------------------------- */
+
+#if defined(PLAT_x86_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "roll $3,  %%edi ; roll $13, %%edi\n\t"      \
+                     "roll $29, %%edi ; roll $19, %%edi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  { volatile unsigned int _zzq_args[6];                           \
+    volatile unsigned int _zzq_result;                            \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EDX = client_request ( %EAX ) */         \
+                     "xchgl %%ebx,%%ebx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EAX = guest_NRADDR */                    \
+                     "xchgl %%ecx,%%ecx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_EAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%EAX */                     \
+                     "xchgl %%edx,%%edx\n\t"
+#endif /* PLAT_x86_linux */
+
+/* ------------------------ amd64-linux ------------------------ */
+
+#if defined(PLAT_amd64_linux)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rolq $3,  %%rdi ; rolq $13, %%rdi\n\t"      \
+                     "rolq $61, %%rdi ; rolq $51, %%rdi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  { volatile unsigned long long int _zzq_args[6];                 \
+    volatile unsigned long long int _zzq_result;                  \
+    _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+    _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RDX = client_request ( %RAX ) */         \
+                     "xchgq %%rbx,%%rbx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned long long int __addr;                       \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RAX = guest_NRADDR */                    \
+                     "xchgq %%rcx,%%rcx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_RAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%RAX */                     \
+                     "xchgq %%rdx,%%rdx\n\t"
+#endif /* PLAT_amd64_linux */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rlwinm 0,0,3,0,0  ; rlwinm 0,0,13,0,0\n\t"  \
+                     "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned int  _zzq_args[6];                          \
+             unsigned int  _zzq_result;                           \
+             unsigned int* _zzq_ptr;                              \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned int __addr;                                          \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64_linux)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+      unsigned long long int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned long long int  _zzq_args[6];                \
+    register unsigned long long int  _zzq_result __asm__("r3");   \
+    register unsigned long long int* _zzq_ptr __asm__("r4");      \
+    _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+    _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1"                                   \
+                     : "=r" (_zzq_result)                         \
+                     : "0" (_zzq_default), "r" (_zzq_ptr)         \
+                     : "cc", "memory");                           \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    register unsigned long long int __addr __asm__("r3");         \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2"                                   \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4"                                   \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc64_linux */
+
+/* ------------------------ ppc32-aix5 ------------------------- */
+
+#if defined(PLAT_ppc32_aix5)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+      unsigned int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rlwinm 0,0,3,0,0  ; rlwinm 0,0,13,0,0\n\t"  \
+                     "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned int  _zzq_args[7];                          \
+    register unsigned int  _zzq_result;                           \
+    register unsigned int* _zzq_ptr;                              \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    _zzq_args[6] = (unsigned int)(_zzq_default);                  \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 4,%1\n\t"                                \
+                     "lwz 3, 24(4)\n\t"                           \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_ptr)                             \
+                     : "r3", "r4", "cc", "memory");               \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    register unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc32_aix5 */
+
+/* ------------------------ ppc64-aix5 ------------------------- */
+
+#if defined(PLAT_ppc64_aix5)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+      unsigned long long int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned long long int  _zzq_args[7];                \
+    register unsigned long long int  _zzq_result;                 \
+    register unsigned long long int* _zzq_ptr;                    \
+    _zzq_args[0] = (unsigned int long long)(_zzq_request);        \
+    _zzq_args[1] = (unsigned int long long)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned int long long)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned int long long)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned int long long)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned int long long)(_zzq_arg5);           \
+    _zzq_args[6] = (unsigned int long long)(_zzq_default);        \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 4,%1\n\t"                                \
+                     "ld 3, 48(4)\n\t"                            \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_ptr)                             \
+                     : "r3", "r4", "cc", "memory");               \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    register unsigned long long int __addr;                       \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc64_aix5 */
+
+/* Insert assembly code for other platforms here... */
+
+#endif /* NVALGRIND */
+
+
+/* ------------------------------------------------------------------ */
+/* PLATFORM SPECIFICS for FUNCTION WRAPPING.  This is all very        */
+/* ugly.  It's the least-worst tradeoff I can think of.               */
+/* ------------------------------------------------------------------ */
+
+/* This section defines magic (a.k.a appalling-hack) macros for doing
+   guaranteed-no-redirection macros, so as to get from function
+   wrappers to the functions they are wrapping.  The whole point is to
+   construct standard call sequences, but to do the call itself with a
+   special no-redirect call pseudo-instruction that the JIT
+   understands and handles specially.  This section is long and
+   repetitious, and I can't see a way to make it shorter.
+
+   The naming scheme is as follows:
+
+      CALL_FN_{W,v}_{v,W,WW,WWW,WWWW,5W,6W,7W,etc}
+
+   'W' stands for "word" and 'v' for "void".  Hence there are
+   different macros for calling arity 0, 1, 2, 3, 4, etc, functions,
+   and for each, the possibility of returning a word-typed result, or
+   no result.
+*/
+
+/* Use these to write the name of your wrapper.  NOTE: duplicates
+   VG_WRAP_FUNCTION_Z{U,Z} in pub_tool_redir.h. */
+
+#define I_WRAP_SONAME_FNNAME_ZU(soname,fnname)                    \
+   _vgwZU_##soname##_##fnname
+
+#define I_WRAP_SONAME_FNNAME_ZZ(soname,fnname)                    \
+   _vgwZZ_##soname##_##fnname
+
+/* Use this macro from within a wrapper function to collect the
+   context (address and possibly other info) of the original function.
+   Once you have that you can then use it in one of the CALL_FN_
+   macros.  The type of the argument _lval is OrigFn. */
+#define VALGRIND_GET_ORIG_FN(_lval)  VALGRIND_GET_NR_CONTEXT(_lval)
+
+/* Derivatives of the main macros below, for calling functions
+   returning void. */
+
+#define CALL_FN_v_v(fnptr)                                        \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_v(_junk,fnptr); } while (0)
+
+#define CALL_FN_v_W(fnptr, arg1)                                  \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_W(_junk,fnptr,arg1); } while (0)
+
+#define CALL_FN_v_WW(fnptr, arg1,arg2)                            \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WW(_junk,fnptr,arg1,arg2); } while (0)
+
+#define CALL_FN_v_WWW(fnptr, arg1,arg2,arg3)                      \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WWW(_junk,fnptr,arg1,arg2,arg3); } while (0)
+
+/* ------------------------- x86-linux ------------------------- */
+
+#if defined(PLAT_x86_linux)
+
+/* These regs are trashed by the hidden call.  No need to mention eax
+   as gcc can already see that, plus causes gcc to bomb. */
+#define __CALLER_SAVED_REGS /*"eax"*/ "ecx", "edx"
+
+/* These CALL_FN_ macros assume that on x86-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $4, %%esp\n"                                       \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $8, %%esp\n"                                       \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $12, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $16, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $20, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $24, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $28, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $32, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $36, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $40, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $44, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "pushl 48(%%eax)\n\t"                                    \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $48, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_x86_linux */
+
+/* ------------------------ amd64-linux ------------------------ */
+
+#if defined(PLAT_amd64_linux)
+
+/* ARGREGS: rdi rsi rdx rcx r8 r9 (the rest on stack in R-to-L order) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS /*"rax",*/ "rcx", "rdx", "rsi",       \
+                            "rdi", "r8", "r9", "r10", "r11"
+
+/* These CALL_FN_ macros assume that on amd64-linux, sizeof(unsigned
+   long) == 8. */
+
+/* NB 9 Sept 07.  There is a nasty kludge here in all these CALL_FN_
+   macros.  In order not to trash the stack redzone, we need to drop
+   %rsp by 128 before the hidden call, and restore afterwards.  The
+   nastyness is that it is only by luck that the stack still appears
+   to be unwindable during the hidden call - since then the behaviour
+   of any routine using this macro does not match what the CFI data
+   says.  Sigh.
+
+   Why is this important?  Imagine that a wrapper has a stack
+   allocated local, and passes to the hidden call, a pointer to it.
+   Because gcc does not know about the hidden call, it may allocate
+   that local in the redzone.  Unfortunately the hidden call may then
+   trash it before it comes to use it.  So we must step clear of the
+   redzone, for the duration of the hidden call, to make it safe.
+
+   Probably the same problem afflicts the other redzone-style ABIs too
+   (ppc64-linux, ppc32-aix5, ppc64-aix5); but for those, the stack is
+   self describing (none of this CFI nonsense) so at least messing
+   with the stack pointer doesn't give a danger of non-unwindable
+   stack. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $8, %%rsp\n"                                       \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $16, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $24, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $32, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 88(%%rax)\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $40, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 96(%%rax)\n\t"                                    \
+         "pushq 88(%%rax)\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $48, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_amd64_linux */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+/* This is useful for finding out about the on-stack stuff:
+
+   extern int f9  ( int,int,int,int,int,int,int,int,int );
+   extern int f10 ( int,int,int,int,int,int,int,int,int,int );
+   extern int f11 ( int,int,int,int,int,int,int,int,int,int,int );
+   extern int f12 ( int,int,int,int,int,int,int,int,int,int,int,int );
+
+   int g9 ( void ) {
+      return f9(11,22,33,44,55,66,77,88,99);
+   }
+   int g10 ( void ) {
+      return f10(11,22,33,44,55,66,77,88,99,110);
+   }
+   int g11 ( void ) {
+      return f11(11,22,33,44,55,66,77,88,99,110,121);
+   }
+   int g12 ( void ) {
+      return f12(11,22,33,44,55,66,77,88,99,110,121,132);
+   }
+*/
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* These CALL_FN_ macros assume that on ppc32-linux, 
+   sizeof(unsigned long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,16\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,16\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,32\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      _argvec[12] = (unsigned long)arg12;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg12 */                                              \
+         "lwz 3,48(11)\n\t"                                       \
+         "stw 3,20(1)\n\t"                                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,32\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64_linux)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,128"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,128"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,144"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg12 */                                              \
+         "ld  3,96(11)\n\t"                                       \
+         "std 3,136(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,144"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64_linux */
+
+/* ------------------------ ppc32-aix5 ------------------------- */
+
+#if defined(PLAT_ppc32_aix5)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* Expand the stack frame, copying enough info that unwinding
+   still works.  Trashes r3. */
+
+#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr)                      \
+         "addi 1,1,-" #_n_fr "\n\t"                               \
+         "lwz  3," #_n_fr "(1)\n\t"                               \
+         "stw  3,0(1)\n\t"
+
+#define VG_CONTRACT_FRAME_BY(_n_fr)                               \
+         "addi 1,1," #_n_fr "\n\t"
+
+/* These CALL_FN_ macros assume that on ppc32-aix5, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t" /* arg2->r4 */                       \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(64)                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(64)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(64)                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,60(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(64)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(72)                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,64(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,60(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(72)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(72)                        \
+         /* arg12 */                                              \
+         "lwz 3,48(11)\n\t"                                       \
+         "stw 3,68(1)\n\t"                                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,64(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,60(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(72)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc32_aix5 */
+
+/* ------------------------ ppc64-aix5 ------------------------- */
+
+#if defined(PLAT_ppc64_aix5)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* Expand the stack frame, copying enough info that unwinding
+   still works.  Trashes r3. */
+
+#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr)                      \
+         "addi 1,1,-" #_n_fr "\n\t"                               \
+         "ld   3," #_n_fr "(1)\n\t"                               \
+         "std  3,0(1)\n\t"
+
+#define VG_CONTRACT_FRAME_BY(_n_fr)                               \
+         "addi 1,1," #_n_fr "\n\t"
+
+/* These CALL_FN_ macros assume that on ppc64-aix5, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(128)                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(128)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(128)                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(128)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(144)                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(144)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(144)                       \
+         /* arg12 */                                              \
+         "ld  3,96(11)\n\t"                                       \
+         "std 3,136(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(144)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64_aix5 */
+
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS.               */
+/*                                                                    */
+/* ------------------------------------------------------------------ */
+
+/* Some request codes.  There are many more of these, but most are not
+   exposed to end-user view.  These are the public ones, all of the
+   form 0x1000 + small_number.
+
+   Core ones are in the range 0x00000000--0x0000ffff.  The non-public
+   ones start at 0x2000.
+*/
+
+/* These macros are used by tools -- they must be public, but don't
+   embed them into other programs. */
+#define VG_USERREQ_TOOL_BASE(a,b) \
+   ((unsigned int)(((a)&0xff) << 24 | ((b)&0xff) << 16))
+#define VG_IS_TOOL_USERREQ(a, b, v) \
+   (VG_USERREQ_TOOL_BASE(a,b) == ((v) & 0xffff0000))
+
+/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! 
+   This enum comprises an ABI exported by Valgrind to programs
+   which use client requests.  DO NOT CHANGE THE ORDER OF THESE
+   ENTRIES, NOR DELETE ANY -- add new ones at the end. */
+typedef
+   enum { VG_USERREQ__RUNNING_ON_VALGRIND  = 0x1001,
+          VG_USERREQ__DISCARD_TRANSLATIONS = 0x1002,
+
+          /* These allow any function to be called from the simulated
+             CPU but run on the real CPU.  Nb: the first arg passed to
+             the function is always the ThreadId of the running
+             thread!  So CLIENT_CALL0 actually requires a 1 arg
+             function, etc. */
+          VG_USERREQ__CLIENT_CALL0 = 0x1101,
+          VG_USERREQ__CLIENT_CALL1 = 0x1102,
+          VG_USERREQ__CLIENT_CALL2 = 0x1103,
+          VG_USERREQ__CLIENT_CALL3 = 0x1104,
+
+          /* Can be useful in regression testing suites -- eg. can
+             send Valgrind's output to /dev/null and still count
+             errors. */
+          VG_USERREQ__COUNT_ERRORS = 0x1201,
+
+          /* These are useful and can be interpreted by any tool that
+             tracks malloc() et al, by using vg_replace_malloc.c. */
+          VG_USERREQ__MALLOCLIKE_BLOCK = 0x1301,
+          VG_USERREQ__FREELIKE_BLOCK   = 0x1302,
+          /* Memory pool support. */
+          VG_USERREQ__CREATE_MEMPOOL   = 0x1303,
+          VG_USERREQ__DESTROY_MEMPOOL  = 0x1304,
+          VG_USERREQ__MEMPOOL_ALLOC    = 0x1305,
+          VG_USERREQ__MEMPOOL_FREE     = 0x1306,
+          VG_USERREQ__MEMPOOL_TRIM     = 0x1307,
+          VG_USERREQ__MOVE_MEMPOOL     = 0x1308,
+          VG_USERREQ__MEMPOOL_CHANGE   = 0x1309,
+          VG_USERREQ__MEMPOOL_EXISTS   = 0x130a,
+
+          /* Allow printfs to valgrind log. */
+          VG_USERREQ__PRINTF           = 0x1401,
+          VG_USERREQ__PRINTF_BACKTRACE = 0x1402,
+
+          /* Stack support. */
+          VG_USERREQ__STACK_REGISTER   = 0x1501,
+          VG_USERREQ__STACK_DEREGISTER = 0x1502,
+          VG_USERREQ__STACK_CHANGE     = 0x1503
+   } Vg_ClientRequest;
+
+#if !defined(__GNUC__)
+#  define __extension__ /* */
+#endif
+
+/* Returns the number of Valgrinds this code is running under.  That
+   is, 0 if running natively, 1 if running under Valgrind, 2 if
+   running under Valgrind which is running under another Valgrind,
+   etc. */
+#define RUNNING_ON_VALGRIND  __extension__                        \
+   ({unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* if not */,          \
+                               VG_USERREQ__RUNNING_ON_VALGRIND,   \
+                               0, 0, 0, 0, 0);                    \
+    _qzz_res;                                                     \
+   })
+
+
+/* Discard translation of code in the range [_qzz_addr .. _qzz_addr +
+   _qzz_len - 1].  Useful if you are debugging a JITter or some such,
+   since it provides a way to make sure valgrind will retranslate the
+   invalidated area.  Returns no value. */
+#define VALGRIND_DISCARD_TRANSLATIONS(_qzz_addr,_qzz_len)         \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__DISCARD_TRANSLATIONS,  \
+                               _qzz_addr, _qzz_len, 0, 0, 0);     \
+   }
+
+
+/* These requests are for getting Valgrind itself to print something.
+   Possibly with a backtrace.  This is a really ugly hack. */
+
+#if defined(NVALGRIND)
+
+#  define VALGRIND_PRINTF(...)
+#  define VALGRIND_PRINTF_BACKTRACE(...)
+
+#else /* NVALGRIND */
+
+/* Modern GCC will optimize the static routine out if unused,
+   and unused attribute will shut down warnings about it.  */
+static int VALGRIND_PRINTF(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+static int
+VALGRIND_PRINTF(const char *format, ...)
+{
+   unsigned long _qzz_res;
+   va_list vargs;
+   va_start(vargs, format);
+   VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, VG_USERREQ__PRINTF,
+                              (unsigned long)format, (unsigned long)vargs, 
+                              0, 0, 0);
+   va_end(vargs);
+   return (int)_qzz_res;
+}
+
+static int VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+static int
+VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+{
+   unsigned long _qzz_res;
+   va_list vargs;
+   va_start(vargs, format);
+   VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, VG_USERREQ__PRINTF_BACKTRACE,
+                              (unsigned long)format, (unsigned long)vargs, 
+                              0, 0, 0);
+   va_end(vargs);
+   return (int)_qzz_res;
+}
+
+#endif /* NVALGRIND */
+
+
+/* These requests allow control to move from the simulated CPU to the
+   real CPU, calling an arbitary function.
+   
+   Note that the current ThreadId is inserted as the first argument.
+   So this call:
+
+     VALGRIND_NON_SIMD_CALL2(f, arg1, arg2)
+
+   requires f to have this signature:
+
+     Word f(Word tid, Word arg1, Word arg2)
+
+   where "Word" is a word-sized type.
+
+   Note that these client requests are not entirely reliable.  For example,
+   if you call a function with them that subsequently calls printf(),
+   there's a high chance Valgrind will crash.  Generally, your prospects of
+   these working are made higher if the called function does not refer to
+   any global variables, and does not refer to any libc or other functions
+   (printf et al).  Any kind of entanglement with libc or dynamic linking is
+   likely to have a bad outcome, for tricky reasons which we've grappled
+   with a lot in the past.
+*/
+#define VALGRIND_NON_SIMD_CALL0(_qyy_fn)                          \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL0,          \
+                               _qyy_fn,                           \
+                               0, 0, 0, 0);                       \
+    _qyy_res;                                                     \
+   })
+
+#define VALGRIND_NON_SIMD_CALL1(_qyy_fn, _qyy_arg1)               \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL1,          \
+                               _qyy_fn,                           \
+                               _qyy_arg1, 0, 0, 0);               \
+    _qyy_res;                                                     \
+   })
+
+#define VALGRIND_NON_SIMD_CALL2(_qyy_fn, _qyy_arg1, _qyy_arg2)    \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL2,          \
+                               _qyy_fn,                           \
+                               _qyy_arg1, _qyy_arg2, 0, 0);       \
+    _qyy_res;                                                     \
+   })
+
+#define VALGRIND_NON_SIMD_CALL3(_qyy_fn, _qyy_arg1, _qyy_arg2, _qyy_arg3) \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL3,          \
+                               _qyy_fn,                           \
+                               _qyy_arg1, _qyy_arg2,              \
+                               _qyy_arg3, 0);                     \
+    _qyy_res;                                                     \
+   })
+
+
+/* Counts the number of errors that have been recorded by a tool.  Nb:
+   the tool must record the errors with VG_(maybe_record_error)() or
+   VG_(unique_error)() for them to be counted. */
+#define VALGRIND_COUNT_ERRORS                                     \
+   __extension__                                                  \
+   ({unsigned int _qyy_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__COUNT_ERRORS,          \
+                               0, 0, 0, 0, 0);                    \
+    _qyy_res;                                                     \
+   })
+
+/* Mark a block of memory as having been allocated by a malloc()-like
+   function.  `addr' is the start of the usable block (ie. after any
+   redzone) `rzB' is redzone size if the allocator can apply redzones;
+   use '0' if not.  Adding redzones makes it more likely Valgrind will spot
+   block overruns.  `is_zeroed' indicates if the memory is zeroed, as it is
+   for calloc().  Put it immediately after the point where a block is
+   allocated. 
+   
+   If you're using Memcheck: If you're allocating memory via superblocks,
+   and then handing out small chunks of each superblock, if you don't have
+   redzones on your small blocks, it's worth marking the superblock with
+   VALGRIND_MAKE_MEM_NOACCESS when it's created, so that block overruns are
+   detected.  But if you can put redzones on, it's probably better to not do
+   this, so that messages for small overruns are described in terms of the
+   small block rather than the superblock (but if you have a big overrun
+   that skips over a redzone, you could miss an error this way).  See
+   memcheck/tests/custom_alloc.c for an example.
+
+   WARNING: if your allocator uses malloc() or 'new' to allocate
+   superblocks, rather than mmap() or brk(), this will not work properly --
+   you'll likely get assertion failures during leak detection.  This is
+   because Valgrind doesn't like seeing overlapping heap blocks.  Sorry.
+
+   Nb: block must be freed via a free()-like function specified
+   with VALGRIND_FREELIKE_BLOCK or mismatch errors will occur. */
+#define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)    \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MALLOCLIKE_BLOCK,      \
+                               addr, sizeB, rzB, is_zeroed, 0);   \
+   }
+
+/* Mark a block of memory as having been freed by a free()-like function.
+   `rzB' is redzone size;  it must match that given to
+   VALGRIND_MALLOCLIKE_BLOCK.  Memory not freed will be detected by the leak
+   checker.  Put it immediately after the point where the block is freed. */
+#define VALGRIND_FREELIKE_BLOCK(addr, rzB)                        \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__FREELIKE_BLOCK,        \
+                               addr, rzB, 0, 0, 0);               \
+   }
+
+/* Create a memory pool. */
+#define VALGRIND_CREATE_MEMPOOL(pool, rzB, is_zeroed)             \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__CREATE_MEMPOOL,        \
+                               pool, rzB, is_zeroed, 0, 0);       \
+   }
+
+/* Destroy a memory pool. */
+#define VALGRIND_DESTROY_MEMPOOL(pool)                            \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__DESTROY_MEMPOOL,       \
+                               pool, 0, 0, 0, 0);                 \
+   }
+
+/* Associate a piece of memory with a memory pool. */
+#define VALGRIND_MEMPOOL_ALLOC(pool, addr, size)                  \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_ALLOC,         \
+                               pool, addr, size, 0, 0);           \
+   }
+
+/* Disassociate a piece of memory from a memory pool. */
+#define VALGRIND_MEMPOOL_FREE(pool, addr)                         \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_FREE,          \
+                               pool, addr, 0, 0, 0);              \
+   }
+
+/* Disassociate any pieces outside a particular range. */
+#define VALGRIND_MEMPOOL_TRIM(pool, addr, size)                   \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_TRIM,          \
+                               pool, addr, size, 0, 0);           \
+   }
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MOVE_MEMPOOL(poolA, poolB)                       \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MOVE_MEMPOOL,          \
+                               poolA, poolB, 0, 0, 0);            \
+   }
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MEMPOOL_CHANGE(pool, addrA, addrB, size)         \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_CHANGE,        \
+                               pool, addrA, addrB, size, 0);      \
+   }
+
+/* Return 1 if a mempool exists, else 0. */
+#define VALGRIND_MEMPOOL_EXISTS(pool)                             \
+   ({unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_EXISTS,        \
+                               pool, 0, 0, 0, 0);                 \
+    _qzz_res;                                                     \
+   })
+
+/* Mark a piece of memory as being a stack. Returns a stack id. */
+#define VALGRIND_STACK_REGISTER(start, end)                       \
+   ({unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__STACK_REGISTER,        \
+                               start, end, 0, 0, 0);              \
+    _qzz_res;                                                     \
+   })
+
+/* Unmark the piece of memory associated with a stack id as being a
+   stack. */
+#define VALGRIND_STACK_DEREGISTER(id)                             \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__STACK_DEREGISTER,      \
+                               id, 0, 0, 0, 0);                   \
+   }
+
+/* Change the start and end address of the stack id. */
+#define VALGRIND_STACK_CHANGE(id, start, end)                     \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__STACK_CHANGE,          \
+                               id, start, end, 0, 0);             \
+   }
+
+
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64_linux
+#undef PLAT_ppc32_aix5
+#undef PLAT_ppc64_aix5
+
+#endif   /* __VALGRIND_H */

diff --git a/src/thread_cache.cc b/src/thread_cache.cc
new file mode 100644
index 0000000..444a09f
--- /dev/null
+++ b/src/thread_cache.cc

@@ -0,0 +1,474 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Ken Ashcraft <opensource@google.com>
+
+#include <config.h>
+#include "thread_cache.h"
+#include <errno.h>
+#include <string.h>                     // for memcpy
+#include <algorithm>                    // for max, min
+#include "base/commandlineflags.h"      // for SpinLockHolder
+#include "base/spinlock.h"              // for SpinLockHolder
+#include "getenv_safe.h"                // for TCMallocGetenvSafe
+#include "central_freelist.h"           // for CentralFreeListPadded
+#include "maybe_threads.h"
+
+using std::min;
+using std::max;
+
+// Note: this is initialized manually in InitModule to ensure that
+// it's configured at right time
+//
+// DEFINE_int64(tcmalloc_max_total_thread_cache_bytes,
+//              EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES",
+//                         kDefaultOverallThreadCacheSize),
+//              "Bound on the total amount of bytes allocated to "
+//              "thread caches. This bound is not strict, so it is possible "
+//              "for the cache to go over this bound in certain circumstances. "
+//              "Maximum value of this flag is capped to 1 GB.");
+
+
+namespace tcmalloc {
+
+static bool phinited = false;
+
+volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize;
+size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize;
+ssize_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize;
+PageHeapAllocator<ThreadCache> threadcache_allocator;
+ThreadCache* ThreadCache::thread_heaps_ = NULL;
+int ThreadCache::thread_heap_count_ = 0;
+ThreadCache* ThreadCache::next_memory_steal_ = NULL;
+#ifdef HAVE_TLS
+__thread ThreadCache::ThreadLocalData ThreadCache::threadlocal_data_
+    ATTR_INITIAL_EXEC
+    = {0, 0};
+#endif
+bool ThreadCache::tsd_inited_ = false;
+pthread_key_t ThreadCache::heap_key_;
+
+void ThreadCache::Init(pthread_t tid) {
+  size_ = 0;
+
+  max_size_ = 0;
+  IncreaseCacheLimitLocked();
+  if (max_size_ == 0) {
+    // There isn't enough memory to go around.  Just give the minimum to
+    // this thread.
+    max_size_ = kMinThreadCacheSize;
+
+    // Take unclaimed_cache_space_ negative.
+    unclaimed_cache_space_ -= kMinThreadCacheSize;
+    ASSERT(unclaimed_cache_space_ < 0);
+  }
+
+  next_ = NULL;
+  prev_ = NULL;
+  tid_  = tid;
+  in_setspecific_ = false;
+  for (size_t cl = 0; cl < kNumClasses; ++cl) {
+    list_[cl].Init();
+  }
+
+  uint32_t sampler_seed;
+  memcpy(&sampler_seed, &tid, sizeof(sampler_seed));
+  sampler_.Init(sampler_seed);
+}
+
+void ThreadCache::Cleanup() {
+  // Put unused memory back into central cache
+  for (int cl = 0; cl < kNumClasses; ++cl) {
+    if (list_[cl].length() > 0) {
+      ReleaseToCentralCache(&list_[cl], cl, list_[cl].length());
+    }
+  }
+}
+
+// Remove some objects of class "cl" from central cache and add to thread heap.
+// On success, return the first object for immediate use; otherwise return NULL.
+void* ThreadCache::FetchFromCentralCache(size_t cl, size_t byte_size) {
+  FreeList* list = &list_[cl];
+  ASSERT(list->empty());
+  const int batch_size = Static::sizemap()->num_objects_to_move(cl);
+
+  const int num_to_move = min<int>(list->max_length(), batch_size);
+  void *start, *end;
+  int fetch_count = Static::central_cache()[cl].RemoveRange(
+      &start, &end, num_to_move);
+
+  ASSERT((start == NULL) == (fetch_count == 0));
+  if (--fetch_count >= 0) {
+    size_ += byte_size * fetch_count;
+    list->PushRange(fetch_count, SLL_Next(start), end);
+  }
+
+  // Increase max length slowly up to batch_size.  After that,
+  // increase by batch_size in one shot so that the length is a
+  // multiple of batch_size.
+  if (list->max_length() < batch_size) {
+    list->set_max_length(list->max_length() + 1);
+  } else {
+    // Don't let the list get too long.  In 32 bit builds, the length
+    // is represented by a 16 bit int, so we need to watch out for
+    // integer overflow.
+    int new_length = min<int>(list->max_length() + batch_size,
+                              kMaxDynamicFreeListLength);
+    // The list's max_length must always be a multiple of batch_size,
+    // and kMaxDynamicFreeListLength is not necessarily a multiple
+    // of batch_size.
+    new_length -= new_length % batch_size;
+    ASSERT(new_length % batch_size == 0);
+    list->set_max_length(new_length);
+  }
+  return start;
+}
+
+void ThreadCache::ListTooLong(FreeList* list, size_t cl) {
+  const int batch_size = Static::sizemap()->num_objects_to_move(cl);
+  ReleaseToCentralCache(list, cl, batch_size);
+
+  // If the list is too long, we need to transfer some number of
+  // objects to the central cache.  Ideally, we would transfer
+  // num_objects_to_move, so the code below tries to make max_length
+  // converge on num_objects_to_move.
+
+  if (list->max_length() < batch_size) {
+    // Slow start the max_length so we don't overreserve.
+    list->set_max_length(list->max_length() + 1);
+  } else if (list->max_length() > batch_size) {
+    // If we consistently go over max_length, shrink max_length.  If we don't
+    // shrink it, some amount of memory will always stay in this freelist.
+    list->set_length_overages(list->length_overages() + 1);
+    if (list->length_overages() > kMaxOverages) {
+      ASSERT(list->max_length() > batch_size);
+      list->set_max_length(list->max_length() - batch_size);
+      list->set_length_overages(0);
+    }
+  }
+}
+
+// Remove some objects of class "cl" from thread heap and add to central cache
+void ThreadCache::ReleaseToCentralCache(FreeList* src, size_t cl, int N) {
+  ASSERT(src == &list_[cl]);
+  if (N > src->length()) N = src->length();
+  size_t delta_bytes = N * Static::sizemap()->ByteSizeForClass(cl);
+
+  // We return prepackaged chains of the correct size to the central cache.
+  // TODO: Use the same format internally in the thread caches?
+  int batch_size = Static::sizemap()->num_objects_to_move(cl);
+  while (N > batch_size) {
+    void *tail, *head;
+    src->PopRange(batch_size, &head, &tail);
+    Static::central_cache()[cl].InsertRange(head, tail, batch_size);
+    N -= batch_size;
+  }
+  void *tail, *head;
+  src->PopRange(N, &head, &tail);
+  Static::central_cache()[cl].InsertRange(head, tail, N);
+  size_ -= delta_bytes;
+}
+
+// Release idle memory to the central cache
+void ThreadCache::Scavenge() {
+  // If the low-water mark for the free list is L, it means we would
+  // not have had to allocate anything from the central cache even if
+  // we had reduced the free list size by L.  We aim to get closer to
+  // that situation by dropping L/2 nodes from the free list.  This
+  // may not release much memory, but if so we will call scavenge again
+  // pretty soon and the low-water marks will be high on that call.
+  //int64 start = CycleClock::Now();
+  for (int cl = 0; cl < kNumClasses; cl++) {
+    FreeList* list = &list_[cl];
+    const int lowmark = list->lowwatermark();
+    if (lowmark > 0) {
+      const int drop = (lowmark > 1) ? lowmark/2 : 1;
+      ReleaseToCentralCache(list, cl, drop);
+
+      // Shrink the max length if it isn't used.  Only shrink down to
+      // batch_size -- if the thread was active enough to get the max_length
+      // above batch_size, it will likely be that active again.  If
+      // max_length shinks below batch_size, the thread will have to
+      // go through the slow-start behavior again.  The slow-start is useful
+      // mainly for threads that stay relatively idle for their entire
+      // lifetime.
+      const int batch_size = Static::sizemap()->num_objects_to_move(cl);
+      if (list->max_length() > batch_size) {
+        list->set_max_length(
+            max<int>(list->max_length() - batch_size, batch_size));
+      }
+    }
+    list->clear_lowwatermark();
+  }
+
+  IncreaseCacheLimit();
+}
+
+void ThreadCache::IncreaseCacheLimit() {
+  SpinLockHolder h(Static::pageheap_lock());
+  IncreaseCacheLimitLocked();
+}
+
+void ThreadCache::IncreaseCacheLimitLocked() {
+  if (unclaimed_cache_space_ > 0) {
+    // Possibly make unclaimed_cache_space_ negative.
+    unclaimed_cache_space_ -= kStealAmount;
+    max_size_ += kStealAmount;
+    return;
+  }
+  // Don't hold pageheap_lock too long.  Try to steal from 10 other
+  // threads before giving up.  The i < 10 condition also prevents an
+  // infinite loop in case none of the existing thread heaps are
+  // suitable places to steal from.
+  for (int i = 0; i < 10;
+       ++i, next_memory_steal_ = next_memory_steal_->next_) {
+    // Reached the end of the linked list.  Start at the beginning.
+    if (next_memory_steal_ == NULL) {
+      ASSERT(thread_heaps_ != NULL);
+      next_memory_steal_ = thread_heaps_;
+    }
+    if (next_memory_steal_ == this ||
+        next_memory_steal_->max_size_ <= kMinThreadCacheSize) {
+      continue;
+    }
+    next_memory_steal_->max_size_ -= kStealAmount;
+    max_size_ += kStealAmount;
+
+    next_memory_steal_ = next_memory_steal_->next_;
+    return;
+  }
+}
+
+int ThreadCache::GetSamplePeriod() {
+  return sampler_.GetSamplePeriod();
+}
+
+void ThreadCache::InitModule() {
+  SpinLockHolder h(Static::pageheap_lock());
+  if (!phinited) {
+    const char *tcb = TCMallocGetenvSafe("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES");
+    if (tcb) {
+      set_overall_thread_cache_size(strtoll(tcb, NULL, 10));
+    }
+    Static::InitStaticVars();
+    threadcache_allocator.Init();
+    phinited = 1;
+  }
+}
+
+void ThreadCache::InitTSD() {
+  ASSERT(!tsd_inited_);
+  perftools_pthread_key_create(&heap_key_, DestroyThreadCache);
+  tsd_inited_ = true;
+
+#ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
+  // We may have used a fake pthread_t for the main thread.  Fix it.
+  pthread_t zero;
+  memset(&zero, 0, sizeof(zero));
+  SpinLockHolder h(Static::pageheap_lock());
+  for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
+    if (h->tid_ == zero) {
+      h->tid_ = pthread_self();
+    }
+  }
+#endif
+}
+
+ThreadCache* ThreadCache::CreateCacheIfNecessary() {
+  // Initialize per-thread data if necessary
+  ThreadCache* heap = NULL;
+  {
+    SpinLockHolder h(Static::pageheap_lock());
+    // On some old glibc's, and on freebsd's libc (as of freebsd 8.1),
+    // calling pthread routines (even pthread_self) too early could
+    // cause a segfault.  Since we can call pthreads quite early, we
+    // have to protect against that in such situations by making a
+    // 'fake' pthread.  This is not ideal since it doesn't work well
+    // when linking tcmalloc statically with apps that create threads
+    // before main, so we only do it if we have to.
+#ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
+    pthread_t me;
+    if (!tsd_inited_) {
+      memset(&me, 0, sizeof(me));
+    } else {
+      me = pthread_self();
+    }
+#else
+    const pthread_t me = pthread_self();
+#endif
+
+    // This may be a recursive malloc call from pthread_setspecific()
+    // In that case, the heap for this thread has already been created
+    // and added to the linked list.  So we search for that first.
+    for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
+      if (h->tid_ == me) {
+        heap = h;
+        break;
+      }
+    }
+
+    if (heap == NULL) heap = NewHeap(me);
+  }
+
+  // We call pthread_setspecific() outside the lock because it may
+  // call malloc() recursively.  We check for the recursive call using
+  // the "in_setspecific_" flag so that we can avoid calling
+  // pthread_setspecific() if we are already inside pthread_setspecific().
+  if (!heap->in_setspecific_ && tsd_inited_) {
+    heap->in_setspecific_ = true;
+    perftools_pthread_setspecific(heap_key_, heap);
+#ifdef HAVE_TLS
+    // Also keep a copy in __thread for faster retrieval
+    threadlocal_data_.heap = heap;
+    SetMinSizeForSlowPath(kMaxSize + 1);
+#endif
+    heap->in_setspecific_ = false;
+  }
+  return heap;
+}
+
+ThreadCache* ThreadCache::NewHeap(pthread_t tid) {
+  // Create the heap and add it to the linked list
+  ThreadCache *heap = threadcache_allocator.New();
+  heap->Init(tid);
+  heap->next_ = thread_heaps_;
+  heap->prev_ = NULL;
+  if (thread_heaps_ != NULL) {
+    thread_heaps_->prev_ = heap;
+  } else {
+    // This is the only thread heap at the momment.
+    ASSERT(next_memory_steal_ == NULL);
+    next_memory_steal_ = heap;
+  }
+  thread_heaps_ = heap;
+  thread_heap_count_++;
+  return heap;
+}
+
+void ThreadCache::BecomeIdle() {
+  if (!tsd_inited_) return;              // No caches yet
+  ThreadCache* heap = GetThreadHeap();
+  if (heap == NULL) return;             // No thread cache to remove
+  if (heap->in_setspecific_) return;    // Do not disturb the active caller
+
+  heap->in_setspecific_ = true;
+  perftools_pthread_setspecific(heap_key_, NULL);
+#ifdef HAVE_TLS
+  // Also update the copy in __thread
+  threadlocal_data_.heap = NULL;
+  SetMinSizeForSlowPath(0);
+#endif
+  heap->in_setspecific_ = false;
+  if (GetThreadHeap() == heap) {
+    // Somehow heap got reinstated by a recursive call to malloc
+    // from pthread_setspecific.  We give up in this case.
+    return;
+  }
+
+  // We can now get rid of the heap
+  DeleteCache(heap);
+}
+
+void ThreadCache::DestroyThreadCache(void* ptr) {
+  // Note that "ptr" cannot be NULL since pthread promises not
+  // to invoke the destructor on NULL values, but for safety,
+  // we check anyway.
+  if (ptr == NULL) return;
+#ifdef HAVE_TLS
+  // Prevent fast path of GetThreadHeap() from returning heap.
+  threadlocal_data_.heap = NULL;
+  SetMinSizeForSlowPath(0);
+#endif
+  DeleteCache(reinterpret_cast<ThreadCache*>(ptr));
+}
+
+void ThreadCache::DeleteCache(ThreadCache* heap) {
+  // Remove all memory from heap
+  heap->Cleanup();
+
+  // Remove from linked list
+  SpinLockHolder h(Static::pageheap_lock());
+  if (heap->next_ != NULL) heap->next_->prev_ = heap->prev_;
+  if (heap->prev_ != NULL) heap->prev_->next_ = heap->next_;
+  if (thread_heaps_ == heap) thread_heaps_ = heap->next_;
+  thread_heap_count_--;
+
+  if (next_memory_steal_ == heap) next_memory_steal_ = heap->next_;
+  if (next_memory_steal_ == NULL) next_memory_steal_ = thread_heaps_;
+  unclaimed_cache_space_ += heap->max_size_;
+
+  threadcache_allocator.Delete(heap);
+}
+
+void ThreadCache::RecomputePerThreadCacheSize() {
+  // Divide available space across threads
+  int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1;
+  size_t space = overall_thread_cache_size_ / n;
+
+  // Limit to allowed range
+  if (space < kMinThreadCacheSize) space = kMinThreadCacheSize;
+  if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize;
+
+  double ratio = space / max<double>(1, per_thread_cache_size_);
+  size_t claimed = 0;
+  for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
+    // Increasing the total cache size should not circumvent the
+    // slow-start growth of max_size_.
+    if (ratio < 1.0) {
+        h->max_size_ = static_cast<size_t>(h->max_size_ * ratio);
+    }
+    claimed += h->max_size_;
+  }
+  unclaimed_cache_space_ = overall_thread_cache_size_ - claimed;
+  per_thread_cache_size_ = space;
+}
+
+void ThreadCache::GetThreadStats(uint64_t* total_bytes, uint64_t* class_count) {
+  for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
+    *total_bytes += h->Size();
+    if (class_count) {
+      for (int cl = 0; cl < kNumClasses; ++cl) {
+        class_count[cl] += h->freelist_length(cl);
+      }
+    }
+  }
+}
+
+void ThreadCache::set_overall_thread_cache_size(size_t new_size) {
+  // Clip the value to a reasonable range
+  if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize;
+  if (new_size > (1<<30)) new_size = (1<<30);     // Limit to 1GB
+  overall_thread_cache_size_ = new_size;
+
+  RecomputePerThreadCacheSize();
+}
+
+}  // namespace tcmalloc

diff --git a/src/thread_cache.h b/src/thread_cache.h
new file mode 100644
index 0000000..81a020e
--- /dev/null
+++ b/src/thread_cache.h

@@ -0,0 +1,440 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Sanjay Ghemawat <opensource@google.com>
+
+#ifndef TCMALLOC_THREAD_CACHE_H_
+#define TCMALLOC_THREAD_CACHE_H_
+
+#include <config.h>
+#ifdef HAVE_PTHREAD
+#include <pthread.h>                    // for pthread_t, pthread_key_t
+#endif
+#include <stddef.h>                     // for size_t, NULL
+#ifdef HAVE_STDINT_H
+#include <stdint.h>                     // for uint32_t, uint64_t
+#endif
+#include <sys/types.h>                  // for ssize_t
+#include "common.h"
+#include "linked_list.h"
+#include "maybe_threads.h"
+#include "page_heap_allocator.h"
+#include "sampler.h"
+#include "static_vars.h"
+
+#include "common.h"            // for SizeMap, kMaxSize, etc
+#include "internal_logging.h"  // for ASSERT, etc
+#include "linked_list.h"       // for SLL_Pop, SLL_PopRange, etc
+#include "page_heap_allocator.h"  // for PageHeapAllocator
+#include "sampler.h"           // for Sampler
+#include "static_vars.h"       // for Static
+
+namespace tcmalloc {
+
+//-------------------------------------------------------------------
+// Data kept per thread
+//-------------------------------------------------------------------
+
+class ThreadCache {
+ public:
+#ifdef HAVE_TLS
+  enum { have_tls = true };
+#else
+  enum { have_tls = false };
+#endif
+
+  // All ThreadCache objects are kept in a linked list (for stats collection)
+  ThreadCache* next_;
+  ThreadCache* prev_;
+
+  void Init(pthread_t tid);
+  void Cleanup();
+
+  // Accessors (mostly just for printing stats)
+  int freelist_length(size_t cl) const { return list_[cl].length(); }
+
+  // Total byte size in cache
+  size_t Size() const { return size_; }
+
+  // Allocate an object of the given size and class. The size given
+  // must be the same as the size of the class in the size map.
+  void* Allocate(size_t size, size_t cl);
+  void Deallocate(void* ptr, size_t size_class);
+
+  void Scavenge();
+
+  int GetSamplePeriod();
+
+  // Record allocation of "k" bytes.  Return true iff allocation
+  // should be sampled
+  bool SampleAllocation(size_t k);
+
+  static void         InitModule();
+  static void         InitTSD();
+  static ThreadCache* GetThreadHeap();
+  static ThreadCache* GetCache();
+  static ThreadCache* GetCacheIfPresent();
+  static ThreadCache* GetCacheWhichMustBePresent();
+  static ThreadCache* CreateCacheIfNecessary();
+  static void         BecomeIdle();
+  static size_t       MinSizeForSlowPath();
+  static void         SetMinSizeForSlowPath(size_t size);
+
+  static bool IsFastPathAllowed() { return MinSizeForSlowPath() != 0; }
+
+  // Return the number of thread heaps in use.
+  static inline int HeapsInUse();
+
+  // Adds to *total_bytes the total number of bytes used by all thread heaps.
+  // Also, if class_count is not NULL, it must be an array of size kNumClasses,
+  // and this function will increment each element of class_count by the number
+  // of items in all thread-local freelists of the corresponding size class.
+  // REQUIRES: Static::pageheap_lock is held.
+  static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count);
+
+  // Sets the total thread cache size to new_size, recomputing the
+  // individual thread cache sizes as necessary.
+  // REQUIRES: Static::pageheap lock is held.
+  static void set_overall_thread_cache_size(size_t new_size);
+  static size_t overall_thread_cache_size() {
+    return overall_thread_cache_size_;
+  }
+
+ private:
+  class FreeList {
+   private:
+    void*    list_;       // Linked list of nodes
+
+#ifdef _LP64
+    // On 64-bit hardware, manipulating 16-bit values may be slightly slow.
+    uint32_t length_;      // Current length.
+    uint32_t lowater_;     // Low water mark for list length.
+    uint32_t max_length_;  // Dynamic max list length based on usage.
+    // Tracks the number of times a deallocation has caused
+    // length_ > max_length_.  After the kMaxOverages'th time, max_length_
+    // shrinks and length_overages_ is reset to zero.
+    uint32_t length_overages_;
+#else
+    // If we aren't using 64-bit pointers then pack these into less space.
+    uint16_t length_;
+    uint16_t lowater_;
+    uint16_t max_length_;
+    uint16_t length_overages_;
+#endif
+
+   public:
+    void Init() {
+      list_ = NULL;
+      length_ = 0;
+      lowater_ = 0;
+      max_length_ = 1;
+      length_overages_ = 0;
+    }
+
+    // Return current length of list
+    size_t length() const {
+      return length_;
+    }
+
+    // Return the maximum length of the list.
+    size_t max_length() const {
+      return max_length_;
+    }
+
+    // Set the maximum length of the list.  If 'new_max' > length(), the
+    // client is responsible for removing objects from the list.
+    void set_max_length(size_t new_max) {
+      max_length_ = new_max;
+    }
+
+    // Return the number of times that length() has gone over max_length().
+    size_t length_overages() const {
+      return length_overages_;
+    }
+
+    void set_length_overages(size_t new_count) {
+      length_overages_ = new_count;
+    }
+
+    // Is list empty?
+    bool empty() const {
+      return list_ == NULL;
+    }
+
+    // Low-water mark management
+    int lowwatermark() const { return lowater_; }
+    void clear_lowwatermark() { lowater_ = length_; }
+
+    void Push(void* ptr) {
+      SLL_Push(&list_, ptr);
+      length_++;
+    }
+
+    void* Pop() {
+      ASSERT(list_ != NULL);
+      length_--;
+      if (length_ < lowater_) lowater_ = length_;
+      return SLL_Pop(&list_);
+    }
+
+    void* Next() {
+      return SLL_Next(&list_);
+    }
+
+    void PushRange(int N, void *start, void *end) {
+      SLL_PushRange(&list_, start, end);
+      length_ += N;
+    }
+
+    void PopRange(int N, void **start, void **end) {
+      SLL_PopRange(&list_, N, start, end);
+      ASSERT(length_ >= N);
+      length_ -= N;
+      if (length_ < lowater_) lowater_ = length_;
+    }
+  };
+
+  // Gets and returns an object from the central cache, and, if possible,
+  // also adds some objects of that size class to this thread cache.
+  void* FetchFromCentralCache(size_t cl, size_t byte_size);
+
+  // Releases some number of items from src.  Adjusts the list's max_length
+  // to eventually converge on num_objects_to_move(cl).
+  void ListTooLong(FreeList* src, size_t cl);
+
+  // Releases N items from this thread cache.
+  void ReleaseToCentralCache(FreeList* src, size_t cl, int N);
+
+  // Increase max_size_ by reducing unclaimed_cache_space_ or by
+  // reducing the max_size_ of some other thread.  In both cases,
+  // the delta is kStealAmount.
+  void IncreaseCacheLimit();
+  // Same as above but requires Static::pageheap_lock() is held.
+  void IncreaseCacheLimitLocked();
+
+  // If TLS is available, we also store a copy of the per-thread object
+  // in a __thread variable since __thread variables are faster to read
+  // than pthread_getspecific().  We still need pthread_setspecific()
+  // because __thread variables provide no way to run cleanup code when
+  // a thread is destroyed.
+  // We also give a hint to the compiler to use the "initial exec" TLS
+  // model.  This is faster than the default TLS model, at the cost that
+  // you cannot dlopen this library.  (To see the difference, look at
+  // the CPU use of __tls_get_addr with and without this attribute.)
+  // Since we don't really use dlopen in google code -- and using dlopen
+  // on a malloc replacement is asking for trouble in any case -- that's
+  // a good tradeoff for us.
+#ifdef HAVE___ATTRIBUTE__
+#define ATTR_INITIAL_EXEC __attribute__ ((tls_model ("initial-exec")))
+#else
+#define ATTR_INITIAL_EXEC
+#endif
+
+#ifdef HAVE_TLS
+  struct ThreadLocalData {
+    ThreadCache* heap;
+    // min_size_for_slow_path is 0 if heap is NULL or kMaxSize + 1 otherwise.
+    // The latter is the common case and allows allocation to be faster
+    // than it would be otherwise: typically a single branch will
+    // determine that the requested allocation is no more than kMaxSize
+    // and we can then proceed, knowing that global and thread-local tcmalloc
+    // state is initialized.
+    size_t min_size_for_slow_path;
+  };
+  static __thread ThreadLocalData threadlocal_data_ ATTR_INITIAL_EXEC;
+#endif
+
+  // Thread-specific key.  Initialization here is somewhat tricky
+  // because some Linux startup code invokes malloc() before it
+  // is in a good enough state to handle pthread_keycreate().
+  // Therefore, we use TSD keys only after tsd_inited is set to true.
+  // Until then, we use a slow path to get the heap object.
+  static bool tsd_inited_;
+  static pthread_key_t heap_key_;
+
+  // Linked list of heap objects.  Protected by Static::pageheap_lock.
+  static ThreadCache* thread_heaps_;
+  static int thread_heap_count_;
+
+  // A pointer to one of the objects in thread_heaps_.  Represents
+  // the next ThreadCache from which a thread over its max_size_ should
+  // steal memory limit.  Round-robin through all of the objects in
+  // thread_heaps_.  Protected by Static::pageheap_lock.
+  static ThreadCache* next_memory_steal_;
+
+  // Overall thread cache size.  Protected by Static::pageheap_lock.
+  static size_t overall_thread_cache_size_;
+
+  // Global per-thread cache size.  Writes are protected by
+  // Static::pageheap_lock.  Reads are done without any locking, which should be
+  // fine as long as size_t can be written atomically and we don't place
+  // invariants between this variable and other pieces of state.
+  static volatile size_t per_thread_cache_size_;
+
+  // Represents overall_thread_cache_size_ minus the sum of max_size_
+  // across all ThreadCaches.  Protected by Static::pageheap_lock.
+  static ssize_t unclaimed_cache_space_;
+
+  // This class is laid out with the most frequently used fields
+  // first so that hot elements are placed on the same cache line.
+
+  size_t        size_;                  // Combined size of data
+  size_t        max_size_;              // size_ > max_size_ --> Scavenge()
+
+  // We sample allocations, biased by the size of the allocation
+  Sampler       sampler_;               // A sampler
+
+  FreeList      list_[kNumClasses];     // Array indexed by size-class
+
+  pthread_t     tid_;                   // Which thread owns it
+  bool          in_setspecific_;        // In call to pthread_setspecific?
+
+  // Allocate a new heap. REQUIRES: Static::pageheap_lock is held.
+  static ThreadCache* NewHeap(pthread_t tid);
+
+  // Use only as pthread thread-specific destructor function.
+  static void DestroyThreadCache(void* ptr);
+
+  static void DeleteCache(ThreadCache* heap);
+  static void RecomputePerThreadCacheSize();
+
+  // Ensure that this class is cacheline-aligned. This is critical for
+  // performance, as false sharing would negate many of the benefits
+  // of a per-thread cache.
+} CACHELINE_ALIGNED;
+
+// Allocator for thread heaps
+// This is logically part of the ThreadCache class, but MSVC, at
+// least, does not like using ThreadCache as a template argument
+// before the class is fully defined.  So we put it outside the class.
+extern PageHeapAllocator<ThreadCache> threadcache_allocator;
+
+inline int ThreadCache::HeapsInUse() {
+  return threadcache_allocator.inuse();
+}
+
+inline bool ThreadCache::SampleAllocation(size_t k) {
+  return sampler_.SampleAllocation(k);
+}
+
+inline void* ThreadCache::Allocate(size_t size, size_t cl) {
+  ASSERT(size <= kMaxSize);
+  ASSERT(size == Static::sizemap()->ByteSizeForClass(cl));
+
+  FreeList* list = &list_[cl];
+  if (UNLIKELY(list->empty())) {
+    return FetchFromCentralCache(cl, size);
+  }
+  size_ -= size;
+  return list->Pop();
+}
+
+inline void ThreadCache::Deallocate(void* ptr, size_t cl) {
+  FreeList* list = &list_[cl];
+  size_ += Static::sizemap()->ByteSizeForClass(cl);
+  ssize_t size_headroom = max_size_ - size_ - 1;
+
+  // This catches back-to-back frees of allocs in the same size
+  // class. A more comprehensive (and expensive) test would be to walk
+  // the entire freelist. But this might be enough to find some bugs.
+  ASSERT(ptr != list->Next());
+
+  list->Push(ptr);
+  ssize_t list_headroom =
+      static_cast<ssize_t>(list->max_length()) - list->length();
+
+  // There are two relatively uncommon things that require further work.
+  // In the common case we're done, and in that case we need a single branch
+  // because of the bitwise-or trick that follows.
+  if (UNLIKELY((list_headroom | size_headroom) < 0)) {
+    if (list_headroom < 0) {
+      ListTooLong(list, cl);
+    }
+    if (size_ >= max_size_) Scavenge();
+  }
+}
+
+inline ThreadCache* ThreadCache::GetThreadHeap() {
+#ifdef HAVE_TLS
+  return threadlocal_data_.heap;
+#else
+  return reinterpret_cast<ThreadCache *>(
+      perftools_pthread_getspecific(heap_key_));
+#endif
+}
+
+inline ThreadCache* ThreadCache::GetCacheWhichMustBePresent() {
+#ifdef HAVE_TLS
+  ASSERT(threadlocal_data_.heap);
+  return threadlocal_data_.heap;
+#else
+  ASSERT(perftools_pthread_getspecific(heap_key_));
+  return reinterpret_cast<ThreadCache *>(
+      perftools_pthread_getspecific(heap_key_));
+#endif
+}
+
+inline ThreadCache* ThreadCache::GetCache() {
+  ThreadCache* ptr = NULL;
+  if (!tsd_inited_) {
+    InitModule();
+  } else {
+    ptr = GetThreadHeap();
+  }
+  if (ptr == NULL) ptr = CreateCacheIfNecessary();
+  return ptr;
+}
+
+// In deletion paths, we do not try to create a thread-cache.  This is
+// because we may be in the thread destruction code and may have
+// already cleaned up the cache for this thread.
+inline ThreadCache* ThreadCache::GetCacheIfPresent() {
+  if (!tsd_inited_) return NULL;
+  return GetThreadHeap();
+}
+
+inline size_t ThreadCache::MinSizeForSlowPath() {
+#ifdef HAVE_TLS
+  return threadlocal_data_.min_size_for_slow_path;
+#else
+  return 0;
+#endif
+}
+
+inline void ThreadCache::SetMinSizeForSlowPath(size_t size) {
+#ifdef HAVE_TLS
+  threadlocal_data_.min_size_for_slow_path = size;
+#endif
+}
+
+}  // namespace tcmalloc
+
+#endif  // TCMALLOC_THREAD_CACHE_H_

diff --git a/src/windows/TODO b/src/windows/TODO
new file mode 100644
index 0000000..708ec23
--- /dev/null
+++ b/src/windows/TODO

@@ -0,0 +1,86 @@
+* Get heap-profile-table.cc using DeleteMatchingFiles
+* Get heap-profile-table.cc using FillProcSelfMaps, DumpProcSelfMaps
+* Play around with ExperimentalGetStackTrace
+* Support the windows-level memory-allocation functions?  See
+    /home/build/googleclient/earth/client/tools/memorytracking/client/memorytrace/src/memorytrace.cpp
+    /home/build/googleclient/total_recall/common/sitestep/*
+    http://www.internals.com/articles/apispy/apispy.htm
+    http://www.wheaty.net/APISPY32.zip
+* Verify /proc/xxx/maps:
+    http://www.geocities.com/wah_java_dotnet/procmap/index.html
+* Figure out how to edit the executable IAT so tcmalloc.dll is loaded first
+* Use QueryPerformanceCounter instead of GetTickCount() (also for sparsehash)
+
+----
+More info on windows-level memory-allocation functions:
+   C runtime malloc
+   LocalAlloc
+   GlobalAlloc
+   HeapAlloc
+   VirtualAlloc
+   mmap stuff
+
+malloc, LocalAlloc and GlobalAlloc call HeapAlloc, which calls
+VirtualAlloc when needed, which calls VirtualAllocEx (the __sbrk equiv?)
+
+siggi sez: If you want to do a generic job, you probably need to
+preserve the semantics of all of these Win32 calls:
+   Heap32First
+   Heap32ListFirst
+   Heap32ListNext
+   Heap32Next
+   HeapAlloc
+   HeapCompact
+   HeapCreate
+   HeapCreateTagsW
+   HeapDestroy
+   HeapExtend
+   HeapFree
+   HeapLock
+   HeapQueryInformation
+   HeapQueryTagW
+   HeapReAlloc
+   HeapSetInformation
+   HeapSize
+   HeapSummary
+   HeapUnlock
+   HeapUsage
+   HeapValidate
+   HeapWalk
+
+kernel32.dll export functions and nt.dll export functions:
+   http://www.shorthike.com/svn/trunk/tools_win32/dm/lib/kernel32.def
+   http://undocumented.ntinternals.net/
+
+You can edit the executable IAT to have the patching DLL be the
+first one loaded.
+
+Most complete way to intercept system calls is patch the functions
+(not the IAT).
+
+Microsoft has somee built-in routines for heap-checking:
+   http://support.microsoft.com/kb/268343
+
+----
+Itimer replacement:
+   http://msdn2.microsoft.com/en-us/library/ms712713.aspx
+
+----
+Changes I've had to make to the project file:
+
+0) When creating the project file, click on "no autogenerated files"
+
+--- For each project:
+1) Alt-F7 -> General -> [pulldown "all configurations" ] -> Output Directory -> $(SolutionDir)$(ConfigurationName)
+2) Alt-F7 -> General -> [pulldown "all configurations" ] -> Intermediate Directory -> $(ConfigurationName)
+
+--- For each .cc file:
+1) Alt-F7 -> C/C++ -> General -> [pulldown "all configurations"] -> Additional Include Directives --> src/windows + src/
+2) Alt-F7 -> C/C++ -> Code Generation -> Runtime Library -> Multi-threaded, debug/release, DLL or not
+
+--- For DLL:
+3) Alt-F7 -> Linker -> Input -> [pulldown "all configurations" ] -> Module Definition File -> src\windows\vc7and8.def
+--- For binaries depending on a DLL:
+3) Right-click on project -> Project Dependencies -> [add dll]
+--- For static binaries (not depending on a DLL)
+3) Alt-F7 -> C/C++ -> Command Line -> [pulldown "all configurations"] -> /D PERFTOOLS_DLL_DECL=

diff --git a/src/windows/addr2line-pdb.c b/src/windows/addr2line-pdb.c
new file mode 100644
index 0000000..5c65a03
--- /dev/null
+++ b/src/windows/addr2line-pdb.c

@@ -0,0 +1,163 @@
+/* Copyright (c) 2008, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: David Vitek
+ *
+ * Dump function addresses using Microsoft debug symbols.  This works
+ * on PDB files.  Note that this program will download symbols to
+ * c:\websymbols without asking.
+ */
+
+#define WIN32_LEAN_AND_MEAN
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <windows.h>
+#include <dbghelp.h>
+
+#define SEARCH_CAP (1024*1024)
+#define WEBSYM "SRV*c:\\websymbols*http://msdl.microsoft.com/download/symbols"
+
+void usage() {
+  fprintf(stderr, "usage: "
+          "addr2line-pdb [-f|--functions] [-C|--demangle] [-e filename]\n");
+  fprintf(stderr, "(Then list the hex addresses on stdin, one per line)\n");
+}
+
+int main(int argc, char *argv[]) {
+  DWORD  error;
+  HANDLE process;
+  ULONG64 module_base;
+  int i;
+  char* search;
+  char buf[256];   /* Enough to hold one hex address, I trust! */
+  int rv = 0;
+  /* We may add SYMOPT_UNDNAME if --demangle is specified: */
+  DWORD symopts = SYMOPT_DEFERRED_LOADS | SYMOPT_DEBUG | SYMOPT_LOAD_LINES;
+  char* filename = "a.out";         /* The default if -e isn't specified */
+  int print_function_name = 0;      /* Set to 1 if -f is specified */
+
+  for (i = 1; i < argc; i++) {
+    if (strcmp(argv[i], "--functions") == 0 || strcmp(argv[i], "-f") == 0) {
+      print_function_name = 1;
+    } else if (strcmp(argv[i], "--demangle") == 0 ||
+               strcmp(argv[i], "-C") == 0) {
+      symopts |= SYMOPT_UNDNAME;
+    } else if (strcmp(argv[i], "-e") == 0) {
+      if (i + 1 >= argc) {
+        fprintf(stderr, "FATAL ERROR: -e must be followed by a filename\n");
+        return 1;
+      }
+      filename = argv[i+1];
+      i++;     /* to skip over filename too */
+    } else if (strcmp(argv[i], "--help") == 0) {
+      usage();
+      exit(0);
+    } else {
+      usage();
+      exit(1);
+    }
+  }
+
+  process = GetCurrentProcess();
+
+  if (!SymInitialize(process, NULL, FALSE)) {
+    error = GetLastError();
+    fprintf(stderr, "SymInitialize returned error : %d\n", error);
+    return 1;
+  }
+
+  search = malloc(SEARCH_CAP);
+  if (SymGetSearchPath(process, search, SEARCH_CAP)) {
+    if (strlen(search) + sizeof(";" WEBSYM) > SEARCH_CAP) {
+      fprintf(stderr, "Search path too long\n");
+      SymCleanup(process);
+      return 1;
+    }
+    strcat(search, ";" WEBSYM);
+  } else {
+    error = GetLastError();
+    fprintf(stderr, "SymGetSearchPath returned error : %d\n", error);
+    rv = 1;                   /* An error, but not a fatal one */
+    strcpy(search, WEBSYM);   /* Use a default value */
+  }
+  if (!SymSetSearchPath(process, search)) {
+    error = GetLastError();
+    fprintf(stderr, "SymSetSearchPath returned error : %d\n", error);
+    rv = 1;                   /* An error, but not a fatal one */
+  }
+
+  SymSetOptions(symopts);
+  module_base = SymLoadModuleEx(process, NULL, filename, NULL, 0, 0, NULL, 0);
+  if (!module_base) {
+    /* SymLoadModuleEx failed */
+    error = GetLastError();
+    fprintf(stderr, "SymLoadModuleEx returned error : %d for %s\n",
+            error, filename);
+    SymCleanup(process);
+    return 1;
+  }
+
+  buf[sizeof(buf)-1] = '\0';  /* Just to be safe */
+  while (fgets(buf, sizeof(buf)-1, stdin)) {
+    /* GNU addr2line seems to just do a strtol and ignore any
+     * weird characters it gets, so we will too.
+     */
+    unsigned __int64 addr = _strtoui64(buf, NULL, 16);
+    ULONG64 buffer[(sizeof(SYMBOL_INFO) +
+                    MAX_SYM_NAME*sizeof(TCHAR) +
+                    sizeof(ULONG64) - 1)
+                   / sizeof(ULONG64)];
+    PSYMBOL_INFO pSymbol = (PSYMBOL_INFO)buffer;
+    IMAGEHLP_LINE64 line;
+    DWORD dummy;
+    pSymbol->SizeOfStruct = sizeof(SYMBOL_INFO);
+    pSymbol->MaxNameLen = MAX_SYM_NAME;
+    if (print_function_name) {
+      if (SymFromAddr(process, (DWORD64)addr, NULL, pSymbol)) {
+        printf("%s\n", pSymbol->Name);
+      } else {
+        printf("??\n");
+      }
+    }
+    line.SizeOfStruct = sizeof(IMAGEHLP_LINE64);
+    if (SymGetLineFromAddr64(process, (DWORD64)addr, &dummy, &line)) {
+      printf("%s:%d\n", line.FileName, (int)line.LineNumber);
+    } else {
+      printf("??:0\n");
+    }
+  }
+  SymUnloadModule64(process, module_base);
+  SymCleanup(process);
+  return rv;
+}

diff --git a/src/windows/auto_testing_hook.h b/src/windows/auto_testing_hook.h
new file mode 100644
index 0000000..fc2b710
--- /dev/null
+++ b/src/windows/auto_testing_hook.h

@@ -0,0 +1,156 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Utility for using SideStep with unit tests.
+
+#ifndef CEEE_TESTING_SIDESTEP_AUTO_TESTING_HOOK_H_
+#define CEEE_TESTING_SIDESTEP_AUTO_TESTING_HOOK_H_
+
+#include "base/basictypes.h"
+#include "base/logging.h"
+#include "preamble_patcher.h"
+
+#define SIDESTEP_CHK(x)  CHECK(x)
+#define SIDESTEP_EXPECT_TRUE(x)  SIDESTEP_CHK(x)
+
+namespace sidestep {
+
+// Same trick as common/scope_cleanup.h ScopeGuardImplBase
+class AutoTestingHookBase {
+ public:
+  virtual ~AutoTestingHookBase() {}
+};
+
+// This is the typedef you normally use for the class, e.g.
+//
+// AutoTestingHook hook = MakeTestingHook(TargetFunc, HookTargetFunc);
+//
+// The 'hook' variable will then be destroyed when it goes out of scope.
+//
+// NOTE: You must not hold this type as a member of another class.  Its
+// destructor will not get called.
+typedef const AutoTestingHookBase& AutoTestingHook;
+
+// This is the class you must use when holding a hook as a member of another
+// class, e.g.
+//
+// public:
+//  AutoTestingHookHolder holder_;
+//  MyClass() : my_hook_holder(MakeTestingHookHolder(Target, Hook)) {}
+class AutoTestingHookHolder {
+ public:
+  explicit AutoTestingHookHolder(AutoTestingHookBase* hook) : hook_(hook) {}
+  ~AutoTestingHookHolder() { delete hook_; }
+ private:
+  AutoTestingHookHolder() {}  // disallow
+  AutoTestingHookBase* hook_;
+};
+
+// This class helps patch a function, then unpatch it when the object exits
+// scope, and also maintains the pointer to the original function stub.
+//
+// To enable use of the class without having to explicitly provide the
+// type of the function pointers (and instead only providing it
+// implicitly) we use the same trick as ScopeGuard (see
+// common/scope_cleanup.h) uses, so to create a hook you use the MakeHook
+// function rather than a constructor.
+//
+// NOTE:  This function is only safe for e.g. unit tests and _not_ for
+// production code.  See PreamblePatcher class for details.
+template <typename T>
+class AutoTestingHookImpl : public AutoTestingHookBase {
+ public:
+  static AutoTestingHookImpl<T> MakeTestingHook(T target_function,
+                                                T replacement_function,
+                                                bool do_it) {
+    return AutoTestingHookImpl<T>(target_function, replacement_function, do_it);
+  }
+
+  static AutoTestingHookImpl<T>* MakeTestingHookHolder(T target_function,
+                                                       T replacement_function,
+                                                       bool do_it) {
+    return new AutoTestingHookImpl<T>(target_function,
+                                      replacement_function, do_it);
+  }
+
+  ~AutoTestingHookImpl() {
+    if (did_it_) {
+      SIDESTEP_CHK(SIDESTEP_SUCCESS == PreamblePatcher::Unpatch(
+          (void*)target_function_, (void*)replacement_function_,
+          (void*)original_function_));
+    }
+  }
+
+  // Returns a pointer to the original function.  To use this method you will
+  // have to explicitly create an AutoTestingHookImpl of the specific
+  // function pointer type (i.e. not use the AutoTestingHook typedef).
+  T original_function() {
+    return original_function_;
+  }
+
+ private:
+  AutoTestingHookImpl(T target_function, T replacement_function, bool do_it)
+      : target_function_(target_function),
+        original_function_(NULL),
+        replacement_function_(replacement_function),
+        did_it_(do_it) {
+    if (do_it) {
+      SIDESTEP_CHK(SIDESTEP_SUCCESS == PreamblePatcher::Patch(target_function,
+                                                     replacement_function,
+                                                     &original_function_));
+    }
+  }
+
+  T target_function_;  // always valid
+  T original_function_;  // always valid
+  T replacement_function_;  // always valid
+  bool did_it_;  // Remember if we did it or not...
+};
+
+template <typename T>
+inline AutoTestingHookImpl<T> MakeTestingHook(T target,
+                                              T replacement,
+                                              bool do_it) {
+  return AutoTestingHookImpl<T>::MakeTestingHook(target, replacement, do_it);
+}
+
+template <typename T>
+inline AutoTestingHookImpl<T> MakeTestingHook(T target, T replacement) {
+  return AutoTestingHookImpl<T>::MakeTestingHook(target, replacement, true);
+}
+
+template <typename T>
+inline AutoTestingHookImpl<T>* MakeTestingHookHolder(T target, T replacement) {
+  return AutoTestingHookImpl<T>::MakeTestingHookHolder(target, replacement,
+                                                       true);
+}
+
+};  // namespace sidestep
+
+#endif  // CEEE_TESTING_SIDESTEP_AUTO_TESTING_HOOK_H_

diff --git a/src/windows/config.h b/src/windows/config.h
new file mode 100644
index 0000000..9976457
--- /dev/null
+++ b/src/windows/config.h

@@ -0,0 +1,310 @@
+/* A manual version of config.h fit for windows machines.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ */
+
+/* Sometimes we accidentally #include this config.h instead of the one
+   in .. -- this is particularly true for msys/mingw, which uses the
+   unix config.h but also runs code in the windows directory.
+   */
+#ifdef __MINGW32__
+#include "../config.h"
+#define GOOGLE_PERFTOOLS_WINDOWS_CONFIG_H_
+#endif
+
+#ifndef GOOGLE_PERFTOOLS_WINDOWS_CONFIG_H_
+#define GOOGLE_PERFTOOLS_WINDOWS_CONFIG_H_
+
+/* define this if you are linking tcmalloc statically and overriding the
+ * default allocators.
+ * For instructions on how to use this mode, see
+ * http://groups.google.com/group/google-perftools/browse_thread/thread/41cd3710af85e57b
+ */
+#undef WIN32_OVERRIDE_ALLOCATORS
+
+/* Define to 1 if your libc has a snprintf implementation */
+#undef HAVE_SNPRINTF
+
+/* Define to 1 if compiler supports __builtin_stack_pointer */
+#undef HAVE_BUILTIN_STACK_POINTER
+
+/* Define to 1 if you have the <conflict-signal.h> header file. */
+#undef HAVE_CONFLICT_SIGNAL_H
+
+/* Define to 1 if you have the <cygwin/signal.h> header file. */
+#undef HAVE_CYGWIN_SIGNAL_H
+
+/* Define to 1 if you have the declaration of `cfree', and to 0 if you don't.
+   */
+#undef HAVE_DECL_CFREE
+
+/* Define to 1 if you have the declaration of `memalign', and to 0 if you
+   don't. */
+#undef HAVE_DECL_MEMALIGN
+
+/* Define to 1 if you have the declaration of `posix_memalign', and to 0 if
+   you don't. */
+#undef HAVE_DECL_POSIX_MEMALIGN
+
+/* Define to 1 if you have the declaration of `pvalloc', and to 0 if you
+   don't. */
+#undef HAVE_DECL_PVALLOC
+
+/* Define to 1 if you have the declaration of `uname', and to 0 if you don't.
+   */
+#undef HAVE_DECL_UNAME
+
+/* Define to 1 if you have the declaration of `valloc', and to 0 if you don't.
+   */
+#undef HAVE_DECL_VALLOC
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#undef HAVE_DLFCN_H
+
+/* Define to 1 if the system has the type `Elf32_Versym'. */
+#undef HAVE_ELF32_VERSYM
+
+/* Define to 1 if you have the <execinfo.h> header file. */
+#undef HAVE_EXECINFO_H
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#undef HAVE_FCNTL_H
+
+/* Define to 1 if you have the <features.h> header file. */
+#undef HAVE_FEATURES_H
+
+/* Define to 1 if you have the `geteuid' function. */
+#undef HAVE_GETEUID
+
+/* Define to 1 if you have the `getpagesize' function. */
+#define HAVE_GETPAGESIZE 1   /* we define it in windows/port.cc */
+
+/* Define to 1 if you have the <glob.h> header file. */
+#undef HAVE_GLOB_H
+
+/* Define to 1 if you have the <grp.h> header file. */
+#undef HAVE_GRP_H
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* Define to 1 if you have the <libunwind.h> header file. */
+#undef HAVE_LIBUNWIND_H
+
+/* Define to 1 if you have the <linux/ptrace.h> header file. */
+#undef HAVE_LINUX_PTRACE_H
+
+/* Define to 1 if you have the <malloc.h> header file. */
+#define HAVE_MALLOC_H 1
+
+/* Define to 1 if you have the <malloc/malloc.h> header file. */
+#undef HAVE_MALLOC_MALLOC_H
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* Define to 1 if you have a working `mmap' system call. */
+#undef HAVE_MMAP
+
+/* define if the compiler implements namespaces */
+#define HAVE_NAMESPACES 1
+
+/* Define to 1 if you have the <poll.h> header file. */
+#undef HAVE_POLL_H
+
+/* define if libc has program_invocation_name */
+#undef HAVE_PROGRAM_INVOCATION_NAME
+
+/* Define if you have POSIX threads libraries and header files. */
+#undef HAVE_PTHREAD
+
+/* Define to 1 if you have the <pwd.h> header file. */
+#undef HAVE_PWD_H
+
+/* Define to 1 if you have the `sbrk' function. */
+#undef HAVE_SBRK
+
+/* Define to 1 if you have the <sched.h> header file. */
+#undef HAVE_SCHED_H
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if the system has the type `struct mallinfo'. */
+#undef HAVE_STRUCT_MALLINFO
+
+/* Define to 1 if you have the <sys/cdefs.h> header file. */
+#undef HAVE_SYS_CDEFS_H
+
+/* Define to 1 if you have the <sys/malloc.h> header file. */
+#undef HAVE_SYS_MALLOC_H
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#undef HAVE_SYS_PARAM_H
+
+/* Define to 1 if you have the <sys/prctl.h> header file. */
+#undef HAVE_SYS_PRCTL_H
+
+/* Define to 1 if you have the <sys/resource.h> header file. */
+#undef HAVE_SYS_RESOURCE_H
+
+/* Define to 1 if you have the <sys/socket.h> header file. */
+#undef HAVE_SYS_SOCKET_H
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/syscall.h> header file. */
+#undef HAVE_SYS_SYSCALL_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* <sys/ucontext.h> is broken on redhat 7 */
+#undef HAVE_SYS_UCONTEXT_H
+
+/* Define to 1 if you have the <sys/wait.h> header file. */
+#undef HAVE_SYS_WAIT_H
+
+/* Define to 1 if compiler supports __thread */
+#define HAVE_TLS 1
+
+/* Define to 1 if you have the <ucontext.h> header file. */
+#undef HAVE_UCONTEXT_H
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* Define to 1 if you have the <unwind.h> header file. */
+#undef HAVE_UNWIND_H
+
+/* Define to 1 if you have the <valgrind.h> header file. */
+#undef HAVE_VALGRIND_H
+
+/* define if your compiler has __attribute__ */
+#undef HAVE___ATTRIBUTE__
+
+/* Define to 1 if compiler supports __environ */
+#undef HAVE___ENVIRON
+
+/* Define to 1 if the system has the type `__int64'. */
+#define HAVE___INT64 1
+
+/* prefix where we look for installed files */
+#undef INSTALL_PREFIX
+
+/* Define to 1 if int32_t is equivalent to intptr_t */
+#undef INT32_EQUALS_INTPTR
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#undef LT_OBJDIR
+
+/* Define to 'volatile' if __malloc_hook is declared volatile */
+#undef MALLOC_HOOK_MAYBE_VOLATILE
+
+/* Define to 1 if your C compiler doesn't accept -c and -o together. */
+#undef NO_MINUS_C_MINUS_O
+
+/* Name of package */
+#define PACKAGE "gperftools"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "opensource@google.com"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "gperftools"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "gperftools 2.4"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "gperftools"
+
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "2.4"
+
+/* How to access the PC from a struct ucontext */
+#undef PC_FROM_UCONTEXT
+
+/* Always the empty-string on non-windows systems. On windows, should be
+   "__declspec(dllexport)". This way, when we compile the dll, we export our
+   functions/classes. It's safe to define this here because config.h is only
+   used internally, to compile the DLL, and every DLL source file #includes
+   "config.h" before anything else. */
+#ifndef PERFTOOLS_DLL_DECL
+# define PERFTOOLS_IS_A_DLL  1   /* not set if you're statically linking */
+# define PERFTOOLS_DLL_DECL  __declspec(dllexport)
+# define PERFTOOLS_DLL_DECL_FOR_UNITTESTS  __declspec(dllimport)
+#endif
+
+/* printf format code for printing a size_t and ssize_t */
+#define PRIdS  "Id"
+
+/* printf format code for printing a size_t and ssize_t */
+#define PRIuS  "Iu"
+
+/* printf format code for printing a size_t and ssize_t */
+#define PRIxS  "Ix"
+
+/* Mark the systems where we know it's bad if pthreads runs too
+   early before main (before threads are initialized, presumably).  */
+#ifdef __FreeBSD__
+#define PTHREADS_CRASHES_IF_RUN_TOO_EARLY 1
+#endif
+
+/* Define to necessary symbol if this constant uses a non-standard name on
+   your system. */
+#undef PTHREAD_CREATE_JOINABLE
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* the namespace where STL code like vector<> is defined */
+#define STL_NAMESPACE  std
+
+/* Version number of package */
+#undef VERSION
+
+/* C99 says: define this to get the PRI... macros from stdint.h */
+#ifndef __STDC_FORMAT_MACROS
+# define __STDC_FORMAT_MACROS 1
+#endif
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+#undef inline
+#endif
+
+// ---------------------------------------------------------------------
+// Extra stuff not found in config.h.in
+
+// This must be defined before the windows.h is included.  We need at
+// least 0x0400 for mutex.h to have access to TryLock, and at least
+// 0x0501 for patch_functions.cc to have access to GetModuleHandleEx.
+// (This latter is an optimization we could take out if need be.)
+#ifndef _WIN32_WINNT
+# define _WIN32_WINNT 0x0501
+#endif
+
+// We want to make sure not to ever try to #include heap-checker.h
+#define NO_HEAP_CHECK 1
+
+// TODO(csilvers): include windows/port.h in every relevant source file instead?
+#include "windows/port.h"
+
+#endif  /* GOOGLE_PERFTOOLS_WINDOWS_CONFIG_H_ */

diff --git a/src/windows/get_mangled_names.cc b/src/windows/get_mangled_names.cc
new file mode 100644
index 0000000..08bd03b
--- /dev/null
+++ b/src/windows/get_mangled_names.cc

@@ -0,0 +1,65 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// 
+// ---
+// Author: Craig Silverstein (opensource@google.com)
+
+// When you are porting perftools to a new compiler or architecture
+// (win64 vs win32) for instance, you'll need to change the mangled
+// symbol names for operator new and friends at the top of
+// patch_functions.cc.  This file helps you do that.
+//
+// It does this by defining these functions with the proper signature.
+// All you need to do is compile this file and the run dumpbin on it.
+// (See http://msdn.microsoft.com/en-us/library/5x49w699.aspx for more
+// on dumpbin).  To do this in MSVC, use the MSVC commandline shell:
+//    http://msdn.microsoft.com/en-us/library/ms235639(VS.80).aspx)
+//
+// The run:
+//    cl /c get_mangled_names.cc
+//    dumpbin /symbols get_mangled_names.obj
+//
+// It will print out the mangled (and associated unmangled) names of
+// the 8 symbols you need to put at the top of patch_functions.cc
+
+#include <sys/types.h>   // for size_t
+#include <new>           // for nothrow_t
+
+static char m;   // some dummy memory so new doesn't return NULL.
+
+void* operator new(size_t size) { return &m; }
+void operator delete(void* p) throw() { }
+void* operator new[](size_t size) { return &m; }
+void operator delete[](void* p) throw() { }
+
+void* operator new(size_t size, const std::nothrow_t&) throw() { return &m; }
+void operator delete(void* p, const std::nothrow_t&) throw() { }
+void* operator new[](size_t size, const std::nothrow_t&) throw() { return &m; }
+void operator delete[](void* p, const std::nothrow_t&) throw() { }

diff --git a/src/windows/google/tcmalloc.h b/src/windows/google/tcmalloc.h
new file mode 100644
index 0000000..c7db631
--- /dev/null
+++ b/src/windows/google/tcmalloc.h

@@ -0,0 +1,34 @@
+/* Copyright (c) 2003, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* The code has moved to gperftools/.  Use that include-directory for
+ * new code.
+ */
+#include <gperftools/tcmalloc.h>

diff --git a/src/windows/gperftools/tcmalloc.h b/src/windows/gperftools/tcmalloc.h
new file mode 100644
index 0000000..9ba79a9
--- /dev/null
+++ b/src/windows/gperftools/tcmalloc.h

@@ -0,0 +1,125 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2003, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Sanjay Ghemawat <opensource@google.com>
+ *         .h.in file by Craig Silverstein <opensource@google.com>
+ */
+
+#ifndef TCMALLOC_TCMALLOC_H_
+#define TCMALLOC_TCMALLOC_H_
+
+#include <stddef.h>                     // for size_t
+#ifdef HAVE_SYS_CDEFS_H
+#include <sys/cdefs.h>   // where glibc defines __THROW
+#endif
+
+// __THROW is defined in glibc systems.  It means, counter-intuitively,
+// "This function will never throw an exception."  It's an optional
+// optimization tool, but we may need to use it to match glibc prototypes.
+#ifndef __THROW    /* I guess we're not on a glibc system */
+# define __THROW   /* __THROW is just an optimization, so ok to make it "" */
+#endif
+
+// Define the version number so folks can check against it
+#define TC_VERSION_MAJOR  2
+#define TC_VERSION_MINOR  4
+#define TC_VERSION_PATCH  ""
+#define TC_VERSION_STRING "gperftools 2.4"
+
+#include <stdlib.h>   // for struct mallinfo, if it's defined
+
+// Annoying stuff for windows -- makes sure clients can import these functions
+#ifndef PERFTOOLS_DLL_DECL
+# ifdef _WIN32
+#   define PERFTOOLS_DLL_DECL  __declspec(dllimport)
+# else
+#   define PERFTOOLS_DLL_DECL
+# endif
+#endif
+
+#ifdef __cplusplus
+namespace std {
+struct nothrow_t;
+}
+
+extern "C" {
+#endif
+  // Returns a human-readable version string.  If major, minor,
+  // and/or patch are not NULL, they are set to the major version,
+  // minor version, and patch-code (a string, usually "").
+  PERFTOOLS_DLL_DECL const char* tc_version(int* major, int* minor,
+                                            const char** patch) __THROW;
+
+  PERFTOOLS_DLL_DECL void* tc_malloc(size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_malloc_skip_new_handler(size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void tc_free(void* ptr) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_realloc(void* ptr, size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_calloc(size_t nmemb, size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void tc_cfree(void* ptr) __THROW;
+
+  PERFTOOLS_DLL_DECL void* tc_memalign(size_t __alignment,
+                                       size_t __size) __THROW;
+  PERFTOOLS_DLL_DECL int tc_posix_memalign(void** ptr,
+                                           size_t align, size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_valloc(size_t __size) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_pvalloc(size_t __size) __THROW;
+
+  PERFTOOLS_DLL_DECL void tc_malloc_stats(void) __THROW;
+  PERFTOOLS_DLL_DECL int tc_mallopt(int cmd, int value) __THROW;
+#if 0
+  PERFTOOLS_DLL_DECL struct mallinfo tc_mallinfo(void) __THROW;
+#endif
+
+  // This is an alias for MallocExtension::instance()->GetAllocatedSize().
+  // It is equivalent to
+  //    OS X: malloc_size()
+  //    glibc: malloc_usable_size()
+  //    Windows: _msize()
+  PERFTOOLS_DLL_DECL size_t tc_malloc_size(void* ptr) __THROW;
+
+#ifdef __cplusplus
+  PERFTOOLS_DLL_DECL int tc_set_new_mode(int flag) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_new(size_t size);
+  PERFTOOLS_DLL_DECL void* tc_new_nothrow(size_t size,
+                                          const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW;
+  PERFTOOLS_DLL_DECL void tc_delete_nothrow(void* p,
+                                            const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_newarray(size_t size);
+  PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size,
+                                               const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW;
+  PERFTOOLS_DLL_DECL void tc_deletearray_nothrow(void* p,
+                                                 const std::nothrow_t&) __THROW;
+}
+#endif
+
+#endif  // #ifndef TCMALLOC_TCMALLOC_H_

diff --git a/src/windows/gperftools/tcmalloc.h.in b/src/windows/gperftools/tcmalloc.h.in
new file mode 100644
index 0000000..7458de1
--- /dev/null
+++ b/src/windows/gperftools/tcmalloc.h.in

@@ -0,0 +1,125 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2003, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Sanjay Ghemawat <opensource@google.com>
+ *         .h.in file by Craig Silverstein <opensource@google.com>
+ */
+
+#ifndef TCMALLOC_TCMALLOC_H_
+#define TCMALLOC_TCMALLOC_H_
+
+#include <stddef.h>                     // for size_t
+#ifdef HAVE_SYS_CDEFS_H
+#include <sys/cdefs.h>   // where glibc defines __THROW
+#endif
+
+// __THROW is defined in glibc systems.  It means, counter-intuitively,
+// "This function will never throw an exception."  It's an optional
+// optimization tool, but we may need to use it to match glibc prototypes.
+#ifndef __THROW    /* I guess we're not on a glibc system */
+# define __THROW   /* __THROW is just an optimization, so ok to make it "" */
+#endif
+
+// Define the version number so folks can check against it
+#define TC_VERSION_MAJOR  @TC_VERSION_MAJOR@
+#define TC_VERSION_MINOR  @TC_VERSION_MINOR@
+#define TC_VERSION_PATCH  "@TC_VERSION_PATCH@"
+#define TC_VERSION_STRING "gperftools @TC_VERSION_MAJOR@.@TC_VERSION_MINOR@@TC_VERSION_PATCH@"
+
+#include <stdlib.h>   // for struct mallinfo, if it's defined
+
+// Annoying stuff for windows -- makes sure clients can import these functions
+#ifndef PERFTOOLS_DLL_DECL
+# ifdef _WIN32
+#   define PERFTOOLS_DLL_DECL  __declspec(dllimport)
+# else
+#   define PERFTOOLS_DLL_DECL
+# endif
+#endif
+
+#ifdef __cplusplus
+namespace std {
+struct nothrow_t;
+}
+
+extern "C" {
+#endif
+  // Returns a human-readable version string.  If major, minor,
+  // and/or patch are not NULL, they are set to the major version,
+  // minor version, and patch-code (a string, usually "").
+  PERFTOOLS_DLL_DECL const char* tc_version(int* major, int* minor,
+                                            const char** patch) __THROW;
+
+  PERFTOOLS_DLL_DECL void* tc_malloc(size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_malloc_skip_new_handler(size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void tc_free(void* ptr) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_realloc(void* ptr, size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_calloc(size_t nmemb, size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void tc_cfree(void* ptr) __THROW;
+
+  PERFTOOLS_DLL_DECL void* tc_memalign(size_t __alignment,
+                                       size_t __size) __THROW;
+  PERFTOOLS_DLL_DECL int tc_posix_memalign(void** ptr,
+                                           size_t align, size_t size) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_valloc(size_t __size) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_pvalloc(size_t __size) __THROW;
+
+  PERFTOOLS_DLL_DECL void tc_malloc_stats(void) __THROW;
+  PERFTOOLS_DLL_DECL int tc_mallopt(int cmd, int value) __THROW;
+#if 0
+  PERFTOOLS_DLL_DECL struct mallinfo tc_mallinfo(void) __THROW;
+#endif
+
+  // This is an alias for MallocExtension::instance()->GetAllocatedSize().
+  // It is equivalent to
+  //    OS X: malloc_size()
+  //    glibc: malloc_usable_size()
+  //    Windows: _msize()
+  PERFTOOLS_DLL_DECL size_t tc_malloc_size(void* ptr) __THROW;
+
+#ifdef __cplusplus
+  PERFTOOLS_DLL_DECL int tc_set_new_mode(int flag) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_new(size_t size);
+  PERFTOOLS_DLL_DECL void* tc_new_nothrow(size_t size,
+                                          const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_delete(void* p) __THROW;
+  PERFTOOLS_DLL_DECL void tc_delete_nothrow(void* p,
+                                            const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void* tc_newarray(size_t size);
+  PERFTOOLS_DLL_DECL void* tc_newarray_nothrow(size_t size,
+                                               const std::nothrow_t&) __THROW;
+  PERFTOOLS_DLL_DECL void tc_deletearray(void* p) __THROW;
+  PERFTOOLS_DLL_DECL void tc_deletearray_nothrow(void* p,
+                                                 const std::nothrow_t&) __THROW;
+}
+#endif
+
+#endif  // #ifndef TCMALLOC_TCMALLOC_H_

diff --git a/src/windows/ia32_modrm_map.cc b/src/windows/ia32_modrm_map.cc
new file mode 100644
index 0000000..f1f1906
--- /dev/null
+++ b/src/windows/ia32_modrm_map.cc

@@ -0,0 +1,121 @@
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Joi Sigurdsson
+ *
+ * Table of relevant information about how to decode the ModR/M byte.
+ * Based on information in the IA-32 Intel® Architecture
+ * Software Developers Manual Volume 2: Instruction Set Reference.
+ */
+
+#include "mini_disassembler.h"
+#include "mini_disassembler_types.h"
+
+namespace sidestep {
+
+const ModrmEntry MiniDisassembler::s_ia16_modrm_map_[] = {
+// mod == 00
+  /* r/m == 000 */ { false, false, OS_ZERO },
+  /* r/m == 001 */ { false, false, OS_ZERO },
+  /* r/m == 010 */ { false, false, OS_ZERO },
+  /* r/m == 011 */ { false, false, OS_ZERO },
+  /* r/m == 100 */ { false, false, OS_ZERO },
+  /* r/m == 101 */ { false, false, OS_ZERO },
+  /* r/m == 110 */ { true, false, OS_WORD },
+  /* r/m == 111 */ { false, false, OS_ZERO }, 
+// mod == 01
+  /* r/m == 000 */ { true, false, OS_BYTE },
+  /* r/m == 001 */ { true, false, OS_BYTE },
+  /* r/m == 010 */ { true, false, OS_BYTE },
+  /* r/m == 011 */ { true, false, OS_BYTE },
+  /* r/m == 100 */ { true, false, OS_BYTE },
+  /* r/m == 101 */ { true, false, OS_BYTE },
+  /* r/m == 110 */ { true, false, OS_BYTE },
+  /* r/m == 111 */ { true, false, OS_BYTE }, 
+// mod == 10
+  /* r/m == 000 */ { true, false, OS_WORD },
+  /* r/m == 001 */ { true, false, OS_WORD },
+  /* r/m == 010 */ { true, false, OS_WORD },
+  /* r/m == 011 */ { true, false, OS_WORD },
+  /* r/m == 100 */ { true, false, OS_WORD },
+  /* r/m == 101 */ { true, false, OS_WORD },
+  /* r/m == 110 */ { true, false, OS_WORD },
+  /* r/m == 111 */ { true, false, OS_WORD }, 
+// mod == 11
+  /* r/m == 000 */ { false, false, OS_ZERO },
+  /* r/m == 001 */ { false, false, OS_ZERO },
+  /* r/m == 010 */ { false, false, OS_ZERO },
+  /* r/m == 011 */ { false, false, OS_ZERO },
+  /* r/m == 100 */ { false, false, OS_ZERO },
+  /* r/m == 101 */ { false, false, OS_ZERO },
+  /* r/m == 110 */ { false, false, OS_ZERO },
+  /* r/m == 111 */ { false, false, OS_ZERO }
+};
+
+const ModrmEntry MiniDisassembler::s_ia32_modrm_map_[] = {
+// mod == 00
+  /* r/m == 000 */ { false, false, OS_ZERO },
+  /* r/m == 001 */ { false, false, OS_ZERO },
+  /* r/m == 010 */ { false, false, OS_ZERO },
+  /* r/m == 011 */ { false, false, OS_ZERO },
+  /* r/m == 100 */ { false, true, OS_ZERO },
+  /* r/m == 101 */ { true, false, OS_DOUBLE_WORD },
+  /* r/m == 110 */ { false, false, OS_ZERO },
+  /* r/m == 111 */ { false, false, OS_ZERO }, 
+// mod == 01
+  /* r/m == 000 */ { true, false, OS_BYTE },
+  /* r/m == 001 */ { true, false, OS_BYTE },
+  /* r/m == 010 */ { true, false, OS_BYTE },
+  /* r/m == 011 */ { true, false, OS_BYTE },
+  /* r/m == 100 */ { true, true, OS_BYTE },
+  /* r/m == 101 */ { true, false, OS_BYTE },
+  /* r/m == 110 */ { true, false, OS_BYTE },
+  /* r/m == 111 */ { true, false, OS_BYTE }, 
+// mod == 10
+  /* r/m == 000 */ { true, false, OS_DOUBLE_WORD },
+  /* r/m == 001 */ { true, false, OS_DOUBLE_WORD },
+  /* r/m == 010 */ { true, false, OS_DOUBLE_WORD },
+  /* r/m == 011 */ { true, false, OS_DOUBLE_WORD },
+  /* r/m == 100 */ { true, true, OS_DOUBLE_WORD },
+  /* r/m == 101 */ { true, false, OS_DOUBLE_WORD },
+  /* r/m == 110 */ { true, false, OS_DOUBLE_WORD },
+  /* r/m == 111 */ { true, false, OS_DOUBLE_WORD }, 
+// mod == 11
+  /* r/m == 000 */ { false, false, OS_ZERO },
+  /* r/m == 001 */ { false, false, OS_ZERO },
+  /* r/m == 010 */ { false, false, OS_ZERO },
+  /* r/m == 011 */ { false, false, OS_ZERO },
+  /* r/m == 100 */ { false, false, OS_ZERO },
+  /* r/m == 101 */ { false, false, OS_ZERO },
+  /* r/m == 110 */ { false, false, OS_ZERO },
+  /* r/m == 111 */ { false, false, OS_ZERO },
+};
+
+};  // namespace sidestep

diff --git a/src/windows/ia32_opcode_map.cc b/src/windows/ia32_opcode_map.cc
new file mode 100644
index 0000000..ba6a79e
--- /dev/null
+++ b/src/windows/ia32_opcode_map.cc

@@ -0,0 +1,1219 @@
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Joi Sigurdsson
+ *
+ * Opcode decoding maps.  Based on the IA-32 Intel® Architecture
+ * Software Developers Manual Volume 2: Instruction Set Reference.  Idea
+ * for how to lay out the tables in memory taken from the implementation
+ * in the Bastard disassembly environment.
+ */
+
+#include "mini_disassembler.h"
+
+namespace sidestep {
+
+/*
+* This is the first table to be searched; the first field of each
+* Opcode in the table is either 0 to indicate you're in the
+* right table, or an index to the correct table, in the global
+* map g_pentiumOpcodeMap
+*/
+const Opcode s_first_opcode_byte[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_B, AM_G | OT_B, AM_NOT_USED, "add", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "add", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_G | OT_B, AM_E | OT_B, AM_NOT_USED, "add", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "add", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "add", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "add", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_REGISTER | OT_W, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_REGISTER | OT_W, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8 */ { 0, IT_GENERIC, AM_E | OT_B, AM_G | OT_B, AM_NOT_USED, "or", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "or", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA */ { 0, IT_GENERIC, AM_G | OT_B, AM_E | OT_B, AM_NOT_USED, "or", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "or", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "or", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xD */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "or", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xE */ { 0, IT_GENERIC, AM_REGISTER | OT_W, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xF */ { 1, IT_REFERENCE, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x10 */ { 0, IT_GENERIC, AM_E | OT_B, AM_G | OT_B, AM_NOT_USED, "adc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x11 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "adc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x12 */ { 0, IT_GENERIC, AM_G | OT_B, AM_E | OT_B, AM_NOT_USED, "adc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x13 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "adc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x14 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "adc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x15 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "adc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x16 */ { 0, IT_GENERIC, AM_REGISTER | OT_W, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x17 */ { 0, IT_GENERIC, AM_REGISTER | OT_W, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x18 */ { 0, IT_GENERIC, AM_E | OT_B, AM_G | OT_B, AM_NOT_USED, "sbb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x19 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "sbb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1A */ { 0, IT_GENERIC, AM_G | OT_B, AM_E | OT_B, AM_NOT_USED, "sbb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1B */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "sbb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1C */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "sbb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1D */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "sbb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1E */ { 0, IT_GENERIC, AM_REGISTER | OT_W, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1F */ { 0, IT_GENERIC, AM_REGISTER | OT_W, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x20 */ { 0, IT_GENERIC, AM_E | OT_B, AM_G | OT_B, AM_NOT_USED, "and", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x21 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "and", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x22 */ { 0, IT_GENERIC, AM_G | OT_B, AM_E | OT_B, AM_NOT_USED, "and", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x23 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "and", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x24 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "and", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x25 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "and", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x26 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x27 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "daa", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x28 */ { 0, IT_GENERIC, AM_E | OT_B, AM_G | OT_B, AM_NOT_USED, "sub", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x29 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "sub", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2A */ { 0, IT_GENERIC, AM_G | OT_B, AM_E | OT_B, AM_NOT_USED, "sub", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2B */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "sub", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2C */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "sub", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2D */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "sub", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2E */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2F */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "das", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x30 */ { 0, IT_GENERIC, AM_E | OT_B, AM_G | OT_B, AM_NOT_USED, "xor", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x31 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "xor", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x32 */ { 0, IT_GENERIC, AM_G | OT_B, AM_E | OT_B, AM_NOT_USED, "xor", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x33 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "xor", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x34 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "xor", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x35 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "xor", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x36 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x37 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "aaa", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x38 */ { 0, IT_GENERIC, AM_E | OT_B, AM_G | OT_B, AM_NOT_USED, "cmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x39 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "cmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3A */ { 0, IT_GENERIC, AM_G | OT_B, AM_E | OT_B, AM_NOT_USED, "cmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3B */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3C */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "cmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3D */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "cmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3E */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3F */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "aas", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+#ifdef _M_X64
+  /* REX Prefixes in 64-bit mode. */
+  /* 0x40 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x41 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x42 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x43 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x44 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x45 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x46 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x47 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x48 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x49 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4A */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4B */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4C */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4D */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4E */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4F */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+#else
+  /* 0x40 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "inc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x41 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "inc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x42 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "inc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x43 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "inc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x44 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "inc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x45 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "inc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x46 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "inc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x47 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "inc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x48 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "dec", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x49 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "dec", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4A */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "dec", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4B */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "dec", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4C */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "dec", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4D */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "dec", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4E */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "dec", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4F */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "dec", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+#endif
+  /* 0x50 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x51 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x52 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x53 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x54 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x55 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x56 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x57 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x58 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x59 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5A */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5B */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5C */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5D */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5E */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5F */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x60 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "pushad", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x61 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "popad", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x62 */ { 0, IT_GENERIC, AM_G | OT_V, AM_M | OT_A, AM_NOT_USED, "bound", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x63 */ { 0, IT_GENERIC, AM_E | OT_W, AM_G | OT_W, AM_NOT_USED, "arpl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x64 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x65 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x66 */ { 0, IT_PREFIX_OPERAND, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x67 */ { 0, IT_PREFIX_ADDRESS, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x68 */ { 0, IT_GENERIC, AM_I | OT_V, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x69 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_I | OT_V, "imul", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6A */ { 0, IT_GENERIC, AM_I | OT_B, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6B */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_I |  OT_B, "imul", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6C */ { 0, IT_GENERIC, AM_Y | OT_B, AM_REGISTER | OT_B, AM_NOT_USED, "insb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6D */ { 0, IT_GENERIC, AM_Y | OT_V, AM_REGISTER | OT_V, AM_NOT_USED, "insd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6E */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_X | OT_B, AM_NOT_USED, "outsb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6F */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_X | OT_V, AM_NOT_USED, "outsb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x70 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jo", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x71 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jno", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x72 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x73 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jnc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x74 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jz", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x75 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jnz", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x76 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jbe", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x77 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "ja", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x78 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "js", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x79 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jns", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7A */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jpe", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7B */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jpo", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7C */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7D */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jge", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7E */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jle", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7F */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x80 */ { 2, IT_REFERENCE, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x81 */ { 3, IT_REFERENCE, AM_E | OT_V, AM_I | OT_V, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x82 */ { 4, IT_REFERENCE, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x83 */ { 5, IT_REFERENCE, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x84 */ { 0, IT_GENERIC, AM_E | OT_B, AM_G | OT_B, AM_NOT_USED, "test", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x85 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "test", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x86 */ { 0, IT_GENERIC, AM_E | OT_B, AM_G | OT_B, AM_NOT_USED, "xchg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x87 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "xchg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x88 */ { 0, IT_GENERIC, AM_E | OT_B, AM_G | OT_B, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x89 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8A */ { 0, IT_GENERIC, AM_G | OT_B, AM_E | OT_B, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8B */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8C */ { 0, IT_GENERIC, AM_E | OT_W, AM_S | OT_W, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8D */ { 0, IT_GENERIC, AM_G | OT_V, AM_M | OT_ADDRESS_MODE_M, AM_NOT_USED, "lea", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8E */ { 0, IT_GENERIC, AM_S | OT_W, AM_E | OT_W, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8F */ { 0, IT_GENERIC, AM_E | OT_V, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x90 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "nop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x91 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_REGISTER | OT_V, AM_NOT_USED, "xchg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x92 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_REGISTER | OT_V, AM_NOT_USED, "xchg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x93 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_REGISTER | OT_V, AM_NOT_USED, "xchg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x94 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_REGISTER | OT_V, AM_NOT_USED, "xchg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x95 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_REGISTER | OT_V, AM_NOT_USED, "xchg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x96 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_REGISTER | OT_V, AM_NOT_USED, "xchg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x97 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_REGISTER | OT_V, AM_NOT_USED, "xchg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x98 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "cwde", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x99 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "cdq", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9A */ { 0, IT_JUMP, AM_A | OT_P, AM_NOT_USED, AM_NOT_USED, "callf", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9B */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "wait", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9C */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "pushfd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9D */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "popfd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9E */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "sahf", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9F */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "lahf", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA0 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_O | OT_B, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA1 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_O | OT_V, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA2 */ { 0, IT_GENERIC, AM_O | OT_B, AM_REGISTER | OT_B, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA3 */ { 0, IT_GENERIC, AM_O | OT_V, AM_REGISTER | OT_V, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA4 */ { 0, IT_GENERIC, AM_X | OT_B, AM_Y | OT_B, AM_NOT_USED, "movsb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA5 */ { 0, IT_GENERIC, AM_X | OT_V, AM_Y | OT_V, AM_NOT_USED, "movsd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA6 */ { 0, IT_GENERIC, AM_X | OT_B, AM_Y | OT_B, AM_NOT_USED, "cmpsb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA7 */ { 0, IT_GENERIC, AM_X | OT_V, AM_Y | OT_V, AM_NOT_USED, "cmpsd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA8 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "test", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA9 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "test", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xAA */ { 0, IT_GENERIC, AM_Y | OT_B, AM_REGISTER | OT_B, AM_NOT_USED, "stosb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xAB */ { 0, IT_GENERIC, AM_Y | OT_V, AM_REGISTER | OT_V, AM_NOT_USED, "stosd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xAC */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_X| OT_B, AM_NOT_USED, "lodsb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xAD */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_X| OT_V, AM_NOT_USED, "lodsd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xAE */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_Y | OT_B, AM_NOT_USED, "scasb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xAF */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_Y | OT_V, AM_NOT_USED, "scasd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB0 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB1 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB2 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB3 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB4 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB5 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB6 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB7 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+#ifdef _M_X64
+  /* 0xB8 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V | IOS_64, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB9 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V | IOS_64, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBA */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V | IOS_64, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBB */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V | IOS_64, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBC */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V | IOS_64, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBD */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V | IOS_64, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBE */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V | IOS_64, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBF */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V | IOS_64, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+#else
+  /* 0xB8 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB9 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBA */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBB */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBC */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBD */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBE */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBF */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_I | OT_V, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+#endif
+  /* 0xC0 */ { 6, IT_REFERENCE, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC1 */ { 7, IT_REFERENCE, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC2 */ { 0, IT_RETURN, AM_I | OT_W, AM_NOT_USED, AM_NOT_USED, "ret", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC3 */ { 0, IT_RETURN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "ret", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC4 */ { 0, IT_GENERIC, AM_G | OT_V, AM_M | OT_P, AM_NOT_USED, "les", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC5 */ { 0, IT_GENERIC, AM_G | OT_V, AM_M | OT_P, AM_NOT_USED, "lds", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC6 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC7 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_V, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC8 */ { 0, IT_GENERIC, AM_I | OT_W, AM_I | OT_B, AM_NOT_USED, "enter", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC9 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "leave", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xCA */ { 0, IT_RETURN, AM_I | OT_W, AM_NOT_USED, AM_NOT_USED, "retf", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xCB */ { 0, IT_RETURN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "retf", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xCC */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "int3", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xCD */ { 0, IT_GENERIC, AM_I | OT_B, AM_NOT_USED, AM_NOT_USED, "int", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xCE */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "into", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xCF */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "iret", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xD0 */ { 8, IT_REFERENCE, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xD1 */ { 9, IT_REFERENCE, AM_E | OT_V, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xD2 */ { 10, IT_REFERENCE, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xD3 */ { 11, IT_REFERENCE, AM_E | OT_V, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xD4 */ { 0, IT_GENERIC, AM_I | OT_B, AM_NOT_USED, AM_NOT_USED, "aam", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xD5 */ { 0, IT_GENERIC, AM_I | OT_B, AM_NOT_USED, AM_NOT_USED, "aad", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xD6 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xD7 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "xlat", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  
+  // The following 8 lines would be references to the FPU tables, but we currently
+  // do not support the FPU instructions in this disassembler.
+  
+  /* 0xD8 */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xD9 */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xDA */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xDB */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xDC */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xDD */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xDE */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xDF */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  
+  
+  /* 0xE0 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "loopnz", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xE1 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "loopz", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xE2 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "loop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xE3 */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jcxz", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xE4 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "in", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xE5 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_I | OT_B, AM_NOT_USED, "in", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xE6 */ { 0, IT_GENERIC, AM_I | OT_B, AM_REGISTER | OT_B, AM_NOT_USED, "out", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xE7 */ { 0, IT_GENERIC, AM_I | OT_B, AM_REGISTER | OT_B, AM_NOT_USED, "out", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xE8 */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "call", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xE9 */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xEA */ { 0, IT_JUMP, AM_A | OT_P, AM_NOT_USED, AM_NOT_USED, "jmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xEB */ { 0, IT_JUMP, AM_J | OT_B, AM_NOT_USED, AM_NOT_USED, "jmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xEC */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_REGISTER | OT_W, AM_NOT_USED, "in", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xED */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_REGISTER | OT_W, AM_NOT_USED, "in", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xEE */ { 0, IT_GENERIC, AM_REGISTER | OT_W, AM_REGISTER | OT_B, AM_NOT_USED, "out", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xEF */ { 0, IT_GENERIC, AM_REGISTER | OT_W, AM_REGISTER | OT_V, AM_NOT_USED, "out", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xF0 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "lock:", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xF1 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xF2 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "repne:", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xF3 */ { 0, IT_PREFIX, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "rep:", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xF4 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "hlt", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xF5 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "cmc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xF6 */ { 12, IT_REFERENCE, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xF7 */ { 13, IT_REFERENCE, AM_E | OT_V, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xF8 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "clc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xF9 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "stc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xFA */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "cli", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xFB */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "sti", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xFC */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "cld", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xFD */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "std", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xFE */ { 14, IT_REFERENCE, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xFF */ { 15, IT_REFERENCE, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_0f[] = {
+  /* 0x0 */ { 16, IT_REFERENCE, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 17, IT_REFERENCE, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_W, AM_NOT_USED, "lar", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_W, AM_NOT_USED, "lsl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "clts", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "invd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "wbinvd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "ud2", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xD */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xE */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xF */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x10 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "movups", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_SD, AM_W | OT_SD, AM_NOT_USED, "movsd" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_SS, AM_W | OT_SS, AM_NOT_USED, "movss" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_NOT_USED, "movupd" } },
+  /* 0x11 */ { 0, IT_GENERIC, AM_W | OT_PS, AM_V | OT_PS, AM_NOT_USED, "movups", true,
+    /* F2h */ { 0, IT_GENERIC, AM_W | OT_SD, AM_V | OT_SD, AM_NOT_USED, "movsd" },
+    /* F3h */ { 0, IT_GENERIC, AM_W | OT_SS, AM_V | OT_SS, AM_NOT_USED, "movss" },
+    /* 66h */ { 0, IT_GENERIC, AM_W | OT_PD, AM_V | OT_PD, AM_NOT_USED, "movupd" } },
+  /* 0x12 */ { 0, IT_GENERIC, AM_W | OT_Q, AM_V | OT_Q, AM_NOT_USED, "movlps", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_Q, AM_V | OT_Q, AM_NOT_USED, "movhlps" },  // only one of ...
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_Q, AM_V | OT_Q, AM_NOT_USED, "movhlps" },  // ...these two is correct, Intel doesn't specify which
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_Q, AM_W | OT_S, AM_NOT_USED, "movlpd" } },
+  /* 0x13 */ { 0, IT_GENERIC, AM_V | OT_Q, AM_W | OT_Q, AM_NOT_USED, "movlps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_Q, AM_W | OT_Q, AM_NOT_USED, "movlpd" } },
+  /* 0x14 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_Q, AM_NOT_USED, "unpcklps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_Q, AM_NOT_USED, "unpcklpd" } },
+  /* 0x15 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_Q, AM_NOT_USED, "unpckhps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_Q, AM_NOT_USED, "unpckhpd" } },
+  /* 0x16 */ { 0, IT_GENERIC, AM_V | OT_Q, AM_W | OT_Q, AM_NOT_USED, "movhps", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_Q, AM_V | OT_Q, AM_NOT_USED, "movlhps" },  // only one of...
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_Q, AM_V | OT_Q, AM_NOT_USED, "movlhps" },  // ...these two is correct, Intel doesn't specify which
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_Q, AM_W | OT_Q, AM_NOT_USED, "movhpd" } },
+  /* 0x17 */ { 0, IT_GENERIC, AM_W | OT_Q, AM_V | OT_Q, AM_NOT_USED, "movhps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_W | OT_Q, AM_V | OT_Q, AM_NOT_USED, "movhpd" } },
+  /* 0x18 */ { 18, IT_REFERENCE, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x19 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1A */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1B */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1C */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1D */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1E */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1F */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x20 */ { 0, IT_GENERIC, AM_R | OT_D, AM_C | OT_D, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x21 */ { 0, IT_GENERIC, AM_R | OT_D, AM_D | OT_D, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x22 */ { 0, IT_GENERIC, AM_C | OT_D, AM_R | OT_D, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x23 */ { 0, IT_GENERIC, AM_D | OT_D, AM_R | OT_D, AM_NOT_USED, "mov", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x24 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x25 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x26 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x27 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x28 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "movaps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_NOT_USED, "movapd" } },
+  /* 0x29 */ { 0, IT_GENERIC, AM_W | OT_PS, AM_V | OT_PS, AM_NOT_USED, "movaps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_W | OT_PD, AM_V | OT_PD, AM_NOT_USED, "movapd" } },
+  /* 0x2A */ { 0, IT_GENERIC, AM_V | OT_PS, AM_Q | OT_Q, AM_NOT_USED, "cvtpi2ps", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_SD, AM_E | OT_D, AM_NOT_USED, "cvtsi2sd" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_SS, AM_E | OT_D, AM_NOT_USED, "cvtsi2ss" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_Q | OT_DQ, AM_NOT_USED, "cvtpi2pd" } },
+  /* 0x2B */ { 0, IT_GENERIC, AM_W | OT_PS, AM_V | OT_PS, AM_NOT_USED, "movntps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_W | OT_PD, AM_V | OT_PD, AM_NOT_USED, "movntpd" } },
+  /* 0x2C */ { 0, IT_GENERIC, AM_Q | OT_Q, AM_W | OT_PS, AM_NOT_USED, "cvttps2pi", true,
+    /* F2h */ { 0, IT_GENERIC, AM_G | OT_D, AM_W | OT_SD, AM_NOT_USED, "cvttsd2si" },
+    /* F3h */ { 0, IT_GENERIC, AM_G | OT_D, AM_W | OT_SS, AM_NOT_USED, "cvttss2si" },
+    /* 66h */ { 0, IT_GENERIC, AM_Q | OT_DQ, AM_W | OT_PD, AM_NOT_USED, "cvttpd2pi" } },
+  /* 0x2D */ { 0, IT_GENERIC, AM_Q | OT_Q, AM_W | OT_PS, AM_NOT_USED, "cvtps2pi", true,
+    /* F2h */ { 0, IT_GENERIC, AM_G | OT_D, AM_W | OT_SD, AM_NOT_USED, "cvtsd2si" },
+    /* F3h */ { 0, IT_GENERIC, AM_G | OT_D, AM_W | OT_SS, AM_NOT_USED, "cvtss2si" },
+    /* 66h */ { 0, IT_GENERIC, AM_Q | OT_DQ, AM_W | OT_PD, AM_NOT_USED, "cvtpd2pi" } },
+  /* 0x2E */ { 0, IT_GENERIC, AM_V | OT_SS, AM_W | OT_SS, AM_NOT_USED, "ucomiss", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_SD, AM_W | OT_SD, AM_NOT_USED, "ucomisd" } },
+  /* 0x2F */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_SS, AM_NOT_USED, "comiss", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_SD, AM_W | OT_SD, AM_NOT_USED, "comisd" } },
+  /* 0x30 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "wrmsr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x31 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "rdtsc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x32 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "rdmsr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x33 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "rdpmc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x34 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "sysenter", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x35 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "sysexit", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x36 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x37 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x38 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x39 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3A */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3B */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3C */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "movnti", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3D */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3E */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3F */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x40 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovo", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x41 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovno", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x42 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x43 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovnc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x44 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovz", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x45 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovnz", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x46 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovbe", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x47 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmova", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x48 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovs", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x49 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovns", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4A */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovpe", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4B */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovpo", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4C */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4D */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovge", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4E */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovle", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4F */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "cmovg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x50 */ { 0, IT_GENERIC, AM_E | OT_D, AM_V | OT_PS, AM_NOT_USED, "movmskps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_E | OT_D, AM_V | OT_PD, AM_NOT_USED, "movmskpd" } },
+  /* 0x51 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "sqrtps", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_SD, AM_W | OT_SD, AM_NOT_USED, "sqrtsd" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_SS, AM_W | OT_SS, AM_NOT_USED, "sqrtss" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_NOT_USED, "sqrtpd" } },
+  /* 0x52 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "rsqrtps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_SS, AM_W | OT_SS, AM_NOT_USED, "rsqrtss" },
+    /* 66h */ { 0 } },
+  /* 0x53 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "rcpps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_SS, AM_W | OT_SS, AM_NOT_USED, "rcpss" },
+    /* 66h */ { 0 } },
+  /* 0x54 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "andps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_NOT_USED, "andpd" } },
+  /* 0x55 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "andnps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_NOT_USED, "andnpd" } },
+  /* 0x56 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "orps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_NOT_USED, "orpd" } },
+  /* 0x57 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "xorps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_NOT_USED, "xorpd" } },
+  /* 0x58 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "addps", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_SD, AM_W | OT_SD, AM_NOT_USED, "addsd" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_SS, AM_W | OT_SS, AM_NOT_USED, "addss" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_NOT_USED, "addpd" } },
+  /* 0x59 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "mulps", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_SD, AM_W | OT_SD, AM_NOT_USED, "mulsd" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_SS, AM_W | OT_SS, AM_NOT_USED, "mulss" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_NOT_USED, "mulpd" } },
+  /* 0x5A */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PS, AM_NOT_USED, "cvtps2pd", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_SD, AM_W | OT_SD, AM_NOT_USED, "cvtsd2ss" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_SS, AM_W | OT_SS, AM_NOT_USED, "cvtss2sd" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PD, AM_NOT_USED, "cvtpd2ps" } },
+  /* 0x5B */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_DQ, AM_NOT_USED, "cvtdq2ps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_PS, AM_NOT_USED, "cvttps2dq" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_PS, AM_NOT_USED, "cvtps2dq" } },
+  /* 0x5C */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "subps", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_SD, AM_W | OT_SD, AM_NOT_USED, "subsd" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_SS, AM_W | OT_SS, AM_NOT_USED, "subss" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_NOT_USED, "subpd" } },
+  /* 0x5D */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "minps", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_SD, AM_W | OT_SD, AM_NOT_USED, "minsd" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_SS, AM_W | OT_SS, AM_NOT_USED, "minss" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_NOT_USED, "minpd" } },
+  /* 0x5E */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "divps", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_SD, AM_W | OT_SD, AM_NOT_USED, "divsd" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_SS, AM_W | OT_SS, AM_NOT_USED, "divss" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_NOT_USED, "divpd" } },
+  /* 0x5F */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_NOT_USED, "maxps", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_SD, AM_W | OT_SD, AM_NOT_USED, "maxsd" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_SS, AM_W | OT_SS, AM_NOT_USED, "maxss" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_NOT_USED, "maxpd" } },
+  /* 0x60 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_D, AM_NOT_USED, "punpcklbw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "punpcklbw" } },
+  /* 0x61 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_D, AM_NOT_USED, "punpcklwd", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "punpcklwd" } },
+  /* 0x62 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_D, AM_NOT_USED, "punpckldq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "punpckldq" } },
+  /* 0x63 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_D, AM_NOT_USED, "packsswb", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "packsswb" } },
+  /* 0x64 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_D, AM_NOT_USED, "pcmpgtb", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pcmpgtb" } },
+  /* 0x65 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_D, AM_NOT_USED, "pcmpgtw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pcmpgtw" } },
+  /* 0x66 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_D, AM_NOT_USED, "pcmpgtd", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pcmpgtd" } },
+  /* 0x67 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_D, AM_NOT_USED, "packuswb", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "packuswb" } },
+  /* 0x68 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_D, AM_NOT_USED, "punpckhbw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_P | OT_DQ, AM_Q | OT_DQ, AM_NOT_USED, "punpckhbw" } },
+  /* 0x69 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_D, AM_NOT_USED, "punpckhwd", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_P | OT_DQ, AM_Q | OT_DQ, AM_NOT_USED, "punpckhwd" } },
+  /* 0x6A */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_D, AM_NOT_USED, "punpckhdq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_P | OT_DQ, AM_Q | OT_DQ, AM_NOT_USED, "punpckhdq" } },
+  /* 0x6B */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_D, AM_NOT_USED, "packssdw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_P | OT_DQ, AM_Q | OT_DQ, AM_NOT_USED, "packssdw" } },
+  /* 0x6C */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "not used without prefix", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "punpcklqdq" } },
+  /* 0x6D */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "not used without prefix", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "punpcklqdq" } },
+  /* 0x6E */ { 0, IT_GENERIC, AM_P | OT_D, AM_E | OT_D, AM_NOT_USED, "movd", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_E | OT_D, AM_NOT_USED, "movd" } },
+  /* 0x6F */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_D, AM_NOT_USED, "movq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "movdqu" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "movdqa" } },
+  /* 0x70 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_I |  OT_B, "pshuf", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_I | OT_B, "pshuflw" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_I | OT_B, "pshufhw" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_I | OT_B, "pshufd" } },
+  /* 0x71 */ { 19, IT_REFERENCE, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x72 */ { 20, IT_REFERENCE, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x73 */ { 21, IT_REFERENCE, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x74 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pcmpeqb", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pcmpeqb" } },
+  /* 0x75 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pcmpeqw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pcmpeqw" } },
+  /* 0x76 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pcmpeqd", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pcmpeqd" } },
+  /* 0x77 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "emms", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  
+  // The following six opcodes are escapes into the MMX stuff, which this disassembler does not support.
+  /* 0x78 */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x79 */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7A */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7B */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7C */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7D */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  
+  /* 0x7E */ { 0, IT_GENERIC, AM_E | OT_D, AM_P | OT_D, AM_NOT_USED, "movd", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_Q, AM_W | OT_Q, AM_NOT_USED, "movq" },
+    /* 66h */ { 0, IT_GENERIC, AM_E | OT_D, AM_V | OT_DQ, AM_NOT_USED, "movd" } },
+  /* 0x7F */ { 0, IT_GENERIC, AM_Q | OT_Q, AM_P | OT_Q, AM_NOT_USED, "movq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0, IT_GENERIC, AM_W | OT_DQ, AM_V | OT_DQ, AM_NOT_USED, "movdqu" },
+    /* 66h */ { 0, IT_GENERIC, AM_W | OT_DQ, AM_V | OT_DQ, AM_NOT_USED, "movdqa" } },
+  /* 0x80 */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jo", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x81 */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jno", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x82 */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x83 */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jnc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x84 */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jz", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x85 */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jnz", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x86 */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jbe", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x87 */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "ja", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x88 */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "js", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x89 */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jns", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8A */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jpe", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8B */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jpo", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8C */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8D */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jge", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8E */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jle", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x8F */ { 0, IT_JUMP, AM_J | OT_V, AM_NOT_USED, AM_NOT_USED, "jg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x90 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "seto", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x91 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "setno", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x92 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "setc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x93 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "setnc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x94 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "setz", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x95 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "setnz", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x96 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "setbe", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x97 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "seta", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x98 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "sets", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x99 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "setns", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9A */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "setpe", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9B */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "setpo", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9C */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "setl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9D */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "setge", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9E */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "setle", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x9F */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "setg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA0 */ { 0, IT_GENERIC, AM_REGISTER | OT_W, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA1 */ { 0, IT_GENERIC, AM_REGISTER | OT_W, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA2 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "cpuid", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA3 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "bt", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA4 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_I | OT_B, "shld", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA5 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_I | OT_B | AM_REGISTER, "shld", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA6 */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA7 */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA8 */ { 0, IT_GENERIC, AM_REGISTER | OT_W, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xA9 */ { 0, IT_GENERIC, AM_REGISTER | OT_W, AM_NOT_USED, AM_NOT_USED, "pop", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xAA */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "rsm", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xAB */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "bts", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xAC */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_I | OT_B, "shrd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xAD */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_I | OT_B | AM_REGISTER, "shrd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xAE */ { 22, IT_REFERENCE, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xAF */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "imul", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB0 */ { 0, IT_GENERIC, AM_E | OT_B, AM_G | OT_B, AM_NOT_USED, "cmpxchg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB1 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "cmpxchg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB2 */ { 0, IT_GENERIC, AM_M | OT_P, AM_NOT_USED, AM_NOT_USED, "lss", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB3 */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "btr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB4 */ { 0, IT_GENERIC, AM_M | OT_P, AM_NOT_USED, AM_NOT_USED, "lfs", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB5 */ { 0, IT_GENERIC, AM_M | OT_P, AM_NOT_USED, AM_NOT_USED, "lgs", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB6 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_B, AM_NOT_USED, "movzx", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB7 */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_W, AM_NOT_USED, "movzx", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB8 */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xB9 */ { 0, IT_UNKNOWN, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "ud1", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBA */ { 23, IT_REFERENCE, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBB */ { 0, IT_GENERIC, AM_E | OT_V, AM_G | OT_V, AM_NOT_USED, "btc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBC */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "bsf", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBD */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_V, AM_NOT_USED, "bsr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBE */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_B, AM_NOT_USED, "movsx", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xBF */ { 0, IT_GENERIC, AM_G | OT_V, AM_E | OT_W, AM_NOT_USED, "movsx", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC0 */ { 0, IT_GENERIC, AM_E | OT_B, AM_G | OT_B, AM_NOT_USED, "xadd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC1 */ { 0, IT_GENERIC, AM_E | OT_V, AM_NOT_USED, AM_NOT_USED, "xadd", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC2 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_I | OT_B, "cmpps", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_SD, AM_W | OT_SD, AM_I | OT_B, "cmpsd" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_SS, AM_W  | OT_SS, AM_I | OT_B, "cmpss" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_I | OT_B, "cmppd" } },
+  /* 0xC3 */ { 0, IT_GENERIC, AM_E | OT_D, AM_G | OT_D, AM_NOT_USED, "movnti", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC4 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_E | OT_D, AM_I | OT_B, "pinsrw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_E | OT_D, AM_I | OT_B, "pinsrw" } },
+  /* 0xC5 */ { 0, IT_GENERIC, AM_G | OT_D, AM_P | OT_Q, AM_I | OT_B, "pextrw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_G | OT_D, AM_V | OT_DQ, AM_I | OT_B, "pextrw" } },
+  /* 0xC6 */ { 0, IT_GENERIC, AM_V | OT_PS, AM_W | OT_PS, AM_I | OT_B, "shufps", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_PD, AM_I | OT_B, "shufpd" } },
+  /* 0xC7 */ { 24, IT_REFERENCE, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC8 */ { 0, IT_GENERIC, AM_REGISTER | OT_D, AM_NOT_USED, AM_NOT_USED, "bswap", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xC9 */ { 0, IT_GENERIC, AM_REGISTER | OT_D, AM_NOT_USED, AM_NOT_USED, "bswap", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xCA */ { 0, IT_GENERIC, AM_REGISTER | OT_D, AM_NOT_USED, AM_NOT_USED, "bswap", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xCB */ { 0, IT_GENERIC, AM_REGISTER | OT_D, AM_NOT_USED, AM_NOT_USED, "bswap", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xCC */ { 0, IT_GENERIC, AM_REGISTER | OT_D, AM_NOT_USED, AM_NOT_USED, "bswap", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xCD */ { 0, IT_GENERIC, AM_REGISTER | OT_D, AM_NOT_USED, AM_NOT_USED, "bswap", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xCE */ { 0, IT_GENERIC, AM_REGISTER | OT_D, AM_NOT_USED, AM_NOT_USED, "bswap", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xCF */ { 0, IT_GENERIC, AM_REGISTER | OT_D, AM_NOT_USED, AM_NOT_USED, "bswap", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xD0 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xD1 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psrlw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psrlw" } },
+  /* 0xD2 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psrld", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psrld" } },
+  /* 0xD3 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psrlq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psrlq" } },
+  /* 0xD4 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "paddq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "paddq" } },
+  /* 0xD5 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pmullw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pmullw" } },
+  /* 0xD6 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "unused without prefix", true,
+    /* F2h */ { 0, IT_GENERIC, AM_P | OT_Q, AM_W | OT_Q, AM_NOT_USED, "movdq2q" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_Q | OT_Q, AM_NOT_USED, "movq2dq" },
+    /* 66h */ { 0, IT_GENERIC, AM_W | OT_Q, AM_V | OT_Q, AM_NOT_USED, "movq" } },
+  /* 0xD7 */ { 0, IT_GENERIC, AM_G | OT_D, AM_P | OT_Q, AM_NOT_USED, "pmovmskb", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_G | OT_D, AM_V | OT_DQ, AM_NOT_USED, "pmovmskb" } },
+  /* 0xD8 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psubusb", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psubusb" } },
+  /* 0xD9 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psubusw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psubusw" } },
+  /* 0xDA */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pminub", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pminub" } },
+  /* 0xDB */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pand", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pand" } },
+  /* 0xDC */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "paddusb", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "paddusb" } },
+  /* 0xDD */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "paddusw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "paddusw" } },
+  /* 0xDE */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pmaxub", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pmaxub" } },
+  /* 0xDF */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pandn", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pandn" } },
+  /* 0xE0 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pavgb", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pavgb" } },
+  /* 0xE1 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psraw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psrqw" } },
+  /* 0xE2 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psrad", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psrad" } },
+  /* 0xE3 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pavgw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pavgw" } },
+  /* 0xE4 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pmulhuw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pmulhuw" } },
+  /* 0xE5 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pmulhuw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pmulhw" } },
+  /* 0xE6 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "not used without prefix", true,
+    /* F2h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_PD, AM_NOT_USED, "cvtpd2dq" },
+    /* F3h */ { 0, IT_GENERIC, AM_V | OT_PD, AM_W | OT_DQ, AM_NOT_USED, "cvtdq2pd" },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_PD, AM_NOT_USED, "cvttpd2dq" } },
+  /* 0xE7 */ { 0, IT_GENERIC, AM_W | OT_Q, AM_V | OT_Q, AM_NOT_USED, "movntq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_W | OT_DQ, AM_V | OT_DQ, AM_NOT_USED, "movntdq" } },
+  /* 0xE8 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psubsb", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psubsb" } },
+  /* 0xE9 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psubsw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psubsw" } },
+  /* 0xEA */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pminsw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pminsw" } },
+  /* 0xEB */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "por", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "por" } },
+  /* 0xEC */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "paddsb", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "paddsb" } },
+  /* 0xED */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "paddsw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "paddsw" } },
+  /* 0xEE */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pmaxsw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pmaxsw" } },
+  /* 0xEF */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pxor", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pxor" } },
+  /* 0xF0 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0xF1 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psllw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psllw" } },
+  /* 0xF2 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pslld", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pslld" } },
+  /* 0xF3 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psllq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psllq" } },
+  /* 0xF4 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pmuludq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pmuludq" } },
+  /* 0xF5 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "pmaddwd", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "pmaddwd" } },
+  /* 0xF6 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psadbw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psadbw" } },
+  /* 0xF7 */ { 0, IT_GENERIC, AM_P | OT_PI, AM_Q | OT_PI, AM_NOT_USED, "maskmovq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "maskmovdqu" } },
+  /* 0xF8 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psubb", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psubb" } },
+  /* 0xF9 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psubw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psubw" } },
+  /* 0xFA */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psubd", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psubd" } },
+  /* 0xFB */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "psubq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "psubq" } },
+  /* 0xFC */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "paddb", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "paddb" } },
+  /* 0xFD */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "paddw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "paddw" } },
+  /* 0xFE */ { 0, IT_GENERIC, AM_P | OT_Q, AM_Q | OT_Q, AM_NOT_USED, "paddd", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_V | OT_DQ, AM_W | OT_DQ, AM_NOT_USED, "paddd" } },
+  /* 0xFF */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_0f00[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_W, AM_NOT_USED, AM_NOT_USED, "sldt", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_W, AM_NOT_USED, AM_NOT_USED, "str", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_E | OT_W, AM_NOT_USED, AM_NOT_USED, "lldt", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_E | OT_W, AM_NOT_USED, AM_NOT_USED, "ltr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_E | OT_W, AM_NOT_USED, AM_NOT_USED, "verr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_E | OT_W, AM_NOT_USED, AM_NOT_USED, "verw", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_0f01[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_M | OT_S, AM_NOT_USED, AM_NOT_USED, "sgdt", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_M | OT_S, AM_NOT_USED, AM_NOT_USED, "sidt", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_M | OT_S, AM_NOT_USED, AM_NOT_USED, "lgdt", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_M | OT_S, AM_NOT_USED, AM_NOT_USED, "lidt", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_E | OT_W, AM_NOT_USED, AM_NOT_USED, "smsw", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_E | OT_W, AM_NOT_USED, AM_NOT_USED, "lmsw", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_M | OT_B, AM_NOT_USED, AM_NOT_USED, "invlpg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_0f18[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_M | OT_ADDRESS_MODE_M, AM_NOT_USED, AM_NOT_USED, "prefetch", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_REGISTER | OT_D, AM_NOT_USED, AM_NOT_USED, "prefetch", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_REGISTER | OT_D, AM_NOT_USED, AM_NOT_USED, "prefetch", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_REGISTER | OT_D, AM_NOT_USED, AM_NOT_USED, "prefetch", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_0f71[] = {
+  /* 0x0 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_I | OT_B, AM_NOT_USED, "psrlw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_P | OT_DQ, AM_I | OT_B, AM_NOT_USED, "psrlw" } },
+  /* 0x3 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_I | OT_B, AM_NOT_USED, "psraw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_P | OT_DQ, AM_I | OT_B, AM_NOT_USED, "psraw" } },
+  /* 0x5 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_I | OT_B, AM_NOT_USED, "psllw", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_P | OT_DQ, AM_I | OT_B, AM_NOT_USED, "psllw" } },
+  /* 0x7 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_0f72[] = {
+  /* 0x0 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_I | OT_B, AM_NOT_USED, "psrld", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_W | OT_DQ, AM_I | OT_B, AM_NOT_USED, "psrld" } },
+  /* 0x3 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_I | OT_B, AM_NOT_USED, "psrad", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_W | OT_DQ, AM_I | OT_B, AM_NOT_USED, "psrad" } },
+  /* 0x5 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_I | OT_B, AM_NOT_USED, "pslld", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_W | OT_DQ, AM_I | OT_B, AM_NOT_USED, "pslld" } },
+  /* 0x7 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_0f73[] = {
+  /* 0x0 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_I | OT_B, AM_NOT_USED, "psrlq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_W | OT_DQ, AM_I | OT_B, AM_NOT_USED, "psrlq" } },
+  /* 0x3 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_P | OT_Q, AM_I | OT_B, AM_NOT_USED, "psllq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_W | OT_DQ, AM_I | OT_B, AM_NOT_USED, "psllq" } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_W | OT_DQ, AM_I | OT_B, AM_NOT_USED, "pslldq", true,
+    /* F2h */ { 0 },
+    /* F3h */ { 0 },
+    /* 66h */ { 0, IT_GENERIC, AM_W | OT_DQ, AM_I | OT_B, AM_NOT_USED, "pslldq" } },
+};
+
+const Opcode s_opcode_byte_after_0fae[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "fxsave", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "fxrstor", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "ldmxcsr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "stmxcsr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "lfence", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "mfence", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, "clflush/sfence", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+};
+
+const Opcode s_opcode_byte_after_0fba[] = {
+  /* 0x0 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "bt", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "bts", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "btr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "btc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_0fc7[] = {
+  /* 0x0 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_M | OT_Q, AM_NOT_USED, AM_NOT_USED, "cmpxch8b", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_80[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "add", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "or", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "adc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "sbb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "and", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "sub", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "xor", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "cmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_81[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_V, AM_NOT_USED, "add", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_V, AM_NOT_USED, "or", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_V, AM_NOT_USED, "adc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_V, AM_NOT_USED, "sbb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_V, AM_NOT_USED, "and", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_V, AM_NOT_USED, "sub", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_V, AM_NOT_USED, "xor", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_V, AM_NOT_USED, "cmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_82[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "add", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "or", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "adc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "sbb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "and", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "sub", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "xor", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "cmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_83[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "add", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "or", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "adc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "sbb", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "and", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "sub", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "xor", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "cmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_c0[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "rol", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "ror", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "rcl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "rcr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "shl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "shr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "sal", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "sar", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_c1[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "rol", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "ror", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "rcl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "rcr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "shl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "shr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "sal", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_B, AM_NOT_USED, "sar", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_d0[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_B, AM_IMPLICIT, AM_NOT_USED, "rol", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_B, AM_IMPLICIT, AM_NOT_USED, "ror", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_E | OT_B, AM_IMPLICIT, AM_NOT_USED, "rcl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_E | OT_B, AM_IMPLICIT, AM_NOT_USED, "rcr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_E | OT_B, AM_IMPLICIT, AM_NOT_USED, "shl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_E | OT_B, AM_IMPLICIT, AM_NOT_USED, "shr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_E | OT_B, AM_IMPLICIT, AM_NOT_USED, "sal", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_E | OT_B, AM_IMPLICIT, AM_NOT_USED, "sar", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_d1[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_V, AM_IMPLICIT, AM_NOT_USED, "rol", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_V, AM_IMPLICIT, AM_NOT_USED, "ror", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_E | OT_V, AM_IMPLICIT, AM_NOT_USED, "rcl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_E | OT_V, AM_IMPLICIT, AM_NOT_USED, "rcr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_E | OT_V, AM_IMPLICIT, AM_NOT_USED, "shl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_E | OT_V, AM_IMPLICIT, AM_NOT_USED, "shr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_E | OT_V, AM_IMPLICIT, AM_NOT_USED, "sal", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_E | OT_V, AM_IMPLICIT, AM_NOT_USED, "sar", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_d2[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_B, AM_REGISTER | OT_B, AM_NOT_USED, "rol", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_B, AM_REGISTER | OT_B, AM_NOT_USED, "ror", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_E | OT_B, AM_REGISTER | OT_B, AM_NOT_USED, "rcl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_E | OT_B, AM_REGISTER | OT_B, AM_NOT_USED, "rcr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_E | OT_B, AM_REGISTER | OT_B, AM_NOT_USED, "shl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_E | OT_B, AM_REGISTER | OT_B, AM_NOT_USED, "shr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_E | OT_B, AM_REGISTER | OT_B, AM_NOT_USED, "sal", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_E | OT_B, AM_REGISTER | OT_B, AM_NOT_USED, "sar", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_d3[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_V, AM_REGISTER | OT_B, AM_NOT_USED, "rol", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_V, AM_REGISTER | OT_B, AM_NOT_USED, "ror", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_E | OT_V, AM_REGISTER | OT_B, AM_NOT_USED, "rcl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_E | OT_V, AM_REGISTER | OT_B, AM_NOT_USED, "rcr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_E | OT_V, AM_REGISTER | OT_B, AM_NOT_USED, "shl", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_E | OT_V, AM_REGISTER | OT_B, AM_NOT_USED, "shr", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_E | OT_V, AM_REGISTER | OT_B, AM_NOT_USED, "sal", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_E | OT_V, AM_REGISTER | OT_B, AM_NOT_USED, "sar", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_f6[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "test", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_B, AM_I | OT_B, AM_NOT_USED, "test", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "not", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "neg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, OT_B | AM_REGISTER, AM_E | OT_B, AM_NOT_USED, "mul", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, OT_B | AM_REGISTER, AM_E | OT_B, AM_NOT_USED, "imul", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_E | OT_B, AM_NOT_USED, "div", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_REGISTER | OT_B, AM_E | OT_B, AM_NOT_USED, "idiv", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_f7[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_V, AM_NOT_USED, "test", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_V, AM_I | OT_V, AM_NOT_USED, "test", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_GENERIC, AM_E | OT_V, AM_NOT_USED, AM_NOT_USED, "not", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_GENERIC, AM_E | OT_V, AM_NOT_USED, AM_NOT_USED, "neg", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_E | OT_V, AM_NOT_USED, "mul", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_E | OT_V, AM_NOT_USED, "imul", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_E | OT_V, AM_NOT_USED, "div", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_GENERIC, AM_REGISTER | OT_V, AM_E | OT_V, AM_NOT_USED, "idiv", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_fe[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "inc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_B, AM_NOT_USED, AM_NOT_USED, "dec", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+const Opcode s_opcode_byte_after_ff[] = {
+  /* 0x0 */ { 0, IT_GENERIC, AM_E | OT_V, AM_NOT_USED, AM_NOT_USED, "inc", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x1 */ { 0, IT_GENERIC, AM_E | OT_V, AM_NOT_USED, AM_NOT_USED, "dec", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x2 */ { 0, IT_JUMP, AM_E | OT_V, AM_NOT_USED, AM_NOT_USED, "call", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x3 */ { 0, IT_JUMP, AM_E | OT_P, AM_NOT_USED, AM_NOT_USED, "call", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x4 */ { 0, IT_JUMP, AM_E | OT_V, AM_NOT_USED, AM_NOT_USED, "jmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x5 */ { 0, IT_JUMP, AM_E | OT_P, AM_NOT_USED, AM_NOT_USED, "jmp", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x6 */ { 0, IT_GENERIC, AM_E | OT_V, AM_NOT_USED, AM_NOT_USED, "push", false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } },
+  /* 0x7 */ { 0, IT_UNUSED, AM_NOT_USED, AM_NOT_USED, AM_NOT_USED, 0, false, /* F2h */ { 0 }, /* F3h */ { 0 }, /* 66h */ { 0 } }
+};
+
+/*
+* A table of all the other tables, containing some extra information, e.g.
+* how to mask out the byte we're looking at.
+*/
+const OpcodeTable MiniDisassembler::s_ia32_opcode_map_[]={
+  // One-byte opcodes and jumps to larger
+  /*  0 */ {s_first_opcode_byte, 0, 0xff, 0, 0xff},
+  // Two-byte opcodes (second byte)
+  /*  1 */ {s_opcode_byte_after_0f, 0, 0xff, 0, 0xff},
+  // Start of tables for opcodes using ModR/M bits as extension
+  /*  2 */ {s_opcode_byte_after_80, 3, 0x07, 0, 0x07},
+  /*  3 */ {s_opcode_byte_after_81, 3, 0x07, 0, 0x07}, 
+  /*  4 */ {s_opcode_byte_after_82, 3, 0x07, 0, 0x07}, 
+  /*  5 */ {s_opcode_byte_after_83, 3, 0x07, 0, 0x07}, 
+  /*  6 */ {s_opcode_byte_after_c0, 3, 0x07, 0, 0x07}, 
+  /*  7 */ {s_opcode_byte_after_c1, 3, 0x07, 0, 0x07}, 
+  /*  8 */ {s_opcode_byte_after_d0, 3, 0x07, 0, 0x07}, 
+  /*  9 */ {s_opcode_byte_after_d1, 3, 0x07, 0, 0x07}, 
+  /* 10 */ {s_opcode_byte_after_d2, 3, 0x07, 0, 0x07}, 
+  /* 11 */ {s_opcode_byte_after_d3, 3, 0x07, 0, 0x07}, 
+  /* 12 */ {s_opcode_byte_after_f6, 3, 0x07, 0, 0x07}, 
+  /* 13 */ {s_opcode_byte_after_f7, 3, 0x07, 0, 0x07}, 
+  /* 14 */ {s_opcode_byte_after_fe, 3, 0x07, 0, 0x01}, 
+  /* 15 */ {s_opcode_byte_after_ff, 3, 0x07, 0, 0x07}, 
+  /* 16 */ {s_opcode_byte_after_0f00, 3, 0x07, 0, 0x07}, 
+  /* 17 */ {s_opcode_byte_after_0f01, 3, 0x07, 0, 0x07}, 
+  /* 18 */ {s_opcode_byte_after_0f18, 3, 0x07, 0, 0x07}, 
+  /* 19 */ {s_opcode_byte_after_0f71, 3, 0x07, 0, 0x07}, 
+  /* 20 */ {s_opcode_byte_after_0f72, 3, 0x07, 0, 0x07}, 
+  /* 21 */ {s_opcode_byte_after_0f73, 3, 0x07, 0, 0x07}, 
+  /* 22 */ {s_opcode_byte_after_0fae, 3, 0x07, 0, 0x07}, 
+  /* 23 */ {s_opcode_byte_after_0fba, 3, 0x07, 0, 0x07}, 
+  /* 24 */ {s_opcode_byte_after_0fc7, 3, 0x07, 0, 0x01}
+};
+
+};  // namespace sidestep

diff --git a/src/windows/mingw.h b/src/windows/mingw.h
new file mode 100644
index 0000000..0586e62
--- /dev/null
+++ b/src/windows/mingw.h

@@ -0,0 +1,72 @@
+/* -*- Mode: C; c-basic-offset: 2; indent-tabs-mode: nil -*- */
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Craig Silverstein
+ *
+ * MinGW is an interesting mix of unix and windows.  We use a normal
+ * configure script, but still need the windows port.h to define some
+ * stuff that MinGW doesn't support, like pthreads.
+ */
+
+#ifndef GOOGLE_PERFTOOLS_WINDOWS_MINGW_H_
+#define GOOGLE_PERFTOOLS_WINDOWS_MINGW_H_
+
+#ifdef __MINGW32__
+
+// Older version of the mingw msvcrt don't define _aligned_malloc
+#if __MSVCRT_VERSION__ < 0x0700
+# define PERFTOOLS_NO_ALIGNED_MALLOC 1
+#endif
+
+// This must be defined before the windows.h is included.  We need at
+// least 0x0400 for mutex.h to have access to TryLock, and at least
+// 0x0501 for patch_functions.cc to have access to GetModuleHandleEx.
+// (This latter is an optimization we could take out if need be.)
+#ifndef _WIN32_WINNT
+# define _WIN32_WINNT 0x0501
+#endif
+
+#define HAVE_SNPRINTF 1
+
+// Some mingw distributions have a pthreads wrapper, but it doesn't
+// work as well as native windows spinlocks (at least for us).  So
+// pretend the pthreads wrapper doesn't exist, even when it does.
+#ifndef HAVE_PTHREAD_DESPITE_ASKING_FOR
+#undef HAVE_PTHREAD
+#endif
+
+#define HAVE_PID_T
+
+#include "windows/port.h"
+
+#endif  /* __MINGW32__ */
+
+#endif  /* GOOGLE_PERFTOOLS_WINDOWS_MINGW_H_ */

diff --git a/src/windows/mini_disassembler.cc b/src/windows/mini_disassembler.cc
new file mode 100644
index 0000000..0c62004
--- /dev/null
+++ b/src/windows/mini_disassembler.cc

@@ -0,0 +1,432 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Joi Sigurdsson
+ *
+ * Implementation of MiniDisassembler.
+ */
+
+#include "mini_disassembler.h"
+
+namespace sidestep {
+
+MiniDisassembler::MiniDisassembler(bool operand_default_is_32_bits,
+                                   bool address_default_is_32_bits)
+    : operand_default_is_32_bits_(operand_default_is_32_bits),
+      address_default_is_32_bits_(address_default_is_32_bits) {
+  Initialize();
+}
+
+MiniDisassembler::MiniDisassembler()
+    : operand_default_is_32_bits_(true),
+      address_default_is_32_bits_(true) {
+  Initialize();
+}
+
+InstructionType MiniDisassembler::Disassemble(
+    unsigned char* start_byte,
+    unsigned int& instruction_bytes) {
+  // Clean up any state from previous invocations.
+  Initialize();
+
+  // Start by processing any prefixes.
+  unsigned char* current_byte = start_byte;
+  unsigned int size = 0;
+  InstructionType instruction_type = ProcessPrefixes(current_byte, size);
+
+  if (IT_UNKNOWN == instruction_type)
+    return instruction_type;
+
+  current_byte += size;
+  size = 0;
+
+  // Invariant: We have stripped all prefixes, and the operand_is_32_bits_
+  // and address_is_32_bits_ flags are correctly set.
+
+  instruction_type = ProcessOpcode(current_byte, 0, size);
+
+  // Check for error processing instruction
+  if ((IT_UNKNOWN == instruction_type_) || (IT_UNUSED == instruction_type_)) {
+    return IT_UNKNOWN;
+  }
+
+  current_byte += size;
+
+  // Invariant: operand_bytes_ indicates the total size of operands
+  // specified by the opcode and/or ModR/M byte and/or SIB byte.
+  // pCurrentByte points to the first byte after the ModR/M byte, or after
+  // the SIB byte if it is present (i.e. the first byte of any operands
+  // encoded in the instruction).
+
+  // We get the total length of any prefixes, the opcode, and the ModR/M and
+  // SIB bytes if present, by taking the difference of the original starting
+  // address and the current byte (which points to the first byte of the
+  // operands if present, or to the first byte of the next instruction if
+  // they are not).  Adding the count of bytes in the operands encoded in
+  // the instruction gives us the full length of the instruction in bytes.
+  instruction_bytes += operand_bytes_ + (current_byte - start_byte);
+
+  // Return the instruction type, which was set by ProcessOpcode().
+  return instruction_type_;
+}
+
+void MiniDisassembler::Initialize() {
+  operand_is_32_bits_ = operand_default_is_32_bits_;
+  address_is_32_bits_ = address_default_is_32_bits_;
+#ifdef _M_X64
+  operand_default_support_64_bits_ = true;
+#else
+  operand_default_support_64_bits_ = false;
+#endif
+  operand_is_64_bits_ = false;
+  operand_bytes_ = 0;
+  have_modrm_ = false;
+  should_decode_modrm_ = false;
+  instruction_type_ = IT_UNKNOWN;
+  got_f2_prefix_ = false;
+  got_f3_prefix_ = false;
+  got_66_prefix_ = false;
+}
+
+InstructionType MiniDisassembler::ProcessPrefixes(unsigned char* start_byte,
+                                                  unsigned int& size) {
+  InstructionType instruction_type = IT_GENERIC;
+  const Opcode& opcode = s_ia32_opcode_map_[0].table_[*start_byte];
+
+  switch (opcode.type_) {
+    case IT_PREFIX_ADDRESS:
+      address_is_32_bits_ = !address_default_is_32_bits_;
+      goto nochangeoperand;
+    case IT_PREFIX_OPERAND:
+      operand_is_32_bits_ = !operand_default_is_32_bits_;
+      nochangeoperand:
+    case IT_PREFIX:
+
+      if (0xF2 == (*start_byte))
+        got_f2_prefix_ = true;
+      else if (0xF3 == (*start_byte))
+        got_f3_prefix_ = true;
+      else if (0x66 == (*start_byte))
+        got_66_prefix_ = true;
+      else if (operand_default_support_64_bits_ && (*start_byte) & 0x48)
+        operand_is_64_bits_ = true;
+
+      instruction_type = opcode.type_;
+      size ++;
+      // we got a prefix, so add one and check next byte
+      ProcessPrefixes(start_byte + 1, size);
+    default:
+      break;   // not a prefix byte
+  }
+
+  return instruction_type;
+}
+
+InstructionType MiniDisassembler::ProcessOpcode(unsigned char* start_byte,
+                                                unsigned int table_index,
+                                                unsigned int& size) {
+  const OpcodeTable& table = s_ia32_opcode_map_[table_index];   // Get our table
+  unsigned char current_byte = (*start_byte) >> table.shift_;
+  current_byte = current_byte & table.mask_;  // Mask out the bits we will use
+
+  // Check whether the byte we have is inside the table we have.
+  if (current_byte < table.min_lim_ || current_byte > table.max_lim_) {
+    instruction_type_ = IT_UNKNOWN;
+    return instruction_type_;
+  }
+
+  const Opcode& opcode = table.table_[current_byte];
+  if (IT_UNUSED == opcode.type_) {
+    // This instruction is not used by the IA-32 ISA, so we indicate
+    // this to the user.  Probably means that we were pointed to
+    // a byte in memory that was not the start of an instruction.
+    instruction_type_ = IT_UNUSED;
+    return instruction_type_;
+  } else if (IT_REFERENCE == opcode.type_) {
+    // We are looking at an opcode that has more bytes (or is continued
+    // in the ModR/M byte).  Recursively find the opcode definition in
+    // the table for the opcode's next byte.
+    size++;
+    ProcessOpcode(start_byte + 1, opcode.table_index_, size);
+    return instruction_type_;
+  }
+
+  const SpecificOpcode* specific_opcode = (SpecificOpcode*)&opcode;
+  if (opcode.is_prefix_dependent_) {
+    if (got_f2_prefix_ && opcode.opcode_if_f2_prefix_.mnemonic_ != 0) {
+      specific_opcode = &opcode.opcode_if_f2_prefix_;
+    } else if (got_f3_prefix_ && opcode.opcode_if_f3_prefix_.mnemonic_ != 0) {
+      specific_opcode = &opcode.opcode_if_f3_prefix_;
+    } else if (got_66_prefix_ && opcode.opcode_if_66_prefix_.mnemonic_ != 0) {
+      specific_opcode = &opcode.opcode_if_66_prefix_;
+    }
+  }
+
+  // Inv: The opcode type is known.
+  instruction_type_ = specific_opcode->type_;
+
+  // Let's process the operand types to see if we have any immediate
+  // operands, and/or a ModR/M byte.
+
+  ProcessOperand(specific_opcode->flag_dest_);
+  ProcessOperand(specific_opcode->flag_source_);
+  ProcessOperand(specific_opcode->flag_aux_);
+
+  // Inv: We have processed the opcode and incremented operand_bytes_
+  // by the number of bytes of any operands specified by the opcode
+  // that are stored in the instruction (not registers etc.).  Now
+  // we need to return the total number of bytes for the opcode and
+  // for the ModR/M or SIB bytes if they are present.
+
+  if (table.mask_ != 0xff) {
+    if (have_modrm_) {
+      // we're looking at a ModR/M byte so we're not going to
+      // count that into the opcode size
+      ProcessModrm(start_byte, size);
+      return IT_GENERIC;
+    } else {
+      // need to count the ModR/M byte even if it's just being
+      // used for opcode extension
+      size++;
+      return IT_GENERIC;
+    }
+  } else {
+    if (have_modrm_) {
+      // The ModR/M byte is the next byte.
+      size++;
+      ProcessModrm(start_byte + 1, size);
+      return IT_GENERIC;
+    } else {
+      size++;
+      return IT_GENERIC;
+    }
+  }
+}
+
+bool MiniDisassembler::ProcessOperand(int flag_operand) {
+  bool succeeded = true;
+  if (AM_NOT_USED == flag_operand)
+    return succeeded;
+
+  // Decide what to do based on the addressing mode.
+  switch (flag_operand & AM_MASK) {
+    // No ModR/M byte indicated by these addressing modes, and no
+    // additional (e.g. immediate) parameters.
+    case AM_A: // Direct address
+    case AM_F: // EFLAGS register
+    case AM_X: // Memory addressed by the DS:SI register pair
+    case AM_Y: // Memory addressed by the ES:DI register pair
+    case AM_IMPLICIT: // Parameter is implicit, occupies no space in
+                       // instruction
+      break;
+
+    // There is a ModR/M byte but it does not necessarily need
+    // to be decoded.
+    case AM_C: // reg field of ModR/M selects a control register
+    case AM_D: // reg field of ModR/M selects a debug register
+    case AM_G: // reg field of ModR/M selects a general register
+    case AM_P: // reg field of ModR/M selects an MMX register
+    case AM_R: // mod field of ModR/M may refer only to a general register
+    case AM_S: // reg field of ModR/M selects a segment register
+    case AM_T: // reg field of ModR/M selects a test register
+    case AM_V: // reg field of ModR/M selects a 128-bit XMM register
+      have_modrm_ = true;
+      break;
+
+    // In these addressing modes, there is a ModR/M byte and it needs to be
+    // decoded. No other (e.g. immediate) params than indicated in ModR/M.
+    case AM_E: // Operand is either a general-purpose register or memory,
+                 // specified by ModR/M byte
+    case AM_M: // ModR/M byte will refer only to memory
+    case AM_Q: // Operand is either an MMX register or memory (complex
+                 // evaluation), specified by ModR/M byte
+    case AM_W: // Operand is either a 128-bit XMM register or memory (complex
+                 // eval), specified by ModR/M byte
+      have_modrm_ = true;
+      should_decode_modrm_ = true;
+      break;
+
+    // These addressing modes specify an immediate or an offset value
+    // directly, so we need to look at the operand type to see how many
+    // bytes.
+    case AM_I: // Immediate data.
+    case AM_J: // Jump to offset.
+    case AM_O: // Operand is at offset.
+      switch (flag_operand & OT_MASK) {
+        case OT_B: // Byte regardless of operand-size attribute.
+          operand_bytes_ += OS_BYTE;
+          break;
+        case OT_C: // Byte or word, depending on operand-size attribute.
+          if (operand_is_32_bits_)
+            operand_bytes_ += OS_WORD;
+          else
+            operand_bytes_ += OS_BYTE;
+          break;
+        case OT_D: // Doubleword, regardless of operand-size attribute.
+          operand_bytes_ += OS_DOUBLE_WORD;
+          break;
+        case OT_DQ: // Double-quadword, regardless of operand-size attribute.
+          operand_bytes_ += OS_DOUBLE_QUAD_WORD;
+          break;
+        case OT_P: // 32-bit or 48-bit pointer, depending on operand-size
+                     // attribute.
+          if (operand_is_32_bits_)
+            operand_bytes_ += OS_48_BIT_POINTER;
+          else
+            operand_bytes_ += OS_32_BIT_POINTER;
+          break;
+        case OT_PS: // 128-bit packed single-precision floating-point data.
+          operand_bytes_ += OS_128_BIT_PACKED_SINGLE_PRECISION_FLOATING;
+          break;
+        case OT_Q: // Quadword, regardless of operand-size attribute.
+          operand_bytes_ += OS_QUAD_WORD;
+          break;
+        case OT_S: // 6-byte pseudo-descriptor.
+          operand_bytes_ += OS_PSEUDO_DESCRIPTOR;
+          break;
+        case OT_SD: // Scalar Double-Precision Floating-Point Value
+        case OT_PD: // Unaligned packed double-precision floating point value
+          operand_bytes_ += OS_DOUBLE_PRECISION_FLOATING;
+          break;
+        case OT_SS:
+          // Scalar element of a 128-bit packed single-precision
+          // floating data.
+          // We simply return enItUnknown since we don't have to support
+          // floating point
+          succeeded = false;
+          break;
+        case OT_V: // Word, doubleword or quadword, depending on operand-size 
+                   // attribute.
+          if (operand_is_64_bits_ && flag_operand & AM_I &&
+              flag_operand & IOS_64)
+            operand_bytes_ += OS_QUAD_WORD;
+          else if (operand_is_32_bits_)
+            operand_bytes_ += OS_DOUBLE_WORD;
+          else
+            operand_bytes_ += OS_WORD;
+          break;
+        case OT_W: // Word, regardless of operand-size attribute.
+          operand_bytes_ += OS_WORD;
+          break;
+
+        // Can safely ignore these.
+        case OT_A: // Two one-word operands in memory or two double-word
+                     // operands in memory
+        case OT_PI: // Quadword MMX technology register (e.g. mm0)
+        case OT_SI: // Doubleword integer register (e.g., eax)
+          break;
+
+        default:
+          break;
+      }
+      break;
+
+    default:
+      break;
+  }
+
+  return succeeded;
+}
+
+bool MiniDisassembler::ProcessModrm(unsigned char* start_byte,
+                                    unsigned int& size) {
+  // If we don't need to decode, we just return the size of the ModR/M
+  // byte (there is never a SIB byte in this case).
+  if (!should_decode_modrm_) {
+    size++;
+    return true;
+  }
+
+  // We never care about the reg field, only the combination of the mod
+  // and r/m fields, so let's start by packing those fields together into
+  // 5 bits.
+  unsigned char modrm = (*start_byte);
+  unsigned char mod = modrm & 0xC0; // mask out top two bits to get mod field
+  modrm = modrm & 0x07; // mask out bottom 3 bits to get r/m field
+  mod = mod >> 3; // shift the mod field to the right place
+  modrm = mod | modrm; // combine the r/m and mod fields as discussed
+  mod = mod >> 3; // shift the mod field to bits 2..0
+
+  // Invariant: modrm contains the mod field in bits 4..3 and the r/m field
+  // in bits 2..0, and mod contains the mod field in bits 2..0
+
+  const ModrmEntry* modrm_entry = 0;
+  if (address_is_32_bits_)
+    modrm_entry = &s_ia32_modrm_map_[modrm];
+  else
+    modrm_entry = &s_ia16_modrm_map_[modrm];
+
+  // Invariant: modrm_entry points to information that we need to decode
+  // the ModR/M byte.
+
+  // Add to the count of operand bytes, if the ModR/M byte indicates
+  // that some operands are encoded in the instruction.
+  if (modrm_entry->is_encoded_in_instruction_)
+    operand_bytes_ += modrm_entry->operand_size_;
+
+  // Process the SIB byte if necessary, and return the count
+  // of ModR/M and SIB bytes.
+  if (modrm_entry->use_sib_byte_) {
+    size++;
+    return ProcessSib(start_byte + 1, mod, size);
+  } else {
+    size++;
+    return true;
+  }
+}
+
+bool MiniDisassembler::ProcessSib(unsigned char* start_byte,
+                                  unsigned char mod,
+                                  unsigned int& size) {
+  // get the mod field from the 2..0 bits of the SIB byte
+  unsigned char sib_base = (*start_byte) & 0x07;
+  if (0x05 == sib_base) {
+    switch (mod) {
+    case 0x00: // mod == 00
+    case 0x02: // mod == 10
+      operand_bytes_ += OS_DOUBLE_WORD;
+      break;
+    case 0x01: // mod == 01
+      operand_bytes_ += OS_BYTE;
+      break;
+    case 0x03: // mod == 11
+      // According to the IA-32 docs, there does not seem to be a disp
+      // value for this value of mod
+    default:
+      break;
+    }
+  }
+
+  size++;
+  return true;
+}
+
+};  // namespace sidestep

diff --git a/src/windows/mini_disassembler.h b/src/windows/mini_disassembler.h
new file mode 100644
index 0000000..93bdc06
--- /dev/null
+++ b/src/windows/mini_disassembler.h

@@ -0,0 +1,198 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Joi Sigurdsson
+ *
+ * Definition of MiniDisassembler.
+ */
+
+#ifndef GOOGLE_PERFTOOLS_MINI_DISASSEMBLER_H_
+#define GOOGLE_PERFTOOLS_MINI_DISASSEMBLER_H_
+
+#include "config.h"
+#include <windows.h>
+#include "mini_disassembler_types.h"
+
+// compatibility shim
+#include "base/logging.h"
+#define SIDESTEP_ASSERT(cond)  RAW_DCHECK(cond, #cond)
+#define SIDESTEP_LOG(msg)      RAW_VLOG(1, msg)
+
+namespace sidestep {
+
+// This small disassembler is very limited
+// in its functionality, and in fact does only the bare minimum required by the
+// preamble patching utility.  It may be useful for other purposes, however.
+//
+// The limitations include at least the following:
+//  -# No support for coprocessor opcodes, MMX, etc.
+//  -# No machine-readable identification of opcodes or decoding of
+//     assembly parameters. The name of the opcode (as a string) is given,
+//     however, to aid debugging.
+//
+// You may ask what this little disassembler actually does, then?  The answer is
+// that it does the following, which is exactly what the patching utility needs:
+//  -# Indicates if opcode is a jump (any kind) or a return (any kind)
+//     because this is important for the patching utility to determine if
+//     a function is too short or there are jumps too early in it for it
+//     to be preamble patched.
+//  -# The opcode length is always calculated, so that the patching utility
+//     can figure out where the next instruction starts, and whether it
+//     already has enough instructions to replace with the absolute jump
+//     to the patching code.
+//
+// The usage is quite simple; just create a MiniDisassembler and use its
+// Disassemble() method.
+//
+// If you would like to extend this disassembler, please refer to the
+// IA-32 Intel® Architecture Software Developers Manual Volume 2:
+// Instruction Set Reference for information about operand decoding
+// etc.
+class PERFTOOLS_DLL_DECL MiniDisassembler {
+ public:
+
+  // Creates a new instance and sets defaults.
+  //
+  // @param operand_default_32_bits If true, the default operand size is
+  // set to 32 bits, which is the default under Win32. Otherwise it is 16 bits.
+  // @param address_default_32_bits If true, the default address size is
+  // set to 32 bits, which is the default under Win32. Otherwise it is 16 bits.
+  MiniDisassembler(bool operand_default_32_bits,
+                   bool address_default_32_bits);
+
+  // Equivalent to MiniDisassembler(true, true);
+  MiniDisassembler();
+
+  // Attempts to disassemble a single instruction starting from the
+  // address in memory it is pointed to.
+  //
+  // @param start Address where disassembly should start.
+  // @param instruction_bytes Variable that will be <b>incremented</b> by
+  // the length in bytes of the instruction.
+  // @return enItJump, enItReturn or enItGeneric on success.  enItUnknown
+  // if unable to disassemble, enItUnused if this seems to be an unused
+  // opcode. In the last two (error) cases, cbInstruction will be set
+  // to 0xffffffff.
+  //
+  // @post This instance of the disassembler is ready to be used again,
+  // with unchanged defaults from creation time.
+  InstructionType Disassemble(unsigned char* start, unsigned int& instruction_bytes);
+
+ private:
+
+  // Makes the disassembler ready for reuse.
+  void Initialize();
+
+  // Sets the flags for address and operand sizes.
+  // @return Number of prefix bytes.
+  InstructionType ProcessPrefixes(unsigned char* start, unsigned int& size);
+
+  // Sets the flag for whether we have ModR/M, and increments
+  // operand_bytes_ if any are specifies by the opcode directly.
+  // @return Number of opcode bytes.
+  InstructionType ProcessOpcode(unsigned char* start,
+                                unsigned int table,
+                                unsigned int& size);
+
+  // Checks the type of the supplied operand.  Increments
+  // operand_bytes_ if it directly indicates an immediate etc.
+  // operand.  Asserts have_modrm_ if the operand specifies
+  // a ModR/M byte.
+  bool ProcessOperand(int flag_operand);
+
+  // Increments operand_bytes_ by size specified by ModR/M and
+  // by SIB if present.
+  // @return 0 in case of error, 1 if there is just a ModR/M byte,
+  // 2 if there is a ModR/M byte and a SIB byte.
+  bool ProcessModrm(unsigned char* start, unsigned int& size);
+
+  // Processes the SIB byte that it is pointed to.
+  // @param start Pointer to the SIB byte.
+  // @param mod The mod field from the ModR/M byte.
+  // @return 1 to indicate success (indicates 1 SIB byte)
+  bool ProcessSib(unsigned char* start, unsigned char mod, unsigned int& size);
+
+  // The instruction type we have decoded from the opcode.
+  InstructionType instruction_type_;
+
+  // Counts the number of bytes that is occupied by operands in
+  // the current instruction (note: we don't care about how large
+  // operands stored in registers etc. are).
+  unsigned int operand_bytes_;
+
+  // True iff there is a ModR/M byte in this instruction.
+  bool have_modrm_;
+
+  // True iff we need to decode the ModR/M byte (sometimes it just
+  // points to a register, we can tell by the addressing mode).
+  bool should_decode_modrm_;
+
+  // Current operand size is 32 bits if true, 16 bits if false.
+  bool operand_is_32_bits_;
+
+  // Default operand size is 32 bits if true, 16 bits if false.
+  bool operand_default_is_32_bits_;
+
+  // Current address size is 32 bits if true, 16 bits if false.
+  bool address_is_32_bits_;
+
+  // Default address size is 32 bits if true, 16 bits if false.
+  bool address_default_is_32_bits_;
+
+  // Determines if 64 bit operands are supported (x64).
+  bool operand_default_support_64_bits_;
+
+  // Current operand size is 64 bits if true, 32 bits if false.
+  bool operand_is_64_bits_;
+
+  // Huge big opcode table based on the IA-32 manual, defined
+  // in Ia32OpcodeMap.cc
+  static const OpcodeTable s_ia32_opcode_map_[];
+
+  // Somewhat smaller table to help with decoding ModR/M bytes
+  // when 16-bit addressing mode is being used.  Defined in
+  // Ia32ModrmMap.cc
+  static const ModrmEntry s_ia16_modrm_map_[];
+
+  // Somewhat smaller table to help with decoding ModR/M bytes
+  // when 32-bit addressing mode is being used.  Defined in
+  // Ia32ModrmMap.cc
+  static const ModrmEntry s_ia32_modrm_map_[];
+
+  // Indicators of whether we got certain prefixes that certain
+  // silly Intel instructions depend on in nonstandard ways for
+  // their behaviors.
+  bool got_f2_prefix_, got_f3_prefix_, got_66_prefix_;
+};
+
+};  // namespace sidestep
+
+#endif  // GOOGLE_PERFTOOLS_MINI_DISASSEMBLER_H_

diff --git a/src/windows/mini_disassembler_types.h b/src/windows/mini_disassembler_types.h
new file mode 100644
index 0000000..06d4755
--- /dev/null
+++ b/src/windows/mini_disassembler_types.h

@@ -0,0 +1,237 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Joi Sigurdsson
+ *
+ * Several simple types used by the disassembler and some of the patching
+ * mechanisms.
+ */
+
+#ifndef GOOGLE_PERFTOOLS_MINI_DISASSEMBLER_TYPES_H_
+#define GOOGLE_PERFTOOLS_MINI_DISASSEMBLER_TYPES_H_
+
+namespace sidestep {
+
+// Categories of instructions that we care about
+enum InstructionType {
+  // This opcode is not used
+  IT_UNUSED,
+  // This disassembler does not recognize this opcode (error)
+  IT_UNKNOWN,
+  // This is not an instruction but a reference to another table
+  IT_REFERENCE,
+  // This byte is a prefix byte that we can ignore
+  IT_PREFIX,
+  // This is a prefix byte that switches to the nondefault address size
+  IT_PREFIX_ADDRESS,
+  // This is a prefix byte that switches to the nondefault operand size
+  IT_PREFIX_OPERAND,
+  // A jump or call instruction
+  IT_JUMP,
+  // A return instruction
+  IT_RETURN,
+  // Any other type of instruction (in this case we don't care what it is)
+  IT_GENERIC,
+};
+
+// Lists IA-32 operand sizes in multiples of 8 bits
+enum OperandSize {
+  OS_ZERO = 0,
+  OS_BYTE = 1,
+  OS_WORD = 2,
+  OS_DOUBLE_WORD = 4,
+  OS_QUAD_WORD = 8,
+  OS_DOUBLE_QUAD_WORD = 16,
+  OS_32_BIT_POINTER = 32/8,
+  OS_48_BIT_POINTER = 48/8,
+  OS_SINGLE_PRECISION_FLOATING = 32/8,
+  OS_DOUBLE_PRECISION_FLOATING = 64/8,
+  OS_DOUBLE_EXTENDED_PRECISION_FLOATING = 80/8,
+  OS_128_BIT_PACKED_SINGLE_PRECISION_FLOATING = 128/8,
+  OS_PSEUDO_DESCRIPTOR = 6
+};
+
+// Operand addressing methods from the IA-32 manual.  The enAmMask value
+// is a mask for the rest.  The other enumeration values are named for the
+// names given to the addressing methods in the manual, e.g. enAm_D is for
+// the D addressing method.
+//
+// The reason we use a full 4 bytes and a mask, is that we need to combine
+// these flags with the enOperandType to store the details
+// on the operand in a single integer.
+enum AddressingMethod {
+  AM_NOT_USED = 0,        // This operand is not used for this instruction
+  AM_MASK = 0x00FF0000,  // Mask for the rest of the values in this enumeration
+  AM_A = 0x00010000,    // A addressing type
+  AM_C = 0x00020000,    // C addressing type
+  AM_D = 0x00030000,    // D addressing type
+  AM_E = 0x00040000,    // E addressing type
+  AM_F = 0x00050000,    // F addressing type
+  AM_G = 0x00060000,    // G addressing type
+  AM_I = 0x00070000,    // I addressing type
+  AM_J = 0x00080000,    // J addressing type
+  AM_M = 0x00090000,    // M addressing type
+  AM_O = 0x000A0000,    // O addressing type
+  AM_P = 0x000B0000,    // P addressing type
+  AM_Q = 0x000C0000,    // Q addressing type
+  AM_R = 0x000D0000,    // R addressing type
+  AM_S = 0x000E0000,    // S addressing type
+  AM_T = 0x000F0000,    // T addressing type
+  AM_V = 0x00100000,    // V addressing type
+  AM_W = 0x00110000,    // W addressing type
+  AM_X = 0x00120000,    // X addressing type
+  AM_Y = 0x00130000,    // Y addressing type
+  AM_REGISTER = 0x00140000,  // Specific register is always used as this op
+  AM_IMPLICIT = 0x00150000,  // An implicit, fixed value is used
+};
+
+// Operand types from the IA-32 manual. The enOtMask value is
+// a mask for the rest. The rest of the values are named for the
+// names given to these operand types in the manual, e.g. enOt_ps
+// is for the ps operand type in the manual.
+//
+// The reason we use a full 4 bytes and a mask, is that we need
+// to combine these flags with the enAddressingMethod to store the details
+// on the operand in a single integer.
+enum OperandType {
+  OT_MASK = 0xFF000000,
+  OT_A = 0x01000000,
+  OT_B = 0x02000000,
+  OT_C = 0x03000000,
+  OT_D = 0x04000000,
+  OT_DQ = 0x05000000,
+  OT_P = 0x06000000,
+  OT_PI = 0x07000000,
+  OT_PS = 0x08000000,  // actually unsupported for (we don't know its size)
+  OT_Q = 0x09000000,
+  OT_S = 0x0A000000,
+  OT_SS = 0x0B000000,
+  OT_SI = 0x0C000000,
+  OT_V = 0x0D000000,
+  OT_W = 0x0E000000,
+  OT_SD = 0x0F000000,  // scalar double-precision floating-point value
+  OT_PD = 0x10000000,  // double-precision floating point
+  // dummy "operand type" for address mode M - which doesn't specify
+  // operand type
+  OT_ADDRESS_MODE_M = 0x80000000
+};
+
+// Flag that indicates if an immediate operand is 64-bits.
+//
+// The Intel 64 and IA-32 Architecture Software Developer's Manual currently
+// defines MOV as the only instruction supporting a 64-bit immediate operand.
+enum ImmediateOperandSize {
+  IOS_MASK = 0x0000F000,
+  IOS_DEFAULT = 0x0,
+  IOS_64 = 0x00001000
+};
+
+// Everything that's in an Opcode (see below) except the three
+// alternative opcode structs for different prefixes.
+struct SpecificOpcode {
+  // Index to continuation table, or 0 if this is the last
+  // byte in the opcode.
+  int table_index_;
+
+  // The opcode type
+  InstructionType type_;
+
+  // Description of the type of the dest, src and aux operands,
+  // put together from enOperandType, enAddressingMethod and 
+  // enImmediateOperandSize flags.
+  int flag_dest_;
+  int flag_source_;
+  int flag_aux_;
+
+  // We indicate the mnemonic for debugging purposes
+  const char* mnemonic_;
+};
+
+// The information we keep in our tables about each of the different
+// valid instructions recognized by the IA-32 architecture.
+struct Opcode {
+  // Index to continuation table, or 0 if this is the last
+  // byte in the opcode.
+  int table_index_;
+
+  // The opcode type
+  InstructionType type_;
+
+  // Description of the type of the dest, src and aux operands,
+  // put together from an enOperandType flag and an enAddressingMethod
+  // flag.
+  int flag_dest_;
+  int flag_source_;
+  int flag_aux_;
+
+  // We indicate the mnemonic for debugging purposes
+  const char* mnemonic_;
+
+  // Alternative opcode info if certain prefixes are specified.
+  // In most cases, all of these are zeroed-out.  Only used if
+  // bPrefixDependent is true.
+  bool is_prefix_dependent_;
+  SpecificOpcode opcode_if_f2_prefix_;
+  SpecificOpcode opcode_if_f3_prefix_;
+  SpecificOpcode opcode_if_66_prefix_;
+};
+
+// Information about each table entry.
+struct OpcodeTable {
+  // Table of instruction entries
+  const Opcode* table_;
+  // How many bytes left to shift ModR/M byte <b>before</b> applying mask
+  unsigned char shift_;
+  // Mask to apply to byte being looked at before comparing to table
+  unsigned char mask_;
+  // Minimum/maximum indexes in table.
+  unsigned char min_lim_;
+  unsigned char max_lim_;
+};
+
+// Information about each entry in table used to decode ModR/M byte.
+struct ModrmEntry {
+  // Is the operand encoded as bytes in the instruction (rather than
+  // if it's e.g. a register in which case it's just encoded in the
+  // ModR/M byte)
+  bool is_encoded_in_instruction_;
+
+  // Is there a SIB byte?  In this case we always need to decode it.
+  bool use_sib_byte_;
+
+  // What is the size of the operand (only important if it's encoded
+  // in the instruction)?
+  OperandSize operand_size_;
+};
+
+};  // namespace sidestep
+
+#endif  // GOOGLE_PERFTOOLS_MINI_DISASSEMBLER_TYPES_H_

diff --git a/src/windows/nm-pdb.c b/src/windows/nm-pdb.c
new file mode 100644
index 0000000..95a080d
--- /dev/null
+++ b/src/windows/nm-pdb.c

@@ -0,0 +1,273 @@
+/* Copyright (c) 2008, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: David Vitek
+ *
+ * Dump function addresses using Microsoft debug symbols.  This works
+ * on PDB files.  Note that this program will download symbols to
+ * c:\websymbols without asking.
+ */
+
+#define WIN32_LEAN_AND_MEAN
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>   // for _strdup
+
+#include <windows.h>
+#include <dbghelp.h>
+
+// Unfortunately, there is no versioning info in dbghelp.h so I can
+// tell whether it has an old-style (circa VC7.1) IMAGEHLP_MODULE64
+// struct, with only a few fields, or a new-style (circa VC8)
+// IMAGEHLP_MODULE64, with lots of fields.  These fields are just used
+// for debugging, so it's fine to just assume the smaller struct, but
+// for most people, using a modern MSVC, the full struct is available.
+// If you are one of those people and would like this extra debugging
+// info, you can uncomment the line below.
+//#define VC8_OR_ABOVE
+
+#define SEARCH_CAP (1024*1024)
+#define WEBSYM "SRV*c:\\websymbols*http://msdl.microsoft.com/download/symbols"
+
+typedef struct {
+  char *name;
+  ULONG64 addr;
+  ULONG flags;
+} SYM;
+
+typedef struct {
+  ULONG64 module_base;
+  SYM *syms;
+  DWORD syms_len;
+  DWORD syms_cap;
+} SYM_CONTEXT;
+
+static int sym_cmp(const void *_s1, const void *_s2) {
+  const SYM *s1 = (const SYM *)_s1;
+  const SYM *s2 = (const SYM *)_s2;
+
+  if (s1->addr < s2->addr)
+    return -1;
+  if (s1->addr > s2->addr)
+    return 1;
+  return 0;
+}
+
+static BOOL CALLBACK EnumSymProc(PSYMBOL_INFO symbol_info,
+                                 ULONG symbol_size,
+                                 PVOID user_context) {
+  SYM_CONTEXT *ctx = (SYM_CONTEXT*)user_context;
+  if (symbol_info->Address < ctx->module_base ||
+      (symbol_info->Flags & SYMFLAG_TLSREL)) {
+    return TRUE;
+  }
+  if (ctx->syms_len == ctx->syms_cap) {
+    if (!ctx->syms_cap)
+      ctx->syms_cap++;
+    ctx->syms_cap *= 2;
+    ctx->syms = realloc(ctx->syms, sizeof(ctx->syms[0]) * ctx->syms_cap);
+  }
+  ctx->syms[ctx->syms_len].name = _strdup(symbol_info->Name);
+  ctx->syms[ctx->syms_len].addr = symbol_info->Address;
+  ctx->syms[ctx->syms_len].flags = symbol_info->Flags;
+  ctx->syms_len++;
+  return TRUE;
+}
+
+static void MaybePrint(const char* var, const char* description) {
+  if (var[0])
+    printf("%s: %s\n", description, var);
+}
+
+static void PrintAvailability(BOOL var, const char *description) {
+  printf("%s: %s\n", description, (var ? "Available" : "Not available"));
+}
+
+static void ShowSymbolInfo(HANDLE process, ULONG64 module_base) {
+  /* Get module information. */
+  IMAGEHLP_MODULE64 module_info;
+  BOOL getmoduleinfo_rv;
+  printf("Load Address: %I64x\n", module_base);
+  memset(&module_info, 0, sizeof(module_info));
+  module_info.SizeOfStruct = sizeof(module_info);
+  getmoduleinfo_rv = SymGetModuleInfo64(process, module_base, &module_info);
+  if (!getmoduleinfo_rv)  {
+    printf("Error: SymGetModuleInfo64() failed. Error code: %u\n",
+           GetLastError());
+    return;
+  }
+  /* Display information about symbols, based on kind of symbol. */
+  switch (module_info.SymType)  {
+    case SymNone:
+      printf(("No symbols available for the module.\n"));
+      break;
+    case SymExport:
+      printf(("Loaded symbols: Exports\n"));
+      break;
+    case SymCoff:
+      printf(("Loaded symbols: COFF\n"));
+      break;
+    case SymCv:
+      printf(("Loaded symbols: CodeView\n"));
+      break;
+    case SymSym:
+      printf(("Loaded symbols: SYM\n"));
+      break;
+    case SymVirtual:
+      printf(("Loaded symbols: Virtual\n"));
+      break;
+    case SymPdb:
+      printf(("Loaded symbols: PDB\n"));
+      break;
+    case SymDia:
+      printf(("Loaded symbols: DIA\n"));
+      break;
+    case SymDeferred:
+      printf(("Loaded symbols: Deferred\n"));  /* not actually loaded */
+      break;
+    default:
+      printf(("Loaded symbols: Unknown format.\n"));
+      break;
+  }
+
+  MaybePrint("Image name", module_info.ImageName);
+  MaybePrint("Loaded image name", module_info.LoadedImageName);
+#ifdef VC8_OR_ABOVE   /* TODO(csilvers): figure out how to tell */
+  MaybePrint("PDB file name", module_info.LoadedPdbName);
+  if (module_info.PdbUnmatched || module_info.DbgUnmatched)  {
+    /* This can only happen if the debug information is contained in a
+     * separate file (.DBG or .PDB)
+     */
+    printf(("Warning: Unmatched symbols.\n"));
+  }
+#endif
+
+  /* Contents */
+#ifdef VC8_OR_ABOVE   /* TODO(csilvers): figure out how to tell */
+  PrintAvailability("Line numbers", module_info.LineNumbers);
+  PrintAvailability("Global symbols", module_info.GlobalSymbols);
+  PrintAvailability("Type information", module_info.TypeInfo);
+#endif
+}
+
+void usage() {
+  fprintf(stderr, "usage: nm-pdb [-C|--demangle] <module or filename>\n");
+}
+
+int main(int argc, char *argv[]) {
+  DWORD  error;
+  HANDLE process;
+  ULONG64 module_base;
+  SYM_CONTEXT ctx;
+  int i;
+  char* search;
+  char* filename = NULL;
+  int rv = 0;
+  /* We may add SYMOPT_UNDNAME if --demangle is specified: */
+  DWORD symopts = SYMOPT_DEFERRED_LOADS | SYMOPT_DEBUG;
+
+  for (i = 1; i < argc; i++) {
+    if (strcmp(argv[i], "--demangle") == 0 || strcmp(argv[i], "-C") == 0) {
+      symopts |= SYMOPT_UNDNAME;
+    } else if (strcmp(argv[i], "--help") == 0) {
+      usage();
+      exit(0);
+    } else {
+      break;
+    }
+  }
+  if (i != argc - 1) {
+    usage();
+    exit(1);
+  }
+  filename = argv[i];
+
+  process = GetCurrentProcess();
+
+  if (!SymInitialize(process, NULL, FALSE)) {
+    error = GetLastError();
+    fprintf(stderr, "SymInitialize returned error : %d\n", error);
+    return 1;
+  }
+
+  search = malloc(SEARCH_CAP);
+  if (SymGetSearchPath(process, search, SEARCH_CAP)) {
+    if (strlen(search) + sizeof(";" WEBSYM) > SEARCH_CAP) {
+      fprintf(stderr, "Search path too long\n");
+      SymCleanup(process);
+      return 1;
+    }
+    strcat(search, ";" WEBSYM);
+  } else {
+    error = GetLastError();
+    fprintf(stderr, "SymGetSearchPath returned error : %d\n", error);
+    rv = 1;                   /* An error, but not a fatal one */
+    strcpy(search, WEBSYM);   /* Use a default value */
+  }
+  if (!SymSetSearchPath(process, search)) {
+    error = GetLastError();
+    fprintf(stderr, "SymSetSearchPath returned error : %d\n", error);
+    rv = 1;                   /* An error, but not a fatal one */
+ }
+
+  SymSetOptions(symopts);
+  module_base = SymLoadModuleEx(process, NULL, filename, NULL, 0, 0, NULL, 0);
+  if (!module_base) {
+    /* SymLoadModuleEx failed */
+    error = GetLastError();
+    fprintf(stderr, "SymLoadModuleEx returned error : %d for %s\n",
+            error, filename);
+    SymCleanup(process);
+    return 1;
+  }
+
+  ShowSymbolInfo(process, module_base);
+
+  memset(&ctx, 0, sizeof(ctx));
+  ctx.module_base = module_base;
+  if (!SymEnumSymbols(process, module_base, NULL, EnumSymProc, &ctx)) {
+    error = GetLastError();
+    fprintf(stderr, "SymEnumSymbols returned error: %d\n", error);
+    rv = 1;
+  } else {
+    DWORD j;
+    qsort(ctx.syms, ctx.syms_len, sizeof(ctx.syms[0]), sym_cmp);
+    for (j = 0; j < ctx.syms_len; j++) {
+      printf("%016I64x X %s\n", ctx.syms[j].addr, ctx.syms[j].name);
+    }
+    /* In a perfect world, maybe we'd clean up ctx's memory? */
+  }
+  SymUnloadModule64(process, module_base);
+  SymCleanup(process);
+  return rv;
+}

diff --git a/src/windows/override_functions.cc b/src/windows/override_functions.cc
new file mode 100644
index 0000000..e7917d3
--- /dev/null
+++ b/src/windows/override_functions.cc

@@ -0,0 +1,123 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+// Author: Mike Belshe
+// 
+// To link tcmalloc into a EXE or DLL statically without using the patching
+// facility, we can take a stock libcmt and remove all the allocator functions.
+// When we relink the EXE/DLL with the modified libcmt and tcmalloc, a few
+// functions are missing.  This file contains the additional overrides which
+// are required in the VS2005 libcmt in order to link the modified libcmt.
+//
+// See also
+// http://groups.google.com/group/google-perftools/browse_thread/thread/41cd3710af85e57b
+
+#include <config.h>
+
+#ifndef _WIN32
+# error You should only be including this file in a windows environment!
+#endif
+
+#ifndef WIN32_OVERRIDE_ALLOCATORS
+# error This file is intended for use when overriding allocators
+#endif
+
+#include "tcmalloc.cc"
+
+extern "C" void* _recalloc(void* p, size_t n, size_t size) {
+  void* result = realloc(p, n * size);
+  memset(result, 0, n * size);
+  return result;
+}
+
+extern "C" void* _calloc_impl(size_t n, size_t size) {
+  return calloc(n, size);
+}
+
+extern "C" size_t _msize(void* p) {
+  return MallocExtension::instance()->GetAllocatedSize(p);
+}
+
+extern "C" intptr_t _get_heap_handle() {
+  return 0;
+}
+
+// The CRT heap initialization stub.
+extern "C" int _heap_init() {
+  // We intentionally leak this object.  It lasts for the process
+  // lifetime.  Trying to teardown at _heap_term() is so late that
+  // you can't do anything useful anyway.
+  new TCMallocGuard();
+  return 1;
+}
+
+// The CRT heap cleanup stub.
+extern "C" void _heap_term() {
+}
+
+extern "C" int _set_new_mode(int flag) {
+  return tc_set_new_mode(flag);
+}
+
+#ifndef NDEBUG
+#undef malloc
+#undef free
+#undef calloc
+int _CrtDbgReport(int, const char*, int, const char*, const char*, ...) {
+  return 0;
+}
+
+int _CrtDbgReportW(int, const wchar_t*, int, const wchar_t*, const wchar_t*, ...) {
+  return 0;
+}
+
+int _CrtSetReportMode(int, int) {
+  return 0;
+}
+
+extern "C" void* _malloc_dbg(size_t size, int , const char*, int) {
+  return malloc(size);
+}
+
+extern "C" void _free_dbg(void* ptr, int) {
+  free(ptr);
+}
+
+extern "C" void* _calloc_dbg(size_t n, size_t size, int, const char*, int) {
+  return calloc(n, size);
+}
+#endif  // NDEBUG
+
+// We set this to 1 because part of the CRT uses a check of _crtheap != 0
+// to test whether the CRT has been initialized.  Once we've ripped out
+// the allocators from libcmt, we need to provide this definition so that
+// the rest of the CRT is still usable.
+extern "C" void* _crtheap = reinterpret_cast<void*>(1);

diff --git a/src/windows/patch_functions.cc b/src/windows/patch_functions.cc
new file mode 100644
index 0000000..ff1bec7
--- /dev/null
+++ b/src/windows/patch_functions.cc

@@ -0,0 +1,1077 @@
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+// Author: Craig Silverstein
+//
+// The main purpose of this file is to patch the libc allocation
+// routines (malloc and friends, but also _msize and other
+// windows-specific libc-style routines).  However, we also patch
+// windows routines to do accounting.  We do better at the former than
+// the latter.  Here are some comments from Paul Pluzhnikov about what
+// it might take to do a really good job patching windows routines to
+// keep track of memory usage:
+//
+// "You should intercept at least the following:
+//     HeapCreate HeapDestroy HeapAlloc HeapReAlloc HeapFree
+//     RtlCreateHeap RtlDestroyHeap RtlAllocateHeap RtlFreeHeap
+//     malloc calloc realloc free
+//     malloc_dbg calloc_dbg realloc_dbg free_dbg
+// Some of these call the other ones (but not always), sometimes
+// recursively (i.e. HeapCreate may call HeapAlloc on a different
+// heap, IIRC)."
+//
+// Since Paul didn't mention VirtualAllocEx, he may not have even been
+// considering all the mmap-like functions that windows has (or he may
+// just be ignoring it because he's seen we already patch it).  Of the
+// above, we do not patch the *_dbg functions, and of the windows
+// functions, we only patch HeapAlloc and HeapFree.
+//
+// The *_dbg functions come into play with /MDd, /MTd, and /MLd,
+// probably.  It may be ok to just turn off tcmalloc in those cases --
+// if the user wants the windows debug malloc, they probably don't
+// want tcmalloc!  We should also test with all of /MD, /MT, and /ML,
+// which we're not currently doing.
+
+// TODO(csilvers): try to do better here?  Paul does conclude:
+//                 "Keeping track of all of this was a nightmare."
+
+#ifndef _WIN32
+# error You should only be including windows/patch_functions.cc in a windows environment!
+#endif
+
+#include <config.h>
+
+#ifdef WIN32_OVERRIDE_ALLOCATORS
+#error This file is intended for patching allocators - use override_functions.cc instead.
+#endif
+
+// We use psapi.  Non-MSVC systems will have to link this in themselves.
+#ifdef _MSC_VER
+#pragma comment(lib, "Psapi.lib")
+#endif
+
+// Make sure we always use the 'old' names of the psapi functions.
+#ifndef PSAPI_VERSION
+#define PSAPI_VERSION 1
+#endif
+
+#include <windows.h>
+#include <stdio.h>
+#include <malloc.h>       // for _msize and _expand
+#include <psapi.h>        // for EnumProcessModules, GetModuleInformation, etc.
+#include <set>
+#include <map>
+#include <vector>
+#include <base/logging.h>
+#include "base/spinlock.h"
+#include "gperftools/malloc_hook.h"
+#include "malloc_hook-inl.h"
+#include "preamble_patcher.h"
+
+// The maximum number of modules we allow to be in one executable
+const int kMaxModules = 8182;
+
+// These are hard-coded, unfortunately. :-( They are also probably
+// compiler specific.  See get_mangled_names.cc, in this directory,
+// for instructions on how to update these names for your compiler.
+const char kMangledNew[] = "??2@YAPAXI@Z";
+const char kMangledNewArray[] = "??_U@YAPAXI@Z";
+const char kMangledDelete[] = "??3@YAXPAX@Z";
+const char kMangledDeleteArray[] = "??_V@YAXPAX@Z";
+const char kMangledNewNothrow[] = "??2@YAPAXIABUnothrow_t@std@@@Z";
+const char kMangledNewArrayNothrow[] = "??_U@YAPAXIABUnothrow_t@std@@@Z";
+const char kMangledDeleteNothrow[] = "??3@YAXPAXABUnothrow_t@std@@@Z";
+const char kMangledDeleteArrayNothrow[] = "??_V@YAXPAXABUnothrow_t@std@@@Z";
+
+// This is an unused but exported symbol that we can use to tell the
+// MSVC linker to bring in libtcmalloc, via the /INCLUDE linker flag.
+// Without this, the linker will likely decide that libtcmalloc.dll
+// doesn't add anything to the executable (since it does all its work
+// through patching, which the linker can't see), and ignore it
+// entirely.  (The name 'tcmalloc' is already reserved for a
+// namespace.  I'd rather export a variable named "_tcmalloc", but I
+// couldn't figure out how to get that to work.  This function exports
+// the symbol "__tcmalloc".)
+extern "C" PERFTOOLS_DLL_DECL void _tcmalloc();
+void _tcmalloc() { }
+
+// This is the version needed for windows x64, which has a different
+// decoration scheme which doesn't auto-add a leading underscore.
+extern "C" PERFTOOLS_DLL_DECL void __tcmalloc();
+void __tcmalloc() { }
+
+namespace {    // most everything here is in an unnamed namespace
+
+typedef void (*GenericFnPtr)();
+
+using sidestep::PreamblePatcher;
+
+struct ModuleEntryCopy;   // defined below
+
+// These functions are how we override the memory allocation
+// functions, just like tcmalloc.cc and malloc_hook.cc do.
+
+// This is information about the routines we're patching, for a given
+// module that implements libc memory routines.  A single executable
+// can have several libc implementations running about (in different
+// .dll's), and we need to patch/unpatch them all.  This defines
+// everything except the new functions we're patching in, which
+// are defined in LibcFunctions, below.
+class LibcInfo {
+ public:
+  LibcInfo() {
+    memset(this, 0, sizeof(*this));  // easiest way to initialize the array
+  }
+
+  bool patched() const { return is_valid(); }
+  void set_is_valid(bool b) { is_valid_ = b; }
+  // According to http://msdn.microsoft.com/en-us/library/ms684229(VS.85).aspx:
+  // "The load address of a module (lpBaseOfDll) is the same as the HMODULE
+  // value."
+  HMODULE hmodule() const {
+    return reinterpret_cast<HMODULE>(const_cast<void*>(module_base_address_));
+  }
+
+  // Populates all the windows_fn_[] vars based on our module info.
+  // Returns false if windows_fn_ is all NULL's, because there's
+  // nothing to patch.  Also populates the rest of the module_entry
+  // info, such as the module's name.
+  bool PopulateWindowsFn(const ModuleEntryCopy& module_entry);
+
+ protected:
+  void CopyFrom(const LibcInfo& that) {
+    if (this == &that)
+      return;
+    this->is_valid_ = that.is_valid_;
+    memcpy(this->windows_fn_, that.windows_fn_, sizeof(windows_fn_));
+    this->module_base_address_ = that.module_base_address_;
+    this->module_base_size_ = that.module_base_size_;
+  }
+
+  enum {
+    kMalloc, kFree, kRealloc, kCalloc,
+    kNew, kNewArray, kDelete, kDeleteArray,
+    kNewNothrow, kNewArrayNothrow, kDeleteNothrow, kDeleteArrayNothrow,
+    // These are windows-only functions from malloc.h
+    k_Msize, k_Expand,
+    // A MS CRT "internal" function, implemented using _calloc_impl
+    k_CallocCrt,
+    kNumFunctions
+  };
+
+  // I'd like to put these together in a struct (perhaps in the
+  // subclass, so we can put in perftools_fn_ as well), but vc8 seems
+  // to have a bug where it doesn't initialize the struct properly if
+  // we try to take the address of a function that's not yet loaded
+  // from a dll, as is the common case for static_fn_.  So we need
+  // each to be in its own array. :-(
+  static const char* const function_name_[kNumFunctions];
+
+  // This function is only used when statically linking the binary.
+  // In that case, loading malloc/etc from the dll (via
+  // PatchOneModule) won't work, since there are no dlls.  Instead,
+  // you just want to be taking the address of malloc/etc directly.
+  // In the common, non-static-link case, these pointers will all be
+  // NULL, since this initializer runs before msvcrt.dll is loaded.
+  static const GenericFnPtr static_fn_[kNumFunctions];
+
+  // This is the address of the function we are going to patch
+  // (malloc, etc).  Other info about the function is in the
+  // patch-specific subclasses, below.
+  GenericFnPtr windows_fn_[kNumFunctions];
+
+  // This is set to true when this structure is initialized (because
+  // we're patching a new library) and set to false when it's
+  // uninitialized (because we've freed that library).
+  bool is_valid_;
+
+  const void *module_base_address_;
+  size_t module_base_size_;
+
+ public:
+  // These shouldn't have to be public, since only subclasses of
+  // LibcInfo need it, but they do.  Maybe something to do with
+  // templates.  Shrug.  I hide them down here so users won't see
+  // them. :-)  (OK, I also need to define ctrgProcAddress late.)
+  bool is_valid() const { return is_valid_; }
+  GenericFnPtr windows_fn(int ifunction) const {
+    return windows_fn_[ifunction];
+  }
+  // These three are needed by ModuleEntryCopy.
+  static const int ctrgProcAddress = kNumFunctions;
+  static GenericFnPtr static_fn(int ifunction) {
+    return static_fn_[ifunction];
+  }
+  static const char* const function_name(int ifunction) {
+    return function_name_[ifunction];
+  }
+};
+
+// Template trickiness: logically, a LibcInfo would include
+// Windows_malloc_, origstub_malloc_, and Perftools_malloc_: for a
+// given module, these three go together.  And in fact,
+// Perftools_malloc_ may need to call origstub_malloc_, which means we
+// either need to change Perftools_malloc_ to take origstub_malloc_ as
+// an argument -- unfortunately impossible since it needs to keep the
+// same API as normal malloc -- or we need to write a different
+// version of Perftools_malloc_ for each LibcInfo instance we create.
+// We choose the second route, and use templates to implement it (we
+// could have also used macros).  So to get multiple versions
+// of the struct, we say "struct<1> var1; struct<2> var2;".  The price
+// we pay is some code duplication, and more annoying, each instance
+// of this var is a separate type.
+template<int> class LibcInfoWithPatchFunctions : public LibcInfo {
+ public:
+  // me_info should have had PopulateWindowsFn() called on it, so the
+  // module_* vars and windows_fn_ are set up.
+  bool Patch(const LibcInfo& me_info);
+  void Unpatch();
+
+ private:
+  // This holds the original function contents after we patch the function.
+  // This has to be defined static in the subclass, because the perftools_fns
+  // reference origstub_fn_.
+  static GenericFnPtr origstub_fn_[kNumFunctions];
+
+  // This is the function we want to patch in
+  static const GenericFnPtr perftools_fn_[kNumFunctions];
+
+  static void* Perftools_malloc(size_t size) __THROW;
+  static void Perftools_free(void* ptr) __THROW;
+  static void* Perftools_realloc(void* ptr, size_t size) __THROW;
+  static void* Perftools_calloc(size_t nmemb, size_t size) __THROW;
+  static void* Perftools_new(size_t size);
+  static void* Perftools_newarray(size_t size);
+  static void Perftools_delete(void *ptr);
+  static void Perftools_deletearray(void *ptr);
+  static void* Perftools_new_nothrow(size_t size,
+                                     const std::nothrow_t&) __THROW;
+  static void* Perftools_newarray_nothrow(size_t size,
+                                          const std::nothrow_t&) __THROW;
+  static void Perftools_delete_nothrow(void *ptr,
+                                       const std::nothrow_t&) __THROW;
+  static void Perftools_deletearray_nothrow(void *ptr,
+                                            const std::nothrow_t&) __THROW;
+  static size_t Perftools__msize(void *ptr) __THROW;
+  static void* Perftools__expand(void *ptr, size_t size) __THROW;
+  // malloc.h also defines these functions:
+  //   _aligned_malloc, _aligned_free,
+  //   _recalloc, _aligned_offset_malloc, _aligned_realloc, _aligned_recalloc
+  //   _aligned_offset_realloc, _aligned_offset_recalloc, _malloca, _freea
+  // But they seem pretty obscure, and I'm fine not overriding them for now.
+  // It may be they all call into malloc/free anyway.
+};
+
+// This is a subset of MODDULEENTRY32, that we need for patching.
+struct ModuleEntryCopy {
+  LPVOID  modBaseAddr;     // the same as hmodule
+  DWORD   modBaseSize;
+  // This is not part of MODDULEENTRY32, but is needed to avoid making
+  // windows syscalls while we're holding patch_all_modules_lock (see
+  // lock-inversion comments at patch_all_modules_lock definition, below).
+  GenericFnPtr rgProcAddresses[LibcInfo::ctrgProcAddress];
+
+  ModuleEntryCopy() {
+    modBaseAddr = NULL;
+    modBaseSize = 0;
+    for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++)
+      rgProcAddresses[i] = LibcInfo::static_fn(i);
+  }
+  ModuleEntryCopy(const MODULEINFO& mi) {
+    this->modBaseAddr = mi.lpBaseOfDll;
+    this->modBaseSize = mi.SizeOfImage;
+    LPVOID modEndAddr = (char*)mi.lpBaseOfDll + mi.SizeOfImage;
+    for (int i = 0; i < sizeof(rgProcAddresses)/sizeof(*rgProcAddresses); i++) {
+      FARPROC target = ::GetProcAddress(
+          reinterpret_cast<const HMODULE>(mi.lpBaseOfDll),
+          LibcInfo::function_name(i));
+      // Sometimes a DLL forwards a function to a function in another
+      // DLL.  We don't want to patch those forwarded functions --
+      // they'll get patched when the other DLL is processed.
+      if (target >= modBaseAddr && target < modEndAddr)
+        rgProcAddresses[i] = (GenericFnPtr)target;
+      else
+        rgProcAddresses[i] = (GenericFnPtr)NULL;
+    }
+  }
+};
+
+// This class is easier because there's only one of them.
+class WindowsInfo {
+ public:
+  void Patch();
+  void Unpatch();
+
+ private:
+  // TODO(csilvers): should we be patching GlobalAlloc/LocalAlloc instead,
+  //                 for pre-XP systems?
+  enum {
+    kHeapAlloc, kHeapFree, kVirtualAllocEx, kVirtualFreeEx,
+    kMapViewOfFileEx, kUnmapViewOfFile, kLoadLibraryExW, kFreeLibrary,
+    kNumFunctions
+  };
+
+  struct FunctionInfo {
+    const char* const name;          // name of fn in a module (eg "malloc")
+    GenericFnPtr windows_fn;         // the fn whose name we call (&malloc)
+    GenericFnPtr origstub_fn;        // original fn contents after we patch
+    const GenericFnPtr perftools_fn; // fn we want to patch in
+  };
+
+  static FunctionInfo function_info_[kNumFunctions];
+
+  // A Windows-API equivalent of malloc and free
+  static LPVOID WINAPI Perftools_HeapAlloc(HANDLE hHeap, DWORD dwFlags,
+                                           DWORD_PTR dwBytes);
+  static BOOL WINAPI Perftools_HeapFree(HANDLE hHeap, DWORD dwFlags,
+                                        LPVOID lpMem);
+  // A Windows-API equivalent of mmap and munmap, for "anonymous regions"
+  static LPVOID WINAPI Perftools_VirtualAllocEx(HANDLE process, LPVOID address,
+                                                SIZE_T size, DWORD type,
+                                                DWORD protect);
+  static BOOL WINAPI Perftools_VirtualFreeEx(HANDLE process, LPVOID address,
+                                             SIZE_T size, DWORD type);
+  // A Windows-API equivalent of mmap and munmap, for actual files
+  static LPVOID WINAPI Perftools_MapViewOfFileEx(HANDLE hFileMappingObject,
+                                                 DWORD dwDesiredAccess,
+                                                 DWORD dwFileOffsetHigh,
+                                                 DWORD dwFileOffsetLow,
+                                                 SIZE_T dwNumberOfBytesToMap,
+                                                 LPVOID lpBaseAddress);
+  static BOOL WINAPI Perftools_UnmapViewOfFile(LPCVOID lpBaseAddress);
+  // We don't need the other 3 variants because they all call this one. */
+  static HMODULE WINAPI Perftools_LoadLibraryExW(LPCWSTR lpFileName,
+                                                 HANDLE hFile,
+                                                 DWORD dwFlags);
+  static BOOL WINAPI Perftools_FreeLibrary(HMODULE hLibModule);
+};
+
+// If you run out, just add a few more to the array.  You'll also need
+// to update the switch statement in PatchOneModule(), and the list in
+// UnpatchWindowsFunctions().
+// main_executable and main_executable_windows are two windows into
+// the same executable.  One is responsible for patching the libc
+// routines that live in the main executable (if any) to use tcmalloc;
+// the other is responsible for patching the windows routines like
+// HeapAlloc/etc to use tcmalloc.
+static LibcInfoWithPatchFunctions<0> main_executable;
+static LibcInfoWithPatchFunctions<1> libc1;
+static LibcInfoWithPatchFunctions<2> libc2;
+static LibcInfoWithPatchFunctions<3> libc3;
+static LibcInfoWithPatchFunctions<4> libc4;
+static LibcInfoWithPatchFunctions<5> libc5;
+static LibcInfoWithPatchFunctions<6> libc6;
+static LibcInfoWithPatchFunctions<7> libc7;
+static LibcInfoWithPatchFunctions<8> libc8;
+static LibcInfo* g_module_libcs[] = {
+  &libc1, &libc2, &libc3, &libc4, &libc5, &libc6, &libc7, &libc8
+};
+static WindowsInfo main_executable_windows;
+
+const char* const LibcInfo::function_name_[] = {
+  "malloc", "free", "realloc", "calloc",
+  kMangledNew, kMangledNewArray, kMangledDelete, kMangledDeleteArray,
+  // Ideally we should patch the nothrow versions of new/delete, but
+  // at least in msvcrt, nothrow-new machine-code is of a type we
+  // can't patch.  Since these are relatively rare, I'm hoping it's ok
+  // not to patch them.  (NULL name turns off patching.)
+  NULL,  // kMangledNewNothrow,
+  NULL,  // kMangledNewArrayNothrow,
+  NULL,  // kMangledDeleteNothrow,
+  NULL,  // kMangledDeleteArrayNothrow,
+  "_msize", "_expand", "_calloc_crt",
+};
+
+// For mingw, I can't patch the new/delete here, because the
+// instructions are too small to patch.  Luckily, they're so small
+// because all they do is call into malloc/free, so they still end up
+// calling tcmalloc routines, and we don't actually lose anything
+// (except maybe some stacktrace goodness) by not patching.
+const GenericFnPtr LibcInfo::static_fn_[] = {
+  (GenericFnPtr)&::malloc,
+  (GenericFnPtr)&::free,
+  (GenericFnPtr)&::realloc,
+  (GenericFnPtr)&::calloc,
+#ifdef __MINGW32__
+  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+#else
+  (GenericFnPtr)(void*(*)(size_t))&::operator new,
+  (GenericFnPtr)(void*(*)(size_t))&::operator new[],
+  (GenericFnPtr)(void(*)(void*))&::operator delete,
+  (GenericFnPtr)(void(*)(void*))&::operator delete[],
+  (GenericFnPtr)
+  (void*(*)(size_t, struct std::nothrow_t const &))&::operator new,
+  (GenericFnPtr)
+  (void*(*)(size_t, struct std::nothrow_t const &))&::operator new[],
+  (GenericFnPtr)
+  (void(*)(void*, struct std::nothrow_t const &))&::operator delete,
+  (GenericFnPtr)
+  (void(*)(void*, struct std::nothrow_t const &))&::operator delete[],
+#endif
+  (GenericFnPtr)&::_msize,
+  (GenericFnPtr)&::_expand,
+  (GenericFnPtr)&::calloc,
+};
+
+template<int T> GenericFnPtr LibcInfoWithPatchFunctions<T>::origstub_fn_[] = {
+  // This will get filled in at run-time, as patching is done.
+};
+
+template<int T>
+const GenericFnPtr LibcInfoWithPatchFunctions<T>::perftools_fn_[] = {
+  (GenericFnPtr)&Perftools_malloc,
+  (GenericFnPtr)&Perftools_free,
+  (GenericFnPtr)&Perftools_realloc,
+  (GenericFnPtr)&Perftools_calloc,
+  (GenericFnPtr)&Perftools_new,
+  (GenericFnPtr)&Perftools_newarray,
+  (GenericFnPtr)&Perftools_delete,
+  (GenericFnPtr)&Perftools_deletearray,
+  (GenericFnPtr)&Perftools_new_nothrow,
+  (GenericFnPtr)&Perftools_newarray_nothrow,
+  (GenericFnPtr)&Perftools_delete_nothrow,
+  (GenericFnPtr)&Perftools_deletearray_nothrow,
+  (GenericFnPtr)&Perftools__msize,
+  (GenericFnPtr)&Perftools__expand,
+  (GenericFnPtr)&Perftools_calloc,
+};
+
+/*static*/ WindowsInfo::FunctionInfo WindowsInfo::function_info_[] = {
+  { "HeapAlloc", NULL, NULL, (GenericFnPtr)&Perftools_HeapAlloc },
+  { "HeapFree", NULL, NULL, (GenericFnPtr)&Perftools_HeapFree },
+  { "VirtualAllocEx", NULL, NULL, (GenericFnPtr)&Perftools_VirtualAllocEx },
+  { "VirtualFreeEx", NULL, NULL, (GenericFnPtr)&Perftools_VirtualFreeEx },
+  { "MapViewOfFileEx", NULL, NULL, (GenericFnPtr)&Perftools_MapViewOfFileEx },
+  { "UnmapViewOfFile", NULL, NULL, (GenericFnPtr)&Perftools_UnmapViewOfFile },
+  { "LoadLibraryExW", NULL, NULL, (GenericFnPtr)&Perftools_LoadLibraryExW },
+  { "FreeLibrary", NULL, NULL, (GenericFnPtr)&Perftools_FreeLibrary },
+};
+
+bool LibcInfo::PopulateWindowsFn(const ModuleEntryCopy& module_entry) {
+  // First, store the location of the function to patch before
+  // patching it.  If none of these functions are found in the module,
+  // then this module has no libc in it, and we just return false.
+  for (int i = 0; i < kNumFunctions; i++) {
+    if (!function_name_[i])     // we can turn off patching by unsetting name
+      continue;
+    // The ::GetProcAddress calls were done in the ModuleEntryCopy
+    // constructor, so we don't have to make any windows calls here.
+    const GenericFnPtr fn = module_entry.rgProcAddresses[i];
+    if (fn) {
+      windows_fn_[i] = PreamblePatcher::ResolveTarget(fn);
+    }
+  }
+
+  // Some modules use the same function pointer for new and new[].  If
+  // we find that, set one of the pointers to NULL so we don't double-
+  // patch.  Same may happen with new and nothrow-new, or even new[]
+  // and nothrow-new.  It's easiest just to check each fn-ptr against
+  // every other.
+  for (int i = 0; i < kNumFunctions; i++) {
+    for (int j = i+1; j < kNumFunctions; j++) {
+      if (windows_fn_[i] == windows_fn_[j]) {
+        // We NULL the later one (j), so as to minimize the chances we
+        // NULL kFree and kRealloc.  See comments below.  This is fragile!
+        windows_fn_[j] = NULL;
+      }
+    }
+  }
+
+  // There's always a chance that our module uses the same function
+  // as another module that we've already loaded.  In that case, we
+  // need to set our windows_fn to NULL, to avoid double-patching.
+  for (int ifn = 0; ifn < kNumFunctions; ifn++) {
+    for (int imod = 0;
+         imod < sizeof(g_module_libcs)/sizeof(*g_module_libcs);  imod++) {
+      if (g_module_libcs[imod]->is_valid() &&
+          this->windows_fn(ifn) == g_module_libcs[imod]->windows_fn(ifn)) {
+        windows_fn_[ifn] = NULL;
+      }
+    }
+  }
+
+  bool found_non_null = false;
+  for (int i = 0; i < kNumFunctions; i++) {
+    if (windows_fn_[i])
+      found_non_null = true;
+  }
+  if (!found_non_null)
+    return false;
+
+  // It's important we didn't NULL out windows_fn_[kFree] or [kRealloc].
+  // The reason is, if those are NULL-ed out, we'll never patch them
+  // and thus never get an origstub_fn_ value for them, and when we
+  // try to call origstub_fn_[kFree/kRealloc] in Perftools_free and
+  // Perftools_realloc, below, it will fail.  We could work around
+  // that by adding a pointer from one patch-unit to the other, but we
+  // haven't needed to yet.
+  CHECK(windows_fn_[kFree]);
+  CHECK(windows_fn_[kRealloc]);
+
+  // OK, we successfully populated.  Let's store our member information.
+  module_base_address_ = module_entry.modBaseAddr;
+  module_base_size_ = module_entry.modBaseSize;
+  return true;
+}
+
+template<int T>
+bool LibcInfoWithPatchFunctions<T>::Patch(const LibcInfo& me_info) {
+  CopyFrom(me_info);   // copies the module_entry and the windows_fn_ array
+  for (int i = 0; i < kNumFunctions; i++) {
+    if (windows_fn_[i] && windows_fn_[i] != perftools_fn_[i]) {
+      // if origstub_fn_ is not NULL, it's left around from a previous
+      // patch.  We need to set it to NULL for the new Patch call.
+      //
+      // Note that origstub_fn_ was logically freed by
+      // PreamblePatcher::Unpatch, so we don't have to do anything
+      // about it.
+      origstub_fn_[i] = NULL;   // Patch() will fill this in
+      CHECK_EQ(sidestep::SIDESTEP_SUCCESS,
+               PreamblePatcher::Patch(windows_fn_[i], perftools_fn_[i],
+                                      &origstub_fn_[i]));
+    }
+  }
+  set_is_valid(true);
+  return true;
+}
+
+template<int T>
+void LibcInfoWithPatchFunctions<T>::Unpatch() {
+  // We have to cast our GenericFnPtrs to void* for unpatch.  This is
+  // contra the C++ spec; we use C-style casts to empahsize that.
+  for (int i = 0; i < kNumFunctions; i++) {
+    if (windows_fn_[i])
+      CHECK_EQ(sidestep::SIDESTEP_SUCCESS,
+               PreamblePatcher::Unpatch((void*)windows_fn_[i],
+                                        (void*)perftools_fn_[i],
+                                        (void*)origstub_fn_[i]));
+  }
+  set_is_valid(false);
+}
+
+void WindowsInfo::Patch() {
+  HMODULE hkernel32 = ::GetModuleHandleA("kernel32");
+  CHECK_NE(hkernel32, NULL);
+
+  // Unlike for libc, we know these exist in our module, so we can get
+  // and patch at the same time.
+  for (int i = 0; i < kNumFunctions; i++) {
+    function_info_[i].windows_fn = (GenericFnPtr)
+        ::GetProcAddress(hkernel32, function_info_[i].name);
+    // If origstub_fn is not NULL, it's left around from a previous
+    // patch.  We need to set it to NULL for the new Patch call.
+    // Since we've patched Unpatch() not to delete origstub_fn_ (it
+    // causes problems in some contexts, though obviously not this
+    // one), we should delete it now, before setting it to NULL.
+    // NOTE: casting from a function to a pointer is contra the C++
+    //       spec.  It's not safe on IA64, but is on i386.  We use
+    //       a C-style cast here to emphasize this is not legal C++.
+    delete[] (char*)(function_info_[i].origstub_fn);
+    function_info_[i].origstub_fn = NULL;  // Patch() will fill this in
+    CHECK_EQ(sidestep::SIDESTEP_SUCCESS,
+             PreamblePatcher::Patch(function_info_[i].windows_fn,
+                                    function_info_[i].perftools_fn,
+                                    &function_info_[i].origstub_fn));
+  }
+}
+
+void WindowsInfo::Unpatch() {
+  // We have to cast our GenericFnPtrs to void* for unpatch.  This is
+  // contra the C++ spec; we use C-style casts to empahsize that.
+  for (int i = 0; i < kNumFunctions; i++) {
+    CHECK_EQ(sidestep::SIDESTEP_SUCCESS,
+             PreamblePatcher::Unpatch((void*)function_info_[i].windows_fn,
+                                      (void*)function_info_[i].perftools_fn,
+                                      (void*)function_info_[i].origstub_fn));
+  }
+}
+
+// You should hold the patch_all_modules_lock when calling this.
+void PatchOneModuleLocked(const LibcInfo& me_info) {
+  // If we don't already have info on this module, let's add it.  This
+  // is where we're sad that each libcX has a different type, so we
+  // can't use an array; instead, we have to use a switch statement.
+  // Patch() returns false if there were no libc functions in the module.
+  for (int i = 0; i < sizeof(g_module_libcs)/sizeof(*g_module_libcs); i++) {
+    if (!g_module_libcs[i]->is_valid()) {   // found an empty spot to add!
+      switch (i) {
+        case 0: libc1.Patch(me_info); return;
+        case 1: libc2.Patch(me_info); return;
+        case 2: libc3.Patch(me_info); return;
+        case 3: libc4.Patch(me_info); return;
+        case 4: libc5.Patch(me_info); return;
+        case 5: libc6.Patch(me_info); return;
+        case 6: libc7.Patch(me_info); return;
+        case 7: libc8.Patch(me_info); return;
+      }
+    }
+  }
+  printf("PERFTOOLS ERROR: Too many modules containing libc in this executable\n");
+}
+
+void PatchMainExecutableLocked() {
+  if (main_executable.patched())
+    return;    // main executable has already been patched
+  ModuleEntryCopy fake_module_entry;   // make a fake one to pass into Patch()
+  // No need to call PopulateModuleEntryProcAddresses on the main executable.
+  main_executable.PopulateWindowsFn(fake_module_entry);
+  main_executable.Patch(main_executable);
+}
+
+// This lock is subject to a subtle and annoying lock inversion
+// problem: it may interact badly with unknown internal windows locks.
+// In particular, windows may be holding a lock when it calls
+// LoadLibraryExW and FreeLibrary, which we've patched.  We have those
+// routines call PatchAllModules, which acquires this lock.  If we
+// make windows system calls while holding this lock, those system
+// calls may need the internal windows locks that are being held in
+// the call to LoadLibraryExW, resulting in deadlock.  The solution is
+// to be very careful not to call *any* windows routines while holding
+// patch_all_modules_lock, inside PatchAllModules().
+static SpinLock patch_all_modules_lock(SpinLock::LINKER_INITIALIZED);
+
+// last_loaded: The set of modules that were loaded the last time
+// PatchAllModules was called.  This is an optimization for only
+// looking at modules that were added or removed from the last call.
+static std::set<HMODULE> *g_last_loaded;
+
+// Iterates over all the modules currently loaded by the executable,
+// according to windows, and makes sure they're all patched.  Most
+// modules will already be in loaded_modules, meaning we have already
+// loaded and either patched them or determined they did not need to
+// be patched.  Others will not, which means we need to patch them
+// (if necessary).  Finally, we have to go through the existing
+// g_module_libcs and see if any of those are *not* in the modules
+// currently loaded by the executable.  If so, we need to invalidate
+// them.  Returns true if we did any work (patching or invalidating),
+// false if we were a noop.  May update loaded_modules as well.
+// NOTE: you must hold the patch_all_modules_lock to access loaded_modules.
+bool PatchAllModules() {
+  std::vector<ModuleEntryCopy> modules;
+  bool made_changes = false;
+
+  const HANDLE hCurrentProcess = GetCurrentProcess();
+  DWORD num_modules = 0;
+  HMODULE hModules[kMaxModules];  // max # of modules we support in one process
+  if (!::EnumProcessModules(hCurrentProcess, hModules, sizeof(hModules),
+                            &num_modules)) {
+    num_modules = 0;
+  }
+  // EnumProcessModules actually set the bytes written into hModules,
+  // so we need to divide to make num_modules actually be a module-count.
+  num_modules /= sizeof(*hModules);
+  if (num_modules >= kMaxModules) {
+    printf("PERFTOOLS ERROR: Too many modules in this executable to try"
+           " to patch them all (if you need to, raise kMaxModules in"
+           " patch_functions.cc).\n");
+    num_modules = kMaxModules;
+  }
+
+  // Now we handle the unpatching of modules we have in g_module_libcs
+  // but that were not found in EnumProcessModules.  We need to
+  // invalidate them.  To speed that up, we store the EnumProcessModules
+  // output in a set.
+  // At the same time, we prepare for the adding of new modules, by
+  // removing from hModules all the modules we know we've already
+  // patched (or decided don't need to be patched).  At the end,
+  // hModules will hold only the modules that we need to consider patching.
+  std::set<HMODULE> currently_loaded_modules;
+  {
+    SpinLockHolder h(&patch_all_modules_lock);
+    if (!g_last_loaded)  g_last_loaded = new std::set<HMODULE>;
+    // At the end of this loop, currently_loaded_modules contains the
+    // full list of EnumProcessModules, and hModules just the ones we
+    // haven't handled yet.
+    for (int i = 0; i < num_modules; ) {
+      currently_loaded_modules.insert(hModules[i]);
+      if (g_last_loaded->count(hModules[i]) > 0) {
+        hModules[i] = hModules[--num_modules];  // replace element i with tail
+      } else {
+        i++;                                    // keep element i
+      }
+    }
+    // Now we do the unpatching/invalidation.
+    for (int i = 0; i < sizeof(g_module_libcs)/sizeof(*g_module_libcs); i++) {
+      if (g_module_libcs[i]->patched() &&
+          currently_loaded_modules.count(g_module_libcs[i]->hmodule()) == 0) {
+        // Means g_module_libcs[i] is no longer loaded (no me32 matched).
+        // We could call Unpatch() here, but why bother?  The module
+        // has gone away, so nobody is going to call into it anyway.
+        g_module_libcs[i]->set_is_valid(false);
+        made_changes = true;
+      }
+    }
+    // Update the loaded module cache.
+    g_last_loaded->swap(currently_loaded_modules);
+  }
+
+  // Now that we know what modules are new, let's get the info we'll
+  // need to patch them.  Note this *cannot* be done while holding the
+  // lock, since it needs to make windows calls (see the lock-inversion
+  // comments before the definition of patch_all_modules_lock).
+  MODULEINFO mi;
+  for (int i = 0; i < num_modules; i++) {
+    if (::GetModuleInformation(hCurrentProcess, hModules[i], &mi, sizeof(mi)))
+      modules.push_back(ModuleEntryCopy(mi));
+  }
+
+  // Now we can do the patching of new modules.
+  {
+    SpinLockHolder h(&patch_all_modules_lock);
+    for (std::vector<ModuleEntryCopy>::iterator it = modules.begin();
+         it != modules.end(); ++it) {
+      LibcInfo libc_info;
+      if (libc_info.PopulateWindowsFn(*it)) { // true==module has libc routines
+        PatchOneModuleLocked(libc_info);
+        made_changes = true;
+      }
+    }
+
+    // Now that we've dealt with the modules (dlls), update the main
+    // executable.  We do this last because PatchMainExecutableLocked
+    // wants to look at how other modules were patched.
+    if (!main_executable.patched()) {
+      PatchMainExecutableLocked();
+      made_changes = true;
+    }
+  }
+  // TODO(csilvers): for this to be reliable, we need to also take
+  // into account if we *would* have patched any modules had they not
+  // already been loaded.  (That is, made_changes should ignore
+  // g_last_loaded.)
+  return made_changes;
+}
+
+
+}  // end unnamed namespace
+
+// ---------------------------------------------------------------------
+// Now that we've done all the patching machinery, let's actually
+// define the functions we're patching in.  Mostly these are
+// simple wrappers around the do_* routines in tcmalloc.cc.
+//
+// In fact, we #include tcmalloc.cc to get at the tcmalloc internal
+// do_* functions, the better to write our own hook functions.
+// U-G-L-Y, I know.  But the alternatives are, perhaps, worse.  This
+// also lets us define _msize(), _expand(), and other windows-specific
+// functions here, using tcmalloc internals, without polluting
+// tcmalloc.cc.
+// -------------------------------------------------------------------
+
+// TODO(csilvers): refactor tcmalloc.cc into two files, so I can link
+// against the file with do_malloc, and ignore the one with malloc.
+#include "tcmalloc.cc"
+
+template<int T>
+void* LibcInfoWithPatchFunctions<T>::Perftools_malloc(size_t size) __THROW {
+  void* result = do_malloc_or_cpp_alloc(size);
+  MallocHook::InvokeNewHook(result, size);
+  return result;
+}
+
+template<int T>
+void LibcInfoWithPatchFunctions<T>::Perftools_free(void* ptr) __THROW {
+  MallocHook::InvokeDeleteHook(ptr);
+  // This calls the windows free if do_free decides ptr was not
+  // allocated by tcmalloc.  Note it calls the origstub_free from
+  // *this* templatized instance of LibcInfo.  See "template
+  // trickiness" above.
+  do_free_with_callback(ptr, (void (*)(void*))origstub_fn_[kFree]);
+}
+
+template<int T>
+void* LibcInfoWithPatchFunctions<T>::Perftools_realloc(
+    void* old_ptr, size_t new_size) __THROW {
+  if (old_ptr == NULL) {
+    void* result = do_malloc_or_cpp_alloc(new_size);
+    MallocHook::InvokeNewHook(result, new_size);
+    return result;
+  }
+  if (new_size == 0) {
+    MallocHook::InvokeDeleteHook(old_ptr);
+    do_free_with_callback(old_ptr,
+                          (void (*)(void*))origstub_fn_[kFree]);
+    return NULL;
+  }
+  return do_realloc_with_callback(
+      old_ptr, new_size,
+      (void (*)(void*))origstub_fn_[kFree],
+      (size_t (*)(const void*))origstub_fn_[k_Msize]);
+}
+
+template<int T>
+void* LibcInfoWithPatchFunctions<T>::Perftools_calloc(
+    size_t n, size_t elem_size) __THROW {
+  void* result = do_calloc(n, elem_size);
+  MallocHook::InvokeNewHook(result, n * elem_size);
+  return result;
+}
+
+template<int T>
+void* LibcInfoWithPatchFunctions<T>::Perftools_new(size_t size) {
+  void* p = cpp_alloc(size, false);
+  MallocHook::InvokeNewHook(p, size);
+  return p;
+}
+
+template<int T>
+void* LibcInfoWithPatchFunctions<T>::Perftools_newarray(size_t size) {
+  void* p = cpp_alloc(size, false);
+  MallocHook::InvokeNewHook(p, size);
+  return p;
+}
+
+template<int T>
+void LibcInfoWithPatchFunctions<T>::Perftools_delete(void *p) {
+  MallocHook::InvokeDeleteHook(p);
+  do_free_with_callback(p, (void (*)(void*))origstub_fn_[kFree]);
+}
+
+template<int T>
+void LibcInfoWithPatchFunctions<T>::Perftools_deletearray(void *p) {
+  MallocHook::InvokeDeleteHook(p);
+  do_free_with_callback(p, (void (*)(void*))origstub_fn_[kFree]);
+}
+
+template<int T>
+void* LibcInfoWithPatchFunctions<T>::Perftools_new_nothrow(
+    size_t size, const std::nothrow_t&) __THROW {
+  void* p = cpp_alloc(size, true);
+  MallocHook::InvokeNewHook(p, size);
+  return p;
+}
+
+template<int T>
+void* LibcInfoWithPatchFunctions<T>::Perftools_newarray_nothrow(
+    size_t size, const std::nothrow_t&) __THROW {
+  void* p = cpp_alloc(size, true);
+  MallocHook::InvokeNewHook(p, size);
+  return p;
+}
+
+template<int T>
+void LibcInfoWithPatchFunctions<T>::Perftools_delete_nothrow(
+    void *p, const std::nothrow_t&) __THROW {
+  MallocHook::InvokeDeleteHook(p);
+  do_free_with_callback(p, (void (*)(void*))origstub_fn_[kFree]);
+}
+
+template<int T>
+void LibcInfoWithPatchFunctions<T>::Perftools_deletearray_nothrow(
+    void *p, const std::nothrow_t&) __THROW {
+  MallocHook::InvokeDeleteHook(p);
+  do_free_with_callback(p, (void (*)(void*))origstub_fn_[kFree]);
+}
+
+
+// _msize() lets you figure out how much space is reserved for a
+// pointer, in Windows.  Even if applications don't call it, any DLL
+// with global constructors will call (transitively) something called
+// __dllonexit_lk in order to make sure the destructors get called
+// when the dll unloads.  And that will call msize -- horrible things
+// can ensue if this is not hooked.  Other parts of libc may also call
+// this internally.
+
+template<int T>
+size_t LibcInfoWithPatchFunctions<T>::Perftools__msize(void* ptr) __THROW {
+  return GetSizeWithCallback(ptr, (size_t (*)(const void*))origstub_fn_[k_Msize]);
+}
+
+// We need to define this because internal windows functions like to
+// call into it(?).  _expand() is like realloc but doesn't move the
+// pointer.  We punt, which will cause callers to fall back on realloc.
+template<int T>
+void* LibcInfoWithPatchFunctions<T>::Perftools__expand(void *ptr,
+                                                       size_t size) __THROW {
+  return NULL;
+}
+
+LPVOID WINAPI WindowsInfo::Perftools_HeapAlloc(HANDLE hHeap, DWORD dwFlags,
+                                               DWORD_PTR dwBytes) {
+  LPVOID result = ((LPVOID (WINAPI *)(HANDLE, DWORD, DWORD_PTR))
+                   function_info_[kHeapAlloc].origstub_fn)(
+                       hHeap, dwFlags, dwBytes);
+  MallocHook::InvokeNewHook(result, dwBytes);
+  return result;
+}
+
+BOOL WINAPI WindowsInfo::Perftools_HeapFree(HANDLE hHeap, DWORD dwFlags,
+                                            LPVOID lpMem) {
+  MallocHook::InvokeDeleteHook(lpMem);
+  return ((BOOL (WINAPI *)(HANDLE, DWORD, LPVOID))
+          function_info_[kHeapFree].origstub_fn)(
+              hHeap, dwFlags, lpMem);
+}
+
+LPVOID WINAPI WindowsInfo::Perftools_VirtualAllocEx(HANDLE process,
+                                                    LPVOID address,
+                                                    SIZE_T size, DWORD type,
+                                                    DWORD protect) {
+  LPVOID result = ((LPVOID (WINAPI *)(HANDLE, LPVOID, SIZE_T, DWORD, DWORD))
+                   function_info_[kVirtualAllocEx].origstub_fn)(
+                       process, address, size, type, protect);
+  // VirtualAllocEx() seems to be the Windows equivalent of mmap()
+  MallocHook::InvokeMmapHook(result, address, size, protect, type, -1, 0);
+  return result;
+}
+
+BOOL WINAPI WindowsInfo::Perftools_VirtualFreeEx(HANDLE process, LPVOID address,
+                                                 SIZE_T size, DWORD type) {
+  MallocHook::InvokeMunmapHook(address, size);
+  return ((BOOL (WINAPI *)(HANDLE, LPVOID, SIZE_T, DWORD))
+          function_info_[kVirtualFreeEx].origstub_fn)(
+              process, address, size, type);
+}
+
+LPVOID WINAPI WindowsInfo::Perftools_MapViewOfFileEx(
+    HANDLE hFileMappingObject, DWORD dwDesiredAccess, DWORD dwFileOffsetHigh,
+    DWORD dwFileOffsetLow, SIZE_T dwNumberOfBytesToMap, LPVOID lpBaseAddress) {
+  // For this function pair, you always deallocate the full block of
+  // data that you allocate, so NewHook/DeleteHook is the right API.
+  LPVOID result = ((LPVOID (WINAPI *)(HANDLE, DWORD, DWORD, DWORD,
+                                      SIZE_T, LPVOID))
+                   function_info_[kMapViewOfFileEx].origstub_fn)(
+                       hFileMappingObject, dwDesiredAccess, dwFileOffsetHigh,
+                       dwFileOffsetLow, dwNumberOfBytesToMap, lpBaseAddress);
+  MallocHook::InvokeNewHook(result, dwNumberOfBytesToMap);
+  return result;
+}
+
+BOOL WINAPI WindowsInfo::Perftools_UnmapViewOfFile(LPCVOID lpBaseAddress) {
+  MallocHook::InvokeDeleteHook(lpBaseAddress);
+  return ((BOOL (WINAPI *)(LPCVOID))
+          function_info_[kUnmapViewOfFile].origstub_fn)(
+              lpBaseAddress);
+}
+
+// g_load_map holds a copy of windows' refcount for how many times
+// each currently loaded module has been loaded and unloaded.  We use
+// it as an optimization when the same module is loaded more than
+// once: as long as the refcount stays above 1, we don't need to worry
+// about patching because it's already patched.  Likewise, we don't
+// need to unpatch until the refcount drops to 0.  load_map is
+// maintained in LoadLibraryExW and FreeLibrary, and only covers
+// modules explicitly loaded/freed via those interfaces.
+static std::map<HMODULE, int>* g_load_map = NULL;
+
+HMODULE WINAPI WindowsInfo::Perftools_LoadLibraryExW(LPCWSTR lpFileName,
+                                                     HANDLE hFile,
+                                                     DWORD dwFlags) {
+  HMODULE rv;
+  // Check to see if the modules is already loaded, flag 0 gets a
+  // reference if it was loaded.  If it was loaded no need to call
+  // PatchAllModules, just increase the reference count to match
+  // what GetModuleHandleExW does internally inside windows.
+  if (::GetModuleHandleExW(0, lpFileName, &rv)) {
+    return rv;
+  } else {
+    // Not already loaded, so load it.
+    rv = ((HMODULE (WINAPI *)(LPCWSTR, HANDLE, DWORD))
+                  function_info_[kLoadLibraryExW].origstub_fn)(
+                      lpFileName, hFile, dwFlags);
+    // This will patch any newly loaded libraries, if patching needs
+    // to be done.
+    PatchAllModules();
+
+    return rv;
+  }
+}
+
+BOOL WINAPI WindowsInfo::Perftools_FreeLibrary(HMODULE hLibModule) {
+  BOOL rv = ((BOOL (WINAPI *)(HMODULE))
+             function_info_[kFreeLibrary].origstub_fn)(hLibModule);
+
+  // Check to see if the module is still loaded by passing the base
+  // address and seeing if it comes back with the same address.  If it
+  // is the same address it's still loaded, so the FreeLibrary() call
+  // was a noop, and there's no need to redo the patching.
+  HMODULE owner = NULL;
+  BOOL result = ::GetModuleHandleExW(
+      (GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+       GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT),
+      (LPCWSTR)hLibModule,
+      &owner);
+  if (result && owner == hLibModule)
+    return rv;
+
+  PatchAllModules();    // this will fix up the list of patched libraries
+  return rv;
+}
+
+
+// ---------------------------------------------------------------------
+// PatchWindowsFunctions()
+//    This is the function that is exposed to the outside world.
+//    It should be called before the program becomes multi-threaded,
+//    since main_executable_windows.Patch() is not thread-safe.
+// ---------------------------------------------------------------------
+
+void PatchWindowsFunctions() {
+  // This does the libc patching in every module, and the main executable.
+  PatchAllModules();
+  main_executable_windows.Patch();
+}
+
+#if 0
+// It's possible to unpatch all the functions when we are exiting.
+
+// The idea is to handle properly windows-internal data that is
+// allocated before PatchWindowsFunctions is called.  If all
+// destruction happened in reverse order from construction, then we
+// could call UnpatchWindowsFunctions at just the right time, so that
+// that early-allocated data would be freed using the windows
+// allocation functions rather than tcmalloc.  The problem is that
+// windows allocates some structures lazily, so it would allocate them
+// late (using tcmalloc) and then try to deallocate them late as well.
+// So instead of unpatching, we just modify all the tcmalloc routines
+// so they call through to the libc rountines if the memory in
+// question doesn't seem to have been allocated with tcmalloc.  I keep
+// this unpatch code around for reference.
+
+void UnpatchWindowsFunctions() {
+  // We need to go back to the system malloc/etc at global destruct time,
+  // so objects that were constructed before tcmalloc, using the system
+  // malloc, can destroy themselves using the system free.  This depends
+  // on DLLs unloading in the reverse order in which they load!
+  //
+  // We also go back to the default HeapAlloc/etc, just for consistency.
+  // Who knows, it may help avoid weird bugs in some situations.
+  main_executable_windows.Unpatch();
+  main_executable.Unpatch();
+  if (libc1.is_valid()) libc1.Unpatch();
+  if (libc2.is_valid()) libc2.Unpatch();
+  if (libc3.is_valid()) libc3.Unpatch();
+  if (libc4.is_valid()) libc4.Unpatch();
+  if (libc5.is_valid()) libc5.Unpatch();
+  if (libc6.is_valid()) libc6.Unpatch();
+  if (libc7.is_valid()) libc7.Unpatch();
+  if (libc8.is_valid()) libc8.Unpatch();
+}
+#endif

diff --git a/src/windows/port.cc b/src/windows/port.cc
new file mode 100644
index 0000000..76224a2
--- /dev/null
+++ b/src/windows/port.cc

@@ -0,0 +1,235 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Craig Silverstein
+ */
+
+#ifndef _WIN32
+# error You should only be including windows/port.cc in a windows environment!
+#endif
+
+#define NOMINMAX       // so std::max, below, compiles correctly
+#include <config.h>
+#include <string.h>    // for strlen(), memset(), memcmp()
+#include <assert.h>
+#include <stdarg.h>    // for va_list, va_start, va_end
+#include <algorithm>   // for std:{min,max}
+#include <windows.h>
+#include "port.h"
+#include "base/logging.h"
+#include "base/spinlock.h"
+#include "internal_logging.h"
+
+// -----------------------------------------------------------------------
+// Basic libraries
+
+PERFTOOLS_DLL_DECL
+int getpagesize() {
+  static int pagesize = 0;
+  if (pagesize == 0) {
+    SYSTEM_INFO system_info;
+    GetSystemInfo(&system_info);
+    pagesize = std::max(system_info.dwPageSize,
+                        system_info.dwAllocationGranularity);
+  }
+  return pagesize;
+}
+
+extern "C" PERFTOOLS_DLL_DECL void* __sbrk(ptrdiff_t increment) {
+  LOG(FATAL, "Windows doesn't implement sbrk!\n");
+  return NULL;
+}
+
+// We need to write to 'stderr' without having windows allocate memory.
+// The safest way is via a low-level call like WriteConsoleA().  But
+// even then we need to be sure to print in small bursts so as to not
+// require memory allocation.
+extern "C" PERFTOOLS_DLL_DECL void WriteToStderr(const char* buf, int len) {
+  // Looks like windows allocates for writes of >80 bytes
+  for (int i = 0; i < len; i += 80) {
+    write(STDERR_FILENO, buf + i, std::min(80, len - i));
+  }
+}
+
+
+// -----------------------------------------------------------------------
+// Threads code
+
+// Windows doesn't support pthread_key_create's destr_function, and in
+// fact it's a bit tricky to get code to run when a thread exits.  This
+// is cargo-cult magic from http://www.codeproject.com/threads/tls.asp.
+// This code is for VC++ 7.1 and later; VC++ 6.0 support is possible
+// but more busy-work -- see the webpage for how to do it.  If all
+// this fails, we could use DllMain instead.  The big problem with
+// DllMain is it doesn't run if this code is statically linked into a
+// binary (it also doesn't run if the thread is terminated via
+// TerminateThread, which if we're lucky this routine does).
+
+// Force a reference to _tls_used to make the linker create the TLS directory
+// if it's not already there (that is, even if __declspec(thread) is not used).
+// Force a reference to p_thread_callback_tcmalloc and p_process_term_tcmalloc
+// to prevent whole program optimization from discarding the variables.
+#ifdef _MSC_VER
+#if defined(_M_IX86)
+#pragma comment(linker, "/INCLUDE:__tls_used")
+#pragma comment(linker, "/INCLUDE:_p_thread_callback_tcmalloc")
+#pragma comment(linker, "/INCLUDE:_p_process_term_tcmalloc")
+#elif defined(_M_X64)
+#pragma comment(linker, "/INCLUDE:_tls_used")
+#pragma comment(linker, "/INCLUDE:p_thread_callback_tcmalloc")
+#pragma comment(linker, "/INCLUDE:p_process_term_tcmalloc")
+#endif
+#endif
+
+// When destr_fn eventually runs, it's supposed to take as its
+// argument the tls-value associated with key that pthread_key_create
+// creates.  (Yeah, it sounds confusing but it's really not.)  We
+// store the destr_fn/key pair in this data structure.  Because we
+// store this in a single var, this implies we can only have one
+// destr_fn in a program!  That's enough in practice.  If asserts
+// trigger because we end up needing more, we'll have to turn this
+// into an array.
+struct DestrFnClosure {
+  void (*destr_fn)(void*);
+  pthread_key_t key_for_destr_fn_arg;
+};
+
+static DestrFnClosure destr_fn_info;   // initted to all NULL/0.
+
+static int on_process_term(void) {
+  if (destr_fn_info.destr_fn) {
+    void *ptr = TlsGetValue(destr_fn_info.key_for_destr_fn_arg);
+    // This shouldn't be necessary, but in Release mode, Windows
+    // sometimes trashes the pointer in the TLS slot, so we need to
+    // remove the pointer from the TLS slot before the thread dies.
+    TlsSetValue(destr_fn_info.key_for_destr_fn_arg, NULL);
+    if (ptr)  // pthread semantics say not to call if ptr is NULL
+      (*destr_fn_info.destr_fn)(ptr);
+  }
+  return 0;
+}
+
+static void NTAPI on_tls_callback(HINSTANCE h, DWORD dwReason, PVOID pv) {
+  if (dwReason == DLL_THREAD_DETACH) {   // thread is being destroyed!
+    on_process_term();
+  }
+}
+
+#ifdef _MSC_VER
+
+// extern "C" suppresses C++ name mangling so we know the symbol names
+// for the linker /INCLUDE:symbol pragmas above.
+extern "C" {
+// This tells the linker to run these functions.
+#pragma data_seg(push, old_seg)
+#pragma data_seg(".CRT$XLB")
+void (NTAPI *p_thread_callback_tcmalloc)(
+    HINSTANCE h, DWORD dwReason, PVOID pv) = on_tls_callback;
+#pragma data_seg(".CRT$XTU")
+int (*p_process_term_tcmalloc)(void) = on_process_term;
+#pragma data_seg(pop, old_seg)
+}  // extern "C"
+
+#else  // #ifdef _MSC_VER  [probably msys/mingw]
+
+// We have to try the DllMain solution here, because we can't use the
+// msvc-specific pragmas.
+BOOL WINAPI DllMain(HINSTANCE h, DWORD dwReason, PVOID pv) {
+  if (dwReason == DLL_THREAD_DETACH)
+    on_tls_callback(h, dwReason, pv);
+  else if (dwReason == DLL_PROCESS_DETACH)
+    on_process_term();
+  return TRUE;
+}
+
+#endif  // #ifdef _MSC_VER
+
+extern "C" pthread_key_t PthreadKeyCreate(void (*destr_fn)(void*)) {
+  // Semantics are: we create a new key, and then promise to call
+  // destr_fn with TlsGetValue(key) when the thread is destroyed
+  // (as long as TlsGetValue(key) is not NULL).
+  pthread_key_t key = TlsAlloc();
+  if (destr_fn) {   // register it
+    // If this assert fails, we'll need to support an array of destr_fn_infos
+    assert(destr_fn_info.destr_fn == NULL);
+    destr_fn_info.destr_fn = destr_fn;
+    destr_fn_info.key_for_destr_fn_arg = key;
+  }
+  return key;
+}
+
+// NOTE: this is Win2K and later.  For Win98 we could use a CRITICAL_SECTION...
+extern "C" int perftools_pthread_once(pthread_once_t *once_control,
+                                      void (*init_routine)(void)) {
+  // Try for a fast path first. Note: this should be an acquire semantics read.
+  // It is on x86 and x64, where Windows runs.
+  if (*once_control != 1) {
+    while (true) {
+      switch (InterlockedCompareExchange(once_control, 2, 0)) {
+        case 0:
+          init_routine();
+          InterlockedExchange(once_control, 1);
+          return 0;
+        case 1:
+          // The initializer has already been executed
+          return 0;
+        default:
+          // The initializer is being processed by another thread
+          SwitchToThread();
+      }
+    }
+  }
+  return 0;
+}
+
+
+// -----------------------------------------------------------------------
+// These functions rework existing functions of the same name in the
+// Google codebase.
+
+// A replacement for HeapProfiler::CleanupOldProfiles.
+void DeleteMatchingFiles(const char* prefix, const char* full_glob) {
+  WIN32_FIND_DATAA found;  // that final A is for Ansi (as opposed to Unicode)
+  HANDLE hFind = FindFirstFileA(full_glob, &found);   // A is for Ansi
+  if (hFind != INVALID_HANDLE_VALUE) {
+    const int prefix_length = strlen(prefix);
+    do {
+      const char *fname = found.cFileName;
+      if ((strlen(fname) >= prefix_length) &&
+          (memcmp(fname, prefix, prefix_length) == 0)) {
+        RAW_VLOG(0, "Removing old heap profile %s\n", fname);
+        // TODO(csilvers): we really need to unlink dirname + fname
+        _unlink(fname);
+      }
+    } while (FindNextFileA(hFind, &found) != FALSE);  // A is for Ansi
+    FindClose(hFind);
+  }
+}

diff --git a/src/windows/port.h b/src/windows/port.h
new file mode 100644
index 0000000..0350f45
--- /dev/null
+++ b/src/windows/port.h

@@ -0,0 +1,497 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Craig Silverstein
+ *
+ * These are some portability typedefs and defines to make it a bit
+ * easier to compile this code under VC++.
+ *
+ * Several of these are taken from glib:
+ *    http://developer.gnome.org/doc/API/glib/glib-windows-compatability-functions.html
+ */
+
+#ifndef GOOGLE_BASE_WINDOWS_H_
+#define GOOGLE_BASE_WINDOWS_H_
+
+/* You should never include this file directly, but always include it
+   from either config.h (MSVC) or mingw.h (MinGW/msys). */
+#if !defined(GOOGLE_PERFTOOLS_WINDOWS_CONFIG_H_) && \
+    !defined(GOOGLE_PERFTOOLS_WINDOWS_MINGW_H_)
+# error "port.h should only be included from config.h or mingw.h"
+#endif
+
+#ifdef _WIN32
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN  /* We always want minimal includes */
+#endif
+#include <windows.h>
+#include <io.h>              /* because we so often use open/close/etc */
+#include <direct.h>          /* for _getcwd */
+#include <process.h>         /* for _getpid */
+#include <limits.h>          /* for PATH_MAX */
+#include <stdarg.h>          /* for va_list */
+#include <stdio.h>           /* need this to override stdio's (v)snprintf */
+#include <sys/types.h>       /* for _off_t */
+#include <assert.h>
+#include <stdlib.h>          /* for rand, srand, _strtoxxx */
+
+#if _MSC_VER >= 1900
+#define _TIMESPEC_DEFINED
+#include <time.h>
+#endif
+
+/*
+ * 4018: signed/unsigned mismatch is common (and ok for signed_i < unsigned_i)
+ * 4244: otherwise we get problems when subtracting two size_t's to an int
+ * 4288: VC++7 gets confused when a var is defined in a loop and then after it
+ * 4267: too many false positives for "conversion gives possible data loss"
+ * 4290: it's ok windows ignores the "throw" directive
+ * 4996: Yes, we're ok using "unsafe" functions like vsnprintf and getenv()
+ * 4146: internal_logging.cc intentionally negates an unsigned value
+ */
+#ifdef _MSC_VER
+#pragma warning(disable:4018 4244 4288 4267 4290 4996 4146)
+#endif
+
+#ifndef __cplusplus
+/* MSVC does not support C99 */
+# if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L
+#  ifdef _MSC_VER
+#    define inline __inline
+#  else
+#    define inline static
+#  endif
+# endif
+#endif
+
+#ifdef __cplusplus
+# define EXTERN_C  extern "C"
+#else
+# define EXTERN_C  extern
+#endif
+
+/* ----------------------------------- BASIC TYPES */
+
+#ifndef HAVE_STDINT_H
+#ifndef HAVE___INT64    /* we need to have all the __intX names */
+# error  Do not know how to set up type aliases.  Edit port.h for your system.
+#endif
+
+typedef __int8 int8_t;
+typedef __int16 int16_t;
+typedef __int32 int32_t;
+typedef __int64 int64_t;
+typedef unsigned __int8 uint8_t;
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+#endif  /* #ifndef HAVE_STDINT_H */
+
+/* I guess MSVC's <types.h> doesn't include ssize_t by default? */
+#ifdef _MSC_VER
+typedef intptr_t ssize_t;
+#endif
+
+/* ----------------------------------- THREADS */
+
+#ifndef HAVE_PTHREAD   /* not true for MSVC, but may be true for MSYS */
+typedef DWORD pthread_t;
+typedef DWORD pthread_key_t;
+typedef LONG pthread_once_t;
+enum { PTHREAD_ONCE_INIT = 0 };   /* important that this be 0! for SpinLock */
+
+inline pthread_t pthread_self(void) {
+  return GetCurrentThreadId();
+}
+
+#ifdef __cplusplus
+inline bool pthread_equal(pthread_t left, pthread_t right) {
+  return left == right;
+}
+
+/*
+ * windows/port.h defines compatibility APIs for several .h files, which
+ * we therefore shouldn't be #including directly.  This hack keeps us from
+ * doing so.  TODO(csilvers): do something more principled.
+ */
+#define GOOGLE_MAYBE_THREADS_H_ 1
+/* This replaces maybe_threads.{h,cc} */
+
+EXTERN_C pthread_key_t PthreadKeyCreate(void (*destr_fn)(void*));  /* port.cc */
+
+inline int perftools_pthread_key_create(pthread_key_t *pkey,
+                                        void (*destructor)(void*)) {
+  pthread_key_t key = PthreadKeyCreate(destructor);
+  if (key != TLS_OUT_OF_INDEXES) {
+    *(pkey) = key;
+    return 0;
+  } else {
+    return GetLastError();
+  }
+}
+
+inline void* perftools_pthread_getspecific(DWORD key) {
+  DWORD err = GetLastError();
+  void* rv = TlsGetValue(key);
+  if (err) SetLastError(err);
+  return rv;
+}
+
+inline int perftools_pthread_setspecific(pthread_key_t key, const void *value) {
+  if (TlsSetValue(key, (LPVOID)value))
+    return 0;
+  else
+    return GetLastError();
+}
+
+EXTERN_C int perftools_pthread_once(pthread_once_t *once_control,
+                                    void (*init_routine)(void));
+
+#endif  /* __cplusplus */
+
+inline void sched_yield(void) {
+  Sleep(0);
+}
+
+#endif  /* HAVE_PTHREAD */
+
+/*
+ * __declspec(thread) isn't usable in a dll opened via LoadLibrary().
+ * But it doesn't work to LoadLibrary() us anyway, because of all the
+ * things we need to do before main()!  So this kind of TLS is safe for us.
+ */
+#define __thread __declspec(thread)
+
+/*
+ * This code is obsolete, but I keep it around in case we are ever in
+ * an environment where we can't or don't want to use google spinlocks
+ * (from base/spinlock.{h,cc}).  In that case, uncommenting this out,
+ * and removing spinlock.cc from the build, should be enough to revert
+ * back to using native spinlocks.
+ */
+#if 0
+// Windows uses a spinlock internally for its mutexes, making our life easy!
+// However, the Windows spinlock must always be initialized, making life hard,
+// since we want LINKER_INITIALIZED.  We work around this by having the
+// linker initialize a bool to 0, and check that before accessing the mutex.
+// This replaces spinlock.{h,cc}, and all the stuff it depends on (atomicops)
+#ifdef __cplusplus
+class SpinLock {
+ public:
+  SpinLock() : initialize_token_(PTHREAD_ONCE_INIT) {}
+  // Used for global SpinLock vars (see base/spinlock.h for more details).
+  enum StaticInitializer { LINKER_INITIALIZED };
+  explicit SpinLock(StaticInitializer) : initialize_token_(PTHREAD_ONCE_INIT) {
+    perftools_pthread_once(&initialize_token_, InitializeMutex);
+  }
+
+  // It's important SpinLock not have a destructor: otherwise we run
+  // into problems when the main thread has exited, but other threads
+  // are still running and try to access a main-thread spinlock.  This
+  // means we leak mutex_ (we should call DeleteCriticalSection()
+  // here).  However, I've verified that all SpinLocks used in
+  // perftools have program-long scope anyway, so the leak is
+  // perfectly fine.  But be aware of this for the future!
+
+  void Lock() {
+    // You'd thionk this would be unnecessary, since we call
+    // InitializeMutex() in our constructor.  But sometimes Lock() can
+    // be called before our constructor is!  This can only happen in
+    // global constructors, when this is a global.  If we live in
+    // bar.cc, and some global constructor in foo.cc calls a routine
+    // in bar.cc that calls this->Lock(), then Lock() may well run
+    // before our global constructor does.  To protect against that,
+    // we do this check.  For SpinLock objects created after main()
+    // has started, this pthread_once call will always be a noop.
+    perftools_pthread_once(&initialize_token_, InitializeMutex);
+    EnterCriticalSection(&mutex_);
+  }
+  void Unlock() {
+    LeaveCriticalSection(&mutex_);
+  }
+
+  // Used in assertion checks: assert(lock.IsHeld()) (see base/spinlock.h).
+  inline bool IsHeld() const {
+    // This works, but probes undocumented internals, so I've commented it out.
+    // c.f. http://msdn.microsoft.com/msdnmag/issues/03/12/CriticalSections/
+    //return mutex_.LockCount>=0 && mutex_.OwningThread==GetCurrentThreadId();
+    return true;
+  }
+ private:
+  void InitializeMutex() { InitializeCriticalSection(&mutex_); }
+
+  pthread_once_t initialize_token_;
+  CRITICAL_SECTION mutex_;
+};
+
+class SpinLockHolder {  // Acquires a spinlock for as long as the scope lasts
+ private:
+  SpinLock* lock_;
+ public:
+  inline explicit SpinLockHolder(SpinLock* l) : lock_(l) { l->Lock(); }
+  inline ~SpinLockHolder() { lock_->Unlock(); }
+};
+#endif  // #ifdef __cplusplus
+
+// This keeps us from using base/spinlock.h's implementation of SpinLock.
+#define BASE_SPINLOCK_H_ 1
+
+#endif  /* #if 0 */
+
+/* ----------------------------------- MMAP and other memory allocation */
+
+#ifndef HAVE_MMAP   /* not true for MSVC, but may be true for msys */
+#define MAP_FAILED  0
+#define MREMAP_FIXED  2  /* the value in linux, though it doesn't really matter */
+/* These, when combined with the mmap invariants below, yield the proper action */
+#define PROT_READ      PAGE_READWRITE
+#define PROT_WRITE     PAGE_READWRITE
+#define MAP_ANONYMOUS  MEM_RESERVE
+#define MAP_PRIVATE    MEM_COMMIT
+#define MAP_SHARED     MEM_RESERVE   /* value of this #define is 100% arbitrary */
+
+#if __STDC__ && !defined(__MINGW32__)
+typedef _off_t off_t;
+#endif
+
+/* VirtualAlloc only replaces for mmap when certain invariants are kept. */
+inline void *mmap(void *addr, size_t length, int prot, int flags,
+                  int fd, off_t offset) {
+  if (addr == NULL && fd == -1 && offset == 0 &&
+      prot == (PROT_READ|PROT_WRITE) && flags == (MAP_PRIVATE|MAP_ANONYMOUS)) {
+    return VirtualAlloc(0, length, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+  } else {
+    return NULL;
+  }
+}
+
+inline int munmap(void *addr, size_t length) {
+  return VirtualFree(addr, 0, MEM_RELEASE) ? 0 : -1;
+}
+#endif  /* HAVE_MMAP */
+
+/* We could maybe use VirtualAlloc for sbrk as well, but no need */
+inline void *sbrk(intptr_t increment) {
+  // sbrk returns -1 on failure
+  return (void*)-1;
+}
+
+
+/* ----------------------------------- STRING ROUTINES */
+
+/*
+ * We can't just use _vsnprintf and _snprintf as drop-in-replacements,
+ * because they don't always NUL-terminate. :-(  We also can't use the
+ * name vsnprintf, since windows defines that (but not snprintf (!)).
+ */
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+/* We can use safe CRT functions, which the required functionality */
+inline int perftools_vsnprintf(char *str, size_t size, const char *format,
+                               va_list ap) {
+  return vsnprintf_s(str, size, _TRUNCATE, format, ap);
+}
+#else
+inline int perftools_vsnprintf(char *str, size_t size, const char *format,
+                               va_list ap) {
+  if (size == 0)        /* not even room for a \0? */
+    return -1;        /* not what C99 says to do, but what windows does */
+  str[size-1] = '\0';
+  return _vsnprintf(str, size-1, format, ap);
+}
+#endif
+
+#ifndef HAVE_SNPRINTF
+inline int snprintf(char *str, size_t size, const char *format, ...) {
+  va_list ap;
+  int r;
+  va_start(ap, format);
+  r = perftools_vsnprintf(str, size, format, ap);
+  va_end(ap);
+  return r;
+}
+#endif
+
+#define PRIx64  "I64x"
+#define SCNx64  "I64x"
+#define PRId64  "I64d"
+#define SCNd64  "I64d"
+#define PRIu64  "I64u"
+#ifdef _WIN64
+# define PRIuPTR "llu"
+# define PRIxPTR "llx"
+#else
+# define PRIuPTR "lu"
+# define PRIxPTR "lx"
+#endif
+
+/* ----------------------------------- FILE IO */
+
+#ifndef PATH_MAX
+#define PATH_MAX 1024
+#endif
+#ifndef __MINGW32__
+enum { STDIN_FILENO = 0, STDOUT_FILENO = 1, STDERR_FILENO = 2 };
+#endif
+#ifndef O_RDONLY
+#define O_RDONLY  _O_RDONLY
+#endif
+
+#if __STDC__ && !defined(__MINGW32__)
+/* These functions are considered non-standard */
+inline int access(const char *pathname, int mode) {
+  return _access(pathname, mode);
+}
+inline int open(const char *pathname, int flags, int mode = 0) {
+  return _open(pathname, flags, mode);
+}
+inline int close(int fd) {
+  return _close(fd);
+}
+inline ssize_t read(int fd, void *buf, size_t count) {
+  return _read(fd, buf, count);
+}
+inline ssize_t write(int fd, const void *buf, size_t count) {
+  return _write(fd, buf, count);
+}
+inline off_t lseek(int fd, off_t offset, int whence) {
+  return _lseek(fd, offset, whence);
+}
+inline char *getcwd(char *buf, size_t size) {
+  return _getcwd(buf, size);
+}
+inline int mkdir(const char *pathname, int) {
+  return _mkdir(pathname);
+}
+
+inline FILE *popen(const char *command, const char *type) {
+  return _popen(command, type);
+}
+inline int pclose(FILE *stream) {
+  return _pclose(stream);
+}
+#endif
+
+EXTERN_C PERFTOOLS_DLL_DECL void WriteToStderr(const char* buf, int len);
+
+/* ----------------------------------- SYSTEM/PROCESS */
+
+#ifndef HAVE_PID_T
+typedef int pid_t;
+#endif
+
+#if __STDC__ && !defined(__MINGW32__)
+inline pid_t getpid(void) { return _getpid(); }
+#endif
+inline pid_t getppid(void) { return 0; }
+
+/* Handle case when poll is used to simulate sleep. */
+inline int poll(struct pollfd* fds, int nfds, int timeout) {
+  assert(fds == NULL);
+  assert(nfds == 0);
+  Sleep(timeout);
+  return 0;
+}
+
+EXTERN_C PERFTOOLS_DLL_DECL int getpagesize();   /* in port.cc */
+
+/* ----------------------------------- OTHER */
+
+inline void srandom(unsigned int seed) { srand(seed); }
+inline long random(void) { return rand(); }
+
+#ifndef HAVE_DECL_SLEEP
+#define HAVE_DECL_SLEEP 0
+#endif
+
+#if !HAVE_DECL_SLEEP
+inline unsigned int sleep(unsigned int seconds) {
+  Sleep(seconds * 1000);
+  return 0;
+}
+#endif
+
+// mingw64 seems to define timespec (though mingw.org mingw doesn't),
+// protected by the _TIMESPEC_DEFINED macro.
+#ifndef _TIMESPEC_DEFINED
+struct timespec {
+  int tv_sec;
+  int tv_nsec;
+};
+#endif
+
+#ifndef HAVE_DECL_NANOSLEEP
+#define HAVE_DECL_NANOSLEEP 0
+#endif
+
+// latest mingw64 has nanosleep. Earlier mingw and MSVC do not
+#if !HAVE_DECL_NANOSLEEP
+inline int nanosleep(const struct timespec *req, struct timespec *rem) {
+  Sleep(req->tv_sec * 1000 + req->tv_nsec / 1000000);
+  return 0;
+}
+#endif
+
+#ifndef __MINGW32__
+#if _MSC_VER < 1800
+inline long long int strtoll(const char *nptr, char **endptr, int base) {
+    return _strtoi64(nptr, endptr, base);
+}
+inline unsigned long long int strtoull(const char *nptr, char **endptr,
+                                       int base) {
+    return _strtoui64(nptr, endptr, base);
+}
+inline long long int strtoq(const char *nptr, char **endptr, int base) {
+    return _strtoi64(nptr, endptr, base);
+}
+#endif
+inline unsigned long long int strtouq(const char *nptr, char **endptr,
+                                      int base) {
+    return _strtoui64(nptr, endptr, base);
+}
+inline long long atoll(const char *nptr) {
+  return _atoi64(nptr);
+}
+#endif
+
+#define __THROW throw()
+
+/* ----------------------------------- TCMALLOC-SPECIFIC */
+
+/* tcmalloc.cc calls this so we can patch VirtualAlloc() et al. */
+extern void PatchWindowsFunctions();
+
+#endif  /* _WIN32 */
+
+#undef inline
+#undef EXTERN_C
+
+#endif  /* GOOGLE_BASE_WINDOWS_H_ */

diff --git a/src/windows/preamble_patcher.cc b/src/windows/preamble_patcher.cc
new file mode 100644
index 0000000..ec05537
--- /dev/null
+++ b/src/windows/preamble_patcher.cc

@@ -0,0 +1,736 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Joi Sigurdsson
+ * Author: Scott Francis
+ *
+ * Implementation of PreamblePatcher
+ */
+
+#include "preamble_patcher.h"
+
+#include "mini_disassembler.h"
+
+// compatibility shims
+#include "base/logging.h"
+
+// Definitions of assembly statements we need
+#define ASM_JMP32REL 0xE9
+#define ASM_INT3 0xCC
+#define ASM_JMP32ABS_0 0xFF
+#define ASM_JMP32ABS_1 0x25
+#define ASM_JMP8REL 0xEB
+#define ASM_JCC32REL_0 0x0F
+#define ASM_JCC32REL_1_MASK 0x80
+#define ASM_NOP 0x90
+// X64 opcodes
+#define ASM_REXW 0x48
+#define ASM_MOVRAX_IMM 0xB8
+#define ASM_JMP 0xFF
+#define ASM_JMP_RAX 0xE0
+
+namespace sidestep {
+
+PreamblePatcher::PreamblePage* PreamblePatcher::preamble_pages_ = NULL;
+long PreamblePatcher::granularity_ = 0;
+long PreamblePatcher::pagesize_ = 0;
+bool PreamblePatcher::initialized_ = false;
+
+static const unsigned int kPreamblePageMagic = 0x4347414D; // "MAGC"
+
+// Handle a special case that we see with functions that point into an
+// IAT table (including functions linked statically into the
+// application): these function already starts with ASM_JMP32*.  For
+// instance, malloc() might be implemented as a JMP to __malloc().
+// This function follows the initial JMPs for us, until we get to the
+// place where the actual code is defined.  If we get to STOP_BEFORE,
+// we return the address before stop_before.  The stop_before_trampoline
+// flag is used in 64-bit mode.  If true, we will return the address
+// before a trampoline is detected.  Trampolines are defined as:
+//
+//    nop
+//    mov rax, <replacement_function>
+//    jmp rax
+//
+// See PreamblePatcher::RawPatchWithStub for more information.
+void* PreamblePatcher::ResolveTargetImpl(unsigned char* target,
+                                         unsigned char* stop_before,
+                                         bool stop_before_trampoline) {
+  if (target == NULL)
+    return NULL;
+  while (1) {
+    unsigned char* new_target;
+    if (target[0] == ASM_JMP32REL) {
+      // target[1-4] holds the place the jmp goes to, but it's
+      // relative to the next instruction.
+      int relative_offset;   // Windows guarantees int is 4 bytes
+      SIDESTEP_ASSERT(sizeof(relative_offset) == 4);
+      memcpy(reinterpret_cast<void*>(&relative_offset),
+             reinterpret_cast<void*>(target + 1), 4);
+      new_target = target + 5 + relative_offset;
+    } else if (target[0] == ASM_JMP8REL) {
+      // Visual Studio 7.1 implements new[] as an 8 bit jump to new
+      signed char relative_offset;
+      memcpy(reinterpret_cast<void*>(&relative_offset),
+             reinterpret_cast<void*>(target + 1), 1);
+      new_target = target + 2 + relative_offset;
+    } else if (target[0] == ASM_JMP32ABS_0 &&
+               target[1] == ASM_JMP32ABS_1) {
+    jmp32rel:
+      // Visual studio seems to sometimes do it this way instead of the
+      // previous way.  Not sure what the rules are, but it was happening
+      // with operator new in some binaries.
+      void** new_target_v;
+      if (kIs64BitBinary) {
+        // In 64-bit mode JMPs are RIP-relative, not absolute
+        int target_offset;
+        memcpy(reinterpret_cast<void*>(&target_offset),
+               reinterpret_cast<void*>(target + 2), 4);
+        new_target_v = reinterpret_cast<void**>(target + target_offset + 6);
+      } else {
+        SIDESTEP_ASSERT(sizeof(new_target) == 4);
+        memcpy(&new_target_v, reinterpret_cast<void*>(target + 2), 4);
+      }
+      new_target = reinterpret_cast<unsigned char*>(*new_target_v);
+    } else if (kIs64BitBinary && target[0] == ASM_REXW
+               && target[1] == ASM_JMP32ABS_0
+               && target[2] == ASM_JMP32ABS_1) {
+      // in Visual Studio 2012 we're seeing jump like that:
+      //   rex.W jmpq *0x11d019(%rip)
+      //
+      // according to docs I have, rex prefix is actually unneeded and
+      // can be ignored. I.e. docs say for jumps like that operand
+      // already defaults to 64-bit. But clearly it breaks abs. jump
+      // detection above and we just skip rex
+      target++;
+      goto jmp32rel;
+    } else {
+      break;
+    }
+    if (new_target == stop_before)
+      break;
+    if (stop_before_trampoline && *new_target == ASM_NOP
+        && new_target[1] == ASM_REXW && new_target[2] == ASM_MOVRAX_IMM)
+      break;
+    target = new_target;
+  }
+  return target;
+}
+
+// Special case scoped_ptr to avoid dependency on scoped_ptr below.
+class DeleteUnsignedCharArray {
+ public:
+  DeleteUnsignedCharArray(unsigned char* array) : array_(array) {
+  }
+
+  ~DeleteUnsignedCharArray() {
+    if (array_) {
+      PreamblePatcher::FreePreambleBlock(array_);
+    }
+  }
+
+  unsigned char* Release() {
+    unsigned char* temp = array_;
+    array_ = NULL;
+    return temp;
+  }
+
+ private:
+  unsigned char* array_;
+};
+
+SideStepError PreamblePatcher::RawPatchWithStubAndProtections(
+    void* target_function, void *replacement_function,
+    unsigned char* preamble_stub, unsigned long stub_size,
+    unsigned long* bytes_needed) {
+  // We need to be able to write to a process-local copy of the first
+  // MAX_PREAMBLE_STUB_SIZE bytes of target_function
+  DWORD old_target_function_protect = 0;
+  BOOL succeeded = ::VirtualProtect(reinterpret_cast<void*>(target_function),
+                                    MAX_PREAMBLE_STUB_SIZE,
+                                    PAGE_EXECUTE_READWRITE,
+                                    &old_target_function_protect);
+  if (!succeeded) {
+    SIDESTEP_ASSERT(false && "Failed to make page containing target function "
+                    "copy-on-write.");
+    return SIDESTEP_ACCESS_DENIED;
+  }
+
+  SideStepError error_code = RawPatchWithStub(target_function,
+                                              replacement_function,
+                                              preamble_stub,
+                                              stub_size,
+                                              bytes_needed);
+
+  // Restore the protection of the first MAX_PREAMBLE_STUB_SIZE bytes of
+  // pTargetFunction to what they were before we started goofing around.
+  // We do this regardless of whether the patch succeeded or not.
+  succeeded = ::VirtualProtect(reinterpret_cast<void*>(target_function),
+                               MAX_PREAMBLE_STUB_SIZE,
+                               old_target_function_protect,
+                               &old_target_function_protect);
+  if (!succeeded) {
+    SIDESTEP_ASSERT(false &&
+                    "Failed to restore protection to target function.");
+    // We must not return an error here because the function has
+    // likely actually been patched, and returning an error might
+    // cause our client code not to unpatch it.  So we just keep
+    // going.
+  }
+
+  if (SIDESTEP_SUCCESS != error_code) {  // Testing RawPatchWithStub, above
+    SIDESTEP_ASSERT(false);
+    return error_code;
+  }
+
+  // Flush the instruction cache to make sure the processor doesn't execute the
+  // old version of the instructions (before our patch).
+  //
+  // FlushInstructionCache is actually a no-op at least on
+  // single-processor XP machines.  I'm not sure why this is so, but
+  // it is, yet I want to keep the call to the API here for
+  // correctness in case there is a difference in some variants of
+  // Windows/hardware.
+  succeeded = ::FlushInstructionCache(::GetCurrentProcess(),
+                                      target_function,
+                                      MAX_PREAMBLE_STUB_SIZE);
+  if (!succeeded) {
+    SIDESTEP_ASSERT(false && "Failed to flush instruction cache.");
+    // We must not return an error here because the function has actually
+    // been patched, and returning an error would likely cause our client
+    // code not to unpatch it.  So we just keep going.
+  }
+
+  return SIDESTEP_SUCCESS;
+}
+
+SideStepError PreamblePatcher::RawPatch(void* target_function,
+                                        void* replacement_function,
+                                        void** original_function_stub) {
+  if (!target_function || !replacement_function || !original_function_stub ||
+      (*original_function_stub) || target_function == replacement_function) {
+    SIDESTEP_ASSERT(false && "Preconditions not met");
+    return SIDESTEP_INVALID_PARAMETER;
+  }
+
+  BOOL succeeded = FALSE;
+
+  // First, deal with a special case that we see with functions that
+  // point into an IAT table (including functions linked statically
+  // into the application): these function already starts with
+  // ASM_JMP32REL.  For instance, malloc() might be implemented as a
+  // JMP to __malloc().  In that case, we replace the destination of
+  // the JMP (__malloc), rather than the JMP itself (malloc).  This
+  // way we get the correct behavior no matter how malloc gets called.
+  void* new_target = ResolveTarget(target_function);
+  if (new_target != target_function) {
+    target_function = new_target;
+  }
+
+  // In 64-bit mode, preamble_stub must be within 2GB of target function
+  // so that if target contains a jump, we can translate it.
+  unsigned char* preamble_stub = AllocPreambleBlockNear(target_function);
+  if (!preamble_stub) {
+    SIDESTEP_ASSERT(false && "Unable to allocate preamble-stub.");
+    return SIDESTEP_INSUFFICIENT_BUFFER;
+  }
+
+  // Frees the array at end of scope.
+  DeleteUnsignedCharArray guard_preamble_stub(preamble_stub);
+
+  SideStepError error_code = RawPatchWithStubAndProtections(
+      target_function, replacement_function, preamble_stub,
+      MAX_PREAMBLE_STUB_SIZE, NULL);
+
+  if (SIDESTEP_SUCCESS != error_code) {
+    SIDESTEP_ASSERT(false);
+    return error_code;
+  }
+
+  // Flush the instruction cache to make sure the processor doesn't execute the
+  // old version of the instructions (before our patch).
+  //
+  // FlushInstructionCache is actually a no-op at least on
+  // single-processor XP machines.  I'm not sure why this is so, but
+  // it is, yet I want to keep the call to the API here for
+  // correctness in case there is a difference in some variants of
+  // Windows/hardware.
+  succeeded = ::FlushInstructionCache(::GetCurrentProcess(),
+                                      target_function,
+                                      MAX_PREAMBLE_STUB_SIZE);
+  if (!succeeded) {
+    SIDESTEP_ASSERT(false && "Failed to flush instruction cache.");
+    // We must not return an error here because the function has actually
+    // been patched, and returning an error would likely cause our client
+    // code not to unpatch it.  So we just keep going.
+  }
+
+  SIDESTEP_LOG("PreamblePatcher::RawPatch successfully patched.");
+
+  // detach the scoped pointer so the memory is not freed
+  *original_function_stub =
+      reinterpret_cast<void*>(guard_preamble_stub.Release());
+  return SIDESTEP_SUCCESS;
+}
+
+SideStepError PreamblePatcher::Unpatch(void* target_function,
+                                       void* replacement_function,
+                                       void* original_function_stub) {
+  SIDESTEP_ASSERT(target_function && replacement_function &&
+                  original_function_stub);
+  if (!target_function || !replacement_function ||
+      !original_function_stub) {
+    return SIDESTEP_INVALID_PARAMETER;
+  }
+
+  // Before unpatching, target_function should be a JMP to
+  // replacement_function.  If it's not, then either it's an error, or
+  // we're falling into the case where the original instruction was a
+  // JMP, and we patched the jumped_to address rather than the JMP
+  // itself.  (For instance, if malloc() is just a JMP to __malloc(),
+  // we patched __malloc() and not malloc().)
+  unsigned char* target = reinterpret_cast<unsigned char*>(target_function);
+  target = reinterpret_cast<unsigned char*>(
+      ResolveTargetImpl(
+          target, reinterpret_cast<unsigned char*>(replacement_function),
+          true));
+  // We should end at the function we patched.  When we patch, we insert
+  // a ASM_JMP32REL instruction, so look for that as a sanity check.
+  if (target[0] != ASM_JMP32REL) {
+    SIDESTEP_ASSERT(false &&
+                    "target_function does not look like it was patched.");
+    return SIDESTEP_INVALID_PARAMETER;
+  }
+
+  const unsigned int kRequiredTargetPatchBytes = 5;
+
+  // We need to be able to write to a process-local copy of the first
+  // kRequiredTargetPatchBytes bytes of target_function
+  DWORD old_target_function_protect = 0;
+  BOOL succeeded = ::VirtualProtect(reinterpret_cast<void*>(target),
+                                    kRequiredTargetPatchBytes,
+                                    PAGE_EXECUTE_READWRITE,
+                                    &old_target_function_protect);
+  if (!succeeded) {
+    SIDESTEP_ASSERT(false && "Failed to make page containing target function "
+                    "copy-on-write.");
+    return SIDESTEP_ACCESS_DENIED;
+  }
+
+  unsigned char* preamble_stub = reinterpret_cast<unsigned char*>(
+                                   original_function_stub);
+
+  // Disassemble the preamble of stub and copy the bytes back to target.
+  // If we've done any conditional jumps in the preamble we need to convert
+  // them back to the original REL8 jumps in the target.
+  MiniDisassembler disassembler;
+  unsigned int preamble_bytes = 0;
+  unsigned int target_bytes = 0;
+  while (target_bytes < kRequiredTargetPatchBytes) {
+    unsigned int cur_bytes = 0;
+    InstructionType instruction_type =
+        disassembler.Disassemble(preamble_stub + preamble_bytes, cur_bytes);
+    if (IT_JUMP == instruction_type) {
+      unsigned int jump_bytes = 0;
+      SideStepError jump_ret = SIDESTEP_JUMP_INSTRUCTION;
+      if (IsNearConditionalJump(preamble_stub + preamble_bytes, cur_bytes) ||
+          IsNearRelativeJump(preamble_stub + preamble_bytes, cur_bytes) ||
+          IsNearAbsoluteCall(preamble_stub + preamble_bytes, cur_bytes) ||
+          IsNearRelativeCall(preamble_stub + preamble_bytes, cur_bytes)) {
+        jump_ret = PatchNearJumpOrCall(preamble_stub + preamble_bytes, 
+                                       cur_bytes, target + target_bytes, 
+                                       &jump_bytes, MAX_PREAMBLE_STUB_SIZE);
+      }
+      if (jump_ret == SIDESTEP_JUMP_INSTRUCTION) {
+        SIDESTEP_ASSERT(false &&
+                        "Found unsupported jump instruction in stub!!");
+        return SIDESTEP_UNSUPPORTED_INSTRUCTION;
+      }
+      target_bytes += jump_bytes;
+    } else if (IT_GENERIC == instruction_type) {
+      if (IsMovWithDisplacement(preamble_stub + preamble_bytes, cur_bytes)) {
+        unsigned int mov_bytes = 0;
+        if (PatchMovWithDisplacement(preamble_stub + preamble_bytes, cur_bytes,
+                                     target + target_bytes, &mov_bytes,
+                                     MAX_PREAMBLE_STUB_SIZE)
+                                     != SIDESTEP_SUCCESS) {
+          SIDESTEP_ASSERT(false &&
+                          "Found unsupported generic instruction in stub!!");
+          return SIDESTEP_UNSUPPORTED_INSTRUCTION;
+        }
+      } else {
+        memcpy(reinterpret_cast<void*>(target + target_bytes),
+               reinterpret_cast<void*>(reinterpret_cast<unsigned char*>(
+                   original_function_stub) + preamble_bytes), cur_bytes);
+        target_bytes += cur_bytes;
+      }
+    } else {
+      SIDESTEP_ASSERT(false &&
+                      "Found unsupported instruction in stub!!");
+      return SIDESTEP_UNSUPPORTED_INSTRUCTION;
+    }
+    preamble_bytes += cur_bytes;
+  }
+
+  FreePreambleBlock(reinterpret_cast<unsigned char*>(original_function_stub));
+
+  // Restore the protection of the first kRequiredTargetPatchBytes bytes of
+  // target to what they were before we started goofing around.
+  succeeded = ::VirtualProtect(reinterpret_cast<void*>(target),
+                               kRequiredTargetPatchBytes,
+                               old_target_function_protect,
+                               &old_target_function_protect);
+
+  // Flush the instruction cache to make sure the processor doesn't execute the
+  // old version of the instructions (before our patch).
+  //
+  // See comment on FlushInstructionCache elsewhere in this file.
+  succeeded = ::FlushInstructionCache(::GetCurrentProcess(),
+                                      target,
+                                      MAX_PREAMBLE_STUB_SIZE);
+  if (!succeeded) {
+    SIDESTEP_ASSERT(false && "Failed to flush instruction cache.");
+    return SIDESTEP_UNEXPECTED;
+  }
+
+  SIDESTEP_LOG("PreamblePatcher::Unpatch successfully unpatched.");
+  return SIDESTEP_SUCCESS;
+}
+
+void PreamblePatcher::Initialize() {
+  if (!initialized_) {
+    SYSTEM_INFO si = { 0 };
+    ::GetSystemInfo(&si);
+    granularity_ = si.dwAllocationGranularity;
+    pagesize_ = si.dwPageSize;
+    initialized_ = true;
+  }
+}
+
+unsigned char* PreamblePatcher::AllocPreambleBlockNear(void* target) {
+  PreamblePage* preamble_page = preamble_pages_;
+  while (preamble_page != NULL) {
+    if (preamble_page->free_ != NULL) {
+      __int64 val = reinterpret_cast<__int64>(preamble_page) -
+          reinterpret_cast<__int64>(target);
+      if ((val > 0 && val + pagesize_ <= INT_MAX) ||
+          (val < 0 && val >= INT_MIN)) {
+        break;
+      }
+    }
+    preamble_page = preamble_page->next_;
+  }
+
+  // The free_ member of the page is used to store the next available block
+  // of memory to use or NULL if there are no chunks available, in which case
+  // we'll allocate a new page.
+  if (preamble_page == NULL || preamble_page->free_ == NULL) {
+    // Create a new preamble page and initialize the free list
+    preamble_page = reinterpret_cast<PreamblePage*>(AllocPageNear(target));
+    SIDESTEP_ASSERT(preamble_page != NULL && "Could not allocate page!");
+    void** pp = &preamble_page->free_;
+    unsigned char* ptr = reinterpret_cast<unsigned char*>(preamble_page) +
+        MAX_PREAMBLE_STUB_SIZE;
+    unsigned char* limit = reinterpret_cast<unsigned char*>(preamble_page) +
+        pagesize_;
+    while (ptr < limit) {
+      *pp = ptr;
+      pp = reinterpret_cast<void**>(ptr);
+      ptr += MAX_PREAMBLE_STUB_SIZE;
+    }
+    *pp = NULL;
+    // Insert the new page into the list
+    preamble_page->magic_ = kPreamblePageMagic;
+    preamble_page->next_ = preamble_pages_;
+    preamble_pages_ = preamble_page;
+  }
+  unsigned char* ret = reinterpret_cast<unsigned char*>(preamble_page->free_);
+  preamble_page->free_ = *(reinterpret_cast<void**>(preamble_page->free_));
+  return ret;
+}
+
+void PreamblePatcher::FreePreambleBlock(unsigned char* block) {
+  SIDESTEP_ASSERT(block != NULL);
+  SIDESTEP_ASSERT(granularity_ != 0);
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(block);
+  ptr -= ptr & (granularity_ - 1);
+  PreamblePage* preamble_page = reinterpret_cast<PreamblePage*>(ptr);
+  SIDESTEP_ASSERT(preamble_page->magic_ == kPreamblePageMagic);
+  *(reinterpret_cast<void**>(block)) = preamble_page->free_;
+  preamble_page->free_ = block;
+}
+
+void* PreamblePatcher::AllocPageNear(void* target) {
+  MEMORY_BASIC_INFORMATION mbi = { 0 };
+  if (!::VirtualQuery(target, &mbi, sizeof(mbi))) {
+    SIDESTEP_ASSERT(false && "VirtualQuery failed on target address");
+    return 0;
+  }
+  if (initialized_ == false) {
+    PreamblePatcher::Initialize();
+    SIDESTEP_ASSERT(initialized_);
+  }
+  void* pv = NULL;
+  unsigned char* allocation_base = reinterpret_cast<unsigned char*>(
+      mbi.AllocationBase);
+  __int64 i = 1;
+  bool high_target = reinterpret_cast<__int64>(target) > UINT_MAX;
+  while (pv == NULL) {
+    __int64 val = reinterpret_cast<__int64>(allocation_base) -
+        (i * granularity_);
+    if (high_target &&
+        reinterpret_cast<__int64>(target) - val > INT_MAX) {
+        // We're further than 2GB from the target
+      break;
+    } else if (val <= NULL) {
+      // Less than 0
+      break;
+    }
+    pv = ::VirtualAlloc(reinterpret_cast<void*>(allocation_base -
+                            (i++ * granularity_)),
+                        pagesize_, MEM_COMMIT | MEM_RESERVE,
+                        PAGE_EXECUTE_READWRITE);
+  }
+
+  // We couldn't allocate low, try to allocate high
+  if (pv == NULL) {
+    i = 1;
+    // Round up to the next multiple of page granularity
+    allocation_base = reinterpret_cast<unsigned char*>(
+        (reinterpret_cast<__int64>(target) &
+        (~(granularity_ - 1))) + granularity_);
+    while (pv == NULL) {
+      __int64 val = reinterpret_cast<__int64>(allocation_base) +
+          (i * granularity_) - reinterpret_cast<__int64>(target);
+      if (val > INT_MAX || val < 0) {
+        // We're too far or we overflowed
+        break;
+      }
+      pv = ::VirtualAlloc(reinterpret_cast<void*>(allocation_base +
+                              (i++ * granularity_)),
+                          pagesize_, MEM_COMMIT | MEM_RESERVE,
+                          PAGE_EXECUTE_READWRITE);
+    }
+  }
+  return pv;
+}
+
+bool PreamblePatcher::IsShortConditionalJump(
+    unsigned char* target,
+    unsigned int instruction_size) {
+  return (*(target) & 0x70) == 0x70 && instruction_size == 2;
+}
+
+bool PreamblePatcher::IsShortJump(
+    unsigned char* target,
+    unsigned int instruction_size) {
+  return target[0] == 0xeb && instruction_size == 2;
+}
+
+bool PreamblePatcher::IsNearConditionalJump(
+    unsigned char* target,
+    unsigned int instruction_size) {
+  return *(target) == 0xf && (*(target + 1) & 0x80) == 0x80 &&
+      instruction_size == 6;
+}
+
+bool PreamblePatcher::IsNearRelativeJump(
+    unsigned char* target,
+    unsigned int instruction_size) {
+  return *(target) == 0xe9 && instruction_size == 5;
+}
+
+bool PreamblePatcher::IsNearAbsoluteCall(
+    unsigned char* target,
+    unsigned int instruction_size) {
+  return *(target) == 0xff && (*(target + 1) & 0x10) == 0x10 &&
+      instruction_size == 6;
+}
+
+bool PreamblePatcher::IsNearRelativeCall(
+    unsigned char* target,
+    unsigned int instruction_size) {
+  return *(target) == 0xe8 && instruction_size == 5;
+}
+
+bool PreamblePatcher::IsMovWithDisplacement(
+    unsigned char* target,
+    unsigned int instruction_size) {
+  // In this case, the ModRM byte's mod field will be 0 and r/m will be 101b (5)
+  return instruction_size == 7 && *target == 0x48 && *(target + 1) == 0x8b &&
+      (*(target + 2) >> 6) == 0 && (*(target + 2) & 0x7) == 5;
+}
+
+SideStepError PreamblePatcher::PatchShortConditionalJump(
+    unsigned char* source,
+    unsigned int instruction_size,
+    unsigned char* target,
+    unsigned int* target_bytes,
+    unsigned int target_size) {
+  // note: rel8 offset is signed. Thus we need to ask for signed char
+  // to negative offsets right
+  unsigned char* original_jump_dest = (source + 2) + static_cast<signed char>(source[1]);
+  unsigned char* stub_jump_from = target + 6;
+  __int64 fixup_jump_offset = original_jump_dest - stub_jump_from;
+  if (fixup_jump_offset > INT_MAX || fixup_jump_offset < INT_MIN) {
+    SIDESTEP_ASSERT(false &&
+                    "Unable to fix up short jump because target"
+                    " is too far away.");
+    return SIDESTEP_JUMP_INSTRUCTION;
+  }
+
+  *target_bytes = 6;
+  if (target_size > *target_bytes) {
+    // Convert the short jump to a near jump.
+    //
+    // 0f 8x xx xx xx xx = Jcc rel32off
+    unsigned short jmpcode = ((0x80 | (source[0] & 0xf)) << 8) | 0x0f;
+    memcpy(reinterpret_cast<void*>(target),
+           reinterpret_cast<void*>(&jmpcode), 2);
+    memcpy(reinterpret_cast<void*>(target + 2),
+           reinterpret_cast<void*>(&fixup_jump_offset), 4);
+  }
+
+  return SIDESTEP_SUCCESS;
+}
+
+SideStepError PreamblePatcher::PatchShortJump(
+    unsigned char* source,
+    unsigned int instruction_size,
+    unsigned char* target,
+    unsigned int* target_bytes,
+    unsigned int target_size) {
+  // note: rel8 offset is _signed_. Thus we need signed char here.
+  unsigned char* original_jump_dest = (source + 2) + static_cast<signed char>(source[1]);
+  unsigned char* stub_jump_from = target + 5;
+  __int64 fixup_jump_offset = original_jump_dest - stub_jump_from;
+  if (fixup_jump_offset > INT_MAX || fixup_jump_offset < INT_MIN) {
+    SIDESTEP_ASSERT(false &&
+                    "Unable to fix up short jump because target"
+                    " is too far away.");
+    return SIDESTEP_JUMP_INSTRUCTION;
+  }
+
+  *target_bytes = 5;
+  if (target_size > *target_bytes) {
+    // Convert the short jump to a near jump.
+    //
+    // e9 xx xx xx xx = jmp rel32off
+    target[0] = 0xe9;
+    memcpy(reinterpret_cast<void*>(target + 1),
+           reinterpret_cast<void*>(&fixup_jump_offset), 4);
+  }
+
+  return SIDESTEP_SUCCESS;
+}
+
+SideStepError PreamblePatcher::PatchNearJumpOrCall(
+    unsigned char* source,
+    unsigned int instruction_size,
+    unsigned char* target,
+    unsigned int* target_bytes,
+    unsigned int target_size) {
+  SIDESTEP_ASSERT(instruction_size == 5 || instruction_size == 6);
+  unsigned int jmp_offset_in_instruction = instruction_size == 5 ? 1 : 2;
+  unsigned char* original_jump_dest = reinterpret_cast<unsigned char *>(
+      reinterpret_cast<__int64>(source + instruction_size) +
+      *(reinterpret_cast<int*>(source + jmp_offset_in_instruction)));
+  unsigned char* stub_jump_from = target + instruction_size;
+  __int64 fixup_jump_offset = original_jump_dest - stub_jump_from;
+  if (fixup_jump_offset > INT_MAX || fixup_jump_offset < INT_MIN) {
+    SIDESTEP_ASSERT(false &&
+                    "Unable to fix up near jump because target"
+                    " is too far away.");
+    return SIDESTEP_JUMP_INSTRUCTION;
+  }
+
+  if ((fixup_jump_offset < SCHAR_MAX && fixup_jump_offset > SCHAR_MIN)) {
+    *target_bytes = 2;
+    if (target_size > *target_bytes) {
+      // If the new offset is in range, use a short jump instead of a near jump.
+      if (source[0] == ASM_JCC32REL_0 &&
+          (source[1] & ASM_JCC32REL_1_MASK) == ASM_JCC32REL_1_MASK) {
+        unsigned short jmpcode = (static_cast<unsigned char>(
+            fixup_jump_offset) << 8) | (0x70 | (source[1] & 0xf));
+        memcpy(reinterpret_cast<void*>(target),
+               reinterpret_cast<void*>(&jmpcode),
+               2);
+      } else {
+        target[0] = ASM_JMP8REL;
+        target[1] = static_cast<unsigned char>(fixup_jump_offset);
+      }
+    }
+  } else {
+    *target_bytes = instruction_size;
+    if (target_size > *target_bytes) {
+      memcpy(reinterpret_cast<void*>(target),
+             reinterpret_cast<void*>(source),
+             jmp_offset_in_instruction);
+      memcpy(reinterpret_cast<void*>(target + jmp_offset_in_instruction),
+             reinterpret_cast<void*>(&fixup_jump_offset),
+             4);
+    }
+  }
+
+  return SIDESTEP_SUCCESS;
+}
+
+SideStepError PreamblePatcher::PatchMovWithDisplacement(
+     unsigned char* source,
+     unsigned int instruction_size,
+     unsigned char* target,
+     unsigned int* target_bytes,
+     unsigned int target_size) {
+  SIDESTEP_ASSERT(instruction_size == 7);
+  const int mov_offset_in_instruction = 3; // 0x48 0x8b 0x0d <offset>
+  unsigned char* original_mov_dest = reinterpret_cast<unsigned char*>(
+      reinterpret_cast<__int64>(source + instruction_size) +
+      *(reinterpret_cast<int*>(source + mov_offset_in_instruction)));
+  unsigned char* stub_mov_from = target + instruction_size;
+  __int64 fixup_mov_offset = original_mov_dest - stub_mov_from;
+  if (fixup_mov_offset > INT_MAX || fixup_mov_offset < INT_MIN) {
+    SIDESTEP_ASSERT(false &&
+        "Unable to fix up near MOV because target is too far away.");
+    return SIDESTEP_UNEXPECTED;
+  }
+  *target_bytes = instruction_size;
+  if (target_size > *target_bytes) {
+    memcpy(reinterpret_cast<void*>(target),
+           reinterpret_cast<void*>(source),
+           mov_offset_in_instruction);
+    memcpy(reinterpret_cast<void*>(target + mov_offset_in_instruction),
+           reinterpret_cast<void*>(&fixup_mov_offset),
+           4);
+  }
+  return SIDESTEP_SUCCESS;
+}
+
+};  // namespace sidestep

diff --git a/src/windows/preamble_patcher.h b/src/windows/preamble_patcher.h
new file mode 100644
index 0000000..76f158a
--- /dev/null
+++ b/src/windows/preamble_patcher.h

@@ -0,0 +1,620 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Joi Sigurdsson
+ * Author: Scott Francis
+ *
+ * Definition of PreamblePatcher
+ */
+
+#ifndef GOOGLE_PERFTOOLS_PREAMBLE_PATCHER_H_
+#define GOOGLE_PERFTOOLS_PREAMBLE_PATCHER_H_
+
+#include "config.h"
+#include <windows.h>
+
+// compatibility shim
+#include "base/logging.h"
+#define SIDESTEP_ASSERT(cond)  RAW_DCHECK(cond, #cond)
+#define SIDESTEP_LOG(msg)      RAW_VLOG(1, msg)
+
+// Maximum size of the preamble stub. We overwrite at least the first 5
+// bytes of the function. Considering the worst case scenario, we need 4
+// bytes + the max instruction size + 5 more bytes for our jump back to
+// the original code. With that in mind, 32 is a good number :)
+#ifdef _M_X64
+// In 64-bit mode we may need more room.  In 64-bit mode all jumps must be
+// within +/-2GB of RIP.  Because of this limitation we may need to use a
+// trampoline to jump to the replacement function if it is further than 2GB
+// away from the target. The trampoline is 14 bytes.
+//
+// So 4 bytes + max instruction size (17 bytes) + 5 bytes to jump back to the
+// original code + trampoline size.  64 bytes is a nice number :-)
+#define MAX_PREAMBLE_STUB_SIZE    (64)
+#else
+#define MAX_PREAMBLE_STUB_SIZE    (32)
+#endif
+
+// Determines if this is a 64-bit binary.
+#ifdef _M_X64
+static const bool kIs64BitBinary = true;
+#else
+static const bool kIs64BitBinary = false;
+#endif
+
+namespace sidestep {
+
+// Possible results of patching/unpatching
+enum SideStepError {
+  SIDESTEP_SUCCESS = 0,
+  SIDESTEP_INVALID_PARAMETER,
+  SIDESTEP_INSUFFICIENT_BUFFER,
+  SIDESTEP_JUMP_INSTRUCTION,
+  SIDESTEP_FUNCTION_TOO_SMALL,
+  SIDESTEP_UNSUPPORTED_INSTRUCTION,
+  SIDESTEP_NO_SUCH_MODULE,
+  SIDESTEP_NO_SUCH_FUNCTION,
+  SIDESTEP_ACCESS_DENIED,
+  SIDESTEP_UNEXPECTED,
+};
+
+#define SIDESTEP_TO_HRESULT(error)                      \
+  MAKE_HRESULT(SEVERITY_ERROR, FACILITY_NULL, error)
+
+class DeleteUnsignedCharArray;
+
+// Implements a patching mechanism that overwrites the first few bytes of
+// a function preamble with a jump to our hook function, which is then
+// able to call the original function via a specially-made preamble-stub
+// that imitates the action of the original preamble.
+//
+// NOTE:  This patching mechanism should currently only be used for
+// non-production code, e.g. unit tests, because it is not threadsafe.
+// See the TODO in preamble_patcher_with_stub.cc for instructions on what
+// we need to do before using it in production code; it's fairly simple
+// but unnecessary for now since we only intend to use it in unit tests.
+//
+// To patch a function, use either of the typesafe Patch() methods.  You
+// can unpatch a function using Unpatch().
+//
+// Typical usage goes something like this:
+// @code
+// typedef int (*MyTypesafeFuncPtr)(int x);
+// MyTypesafeFuncPtr original_func_stub;
+// int MyTypesafeFunc(int x) { return x + 1; }
+// int HookMyTypesafeFunc(int x) { return 1 + original_func_stub(x); }
+// 
+// void MyPatchInitializingFunction() {
+//   original_func_stub = PreamblePatcher::Patch(
+//              MyTypesafeFunc, HookMyTypesafeFunc);
+//   if (!original_func_stub) {
+//     // ... error handling ...
+//   }
+//
+//   // ... continue - you have patched the function successfully ...
+// }
+// @endcode
+//
+// Note that there are a number of ways that this method of patching can
+// fail.  The most common are:
+//    - If there is a jump (jxx) instruction in the first 5 bytes of
+//    the function being patched, we cannot patch it because in the
+//    current implementation we do not know how to rewrite relative
+//    jumps after relocating them to the preamble-stub.  Note that
+//    if you really really need to patch a function like this, it
+//    would be possible to add this functionality (but at some cost).
+//    - If there is a return (ret) instruction in the first 5 bytes
+//    we cannot patch the function because it may not be long enough
+//    for the jmp instruction we use to inject our patch.
+//    - If there is another thread currently executing within the bytes
+//    that are copied to the preamble stub, it will crash in an undefined
+//    way.
+//
+// If you get any other error than the above, you're either pointing the
+// patcher at an invalid instruction (e.g. into the middle of a multi-
+// byte instruction, or not at memory containing executable instructions)
+// or, there may be a bug in the disassembler we use to find
+// instruction boundaries.
+//
+// NOTE:  In optimized builds, when you have very trivial functions that
+// the compiler can reason do not have side effects, the compiler may
+// reuse the result of calling the function with a given parameter, which
+// may mean if you patch the function in between your patch will never get
+// invoked.  See preamble_patcher_test.cc for an example.
+class PERFTOOLS_DLL_DECL PreamblePatcher {
+ public:
+
+  // This is a typesafe version of RawPatch(), identical in all other
+  // ways than it takes a template parameter indicating the type of the
+  // function being patched.
+  //
+  // @param T The type of the function you are patching. Usually
+  // you will establish this type using a typedef, as in the following
+  // example:
+  // @code
+  // typedef BOOL (WINAPI *MessageBoxPtr)(HWND, LPCTSTR, LPCTSTR, UINT);
+  // MessageBoxPtr original = NULL;
+  // PreamblePatcher::Patch(MessageBox, Hook_MessageBox, &original);
+  // @endcode
+  template <class T>
+  static SideStepError Patch(T target_function,
+                             T replacement_function,
+                             T* original_function_stub) {
+    // NOTE: casting from a function to a pointer is contra the C++
+    //       spec.  It's not safe on IA64, but is on i386.  We use
+    //       a C-style cast here to emphasize this is not legal C++.
+    return RawPatch((void*)(target_function),
+                    (void*)(replacement_function),
+                    (void**)(original_function_stub));
+  }
+
+  // Patches a named function imported from the named module using
+  // preamble patching.  Uses RawPatch() to do the actual patching
+  // work.
+  //
+  // @param T The type of the function you are patching.  Must
+  // exactly match the function you specify using module_name and
+  // function_name.
+  //
+  // @param module_name The name of the module from which the function
+  // is being imported.  Note that the patch will fail if this module
+  // has not already been loaded into the current process.
+  //
+  // @param function_name The name of the function you wish to patch.
+  //
+  // @param replacement_function Your replacement function which
+  // will be called whenever code tries to call the original function.
+  //
+  // @param original_function_stub Pointer to memory that should receive a
+  // pointer that can be used (e.g. in the replacement function) to call the
+  // original function, or NULL to indicate failure.
+  //
+  // @return One of the EnSideStepError error codes; only SIDESTEP_SUCCESS
+  // indicates success.
+  template <class T>
+  static SideStepError Patch(LPCTSTR module_name,
+                             LPCSTR function_name,
+                             T replacement_function,
+                             T* original_function_stub) {
+    SIDESTEP_ASSERT(module_name && function_name);
+    if (!module_name || !function_name) {
+      SIDESTEP_ASSERT(false &&
+                      "You must specify a module name and function name.");
+      return SIDESTEP_INVALID_PARAMETER;
+    }
+    HMODULE module = ::GetModuleHandle(module_name);
+    SIDESTEP_ASSERT(module != NULL);
+    if (!module) {
+      SIDESTEP_ASSERT(false && "Invalid module name.");
+      return SIDESTEP_NO_SUCH_MODULE;
+    }
+    FARPROC existing_function = ::GetProcAddress(module, function_name);
+    if (!existing_function) {
+      SIDESTEP_ASSERT(
+          false && "Did not find any function with that name in the module.");
+      return SIDESTEP_NO_SUCH_FUNCTION;
+    }
+    // NOTE: casting from a function to a pointer is contra the C++
+    //       spec.  It's not safe on IA64, but is on i386.  We use
+    //       a C-style cast here to emphasize this is not legal C++.
+    return RawPatch((void*)existing_function, (void*)replacement_function,
+                    (void**)(original_function_stub));
+  }
+
+  // Patches a function by overwriting its first few bytes with
+  // a jump to a different function.  This is the "worker" function
+  // for each of the typesafe Patch() functions.  In most cases,
+  // it is preferable to use the Patch() functions rather than
+  // this one as they do more checking at compile time.
+  //
+  // @param target_function A pointer to the function that should be
+  // patched.
+  //
+  // @param replacement_function A pointer to the function that should
+  // replace the target function.  The replacement function must have
+  // exactly the same calling convention and parameters as the original
+  // function.
+  //
+  // @param original_function_stub Pointer to memory that should receive a
+  // pointer that can be used (e.g. in the replacement function) to call the
+  // original function, or NULL to indicate failure.
+  //
+  // @param original_function_stub Pointer to memory that should receive a
+  // pointer that can be used (e.g. in the replacement function) to call the
+  // original function, or NULL to indicate failure.
+  //
+  // @return One of the EnSideStepError error codes; only SIDESTEP_SUCCESS
+  // indicates success.
+  //
+  // @note The preamble-stub (the memory pointed to by
+  // *original_function_stub) is allocated on the heap, and (in
+  // production binaries) never destroyed, resulting in a memory leak.  This
+  // will be the case until we implement safe unpatching of a method.
+  // However, it is quite difficult to unpatch a method (because other
+  // threads in the process may be using it) so we are leaving it for now.
+  // See however UnsafeUnpatch, which can be used for binaries where you
+  // know only one thread is running, e.g. unit tests.
+  static SideStepError RawPatch(void* target_function,
+                                void* replacement_function,
+                                void** original_function_stub);
+
+  // Unpatches target_function and deletes the stub that previously could be
+  // used to call the original version of the function.
+  //
+  // DELETES the stub that is passed to the function.
+  //
+  // @param target_function Pointer to the target function which was
+  // previously patched, i.e. a pointer which value should match the value
+  // of the symbol prior to patching it.
+  //
+  // @param replacement_function Pointer to the function target_function
+  // was patched to.
+  //
+  // @param original_function_stub Pointer to the stub returned when
+  // patching, that could be used to call the original version of the
+  // patched function.  This function will also delete the stub, which after
+  // unpatching is useless.
+  //
+  // If your original call was
+  //    Patch(VirtualAlloc, MyVirtualAlloc, &origptr)
+  // then to undo it you would call
+  //    Unpatch(VirtualAlloc, MyVirtualAlloc, origptr);
+  //
+  // @return One of the EnSideStepError error codes; only SIDESTEP_SUCCESS
+  // indicates success.
+  static SideStepError Unpatch(void* target_function,
+                               void* replacement_function,
+                               void* original_function_stub);
+
+  // A helper routine when patching, which follows jmp instructions at
+  // function addresses, to get to the "actual" function contents.
+  // This allows us to identify two functions that are at different
+  // addresses but actually resolve to the same code.
+  //
+  // @param target_function Pointer to a function.
+  //
+  // @return Either target_function (the input parameter), or if
+  // target_function's body consists entirely of a JMP instruction,
+  // the address it JMPs to (or more precisely, the address at the end
+  // of a chain of JMPs).
+  template <class T>
+  static T ResolveTarget(T target_function) {
+    return (T)ResolveTargetImpl((unsigned char*)target_function, NULL);
+  }
+
+  // Allocates a block of memory of size MAX_PREAMBLE_STUB_SIZE that is as
+  // close (within 2GB) as possible to target.  This is done to ensure that 
+  // we can perform a relative jump from target to a trampoline if the 
+  // replacement function is > +-2GB from target.  This means that we only need 
+  // to patch 5 bytes in the target function.
+  //
+  // @param target    Pointer to target function.
+  //
+  // @return  Returns a block of memory of size MAX_PREAMBLE_STUB_SIZE that can
+  //          be used to store a function preamble block.
+  static unsigned char* AllocPreambleBlockNear(void* target);
+
+  // Frees a block allocated by AllocPreambleBlockNear.
+  //
+  // @param block     Block that was returned by AllocPreambleBlockNear.
+  static void FreePreambleBlock(unsigned char* block);
+
+ private:
+  friend class DeleteUnsignedCharArray;
+
+   // Used to store data allocated for preamble stubs
+  struct PreamblePage {
+    unsigned int magic_;
+    PreamblePage* next_;
+    // This member points to a linked list of free blocks within the page
+    // or NULL if at the end
+    void* free_;
+  };
+
+  // In 64-bit mode, the replacement function must be within 2GB of the original
+  // target in order to only require 5 bytes for the function patch.  To meet
+  // this requirement we're creating an allocator within this class to
+  // allocate blocks that are within 2GB of a given target. This member is the
+  // head of a linked list of pages used to allocate blocks that are within
+  // 2GB of the target.
+  static PreamblePage* preamble_pages_;
+  
+  // Page granularity
+  static long granularity_;
+
+  // Page size
+  static long pagesize_;
+
+  // Determines if the patcher has been initialized.
+  static bool initialized_;
+
+  // Used to initialize static members.
+  static void Initialize();
+
+  // Patches a function by overwriting its first few bytes with
+  // a jump to a different function.  This is similar to the RawPatch
+  // function except that it uses the stub allocated by the caller
+  // instead of allocating it.
+  //
+  // We call VirtualProtect to make the
+  // target function writable at least for the duration of the call.
+  //
+  // @param target_function A pointer to the function that should be
+  // patched.
+  //
+  // @param replacement_function A pointer to the function that should
+  // replace the target function.  The replacement function must have
+  // exactly the same calling convention and parameters as the original
+  // function.
+  //
+  // @param preamble_stub A pointer to a buffer where the preamble stub
+  // should be copied. The size of the buffer should be sufficient to
+  // hold the preamble bytes.
+  //
+  // @param stub_size Size in bytes of the buffer allocated for the
+  // preamble_stub
+  //
+  // @param bytes_needed Pointer to a variable that receives the minimum
+  // number of bytes required for the stub.  Can be set to NULL if you're
+  // not interested.
+  //
+  // @return An error code indicating the result of patching.
+  static SideStepError RawPatchWithStubAndProtections(
+      void* target_function,
+      void* replacement_function,
+      unsigned char* preamble_stub,
+      unsigned long stub_size,
+      unsigned long* bytes_needed);
+
+  // A helper function used by RawPatchWithStubAndProtections -- it
+  // does everything but the VirtualProtect work.  Defined in
+  // preamble_patcher_with_stub.cc.
+  //
+  // @param target_function A pointer to the function that should be
+  // patched.
+  //
+  // @param replacement_function A pointer to the function that should
+  // replace the target function.  The replacement function must have
+  // exactly the same calling convention and parameters as the original
+  // function.
+  //
+  // @param preamble_stub A pointer to a buffer where the preamble stub
+  // should be copied. The size of the buffer should be sufficient to
+  // hold the preamble bytes.
+  //
+  // @param stub_size Size in bytes of the buffer allocated for the
+  // preamble_stub
+  //
+  // @param bytes_needed Pointer to a variable that receives the minimum
+  // number of bytes required for the stub.  Can be set to NULL if you're
+  // not interested.
+  //
+  // @return An error code indicating the result of patching.
+  static SideStepError RawPatchWithStub(void* target_function,
+                                        void* replacement_function,
+                                        unsigned char* preamble_stub,
+                                        unsigned long stub_size,
+                                        unsigned long* bytes_needed);
+
+
+  // A helper routine when patching, which follows jmp instructions at
+  // function addresses, to get to the "actual" function contents.
+  // This allows us to identify two functions that are at different
+  // addresses but actually resolve to the same code.
+  //
+  // @param target_function Pointer to a function.
+  //
+  // @param stop_before If, when following JMP instructions from
+  // target_function, we get to the address stop, we return
+  // immediately, the address that jumps to stop_before.
+  //
+  // @param stop_before_trampoline  When following JMP instructions from 
+  // target_function, stop before a trampoline is detected.  See comment in
+  // PreamblePatcher::RawPatchWithStub for more information.  This parameter 
+  // has no effect in 32-bit mode.
+  //
+  // @return Either target_function (the input parameter), or if
+  // target_function's body consists entirely of a JMP instruction,
+  // the address it JMPs to (or more precisely, the address at the end
+  // of a chain of JMPs).
+  static void* ResolveTargetImpl(unsigned char* target_function,
+                                 unsigned char* stop_before,
+                                 bool stop_before_trampoline = false);
+
+  // Helper routine that attempts to allocate a page as close (within 2GB)
+  // as possible to target.
+  //
+  // @param target    Pointer to target function.
+  //
+  // @return   Returns an address that is within 2GB of target.
+  static void* AllocPageNear(void* target);
+
+  // Helper routine that determines if a target instruction is a short
+  // conditional jump.
+  //
+  // @param target            Pointer to instruction.
+  //
+  // @param instruction_size  Size of the instruction in bytes.
+  //
+  // @return  Returns true if the instruction is a short conditional jump.
+  static bool IsShortConditionalJump(unsigned char* target,
+                                     unsigned int instruction_size);
+
+  static bool IsShortJump(unsigned char *target, unsigned int instruction_size);
+
+  // Helper routine that determines if a target instruction is a near
+  // conditional jump.
+  //
+  // @param target            Pointer to instruction.
+  //
+  // @param instruction_size  Size of the instruction in bytes.
+  //
+  // @return  Returns true if the instruction is a near conditional jump.
+  static bool IsNearConditionalJump(unsigned char* target,
+                                    unsigned int instruction_size);
+
+  // Helper routine that determines if a target instruction is a near
+  // relative jump.
+  //
+  // @param target            Pointer to instruction.
+  //
+  // @param instruction_size  Size of the instruction in bytes.
+  //
+  // @return  Returns true if the instruction is a near absolute jump.
+  static bool IsNearRelativeJump(unsigned char* target,
+                                 unsigned int instruction_size);
+
+  // Helper routine that determines if a target instruction is a near 
+  // absolute call.
+  //
+  // @param target            Pointer to instruction.
+  //
+  // @param instruction_size  Size of the instruction in bytes.
+  //
+  // @return  Returns true if the instruction is a near absolute call.
+  static bool IsNearAbsoluteCall(unsigned char* target,
+                                 unsigned int instruction_size);
+
+  // Helper routine that determines if a target instruction is a near 
+  // absolute call.
+  //
+  // @param target            Pointer to instruction.
+  //
+  // @param instruction_size  Size of the instruction in bytes.
+  //
+  // @return  Returns true if the instruction is a near absolute call.
+  static bool IsNearRelativeCall(unsigned char* target,
+                                 unsigned int instruction_size);
+
+  // Helper routine that determines if a target instruction is a 64-bit MOV
+  // that uses a RIP-relative displacement.
+  //
+  // @param target            Pointer to instruction.
+  //
+  // @param instruction_size  Size of the instruction in bytes.
+  //
+  // @return  Returns true if the instruction is a MOV with displacement.
+  static bool IsMovWithDisplacement(unsigned char* target,
+                                    unsigned int instruction_size);
+
+  // Helper routine that converts a short conditional jump instruction
+  // to a near conditional jump in a target buffer.  Note that the target
+  // buffer must be within 2GB of the source for the near jump to work.
+  //
+  // A short conditional jump instruction is in the format:
+  // 7x xx = Jcc rel8off
+  //
+  // @param source              Pointer to instruction.
+  //
+  // @param instruction_size    Size of the instruction.
+  //
+  // @param target              Target buffer to write the new instruction.
+  //
+  // @param target_bytes        Pointer to a buffer that contains the size
+  //                            of the target instruction, in bytes.
+  //
+  // @param target_size         Size of the target buffer.
+  //
+  // @return  Returns SIDESTEP_SUCCESS if successful, otherwise an error.
+  static SideStepError PatchShortConditionalJump(unsigned char* source,
+                                                 unsigned int instruction_size,
+                                                 unsigned char* target,
+                                                 unsigned int* target_bytes,
+                                                 unsigned int target_size);
+
+  static SideStepError PatchShortJump(unsigned char* source,
+                                      unsigned int instruction_size,
+                                      unsigned char* target,
+                                      unsigned int* target_bytes,
+                                      unsigned int target_size);
+
+  // Helper routine that converts an instruction that will convert various
+  // jump-like instructions to corresponding instructions in the target buffer.
+  // What this routine does is fix up the relative offsets contained in jump
+  // instructions to point back to the original target routine.  Like with
+  // PatchShortConditionalJump, the target buffer must be within 2GB of the
+  // source.
+  //
+  // We currently handle the following instructions:
+  //
+  // E9 xx xx xx xx     = JMP rel32off
+  // 0F 8x xx xx xx xx  = Jcc rel32off
+  // FF /2 xx xx xx xx  = CALL reg/mem32/mem64
+  // E8 xx xx xx xx     = CALL rel32off
+  //
+  // It should not be hard to update this function to support other
+  // instructions that jump to relative targets.
+  //
+  // @param source              Pointer to instruction.
+  //
+  // @param instruction_size    Size of the instruction.
+  //
+  // @param target              Target buffer to write the new instruction.
+  //
+  // @param target_bytes        Pointer to a buffer that contains the size
+  //                            of the target instruction, in bytes.
+  //
+  // @param target_size         Size of the target buffer.
+  //
+  // @return  Returns SIDESTEP_SUCCESS if successful, otherwise an error.
+  static SideStepError PatchNearJumpOrCall(unsigned char* source,
+                                           unsigned int instruction_size,
+                                           unsigned char* target,
+                                           unsigned int* target_bytes,
+                                           unsigned int target_size);
+  
+  // Helper routine that patches a 64-bit MOV instruction with a RIP-relative
+  // displacement.  The target buffer must be within 2GB of the source.
+  //
+  // 48 8B 0D XX XX XX XX = MOV rel32off
+  //
+  // @param source              Pointer to instruction.
+  //
+  // @param instruction_size    Size of the instruction.
+  //
+  // @param target              Target buffer to write the new instruction.
+  //
+  // @param target_bytes        Pointer to a buffer that contains the size
+  //                            of the target instruction, in bytes.
+  //
+  // @param target_size         Size of the target buffer.
+  //
+  // @return  Returns SIDESTEP_SUCCESS if successful, otherwise an error.
+  static SideStepError PatchMovWithDisplacement(unsigned char* source,
+                                                unsigned int instruction_size,
+                                                unsigned char* target,
+                                                unsigned int* target_bytes,
+                                                unsigned int target_size);
+};
+
+};  // namespace sidestep
+
+#endif  // GOOGLE_PERFTOOLS_PREAMBLE_PATCHER_H_

diff --git a/src/windows/preamble_patcher_test.cc b/src/windows/preamble_patcher_test.cc
new file mode 100644
index 0000000..e4605c6
--- /dev/null
+++ b/src/windows/preamble_patcher_test.cc

@@ -0,0 +1,368 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2011, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Joi Sigurdsson
+ * Author: Scott Francis
+ *
+ * Unit tests for PreamblePatcher
+ */
+
+#include "config_for_unittests.h"
+#include "preamble_patcher.h"
+#include "mini_disassembler.h"
+#pragma warning(push)
+#pragma warning(disable:4553)
+#include "auto_testing_hook.h"
+#pragma warning(pop)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <tchar.h>
+
+// Turning off all optimizations for this file, since the official build's
+// "Whole program optimization" seems to cause the TestPatchUsingDynamicStub
+// test to crash with an access violation.  We debugged this and found
+// that the optimized access a register that is changed by a call to the hook
+// function.
+#pragma optimize("", off)
+
+// A convenience macro to avoid a lot of casting in the tests.
+// I tried to make this a templated function, but windows complained:
+//     error C2782: 'sidestep::SideStepError `anonymous-namespace'::Unpatch(T,T,T *)' : template parameter 'T' is ambiguous
+//        could be 'int (int)'
+//        or       'int (__cdecl *)(int)'
+// My life isn't long enough to try to figure out how to fix this.
+#define UNPATCH(target_function, replacement_function, original_function_stub) \
+  sidestep::PreamblePatcher::Unpatch((void*)(target_function),          \
+                                     (void*)(replacement_function),     \
+                                     (void*)(original_function))
+
+namespace {
+
+// Function for testing - this is what we patch
+//
+// NOTE:  Because of the way the compiler optimizes this function in
+// release builds, we need to use a different input value every time we
+// call it within a function, otherwise the compiler will just reuse the
+// last calculated incremented value.
+int __declspec(noinline) IncrementNumber(int i) {
+#ifdef _M_X64
+  __int64 i2 = i + 1;
+  return (int) i2;
+#else
+   return i + 1;
+#endif
+}
+
+extern "C" int TooShortFunction(int);
+
+extern "C" int JumpShortCondFunction(int);
+
+extern "C" int JumpNearCondFunction(int);
+
+extern "C" int JumpAbsoluteFunction(int);
+
+extern "C" int CallNearRelativeFunction(int);
+
+typedef int (*IncrementingFunc)(int);
+IncrementingFunc original_function = NULL;
+
+int HookIncrementNumber(int i) {
+  SIDESTEP_ASSERT(original_function != NULL);
+  int incremented_once = original_function(i);
+  return incremented_once + 1;
+}
+
+// For the AutoTestingHook test, we can't use original_function, because
+// all that is encapsulated.
+// This function "increments" by 10, just to set it apart from the other
+// functions.
+int __declspec(noinline) AutoHookIncrementNumber(int i) {
+  return i + 10;
+}
+
+};  // namespace
+
+namespace sidestep {
+
+bool TestDisassembler() {
+   unsigned int instruction_size = 0;
+   sidestep::MiniDisassembler disassembler;
+   void * target = reinterpret_cast<unsigned char *>(IncrementNumber);
+   void * new_target = PreamblePatcher::ResolveTarget(target);
+   if (target != new_target)
+      target = new_target;
+
+   while (1) {
+      sidestep::InstructionType instructionType = disassembler.Disassemble(
+         reinterpret_cast<unsigned char *>(target) + instruction_size,
+         instruction_size);
+      if (sidestep::IT_RETURN == instructionType) {
+         return true;
+      }
+   }
+}
+
+bool TestPatchWithLongJump() {
+  original_function = NULL;
+  void *p = ::VirtualAlloc(reinterpret_cast<void *>(0x0000020000000000), 4096,
+                           MEM_RESERVE | MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+  SIDESTEP_EXPECT_TRUE(p != NULL);
+  memset(p, 0xcc, 4096);
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       sidestep::PreamblePatcher::Patch(IncrementNumber,
+                                                        (IncrementingFunc) p,
+                                                        &original_function));
+  SIDESTEP_ASSERT((*original_function)(1) == 2);
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       UNPATCH(IncrementNumber,
+                               (IncrementingFunc)p,
+                               original_function));
+  ::VirtualFree(p, 0, MEM_RELEASE);
+  return true;
+}
+
+bool TestPatchWithPreambleShortCondJump() {
+  original_function = NULL;
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       sidestep::PreamblePatcher::Patch(JumpShortCondFunction,
+                                                        HookIncrementNumber,
+                                                        &original_function));
+  (*original_function)(1);
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       UNPATCH(JumpShortCondFunction,
+                               (void*)HookIncrementNumber,
+                               original_function));
+  return true;
+}
+
+bool TestPatchWithPreambleNearRelativeCondJump() {
+  original_function = NULL;
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       sidestep::PreamblePatcher::Patch(JumpNearCondFunction,
+                                                        HookIncrementNumber,
+                                                        &original_function));
+  (*original_function)(0);
+  (*original_function)(1);
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       UNPATCH(JumpNearCondFunction,
+                               HookIncrementNumber,
+                               original_function));
+  return true;
+}
+
+bool TestPatchWithPreambleAbsoluteJump() {
+  original_function = NULL;
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       sidestep::PreamblePatcher::Patch(JumpAbsoluteFunction,
+                                                        HookIncrementNumber,
+                                                        &original_function));
+  (*original_function)(0);
+  (*original_function)(1);
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       UNPATCH(JumpAbsoluteFunction,
+                               HookIncrementNumber,
+                               original_function));
+  return true;
+}
+
+bool TestPatchWithPreambleNearRelativeCall() {
+  original_function = NULL;
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       sidestep::PreamblePatcher::Patch(
+                                                    CallNearRelativeFunction,
+                                                    HookIncrementNumber,
+                                                    &original_function));
+  (*original_function)(0);
+  (*original_function)(1);
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       UNPATCH(CallNearRelativeFunction,
+                               HookIncrementNumber,
+                               original_function));
+  return true;
+}
+
+bool TestPatchUsingDynamicStub() {
+  original_function = NULL;
+  SIDESTEP_EXPECT_TRUE(IncrementNumber(1) == 2);
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       sidestep::PreamblePatcher::Patch(IncrementNumber,
+                                                        HookIncrementNumber,
+                                                        &original_function));
+  SIDESTEP_EXPECT_TRUE(original_function);
+  SIDESTEP_EXPECT_TRUE(IncrementNumber(2) == 4);
+  SIDESTEP_EXPECT_TRUE(original_function(3) == 4);
+
+  // Clearbox test to see that the function has been patched.
+  sidestep::MiniDisassembler disassembler;
+  unsigned int instruction_size = 0;
+  SIDESTEP_EXPECT_TRUE(sidestep::IT_JUMP == disassembler.Disassemble(
+                           reinterpret_cast<unsigned char*>(IncrementNumber),
+                           instruction_size));
+
+  // Since we patched IncrementNumber, its first statement is a
+  // jmp to the hook function.  So verify that we now can not patch
+  // IncrementNumber because it starts with a jump.
+#if 0
+  IncrementingFunc dummy = NULL;
+  // TODO(joi@chromium.org): restore this test once flag is added to
+  // disable JMP following
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_JUMP_INSTRUCTION ==
+                       sidestep::PreamblePatcher::Patch(IncrementNumber,
+                                                        HookIncrementNumber,
+                                                        &dummy));
+
+  // This test disabled because code in preamble_patcher_with_stub.cc
+  // asserts before returning the error code -- so there is no way
+  // to get an error code here, in debug build.
+  dummy = NULL;
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_FUNCTION_TOO_SMALL ==
+                       sidestep::PreamblePatcher::Patch(TooShortFunction,
+                                                        HookIncrementNumber,
+                                                        &dummy));
+#endif
+
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       UNPATCH(IncrementNumber,
+                               HookIncrementNumber,
+                               original_function));
+  return true;
+}
+
+bool PatchThenUnpatch() {
+  original_function = NULL;
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       sidestep::PreamblePatcher::Patch(IncrementNumber,
+                                                        HookIncrementNumber,
+                                                        &original_function));
+  SIDESTEP_EXPECT_TRUE(original_function);
+  SIDESTEP_EXPECT_TRUE(IncrementNumber(1) == 3);
+  SIDESTEP_EXPECT_TRUE(original_function(2) == 3);
+
+  SIDESTEP_EXPECT_TRUE(sidestep::SIDESTEP_SUCCESS ==
+                       UNPATCH(IncrementNumber,
+                               HookIncrementNumber,
+                               original_function));
+  original_function = NULL;
+  SIDESTEP_EXPECT_TRUE(IncrementNumber(3) == 4);
+
+  return true;
+}
+
+bool AutoTestingHookTest() {
+  SIDESTEP_EXPECT_TRUE(IncrementNumber(1) == 2);
+
+  // Inner scope, so we can test what happens when the AutoTestingHook
+  // goes out of scope
+  {
+    AutoTestingHook hook = MakeTestingHook(IncrementNumber,
+                                           AutoHookIncrementNumber);
+    (void) hook;
+    SIDESTEP_EXPECT_TRUE(IncrementNumber(2) == 12);
+  }
+  SIDESTEP_EXPECT_TRUE(IncrementNumber(3) == 4);
+
+  return true;
+}
+
+bool AutoTestingHookInContainerTest() {
+  SIDESTEP_EXPECT_TRUE(IncrementNumber(1) == 2);
+
+  // Inner scope, so we can test what happens when the AutoTestingHook
+  // goes out of scope
+  {
+    AutoTestingHookHolder hook(MakeTestingHookHolder(IncrementNumber,
+                                                     AutoHookIncrementNumber));
+    (void) hook;
+    SIDESTEP_EXPECT_TRUE(IncrementNumber(2) == 12);
+  }
+  SIDESTEP_EXPECT_TRUE(IncrementNumber(3) == 4);
+
+  return true;
+}
+
+bool TestPreambleAllocation() {
+  __int64 diff = 0;
+  void* p1 = reinterpret_cast<void*>(0x110000000);
+  void* p2 = reinterpret_cast<void*>(0x810000000);
+  unsigned char* b1 = PreamblePatcher::AllocPreambleBlockNear(p1);
+  SIDESTEP_EXPECT_TRUE(b1 != NULL);
+  diff = reinterpret_cast<__int64>(p1) - reinterpret_cast<__int64>(b1);
+  // Ensure blocks are within 2GB
+  SIDESTEP_EXPECT_TRUE(diff <= INT_MAX && diff >= INT_MIN);
+  unsigned char* b2 = PreamblePatcher::AllocPreambleBlockNear(p2);
+  SIDESTEP_EXPECT_TRUE(b2 != NULL);
+  diff = reinterpret_cast<__int64>(p2) - reinterpret_cast<__int64>(b2);
+  SIDESTEP_EXPECT_TRUE(diff <= INT_MAX && diff >= INT_MIN);
+
+  // Ensure we're reusing free blocks
+  unsigned char* b3 = b1;
+  unsigned char* b4 = b2;
+  PreamblePatcher::FreePreambleBlock(b1);
+  PreamblePatcher::FreePreambleBlock(b2);
+  b1 = PreamblePatcher::AllocPreambleBlockNear(p1);
+  SIDESTEP_EXPECT_TRUE(b1 == b3);
+  b2 = PreamblePatcher::AllocPreambleBlockNear(p2);
+  SIDESTEP_EXPECT_TRUE(b2 == b4);
+  PreamblePatcher::FreePreambleBlock(b1);
+  PreamblePatcher::FreePreambleBlock(b2);
+
+  return true;
+}
+
+bool UnitTests() {
+  return TestPatchWithPreambleNearRelativeCall() &&
+      TestPatchWithPreambleAbsoluteJump() &&
+      TestPatchWithPreambleNearRelativeCondJump() && 
+      TestPatchWithPreambleShortCondJump() &&
+      TestDisassembler() && TestPatchWithLongJump() &&
+      TestPatchUsingDynamicStub() && PatchThenUnpatch() &&
+      AutoTestingHookTest() && AutoTestingHookInContainerTest() &&
+      TestPreambleAllocation();
+}
+
+};  // namespace sidestep
+
+int safe_vsnprintf(char *str, size_t size, const char *format, va_list ap) {
+  if (size == 0)        // not even room for a \0?
+    return -1;          // not what C99 says to do, but what windows does
+  str[size-1] = '\0';
+  return _vsnprintf(str, size-1, format, ap);
+}
+
+int _tmain(int argc, _TCHAR* argv[])
+{
+  bool ret = sidestep::UnitTests();
+  printf("%s\n", ret ? "PASS" : "FAIL");
+  return ret ? 0 : -1;
+}
+
+#pragma optimize("", on)

diff --git a/src/windows/preamble_patcher_with_stub.cc b/src/windows/preamble_patcher_with_stub.cc
new file mode 100644
index 0000000..23f9d3a
--- /dev/null
+++ b/src/windows/preamble_patcher_with_stub.cc

@@ -0,0 +1,302 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Joi Sigurdsson
+ * Author: Scott Francis
+ *
+ * Implementation of PreamblePatcher
+ */
+
+#include "preamble_patcher.h"
+
+#include "mini_disassembler.h"
+
+// Definitions of assembly statements we need
+#define ASM_JMP32REL 0xE9
+#define ASM_INT3 0xCC
+#define ASM_NOP 0x90
+// X64 opcodes
+#define ASM_MOVRAX_IMM 0xB8
+#define ASM_REXW 0x48
+#define ASM_JMP 0xFF
+#define ASM_JMP_RAX 0xE0
+#define ASM_PUSH 0x68
+#define ASM_RET 0xC3
+
+namespace sidestep {
+
+SideStepError PreamblePatcher::RawPatchWithStub(
+    void* target_function,
+    void* replacement_function,
+    unsigned char* preamble_stub,
+    unsigned long stub_size,
+    unsigned long* bytes_needed) {
+  if ((NULL == target_function) ||
+      (NULL == replacement_function) ||
+      (NULL == preamble_stub)) {
+    SIDESTEP_ASSERT(false &&
+                    "Invalid parameters - either pTargetFunction or "
+                    "pReplacementFunction or pPreambleStub were NULL.");
+    return SIDESTEP_INVALID_PARAMETER;
+  }
+
+  // TODO(V7:joi) Siggi and I just had a discussion and decided that both
+  // patching and unpatching are actually unsafe.  We also discussed a
+  // method of making it safe, which is to freeze all other threads in the
+  // process, check their thread context to see if their eip is currently
+  // inside the block of instructions we need to copy to the stub, and if so
+  // wait a bit and try again, then unfreeze all threads once we've patched.
+  // Not implementing this for now since we're only using SideStep for unit
+  // testing, but if we ever use it for production code this is what we
+  // should do.
+  //
+  // NOTE: Stoyan suggests we can write 8 or even 10 bytes atomically using
+  // FPU instructions, and on newer processors we could use cmpxchg8b or
+  // cmpxchg16b. So it might be possible to do the patching/unpatching
+  // atomically and avoid having to freeze other threads.  Note though, that
+  // doing it atomically does not help if one of the other threads happens
+  // to have its eip in the middle of the bytes you change while you change
+  // them.
+  unsigned char* target = reinterpret_cast<unsigned char*>(target_function);
+  unsigned int required_trampoline_bytes = 0;
+  const unsigned int kRequiredStubJumpBytes = 5;
+  const unsigned int kRequiredTargetPatchBytes = 5;
+
+  // Initialize the stub with INT3's just in case.
+  if (stub_size) {
+    memset(preamble_stub, 0xcc, stub_size);
+  }
+  if (kIs64BitBinary) {
+    // In 64-bit mode JMP instructions are always relative to RIP.  If the
+    // replacement - target offset is > 2GB, we can't JMP to the replacement
+    // function.  In this case, we're going to use a trampoline - that is,
+    // we're going to do a relative jump to a small chunk of code in the stub
+    // that will then do the absolute jump to the replacement function.  By
+    // doing this, we only need to patch 5 bytes in the target function, as
+    // opposed to patching 12 bytes if we were to do an absolute jump.
+    //
+    // Note that the first byte of the trampoline is a NOP instruction.  This
+    // is used as a trampoline signature that will be detected when unpatching
+    // the function.
+    //
+    // jmp <trampoline>
+    //
+    // trampoline:
+    //    nop
+    //    mov rax, <replacement_function>
+    //    jmp rax
+    //
+    __int64 replacement_target_offset = reinterpret_cast<__int64>(
+        replacement_function) - reinterpret_cast<__int64>(target) - 5;
+    if (replacement_target_offset > INT_MAX
+        || replacement_target_offset < INT_MIN) {
+      // The stub needs to be within 2GB of the target for the trampoline to
+      // work!
+      __int64 trampoline_offset = reinterpret_cast<__int64>(preamble_stub)
+          - reinterpret_cast<__int64>(target) - 5;
+      if (trampoline_offset > INT_MAX || trampoline_offset < INT_MIN) {
+        // We're screwed.
+        SIDESTEP_ASSERT(false 
+                       && "Preamble stub is too far from target to patch.");
+        return SIDESTEP_UNEXPECTED;
+      }
+      required_trampoline_bytes = 13;
+    }
+  }
+
+  // Let's disassemble the preamble of the target function to see if we can
+  // patch, and to see how much of the preamble we need to take.  We need 5
+  // bytes for our jmp instruction, so let's find the minimum number of
+  // instructions to get 5 bytes.
+  MiniDisassembler disassembler;
+  unsigned int preamble_bytes = 0;
+  unsigned int stub_bytes = 0;
+  while (preamble_bytes < kRequiredTargetPatchBytes) {
+    unsigned int cur_bytes = 0;
+    InstructionType instruction_type =
+        disassembler.Disassemble(target + preamble_bytes, cur_bytes);
+    if (IT_JUMP == instruction_type) {
+      unsigned int jump_bytes = 0;
+      SideStepError jump_ret = SIDESTEP_JUMP_INSTRUCTION;
+      if (IsShortConditionalJump(target + preamble_bytes, cur_bytes)) {
+        jump_ret = PatchShortConditionalJump(target + preamble_bytes, cur_bytes,
+                                             preamble_stub + stub_bytes,
+                                             &jump_bytes,
+                                             stub_size - stub_bytes);
+      } else if (IsShortJump(target + preamble_bytes, cur_bytes)) {
+        jump_ret = PatchShortJump(target + preamble_bytes, cur_bytes,
+                                  preamble_stub + stub_bytes,
+                                  &jump_bytes,
+                                  stub_size - stub_bytes);
+      } else if (IsNearConditionalJump(target + preamble_bytes, cur_bytes) ||
+                 IsNearRelativeJump(target + preamble_bytes, cur_bytes) ||
+                 IsNearAbsoluteCall(target + preamble_bytes, cur_bytes) ||
+                 IsNearRelativeCall(target + preamble_bytes, cur_bytes)) {
+         jump_ret = PatchNearJumpOrCall(target + preamble_bytes, cur_bytes,
+                                        preamble_stub + stub_bytes, &jump_bytes,
+                                        stub_size - stub_bytes);
+      }
+      if (jump_ret != SIDESTEP_SUCCESS) {
+        SIDESTEP_ASSERT(false &&
+                        "Unable to patch because there is an unhandled branch "
+                        "instruction in the initial preamble bytes.");
+        return SIDESTEP_JUMP_INSTRUCTION;
+      }
+      stub_bytes += jump_bytes;
+    } else if (IT_RETURN == instruction_type) {
+      SIDESTEP_ASSERT(false &&
+                      "Unable to patch because function is too short");
+      return SIDESTEP_FUNCTION_TOO_SMALL;
+    } else if (IT_GENERIC == instruction_type) {
+      if (IsMovWithDisplacement(target + preamble_bytes, cur_bytes)) {
+        unsigned int mov_bytes = 0;
+        if (PatchMovWithDisplacement(target + preamble_bytes, cur_bytes,
+                                     preamble_stub + stub_bytes, &mov_bytes,
+                                     stub_size - stub_bytes)
+            != SIDESTEP_SUCCESS) {
+          return SIDESTEP_UNSUPPORTED_INSTRUCTION;
+        }
+        stub_bytes += mov_bytes;
+      } else {
+        memcpy(reinterpret_cast<void*>(preamble_stub + stub_bytes),
+               reinterpret_cast<void*>(target + preamble_bytes), cur_bytes);
+        stub_bytes += cur_bytes;
+      }
+    } else {
+      SIDESTEP_ASSERT(false &&
+                      "Disassembler encountered unsupported instruction "
+                      "(either unused or unknown");
+      return SIDESTEP_UNSUPPORTED_INSTRUCTION;
+    }
+    preamble_bytes += cur_bytes;
+  }
+
+  if (NULL != bytes_needed)
+    *bytes_needed = stub_bytes + kRequiredStubJumpBytes
+        + required_trampoline_bytes;
+
+  // Inv: cbPreamble is the number of bytes (at least 5) that we need to take
+  // from the preamble to have whole instructions that are 5 bytes or more
+  // in size total. The size of the stub required is cbPreamble +
+  // kRequiredStubJumpBytes (5) + required_trampoline_bytes (0 or 13)
+  if (stub_bytes + kRequiredStubJumpBytes + required_trampoline_bytes
+      > stub_size) {
+    SIDESTEP_ASSERT(false);
+    return SIDESTEP_INSUFFICIENT_BUFFER;
+  }
+
+  // Now, make a jmp instruction to the rest of the target function (minus the
+  // preamble bytes we moved into the stub) and copy it into our preamble-stub.
+  // find address to jump to, relative to next address after jmp instruction
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4244)
+#endif
+  int relative_offset_to_target_rest
+      = ((reinterpret_cast<unsigned char*>(target) + preamble_bytes) -
+         (preamble_stub + stub_bytes + kRequiredStubJumpBytes));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+  // jmp (Jump near, relative, displacement relative to next instruction)
+  preamble_stub[stub_bytes] = ASM_JMP32REL;
+  // copy the address
+  memcpy(reinterpret_cast<void*>(preamble_stub + stub_bytes + 1),
+         reinterpret_cast<void*>(&relative_offset_to_target_rest), 4);
+
+  if (kIs64BitBinary && required_trampoline_bytes != 0) {
+    // Construct the trampoline
+    unsigned int trampoline_pos = stub_bytes + kRequiredStubJumpBytes;
+    preamble_stub[trampoline_pos] = ASM_NOP;
+    preamble_stub[trampoline_pos + 1] = ASM_REXW;
+    preamble_stub[trampoline_pos + 2] = ASM_MOVRAX_IMM;
+    memcpy(reinterpret_cast<void*>(preamble_stub + trampoline_pos + 3),
+           reinterpret_cast<void*>(&replacement_function),
+           sizeof(void *));
+    preamble_stub[trampoline_pos + 11] = ASM_JMP;
+    preamble_stub[trampoline_pos + 12] = ASM_JMP_RAX;
+
+    // Now update replacement_function to point to the trampoline
+    replacement_function = preamble_stub + trampoline_pos;
+  }
+
+  // Inv: preamble_stub points to assembly code that will execute the
+  // original function by first executing the first cbPreamble bytes of the
+  // preamble, then jumping to the rest of the function.
+
+  // Overwrite the first 5 bytes of the target function with a jump to our
+  // replacement function.
+  // (Jump near, relative, displacement relative to next instruction)
+  target[0] = ASM_JMP32REL;
+
+  // Find offset from instruction after jmp, to the replacement function.
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4244)
+#endif
+  int offset_to_replacement_function =
+      reinterpret_cast<unsigned char*>(replacement_function) -
+      reinterpret_cast<unsigned char*>(target) - 5;
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+  // complete the jmp instruction
+  memcpy(reinterpret_cast<void*>(target + 1),
+         reinterpret_cast<void*>(&offset_to_replacement_function), 4);
+
+  // Set any remaining bytes that were moved to the preamble-stub to INT3 so
+  // as not to cause confusion (otherwise you might see some strange
+  // instructions if you look at the disassembly, or even invalid
+  // instructions). Also, by doing this, we will break into the debugger if
+  // some code calls into this portion of the code.  If this happens, it
+  // means that this function cannot be patched using this patcher without
+  // further thought.
+  if (preamble_bytes > kRequiredTargetPatchBytes) {
+    memset(reinterpret_cast<void*>(target + kRequiredTargetPatchBytes),
+           ASM_INT3, preamble_bytes - kRequiredTargetPatchBytes);
+  }
+
+  // Inv: The memory pointed to by target_function now points to a relative
+  // jump instruction that jumps over to the preamble_stub.  The preamble
+  // stub contains the first stub_size bytes of the original target
+  // function's preamble code, followed by a relative jump back to the next
+  // instruction after the first cbPreamble bytes.
+  //
+  // In 64-bit mode the memory pointed to by target_function *may* point to a
+  // relative jump instruction that jumps to a trampoline which will then
+  // perform an absolute jump to the replacement function.  The preamble stub
+  // still contains the original target function's preamble code, followed by a
+  // jump back to the instructions after the first preamble bytes.
+  //
+  return SIDESTEP_SUCCESS;
+}
+
+};  // namespace sidestep

diff --git a/src/windows/shortproc.asm b/src/windows/shortproc.asm
new file mode 100644
index 0000000..7e8e3d7
--- /dev/null
+++ b/src/windows/shortproc.asm

@@ -0,0 +1,169 @@
+; Copyright (c) 2011, Google Inc.
+; All rights reserved.
+; 
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+; 
+;     * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;     * Redistributions in binary form must reproduce the above
+; copyright notice, this list of conditions and the following disclaimer
+; in the documentation and/or other materials provided with the
+; distribution.
+;     * Neither the name of Google Inc. nor the names of its
+; contributors may be used to endorse or promote products derived from
+; this software without specific prior written permission.
+; 
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;
+; ---
+; Author: Scott Francis
+;
+; Unit tests for PreamblePatcher

+ 

+.MODEL small

+ 

+.CODE

+

+TooShortFunction PROC

+	ret

+TooShortFunction ENDP

+

+JumpShortCondFunction PROC

+	test cl, 1

+	jnz jumpspot

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+	int 3

+jumpspot:

+	nop

+	nop

+	nop

+	nop

+	mov rax, 1

+	ret

+JumpShortCondFunction ENDP

+

+JumpNearCondFunction PROC

+	test cl, 1

+	jnz jumpspot

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+jumpspot:

+	nop

+	nop

+	mov rax, 1

+	ret

+JumpNearCondFunction ENDP

+

+JumpAbsoluteFunction PROC

+	test cl, 1

+	jmp jumpspot

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+jumpspot:

+	nop

+	nop

+	mov rax, 1

+	ret

+JumpAbsoluteFunction ENDP

+

+CallNearRelativeFunction PROC

+	test cl, 1

+	call TooShortFunction

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	mov rdx, 0ffff1111H

+	nop

+	nop

+	nop

+	ret

+CallNearRelativeFunction ENDP

+

+END


diff --git a/src/windows/system-alloc.cc b/src/windows/system-alloc.cc
new file mode 100644
index 0000000..9537745
--- /dev/null
+++ b/src/windows/system-alloc.cc

@@ -0,0 +1,204 @@
+// Copyright (c) 2013, Google Inc.
+// All rights reserved.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Petr Hosek
+
+#ifndef _WIN32
+# error You should only be including windows/system-alloc.cc in a windows environment!
+#endif
+
+#include <config.h>
+#include <windows.h>
+#include <algorithm> // std::min
+#include <gperftools/malloc_extension.h>
+#include "base/logging.h"
+#include "base/spinlock.h"
+#include "internal_logging.h"
+#include "system-alloc.h"
+
+static SpinLock spinlock(SpinLock::LINKER_INITIALIZED);
+
+// The current system allocator declaration
+SysAllocator* sys_alloc = NULL;
+// Number of bytes taken from system.
+size_t TCMalloc_SystemTaken = 0;
+
+class VirtualSysAllocator : public SysAllocator {
+public:
+  VirtualSysAllocator() : SysAllocator() {
+  }
+  void* Alloc(size_t size, size_t *actual_size, size_t alignment);
+};
+static char virtual_space[sizeof(VirtualSysAllocator)];
+
+// This is mostly like MmapSysAllocator::Alloc, except it does these weird
+// munmap's in the middle of the page, which is forbidden in windows.
+void* VirtualSysAllocator::Alloc(size_t size, size_t *actual_size,
+                                 size_t alignment) {
+  // Align on the pagesize boundary
+  const int pagesize = getpagesize();
+  if (alignment < pagesize) alignment = pagesize;
+  size = ((size + alignment - 1) / alignment) * alignment;
+
+  // Report the total number of bytes the OS actually delivered.  This might be
+  // greater than |size| because of alignment concerns.  The full size is
+  // necessary so that adjacent spans can be coalesced.
+  // TODO(antonm): proper processing of alignments
+  // in actual_size and decommitting.
+  if (actual_size) {
+    *actual_size = size;
+  }
+
+  // We currently do not support alignments larger than the pagesize or
+  // alignments that are not multiples of the pagesize after being floored.
+  // If this ability is needed it can be done by the caller (assuming it knows
+  // the page size).
+  assert(alignment <= pagesize);
+
+  void* result = VirtualAlloc(0, size,
+                              MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE);
+  if (result == NULL)
+    return NULL;
+
+  // If the result is not aligned memory fragmentation will result which can
+  // lead to pathological memory use.
+  assert((reinterpret_cast<uintptr_t>(result) & (alignment - 1)) == 0);
+
+  return result;
+}
+
+#ifdef _MSC_VER
+
+extern "C" SysAllocator* tc_get_sysalloc_override(SysAllocator *def);
+extern "C" SysAllocator* tc_get_sysalloc_default(SysAllocator *def)
+{
+  return def;
+}
+
+#if defined(_M_IX86)
+#pragma comment(linker, "/alternatename:_tc_get_sysalloc_override=_tc_get_sysalloc_default")
+#elif defined(_M_X64)
+#pragma comment(linker, "/alternatename:tc_get_sysalloc_override=tc_get_sysalloc_default")
+#endif
+
+#else // !_MSC_VER
+
+extern "C" ATTRIBUTE_NOINLINE
+SysAllocator* tc_get_sysalloc_override(SysAllocator *def)
+{
+  return def;
+}
+
+#endif
+
+static bool system_alloc_inited = false;
+void InitSystemAllocators(void) {
+  VirtualSysAllocator *alloc = new (virtual_space) VirtualSysAllocator();
+  sys_alloc = tc_get_sysalloc_override(alloc);
+}
+
+extern PERFTOOLS_DLL_DECL
+void* TCMalloc_SystemAlloc(size_t size, size_t *actual_size,
+			   size_t alignment) {
+  SpinLockHolder lock_holder(&spinlock);
+
+  if (!system_alloc_inited) {
+    InitSystemAllocators();
+    system_alloc_inited = true;
+  }
+
+  void* result = sys_alloc->Alloc(size, actual_size, alignment);
+  if (result != NULL) {
+    if (actual_size) {
+      TCMalloc_SystemTaken += *actual_size;
+    } else {
+      TCMalloc_SystemTaken += size;
+    }
+  }
+  return result;
+}
+
+extern PERFTOOLS_DLL_DECL
+bool TCMalloc_SystemRelease(void* start, size_t length) {
+  if (VirtualFree(start, length, MEM_DECOMMIT))
+    return true;
+
+  // The decommit may fail if the memory region consists of allocations
+  // from more than one call to VirtualAlloc.  In this case, fall back to
+  // using VirtualQuery to retrieve the allocation boundaries and decommit
+  // them each individually.
+
+  char* ptr = static_cast<char*>(start);
+  char* end = ptr + length;
+  MEMORY_BASIC_INFORMATION info;
+  while (ptr < end) {
+    size_t resultSize = VirtualQuery(ptr, &info, sizeof(info));
+    assert(resultSize == sizeof(info));
+    size_t decommitSize = std::min<size_t>(info.RegionSize, end - ptr);
+    BOOL success = VirtualFree(ptr, decommitSize, MEM_DECOMMIT);
+    assert(success == TRUE);
+    ptr += decommitSize;
+  }
+
+  return true;
+}
+
+extern PERFTOOLS_DLL_DECL
+void TCMalloc_SystemCommit(void* start, size_t length) {
+  if (VirtualAlloc(start, length, MEM_COMMIT, PAGE_READWRITE) == start)
+    return;
+
+  // The commit may fail if the memory region consists of allocations
+  // from more than one call to VirtualAlloc.  In this case, fall back to
+  // using VirtualQuery to retrieve the allocation boundaries and commit them
+  // each individually.
+
+  char* ptr = static_cast<char*>(start);
+  char* end = ptr + length;
+  MEMORY_BASIC_INFORMATION info;
+  while (ptr < end) {
+    size_t resultSize = VirtualQuery(ptr, &info, sizeof(info));
+    assert(resultSize == sizeof(info));
+
+    size_t commitSize = std::min<size_t>(info.RegionSize, end - ptr);
+    void* newAddress = VirtualAlloc(ptr, commitSize, MEM_COMMIT,
+                                    PAGE_READWRITE);
+    assert(newAddress == ptr);
+    ptr += commitSize;
+  }
+}
+
+bool RegisterSystemAllocator(SysAllocator *allocator, int priority) {
+  return false;   // we don't allow registration on windows, right now
+}
+
+void DumpSystemAllocatorStats(TCMalloc_Printer* printer) {
+  // We don't dump stats on windows, right now
+}
commit	745610d16119f59479f84918a66456ece9d6d461	[log] [tgz]
author	Austin Schuh <austin@peloton-tech.com>	Sun Sep 06 18:19:50 2015 -0700
committer	Austin Schuh <austin@peloton-tech.com>	Sun Sep 06 18:19:50 2015 -0700
tree	135f4ea4b4c31e809bdbaba6221da5cffb29fd88