mitsuba/include/mitsuba/render/gkdtree.h

/*
    This file is part of Mitsuba, a physically based rendering system.

    Copyright (c) 2007-2010 by Wenzel Jakob and others.

    Mitsuba is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License Version 3
    as published by the Free Software Foundation.

    Mitsuba is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

#if !defined(__KDTREE_GENERIC_H)
#define __KDTREE_GENERIC_H

#include <mitsuba/core/timer.h>
#include <boost/static_assert.hpp>
#include <boost/tuple/tuple.hpp>

/// Activate lots of extra checks
#define MTS_KD_DEBUG 1

/// Compile-time KD-tree depth limit
#define MTS_KD_MAX_DEPTH 48

/// Collect statistics during building/traversal
#define MTS_KD_STATISTICS 1

/// Min-max bin count
#define MTS_KD_MINMAX_BINS 128

/// OrderedChunkAllocator: don't create chunks smaller than 512 KiB
#define MTS_KD_MIN_ALLOC 512*1024

/// Allocate nodes & index lists in blocks of 512 KiB
#define MTS_KD_BLOCKSIZE_KD  (512*1024/sizeof(KDNode))
#define MTS_KD_BLOCKSIZE_IDX (512*1024/sizeof(uint32_t))

#if defined(MTS_KD_DEBUG)
#define KDAssert(expr) Assert(expr)
#define KDAssertEx(expr, text) AssertEx(expr, text)
#else
#define KDAssert(expr)
#define KDAssertEx(expr, text)
#endif

MTS_NAMESPACE_BEGIN

/**
 * \brief Special "ordered" memory allocator
 *
 * During kd-tree construction, large amounts of memory are required
 * to temporarily hold index and edge event lists. When not implemented
 * properly, these allocations can become a critical bottleneck.
 * The class \ref OrderedChunkAllocator provides a specialized
 * memory allocator, which reserves memory in chunks of at least
 * 128KiB. An important assumption made by the allocator is that
 * memory will be released in the same order, in which it is
 * allocated. This makes it possible to create an implementation
 * with a very low memory overhead. Note that no locking is done,
 * hence each thread will need its own allocator.
 */
class OrderedChunkAllocator {
public:
	inline OrderedChunkAllocator(size_t minAllocation = MTS_KD_MIN_ALLOC)
			: m_minAllocation(minAllocation) {
		m_chunks.reserve(16);
	}

	~OrderedChunkAllocator() {
		cleanup();
	}

	/**
	 * \brief Release all memory used by the allocator
	 */
	void cleanup() {
		for (std::vector<Chunk>::iterator it = m_chunks.begin();
				it != m_chunks.end(); ++it)
			freeAligned((*it).start);
		m_chunks.clear();
	}

	/**
	 * \brief Merge the chunks of another allocator into this one
	 */
	void merge(const OrderedChunkAllocator &other) {
		m_chunks.reserve(m_chunks.size() + other.m_chunks.size());
		m_chunks.insert(m_chunks.end(), other.m_chunks.begin(),
				other.m_chunks.end());
	}

	/**
	 * \brief Forget about all chunks without actually freeing them.
	 * This is useful when the chunks have been merged into another
	 * allocator.
	 */
	void forget() {
		m_chunks.clear();
	}

	/**
	 * \brief Request a block of memory from the allocator
	 *
	 * Walks through the list of chunks to find one with enough
	 * free memory. If no chunk could be found, a new one is created.
	 */
	template <typename T> T * __restrict__ allocate(size_t size) {
		size *= sizeof(T);
		for (std::vector<Chunk>::iterator it = m_chunks.begin();
				it != m_chunks.end(); ++it) {
			Chunk &chunk = *it;
			if (chunk.remainder() >= size) {
				T* result = reinterpret_cast<T *>(chunk.cur);
				chunk.cur += size;
				return result;
			}
		}

		/* No chunk had enough free memory */
		size_t allocSize = std::max(size,
			m_minAllocation);

		Chunk chunk;
		chunk.start = (uint8_t *) allocAligned(allocSize);
		chunk.cur = chunk.start + size;
		chunk.size = allocSize;
		m_chunks.push_back(chunk);

		return reinterpret_cast<T *>(chunk.start);
	}

	template <typename T> void release(T *ptr) {
		for (std::vector<Chunk>::iterator it = m_chunks.begin();
				it != m_chunks.end(); ++it) {
			Chunk &chunk = *it;
			if ((uint8_t *) ptr >= chunk.start &&
				(uint8_t *) ptr < chunk.start + chunk.size) {
				chunk.cur = (uint8_t *) ptr;
				return;
			}
		}
#if defined(MTS_KD_DEBUG)
		/* Uh oh, allocation could not be found. Check if it has size==0 */
		for (std::vector<Chunk>::iterator it = m_chunks.begin();
				it != m_chunks.end(); ++it) {
			const Chunk &chunk = *it;
			if ((uint8_t *) ptr == chunk.start + chunk.size)
				return;
		}
		SLog(EError, "OrderedChunkAllocator: Internal error while"
			" releasing memory");
#endif
	}

	/**
	 * \brief Shrink the size of the last allocated chunk
	 */
	template <typename T> void shrinkAllocation(T *ptr, size_t newSize) {
		newSize *= sizeof(T);
		for (std::vector<Chunk>::iterator it = m_chunks.begin();
				it != m_chunks.end(); ++it) {
			Chunk &chunk = *it;
			if ((uint8_t *) ptr >= chunk.start &&
				(uint8_t *) ptr < chunk.start + chunk.size) {
				chunk.cur = (uint8_t *) ptr + newSize;
				return;
			}
		}
#if defined(MTS_KD_DEBUG)
		/* Uh oh, allocation could not be found. Check if it has size==0 */
		if (newSize == 0) {
			for (std::vector<Chunk>::iterator it = m_chunks.begin();
					it != m_chunks.end(); ++it) {
				const Chunk &chunk = *it;
				if ((uint8_t *) ptr == chunk.start + chunk.size)
					return;
			}
		}
		SLog(EError, "OrderedChunkAllocator: Internal error while"
			" releasing memory");
#endif
	}

	inline size_t getChunkCount() const { return m_chunks.size(); }

	/**
	 * \brief Return the total amount of chunk memory in bytes
	 */
	size_t size() const {
		size_t result = 0;
		for (std::vector<Chunk>::const_iterator it = m_chunks.begin();
				it != m_chunks.end(); ++it)
			result += (*it).size;
		return result;
	}

	/**
	 * \brief Return the total amount of used memory in bytes
	 */
	size_t used() const {
		size_t result = 0;
		for (std::vector<Chunk>::const_iterator it = m_chunks.begin();
				it != m_chunks.end(); ++it)
			result += (*it).used();
		return result;
	}

	/**
	 * \brief Return a string representation of the chunks
	 */
	std::string toString() const {
		std::ostringstream oss;
		oss << "OrderedChunkAllocator[" << endl;
		for (size_t i=0; i<m_chunks.size(); ++i)
			oss << "    Chunk " << i << ": " << m_chunks[i].toString() << endl;
		oss << "]";
		return oss.str();
	}

private:
	struct Chunk {
		size_t size;
		uint8_t *start, *cur;

		inline size_t used() const {
			return cur - start;
		}

		inline size_t remainder() const {
			return size - used();
		}

		std::string toString() const {
			return formatString("0x%llx-0x%llx (size=" SIZE_T_FMT
				", used=" SIZE_T_FMT ")", start, start+size,
				size, used());
		}
	};

	size_t m_minAllocation;
	std::vector<Chunk> m_chunks;
};

/**
 * \brief Basic vector implementation, which stores all data
 * in a list of fixed-sized blocks.
 *
 * This leads to a more conservative memory usage when the
 * final size of a (possibly very large) growing vector is
 * unknown. Also, frequent reallocations & copies are avoided.
 */
template <typename T, size_t BlockSize> class BlockedVector {
public:
	BlockedVector() : m_pos(0) {}

	~BlockedVector() {
		clear();
	}

	/**
	 * \brief Append an element to the end
	 */
	inline void push_back(const T &value) {
		size_t blockIdx = m_pos / BlockSize;
		size_t offset = m_pos % BlockSize;
		if (blockIdx == m_blocks.size())
			m_blocks.push_back(new T[BlockSize]);
		m_blocks[blockIdx][offset] = value;
		m_pos++;
	}

	/**
	 * \brief Allocate a certain number of elements and
	 * return a pointer to the first one.
	 *
	 * The implementation will ensure that they lie
	 * contiguous in memory -- note that this can potentially
	 * create unused elements in the previous block if a new
	 * one has to be allocated.
	 */
	inline T * __restrict__ allocate(size_t size) {
#if defined(MTS_KD_DEBUG)
		SAssert(size <= BlockSize);
#endif
		size_t blockIdx = m_pos / BlockSize;
		size_t offset = m_pos % BlockSize;
		T *result;
		if (EXPECT_TAKEN(offset + size <= BlockSize)) {
			if (blockIdx == m_blocks.size())
				m_blocks.push_back(new T[BlockSize]);
			result = m_blocks[blockIdx] + offset;
			m_pos += size;
		} else {
			++blockIdx;
			if (blockIdx == m_blocks.size())
				m_blocks.push_back(new T[BlockSize]);
			result = m_blocks[blockIdx];
			m_pos += BlockSize - offset + size;
		}
		return result;
	}

	inline T &operator[](size_t index) {
		return *(m_blocks[index / BlockSize] +
			(index % BlockSize));
	}

	inline const T &operator[](size_t index) const {
		return *(m_blocks[index / BlockSize] +
			(index % BlockSize));
	}


	/**
	 * \brief Return the currently used number of items
	 */
	inline size_t size() const {
		return m_pos;
	}

	/**
	 * \brief Return the number of allocated blocks
	 */
	inline size_t blockCount() const {
		return m_blocks.size();
	}

	/**
	 * \brief Return the total capacity
	 */
	inline size_t capacity() const {
		return m_blocks.size() * BlockSize;
	}

	/**
	 * \brief Resize the vector to the given size.
	 *
	 * Note: this implementation doesn't support
	 * enlarging the vector and simply changes the
	 * last item pointer.
	 */
	inline void resize(size_t pos) {
#if defined(MTS_KD_DEBUG)
		SAssert(pos <= capacity());
#endif
		m_pos = pos;
	}

	/**
	 * \brief Release all memory
	 */
	void clear() {
		for (typename std::vector<T *>::iterator it = m_blocks.begin();
				it != m_blocks.end(); ++it)
			delete[] *it;
		m_blocks.clear();
		m_pos = 0;
	}
private:
	std::vector<T *> m_blocks;
	size_t m_pos;
};

/**
 * \brief Compact storage for primitive classifcation
 *
 * When classifying primitives with respect to a split plane,
 * a data structure is needed to hold the tertiary result of
 * this operation. This class implements a compact storage
 * (2 bits per entry) in the spirit of the std::vector<bool>
 * specialization.
 */
class ClassificationStorage {
public:
	ClassificationStorage(size_t size = 0) : m_buffer(NULL), m_bufferSize(0) { }

	~ClassificationStorage() {
		if (m_buffer)
			delete[] m_buffer;
	}

	void setPrimitiveCount(size_t size) {
		if (m_buffer)
			delete[] m_buffer;
		if (size > 0) {
			m_bufferSize = size/4 + ((size % 4) > 0 ? 1 : 0);
			m_buffer = new uint8_t[m_bufferSize];
		} else {
			m_buffer = NULL;
		}
	}

	inline void set(uint32_t index, uint8_t value) {
		uint8_t *ptr = m_buffer + (index >> 2);
		uint8_t shift = (index & 3) << 1;
		*ptr = (*ptr & ~(3 << shift)) | (value << shift);
	}

	inline uint8_t get(uint32_t index) const {
		uint8_t *ptr = m_buffer + (index >> 2);
		uint8_t shift = (index & 3) << 1;
		return (*ptr >> shift) & 3;
	}

	inline size_t size() const {
		return m_bufferSize;
	}
private:
	uint8_t *m_buffer;
	size_t m_bufferSize;
};

/**
 * \brief SAH KD-tree acceleration data structure for fast ray-object
 * intersection computations.
 *
 * The code in this class is fully generic and can theoretically
 * support any kind of shape. Subclasses need to provide the following
 * signatures for a functional implementation:
 *
 * /// Return the total number of primitives
 * inline size_type getPrimitiveCount() const;
 *
 * /// Return an axis-aligned bounding box of a certain primitive
 * inline AABB getAABB(index_type primIdx) const;
 *
 * /// Return an AABB of a primitive when clipped to another AABB
 * inline AABB getClippedAABB(index_type primIdx, const AABB &aabb) const;
 *
 * This class follows the "Curiously recurring template" design pattern so that
 * the above functions can be inlined (no virtual calls will be necessary!).
 *
 * The kd-tree construction algorithm creates 'perfect split' trees as
 * outlined in the paper "On Building fast kd-Trees for Ray Tracing, and on
 * doing that in O(N log N)" by Ingo Wald and Vlastimil Havran.
 *
 * Because the O(N log N) construction algorithm tends to cause many
 * incoherent memory accesses, a fast approximate technique (Min-max binning)
 * is used near the top of the tree, which significantly reduces cache misses.
 * Once the input data has been narrowed down to a reasonable amount, the
 * implementation switches over to the O(N log N) builder. When multiple
 * processors are available, the build process runs in parallel.
*
 */
template <typename Derived> class GenericKDTree : public Object {
protected:
	struct KDNode;
	struct EdgeEvent;
	struct EdgeEventOrdering;

public:
	/// Index number format (max 2^32 prims)
	typedef uint32_t index_type;

	/// Size number format
	typedef uint32_t size_type;

	/**
	 * \brief Create a new kd-tree instance initialized with
	 * the default parameters.
	 */
	GenericKDTree() : m_nodes(NULL), m_indices(NULL) {
		m_traversalCost = 15;
		m_intersectionCost = 20;
		m_emptySpaceBonus = 0.9f;
		m_clip = true;
		m_stopPrims = 4;
		m_maxBadRefines = 3;
		m_exactPrimThreshold = 65536;
		m_maxDepth = 0;
		m_retract = true;
		m_parallel = true;
	}

	virtual ~GenericKDTree() {
		if (m_indices)
			delete[] m_indices;
		if (m_nodes)
			delete[] m_nodes;
	}

	/**
	 * \brief Build a KD-tree over supplied geometry
	 */
	void build() {
		if (m_nodes != NULL)
			Log(EError, "The kd-tree has already been built!");

		size_type primCount = cast()->getPrimitiveCount();
		BuildContext ctx(primCount);

		/* Establish an ad-hoc depth cutoff value (Formula from PBRT) */
		if (m_maxDepth == 0)
			m_maxDepth = (int) (8 + 1.3f * log2i(primCount));
		m_maxDepth = std::min(m_maxDepth, (size_type) MTS_KD_MAX_DEPTH);

		Log(EDebug, "Creating a preliminary index list (%s)",
			memString(primCount * sizeof(index_type)).c_str());

		OrderedChunkAllocator &leftAlloc = ctx.leftAlloc;
		index_type *indices = leftAlloc.allocate<index_type>(primCount);

		ref<Timer> timer = new Timer();
		m_aabb.reset();
		for (index_type i=0; i<primCount; ++i) {
			m_aabb.expandBy(cast()->getAABB(i));
			indices[i] = i;
		}

		Log(EDebug, "Computed scene bounds in %i ms", timer->getMilliseconds());
		Log(EDebug, "");

		Log(EDebug, "kd-tree configuration:");
		Log(EDebug, "   Traversal cost           : %.2f", m_traversalCost);
		Log(EDebug, "   Intersection cost        : %.2f", m_intersectionCost);
		Log(EDebug, "   Empty space bonus        : %.2f", m_emptySpaceBonus);
		Log(EDebug, "   Max. tree depth          : %i", m_maxDepth);
		Log(EDebug, "   Scene bounding box (min) : %s", m_aabb.min.toString().c_str());
		Log(EDebug, "   Scene bounding box (max) : %s", m_aabb.max.toString().c_str());
		Log(EDebug, "   Min-max bins             : %i", MTS_KD_MINMAX_BINS);
		Log(EDebug, "   Greedy SAH optimization  : use for <= %i primitives", m_exactPrimThreshold);
		Log(EDebug, "   Perfect splits           : %s", m_clip ? "yes" : "no");
		Log(EDebug, "   Retract bad splits       : %s", m_retract ? "yes" : "no");
		Log(EDebug, "   Stopping primitive count : %i", m_stopPrims);
		Log(EDebug, "");

		size_type procCount = getProcessorCount();
		if (procCount == 1)
			m_parallel = false;

		if (m_parallel) {
			m_builders.resize(procCount);
			for (size_type i=0; i<procCount; ++i) {
				m_builders[i] = new SAHTreeBuilder(i, this);
				m_builders[i]->incRef();
				m_builders[i]->start();
			}
		}

		Log(EInfo, "Constructing a SAH kd-tree (%i primitives) ..", primCount);

		m_indirectionLock = new Mutex();
		KDNode *prelimRoot = ctx.nodes.allocate(1);
		buildTreeMinMax(ctx, 1, prelimRoot, m_aabb, m_aabb,
				indices, primCount, true, 0);
		ctx.leftAlloc.release(indices);

		KDAssert(ctx.leftAlloc.used() == 0);
		KDAssert(ctx.rightAlloc.used() == 0);

		if (m_parallel) {
			m_interface.done = true;
			m_interface.cond->broadcast();
			for (size_type i=0; i<m_builders.size(); ++i)
				m_builders[i]->join();
		}

		Log(EInfo, "Finished -- took %i ms.", timer->getMilliseconds());
		Log(EDebug, "");

		Log(EDebug, "Temporary memory statistics:");
		Log(EDebug, "   Classification storage : %s",
				memString((ctx.classStorage.size() * (1+procCount))).c_str());
		Log(EDebug, "   Indirection entries    : " SIZE_T_FMT " (%s)",
				m_indirections.size(), memString(m_indirections.capacity()
				* sizeof(KDNode *)).c_str());

		Log(EDebug, "   Main thread:");
		ctx.printStats();
		size_t totalUsage = m_indirections.capacity()
			* sizeof(KDNode *) + ctx.size();

		/// Clean up event lists and print statistics
		ctx.leftAlloc.cleanup();
		ctx.rightAlloc.cleanup();
		for (size_type i=0; i<m_builders.size(); ++i) {
			Log(EDebug, "   Worker thread %i:", i+1);
			BuildContext &subCtx = m_builders[i]->getContext();
			subCtx.printStats();
			totalUsage += subCtx.size();
			subCtx.leftAlloc.cleanup();
			subCtx.rightAlloc.cleanup();
			ctx.accumulateStatisticsFrom(subCtx);
		}
		Log(EDebug, "   Total: %s", memString(totalUsage).c_str());

		Log(EDebug, "");
		timer->reset();
		Log(EDebug, "Optimizing memory layout ..");

		std::stack<boost::tuple<const KDNode *, KDNode *,
			const BuildContext *, AABB> > stack;

		Float expTraversalSteps = 0;
		Float expLeavesVisited = 0;
		Float expPrimitivesIntersected = 0;
		Float sahCost = 0;
		size_type nodePtr = 0, indexPtr = 0;

		m_nodes = new KDNode[ctx.innerNodeCount + ctx.leafNodeCount];
		m_indices = new index_type[ctx.primIndexCount];

		stack.push(boost::make_tuple(prelimRoot, &m_nodes[nodePtr++], &ctx, m_aabb));
		while (!stack.empty()) {
			const KDNode *node = boost::get<0>(stack.top());
			KDNode *target = boost::get<1>(stack.top());
			const BuildContext *context = boost::get<2>(stack.top());
			AABB aabb = boost::get<3>(stack.top());
			stack.pop();

			if (node->isLeaf()) {
				size_type primStart = node->getPrimStart(),
						  primEnd = node->getPrimEnd(),
						  primCount = primEnd-primStart;
				target->initLeafNode(indexPtr, primCount);

				Float sa = aabb.getSurfaceArea(), weightedSA = sa * primCount;
				expLeavesVisited += aabb.getSurfaceArea();
				expPrimitivesIntersected += weightedSA;
				sahCost += weightedSA * m_intersectionCost;

				const BlockedVector<index_type, MTS_KD_BLOCKSIZE_IDX> &indices
					= context->indices;
				for (size_type idx = primStart; idx<primEnd; ++idx)
					m_indices[indexPtr++] = indices[idx];
			} else {
				typename std::map<const KDNode *, index_type>::const_iterator it
					= m_interface.threadMap.find(node);
				// Check if we're switching to a subtree built by a worker thread
				if (it != m_interface.threadMap.end())
					context = &m_builders[(*it).second]->getContext();

				Float sa = aabb.getSurfaceArea();
				expTraversalSteps += sa;
				sahCost += sa * m_traversalCost;
				const KDNode *left;
				if (EXPECT_TAKEN(!node->isIndirection()))
					left = node->getLeft();
				else
					left = m_indirections[node->getIndirectionIndex()];

				KDNode *children = &m_nodes[nodePtr];
				nodePtr += 2;
				int axis = node->getAxis();
				float split = node->getSplit();
				bool result = target->initInnerNode(axis, split, children - target);
				if (!result)
					Log(EError, "Cannot represent relative pointer -- too many primitives?");

				Float tmp = aabb.min[axis];
				aabb.min[axis] = split;
				stack.push(boost::make_tuple(left+1, children+1, context, aabb));
				aabb.min[axis] = tmp;
				aabb.max[axis] = split;
				stack.push(boost::make_tuple(left, children, context, aabb));
			}
		}

		KDAssert(nodePtr == ctx.innerNodeCount + ctx.leafNodeCount);
		KDAssert(indexPtr == ctx.primIndexCount);

		Log(EDebug, "Finished -- took %i ms.", timer->getMilliseconds());

		/* Free some more memory */
		ctx.nodes.clear();
		ctx.indices.clear();
		for (size_type i=0; i<m_builders.size(); ++i) {
			BuildContext &subCtx = m_builders[i]->getContext();
			subCtx.nodes.clear();
			subCtx.indices.clear();
		}
		m_indirectionLock = NULL;
		std::vector<KDNode *>().swap(m_indirections);

		if (m_builders.size() > 0) {
			for (size_type i=0; i<m_builders.size(); ++i)
				m_builders[i]->decRef();
			m_builders.clear();
		}

		Log(EDebug, "");

		Float rootSA = m_aabb.getSurfaceArea();
		expTraversalSteps /= rootSA;
		expLeavesVisited /= rootSA;
		expPrimitivesIntersected /= rootSA;
		sahCost /= rootSA;

		Log(EDebug, "Detailed kd-tree statistics:");
		Log(EDebug, "   Inner nodes               : %i", ctx.innerNodeCount);
		Log(EDebug, "   Leaf nodes                : %i", ctx.leafNodeCount);
		Log(EDebug, "   Nonempty leaf nodes       : %i", ctx.nonemptyLeafNodeCount);
		Log(EDebug, "   Node storage cost         : %s",
				memString(nodePtr * sizeof(KDNode)).c_str());
		Log(EDebug, "   Index storage cost        : %s",
				memString(indexPtr * sizeof(index_type)).c_str());
		Log(EDebug, "   Parallel work units       : " SIZE_T_FMT,
				m_interface.threadMap.size());
		Log(EDebug, "   Retracted splits          : %i", ctx.retractedSplits);
		Log(EDebug, "   Pruned primitives         : %i", ctx.pruned);
		Log(EDebug, "   Avg. prims/nonempty leaf  : %.2f",
				ctx.primIndexCount / (Float) ctx.nonemptyLeafNodeCount);
		Log(EDebug, "   Expected traversals/ray   : %.2f", expTraversalSteps);
		Log(EDebug, "   Expected leaf visits/ray  : %.2f", expLeavesVisited);
		Log(EDebug, "   Expected prim. visits/ray : %.2f", expPrimitivesIntersected);
		Log(EDebug, "   Final SAH cost            : %.2f", sahCost);
		Log(EDebug, "");


	}
protected:
	/// Primitive classification during tree-construction
	enum EClassificationResult {
		///< Straddling primitive
		EBothSides = 0,
		///< Primitive is entirely on the left side of the split
		ELeftSide = 1,
		///< Primitive is entirely on the right side of the split
		ERightSide = 2,
		//< Edge events have been generated for the straddling primitive
		EBothSidesProcessed = 3
	};

	/**
	 * \brief Describes the beginning or end of a primitive
	 * when projected onto a certain dimension.
	 */
	struct EdgeEvent {
		/// Possible event types
		enum EEventType {
			EEdgeEnd = 0,
			EEdgePlanar = 1,
			EEdgeStart = 2
		};

		/// Dummy constructor
		inline EdgeEvent() { }

		/// Create a new edge event
		inline EdgeEvent(uint16_t type, int axis, float pos, index_type index)
		 : pos(pos), index(index), type(type), axis(axis) { }

		/// Return a string representation
		std::string toString() const {
			std::ostringstream oss;
			oss << "EdgeEvent[" << endl
				<< "  pos = " << pos << "," << endl
				<< "  index = " << index << "," << endl
				<< "  type = ";
			if (type == EEdgeEnd)
				oss << "end";
			else if (type == EEdgePlanar)
				oss << "planar";
			else if (type == EEdgeStart)
				oss << "start";
			else
				oss << "unknown!";
			oss << "," << endl
				<< "  axis = " << axis << endl
				<<"]";
			return oss.str();
		}

		/// Plane position
		float pos;
		/// Primitive index
		index_type index;
		/// Event type: end/planar/start
		unsigned int type:2;
		/// Event axis
		unsigned int axis:2;
	};

	BOOST_STATIC_ASSERT(sizeof(EdgeEvent) == 12);

	/// Edge event comparison functor
	struct EdgeEventOrdering : public std::binary_function<EdgeEvent, EdgeEvent, bool> {
		inline bool operator()(const EdgeEvent &a, const EdgeEvent &b) const {
			if (a.axis != b.axis)
				return a.axis < b.axis;
			if (a.pos != b.pos)
				return a.pos < b.pos;
			return a.type < b.type;
		}
	};

	/**
	 * \brief Data type for split candidates computed by
	 * the SAH optimization routines.
	 * */
	struct SplitCandidate {
		Float sahCost;
		float pos;
		int axis;
		size_type numLeft, numRight;
		bool planarLeft;

		inline SplitCandidate() :
			sahCost(std::numeric_limits<Float>::infinity()),
			pos(0), axis(0), numLeft(0), numRight(0), planarLeft(false) {
		}

		std::string toString() const {
			std::ostringstream oss;
			oss << "SplitCandidate[" << endl
				<< "  sahCost=" << sahCost << "," << endl
				<< "  pos=" << pos << "," << endl
				<< "  axis=" << axis << "," << endl
				<< "  numLeft=" << numLeft << "," << endl
				<< "  numRight=" << numRight << "," << endl
				<< "  planarLeft=" << (planarLeft ? "yes" : "no") << endl
				<< "]";
			return oss.str();
		}
	};

	/**
	 * \brief Per-thread context used to manage memory allocations,
	 * also records some useful statistics.
	 */
	struct BuildContext {
		OrderedChunkAllocator leftAlloc, rightAlloc;
		BlockedVector<KDNode, MTS_KD_BLOCKSIZE_KD> nodes;
		BlockedVector<index_type, MTS_KD_BLOCKSIZE_IDX> indices;
		ClassificationStorage classStorage;

		size_type leafNodeCount;
		size_type nonemptyLeafNodeCount;
		size_type innerNodeCount;
		size_type primIndexCount;
		size_type retractedSplits;
		size_type pruned;

		BuildContext(size_type primCount) : classStorage(primCount) {
			classStorage.setPrimitiveCount(primCount);
			leafNodeCount = 0;
			nonemptyLeafNodeCount = 0;
			innerNodeCount = 0;
			primIndexCount = 0;
			retractedSplits = 0;
			pruned = 0;
		}

		size_t size() {
			return leftAlloc.size() + rightAlloc.size()
				+ nodes.capacity() * sizeof(KDNode)
				+ indices.capacity() * sizeof(index_type)
				+ classStorage.size();
		}

		void printStats() {
			Log(EDebug, "      Left events   : " SIZE_T_FMT " chunks (%s)",
					leftAlloc.getChunkCount(), memString(leftAlloc.size()).c_str());
			Log(EDebug, "      Right events  : " SIZE_T_FMT " chunks (%s)",
					rightAlloc.getChunkCount(), memString(rightAlloc.size()).c_str());
			Log(EDebug, "      kd-tree nodes : " SIZE_T_FMT " entries, " SIZE_T_FMT " blocks (%s)",
					nodes.size(), nodes.blockCount(), memString(nodes.capacity() * sizeof(KDNode)).c_str());
			Log(EDebug, "      Indices       : " SIZE_T_FMT " entries, " SIZE_T_FMT " blocks (%s)",
					indices.size(), indices.blockCount(), memString(indices.capacity() * sizeof(index_type)).c_str());
		}

		void accumulateStatisticsFrom(const BuildContext &ctx) {
			leafNodeCount += ctx.leafNodeCount;
			nonemptyLeafNodeCount += ctx.nonemptyLeafNodeCount;
			innerNodeCount += ctx.innerNodeCount;
			primIndexCount += ctx.primIndexCount;
			retractedSplits += ctx.retractedSplits;
			pruned += ctx.pruned;
		}
	};

	/**
	 * \brief Communication data structure used to pass jobs to
	 * SAH kd-tree builder threads
	 */
	struct BuildInterface {
		/* Communcation */
		ref<Mutex> mutex;
		ref<ConditionVariable> cond, condJobTaken;
		std::map<const KDNode *, index_type> threadMap;
		bool done;

		/* Job description for building a subtree */
		int depth;
		KDNode *node;
		AABB nodeAABB;
		EdgeEvent *eventStart, *eventEnd;
		size_type primCount;
		int badRefines;

		inline BuildInterface() {
			mutex = new Mutex();
			cond = new ConditionVariable(mutex);
			condJobTaken = new ConditionVariable(mutex);
			node = NULL;
			done = false;
		}
	};

	/**
	 * \brief KD-tree node in 8 bytes.
	 */
	struct KDNode {
		union {
			/* Inner node */
			struct {
				/* Bit layout:
				   31   : False (inner node)
				   30   : Indirection node flag
				   29-3 : Offset to the left child
				   2-0  : Split axis
				*/
				uint32_t combined;

				/// Split plane coordinate
				float split;
			} inner;

			/* Leaf node */
			struct {
				/* Bit layout:
				   31   : True (leaf node)
				   30-0 : Offset to the node's primitive list
				*/
				uint32_t combined;

				/// End offset of the primitive list
				uint32_t end;
			} leaf;
		};

		enum EMask {
			ETypeMask = 1 << 31,
			EIndirectionMask = 1 << 30,
			ELeafOffsetMask = ~ETypeMask,
			EInnerAxisMask = 0x3,
			EInnerOffsetMask = ~(EInnerAxisMask + EIndirectionMask),
			ERelOffsetLimit = (1<<28) - 1
		};

		/// Initialize a leaf kd-Tree node
		inline void initLeafNode(unsigned int offset, unsigned int numPrims) {
			leaf.combined = ETypeMask | offset;
			leaf.end = offset + numPrims;
		}

		/**
		 * Initialize an interior kd-Tree node. Reports a failure if the
		 * relative offset to the left child node is too large.
		 */
		inline bool initInnerNode(int axis, float split, ptrdiff_t relOffset) {
			if (relOffset < 0 || relOffset > ERelOffsetLimit)
				return false;
			inner.combined = axis | ((uint32_t) relOffset << 2);
			inner.split = split;
			return true;
		}

		/**
		 * \brief Initialize an interior indirection node.
		 *
		 * Indirections are necessary whenever the children cannot be
		 * referenced using a relative pointer, which can happen when
		 * they lie in different memory chunks. In this case, the node
		 * stores an index into a globally shared pointer list.
		 */
		inline void initIndirectionNode(int axis, float split, uint32_t indirectionEntry) {
			inner.combined = EIndirectionMask | axis | ((uint32_t) indirectionEntry << 2);
			inner.split = split;
		}

		/// Is this a leaf node?
		FINLINE bool isLeaf() const {
			return leaf.combined & ETypeMask;
		}

		/// Is this an indirection node?
		FINLINE bool isIndirection() const {
			return leaf.combined & EIndirectionMask;
		}

		/// Assuming this is a leaf node, return the first primitive index
		FINLINE index_type getPrimStart() const {
			return leaf.combined & ELeafOffsetMask;
		}

		/// Assuming this is a leaf node, return the last primitive index
		FINLINE index_type getPrimEnd() const {
			return leaf.end;
		}

		/// Return the index of an indirection node
		FINLINE index_type getIndirectionIndex() const {
			return(inner.combined & EInnerOffsetMask) >> 2;
		}

		/// Return the left child (assuming that this is an interior node)
		FINLINE const KDNode * __restrict getLeft() const {
			return this +
				((inner.combined & EInnerOffsetMask) >> 2);
		}

		/// Return the left child (assuming that this is an interior node)
		FINLINE KDNode * __restrict getLeft() {
			return this +
				((inner.combined & EInnerOffsetMask) >> 2);
		}

		/// Return the left child (assuming that this is an interior node)
		FINLINE const KDNode * __restrict getRight() const {
			return getLeft() + 1;
		}

		/// Return the split plane location (assuming that this is an interior node)
		FINLINE float getSplit() const {
			return inner.split;
		}

		/// Return the split axis (assuming that this is an interior node)
		FINLINE int getAxis() const {
			return inner.combined & EInnerAxisMask;
		}
	};

	BOOST_STATIC_ASSERT(sizeof(KDNode) == 8);

	/**
	 * \brief SAH kd-tree builder thread
	 */
	class SAHTreeBuilder : public Thread {
	public:
		SAHTreeBuilder(index_type id, GenericKDTree *parent)
			: Thread(formatString("bld%i", id)),
			m_id(id),
			m_parent(parent),
			m_context(parent->cast()->getPrimitiveCount()),
			m_interface(parent->m_interface) {
			setCritical(true);
		}

		~SAHTreeBuilder() {
			KDAssert(m_context.leftAlloc.used() == 0);
			KDAssert(m_context.rightAlloc.used() == 0);
		}

		void run() {
			OrderedChunkAllocator &leftAlloc = m_context.leftAlloc;
			while (true) {
				m_interface.mutex->lock();
				while (!m_interface.done && !m_interface.node)
					m_interface.cond->wait();
				if (m_interface.done) {
					m_interface.mutex->unlock();
					break;
				}
				int depth = m_interface.depth;
				KDNode *node = m_interface.node;
				AABB nodeAABB = m_interface.nodeAABB;
				size_t eventCount = m_interface.eventEnd - m_interface.eventStart;
				size_type primCount = m_interface.primCount;
				int badRefines = m_interface.badRefines;
				EdgeEvent *eventStart = leftAlloc.allocate<EdgeEvent>(eventCount),
						  *eventEnd = eventStart + eventCount;
				memcpy(eventStart, m_interface.eventStart, eventCount * sizeof(EdgeEvent));
				m_interface.threadMap[node] = m_id;
				m_interface.node = NULL;
				m_interface.condJobTaken->signal();
				m_interface.mutex->unlock();

				std::sort(eventStart, eventEnd, EdgeEventOrdering());
				m_parent->buildTreeSAH(m_context, depth, node,
					nodeAABB, eventStart, eventEnd, primCount, true, badRefines);
				leftAlloc.release(eventStart);
			}
		}

		inline BuildContext &getContext() {
			return m_context;
		}

	private:
		index_type m_id;
		GenericKDTree *m_parent;
		BuildContext m_context;
		BuildInterface &m_interface;
	};

	/// Cast to the derived class
	inline Derived *cast() {
		return static_cast<Derived *>(this);
	}

	/// Cast to the derived class (const version)
	inline const Derived *cast() const {
		return static_cast<Derived *>(this);
	}

	/**
	 * \brief Create an edge event list for a given list of primitives.
	 *
	 * This is necessary when passing from Min-Max binning to the more
	 * accurate SAH-based optimizier.
	 */
	boost::tuple<EdgeEvent *, EdgeEvent *, size_type> createEventList(
			OrderedChunkAllocator &alloc, const AABB &nodeAABB, index_type *prims, size_type primCount) {
		size_type initialSize = primCount * 6, actualPrimCount = 0;
		EdgeEvent *eventStart = alloc.allocate<EdgeEvent>(initialSize);
		EdgeEvent *eventEnd = eventStart;

		for (size_type i=0; i<primCount; ++i) {
			index_type index = prims[i];
			AABB aabb;
			if (m_clip) {
				aabb = cast()->getClippedAABB(index, nodeAABB);
				if (!aabb.isValid() || aabb.getSurfaceArea() == 0)
					continue;
			} else {
				aabb = cast()->getAABB(index);
			}

			for (int axis=0; axis<3; ++axis) {
				float min = (float) aabb.min[axis], max = (float) aabb.max[axis];

				if (min == max) {
					*eventEnd++ = EdgeEvent(EdgeEvent::EEdgePlanar, axis, min, index);
				} else {
					*eventEnd++ = EdgeEvent(EdgeEvent::EEdgeStart, axis, min, index);
					*eventEnd++ = EdgeEvent(EdgeEvent::EEdgeEnd, axis, max, index);
				}
			}
			++actualPrimCount;
		}

		size_type newSize = eventEnd - eventStart;
		if (newSize != initialSize)
			alloc.shrinkAllocation<EdgeEvent>(eventStart, newSize);

		return boost::make_tuple(eventStart, eventEnd, actualPrimCount);
	}

	/**
	 * \brief Leaf node creation helper function
	 *
	 * \param ctx
	 *     Thread-specific build context containing allocators etc.
	 * \param node
	 *     KD-tree node entry to be filled
	 * \param eventStart
	 *     Start pointer of an edge event list
	 * \param eventEnd
	 *     End pointer of an edge event list
	 * \param primCount
	 *     Total primitive count for the current node
	 */
	void createLeaf(BuildContext &ctx, KDNode *node, EdgeEvent *eventStart,
			EdgeEvent *eventEnd, size_type primCount) {
		node->initLeafNode(ctx.indices.size(), primCount);
		if (primCount > 0) {
			size_type seenPrims = 0;
			ctx.nonemptyLeafNodeCount++;
			for (EdgeEvent *event = eventStart; event != eventEnd
					&& event->axis == 0; ++event) {
				if (event->type == EdgeEvent::EEdgeStart ||
					event->type == EdgeEvent::EEdgePlanar) {
					ctx.indices.push_back(event->index);
					seenPrims++;
				}
			}
			KDAssert(seenPrims == primCount);
			ctx.primIndexCount += primCount;
		}
		ctx.leafNodeCount++;
	}

	/**
	 * \brief Leaf node creation helper function
	 *
	 * \param ctx
	 *     Thread-specific build context containing allocators etc.
	 * \param node
	 *     KD-tree node entry to be filled
	 * \param indices
	 *     Start pointer of an index list
	 * \param primCount
	 *     Total primitive count for the current node
	 */
	void createLeaf(BuildContext &ctx, KDNode *node, size_type *indices,
			size_type primCount) {
		node->initLeafNode(ctx.indices.size(), primCount);
		if (primCount > 0) {
			ctx.nonemptyLeafNodeCount++;
			for (size_type i=0; i<primCount; ++i)
				ctx.indices.push_back(indices[i]);
			ctx.primIndexCount += primCount;
		}
		ctx.leafNodeCount++;
	}

	/**
	 * \brief Leaf node creation helper function.
	 *
	 * Creates a unique index list by collapsing
	 * a subtree with a bad SAH cost.
	 *
	 * \param ctx
	 *     Thread-specific build context containing allocators etc.
	 * \param node
	 *     KD-tree node entry to be filled
	 * \param start
	 *     Start pointer of the subtree indices
	 * \param primCount
	 *     Total primitive count for the current node
	 */
	void createLeafAfterRetraction(BuildContext &ctx, KDNode *node,
			size_type start, size_type primCount) {
		node->initLeafNode(start, primCount);
		size_t actualCount = ctx.indices.size() - start;

		if (primCount > 0)
			ctx.nonemptyLeafNodeCount++;

		if (actualCount != primCount) {
			KDAssert(primCount > 0);
			OrderedChunkAllocator &alloc = ctx.leftAlloc;

			/* A temporary list is allocated to do the sorting (the indices
			   are not guaranteed to be contiguous in memory) */
			index_type *tempStart = alloc.allocate<index_type>(actualCount);
			index_type *tempEnd = tempStart, *ptr = tempStart;

			for (size_type i=start, end = start + actualCount; i<end; ++i)
				*tempEnd++ = ctx.indices[i];

			std::sort(tempStart, tempEnd, std::less<index_type>());

			for (size_type i=start, end = start + primCount; i<end; ++i) {
				ctx.indices[i] = *ptr++;
				while (ptr < tempEnd && *ptr == ctx.indices[i])
					++ptr;
			}

			ctx.indices.resize(start + primCount);
			ctx.primIndexCount = ctx.primIndexCount - actualCount + primCount;
			alloc.release(tempStart);
		}

		ctx.leafNodeCount++;
	}

	/**
	 * \brief Build helper function (min-max binning)
	 *
	 * \param ctx
	 *     Thread-specific build context containing allocators etc.
	 * \param depth
	 *     Current tree depth (1 == root node)
	 * \param node
	 *     KD-tree node entry to be filled
	 * \param nodeAABB
	 *     Axis-aligned bounding box of the current node
	 * \param tightAABB
	 *     Tight bounding box of the contained geometry (for min-max binning)
	 * \param indices
	 *     Index list of all triangles in the current node (for min-max binning)
	 * \param primCount
	 *     Total primitive count for the current node
	 * \param isLeftChild
	 *     Is this node the left child of its parent? This is important for
	 *     memory management using the \ref OrderedChunkAllocator.
	 * \param badRefines
	 *     Number of "probable bad refines" further up the tree. This makes
	 *     it possible to split along an initially bad-looking candidate in
	 *     the hope that the SAH cost was significantly overestimated. The
	 *     counter makes sure that only a limited number of such splits can
	 *     happen in succession.
	 * \returns
	 *     Final SAH cost of the node
	 */
	Float buildTreeMinMax(BuildContext &ctx, unsigned int depth, KDNode *node,
			const AABB &nodeAABB, const AABB &tightAABB, index_type *indices,
			size_type primCount, bool isLeftChild, size_type badRefines) {
		KDAssert(nodeAABB.contains(tightAABB));

		Float leafCost = primCount * m_intersectionCost;
		if (primCount <= m_stopPrims || depth >= m_maxDepth) {
			createLeaf(ctx, node, indices, primCount);
			return leafCost;
		}

		if (primCount <= m_exactPrimThreshold) {
			OrderedChunkAllocator &alloc = isLeftChild ? ctx.leftAlloc : ctx.rightAlloc;
			boost::tuple<EdgeEvent *, EdgeEvent *, size_type> events
					= createEventList(alloc, nodeAABB, indices, primCount);
			Float sahCost;
			if (m_parallel) {
				m_interface.mutex->lock();
				m_interface.depth = depth;
				m_interface.node = node;
				m_interface.nodeAABB = nodeAABB;
				m_interface.eventStart = boost::get<0>(events);
				m_interface.eventEnd = boost::get<1>(events);
				m_interface.primCount = boost::get<2>(events);
				badRefines = badRefines;
				m_interface.cond->signal();

				/* Wait for a worker thread to take this job */
				while (m_interface.node)
					m_interface.condJobTaken->wait();
				m_interface.mutex->unlock();


				// Never tear down this subtree (return a SAH cost of -infinity)
				sahCost = -std::numeric_limits<Float>::infinity();
			} else {
				std::sort(boost::get<0>(events), boost::get<1>(events), EdgeEventOrdering());

				sahCost = buildTreeSAH(ctx, depth, node, nodeAABB,
					boost::get<0>(events), boost::get<1>(events), boost::get<2>(events),
					isLeftChild, badRefines);
			}
			alloc.release(boost::get<0>(events));
			return sahCost;
		}

		/* ==================================================================== */
	    /*                              Binning                                 */
	    /* ==================================================================== */

		MinMaxBins<MTS_KD_MINMAX_BINS> bins(tightAABB);
		bins.bin(cast(), indices, primCount);

		/* ==================================================================== */
	    /*                        Split candidate search                        */
    	/* ==================================================================== */
		SplitCandidate bestSplit = bins.maximizeSAH(m_traversalCost,
			m_intersectionCost);

		/* "Bad refines" heuristic from PBRT */
		if (bestSplit.sahCost >= leafCost) {
			if ((bestSplit.sahCost > 4 * leafCost && primCount < 16)
				|| bestSplit.sahCost == std::numeric_limits<Float>::infinity()
				|| badRefines >= m_maxBadRefines) {
				createLeaf(ctx, node, indices, primCount);
				return leafCost;
			}
			++badRefines;
		}

		/* ==================================================================== */
	    /*                            Partitioning                              */
	    /* ==================================================================== */

		boost::tuple<AABB, index_type *, AABB, index_type *> partition =
			bins.partition(ctx, cast(), indices, bestSplit, isLeftChild,
			m_traversalCost, m_intersectionCost);

		/* ==================================================================== */
	    /*                              Recursion                               */
	    /* ==================================================================== */

		KDNode *children = ctx.nodes.allocate(2);

		size_type nodePosBeforeSplit = ctx.nodes.size();
		size_type indexPosBeforeSplit = ctx.indices.size();
		size_type leafNodeCountBeforeSplit = ctx.leafNodeCount;
		size_type nonemptyLeafNodeCountBeforeSplit = ctx.nonemptyLeafNodeCount;
		size_type innerNodeCountBeforeSplit = ctx.innerNodeCount;

		if (!node->initInnerNode(bestSplit.axis, bestSplit.pos, children-node)) {
			m_indirectionLock->lock();
			size_t indirectionIdx = m_indirections.size();
			m_indirections.push_back(children);
			/* Unable to store relative offset -- create an indirection
			   table entry */
			node->initIndirectionNode(bestSplit.axis, bestSplit.pos, indirectionIdx);
			m_indirectionLock->unlock();
		}
		ctx.innerNodeCount++;

		AABB childAABB(nodeAABB);
		childAABB.max[bestSplit.axis] = bestSplit.pos;
		Float saLeft = childAABB.getSurfaceArea();

		Float leftSAHCost = buildTreeMinMax(ctx, depth+1, children,
				childAABB, boost::get<0>(partition), boost::get<1>(partition),
				bestSplit.numLeft, true, badRefines);

		childAABB.min[bestSplit.axis] = bestSplit.pos;
		childAABB.max[bestSplit.axis] = nodeAABB.max[bestSplit.axis];
		Float saRight = childAABB.getSurfaceArea();

		Float rightSAHCost = buildTreeMinMax(ctx, depth+1, children + 1,
				childAABB, boost::get<2>(partition), boost::get<3>(partition),
				bestSplit.numRight, false, badRefines);

		/* Compute the final SAH cost given the updated cost
		   values received from the children */
		Float finalSAHCost = m_traversalCost +
			(saLeft * leftSAHCost + saRight * rightSAHCost)
			/ nodeAABB.getSurfaceArea();

		/* Release the index lists not needed by the children anymore */
		if (isLeftChild)
			ctx.rightAlloc.release(boost::get<3>(partition));
		else
			ctx.leftAlloc.release(boost::get<1>(partition));

		/* ==================================================================== */
	    /*                           Final decision                             */
	    /* ==================================================================== */

		if (!m_retract || finalSAHCost < primCount * m_intersectionCost) {
			return finalSAHCost;
		} else {
			/* In the end, splitting didn't help to reduce the SAH cost.
			   Tear up everything below this node and create a leaf */
			ctx.nodes.resize(nodePosBeforeSplit);
			ctx.retractedSplits++;
			ctx.leafNodeCount = leafNodeCountBeforeSplit;
			ctx.nonemptyLeafNodeCount = nonemptyLeafNodeCountBeforeSplit;
			ctx.innerNodeCount = innerNodeCountBeforeSplit;
			createLeafAfterRetraction(ctx, node, indexPosBeforeSplit, primCount);
			return leafCost;
		}
	}

	/*
	 * \brief Build helper function (greedy SAH-based optimization)
	 *
	 * \param ctx
	 *     Thread-specific build context containing allocators etc.
	 * \param depth
	 *     Current tree depth (1 == root node)
	 * \param node
	 *     KD-tree node entry to be filled
	 * \param nodeAABB
	 *     Axis-aligned bounding box of the current node
	 * \param eventStart
	 *     Pointer to the beginning of a sorted edge event list
	 * \param eventEnd
	 *     Pointer to the end of a sorted edge event list
	 * \param primCount
	 *     Total primitive count for the current node
	 * \param isLeftChild
	 *     Is this node the left child of its parent? This is important for
	 *     memory management using the \ref OrderedChunkAllocator.
	 * \param badRefines
	 *     Number of "probable bad refines" further up the tree. This makes
	 *     it possible to split along an initially bad-looking candidate in
	 *     the hope that the SAH cost was significantly overestimated. The
	 *     counter makes sure that only a limited number of such splits can
	 *     happen in succession.
	 * \returns
	 *     Final SAH cost of the node
	 */
	Float buildTreeSAH(BuildContext &ctx, unsigned int depth, KDNode *node,
		const AABB &nodeAABB, EdgeEvent *eventStart, EdgeEvent *eventEnd,
		size_type primCount, bool isLeftChild, size_type badRefines) {

		Float leafCost = primCount * m_intersectionCost;
		if (primCount <= m_stopPrims || depth >= m_maxDepth) {
			createLeaf(ctx, node, eventStart, eventEnd, primCount);
			return leafCost;
		}

		SplitCandidate bestSplit;

		/* ==================================================================== */
	    /*                        Split candidate search                        */
    	/* ==================================================================== */

		/* First, find the optimal splitting plane according to the
		   surface area heuristic. To do this in O(n), the search is
		   implemented as a sweep over the edge events */

		/* Initially, the split plane is placed left of the scene
		   and thus all geometry is on its right side */
		size_type numLeft[3], numRight[3];
		for (int i=0; i<3; ++i) {
			numLeft[i] = 0;
			numRight[i] = primCount;
		}
		EdgeEvent *eventsByAxis[3];
		int eventsByAxisCtr = 1;
		eventsByAxis[0] = eventStart;

		const Vector extents(nodeAABB.getExtents());
		const Float invSA = 0.5f / (extents.x * extents.y
				+ extents.y*extents.z + extents.x*extents.z);
		const Vector temp0 = Vector(
			(extents[1] * extents[2]),
			(extents[0] * extents[2]),
			(extents[0] * extents[1])) * 2 * invSA;

		const Vector temp1 = Vector(
			(extents[1] + extents[2]),
			(extents[0] + extents[2]),
			(extents[0] + extents[1])) * 2 * invSA;

		/* Iterate over all events on the current axis */
		for (EdgeEvent *event = eventStart; event < eventEnd; ) {
			/* Record the current position and count the number
			   and type of remaining events, which are also here.
			   Due to the sort ordering, there is no need to worry
			   about an axis change in the loops below */
			int axis = event->axis;
			float pos = event->pos;
			size_type numStart = 0, numEnd = 0, numPlanar = 0;

			/* Count "end" events */
			while (event < eventEnd && event->pos == pos
				&& event->type == EdgeEvent::EEdgeEnd) {
				++numEnd; ++event;
			}

			/* Count "planar" events */
			while (event < eventEnd && event->pos == pos
				&& event->type == EdgeEvent::EEdgePlanar) {
				++numPlanar; ++event;
			}

			/* Count "start" events */
			while (event < eventEnd && event->pos == pos
				&& event->type == EdgeEvent::EEdgeStart) {
				++numStart; ++event;
			}

			/* Keep track of the beginning of dimensions */
			if (event < eventEnd && event->axis != axis) {
				KDAssert(eventsByAxisCtr < 3);
				eventsByAxis[eventsByAxisCtr++] = event;
			}

			/* The split plane can now be moved onto 't'. Accordingly, all planar
			   and ending primitives are removed from the right side */
			numRight[axis] -= numPlanar + numEnd;

			/* Calculate a score using the surface area heuristic */
			if (EXPECT_TAKEN(pos >= nodeAABB.min[axis] && pos <= nodeAABB.max[axis])) {
				const size_type nL = numLeft[axis], nR = numRight[axis];
				const Float nLF = (Float) nL, nRF = (Float) nR;

				Float pLeft  = temp0[axis] + temp1[axis] * (pos - nodeAABB.min[axis]);
				Float pRight = temp0[axis] + temp1[axis] * (nodeAABB.max[axis] - pos);

				if (numPlanar == 0) {
					Float sahCost = m_intersectionCost + m_traversalCost
						* (pLeft * nLF + pRight * nRF);
					if (nL == 0 || nR == 0)
						sahCost *= m_emptySpaceBonus;
					if (sahCost < bestSplit.sahCost) {
						bestSplit.pos = pos;
						bestSplit.axis = axis;
						bestSplit.sahCost = sahCost;
						bestSplit.numLeft = nL;
						bestSplit.numRight = nR;
					}
				} else {
					Float sahCostPlanarLeft  = m_intersectionCost + m_traversalCost
						* (pLeft * (nL+numPlanar) + pRight * nRF);
					Float sahCostPlanarRight = m_intersectionCost + m_traversalCost
						* (pLeft * nLF + pRight * (nR+numPlanar));
					if (nL + numPlanar == 0 || nR == 0)
						sahCostPlanarLeft *= m_emptySpaceBonus;
					if (nL == 0 || nR + numPlanar == 0)
						sahCostPlanarRight *= m_emptySpaceBonus;
					if (sahCostPlanarLeft < bestSplit.sahCost || sahCostPlanarRight < bestSplit.sahCost) {
						bestSplit.pos = pos;
						bestSplit.axis = axis;
						if (sahCostPlanarLeft < sahCostPlanarRight) {
							bestSplit.sahCost = sahCostPlanarLeft;
							bestSplit.numLeft = nL + numPlanar;
							bestSplit.numRight = nR;
							bestSplit.planarLeft = true;
						} else {
							bestSplit.sahCost = sahCostPlanarRight;
							bestSplit.numLeft = nL;
							bestSplit.numRight = nR + numPlanar;
							bestSplit.planarLeft = false;
						}
					}
				}
			} else {
				/* When primitive clipping is active, this should
					never happen! */
				KDAssertEx(!m_clip, "Internal error: edge event is out of bounds");
			}

			/* The split plane is moved past 't'. All prims,
				which were planar on 't', are moved to the left
				side. Also, starting prims are now also left of
				the split plane. */
			numLeft[axis] += numStart + numPlanar;
		}

		/* Sanity checks. Everything should now be left of the split plane */
		KDAssert(numRight[0] == 0 && numLeft[0] == primCount &&
			   numRight[1] == 0 && numLeft[1] == primCount &&
			   numRight[2] == 0 && numLeft[2] == primCount);

		KDAssert(eventsByAxis[1]->axis == 1 && (eventsByAxis[1]-1)->axis == 0);
		KDAssert(eventsByAxis[2]->axis == 2 && (eventsByAxis[2]-1)->axis == 1);

		/* "Bad refines" heuristic from PBRT */
		if (bestSplit.sahCost >= leafCost) {
			if ((bestSplit.sahCost > 4 * leafCost && primCount < 16)
				|| badRefines >= m_maxBadRefines
				|| bestSplit.sahCost == std::numeric_limits<Float>::infinity()) {
				createLeaf(ctx, node, eventStart, eventEnd, primCount);
				return leafCost;
			}
			++badRefines;
		}

		/* ==================================================================== */
		/*                      Primitive Classification                        */
		/* ==================================================================== */

		ClassificationStorage &storage = ctx.classStorage;

		/* Initially mark all prims as being located on both sides */
		for (EdgeEvent *event = eventsByAxis[bestSplit.axis];
			 event < eventEnd && event->axis == bestSplit.axis; ++event)
			storage.set(event->index, EBothSides);

		size_type primsLeft = 0, primsRight = 0, primsBoth = primCount;
		/* Sweep over all edge events and classify the primitives wrt. the split */
		for (EdgeEvent *event = eventsByAxis[bestSplit.axis];
			 event < eventEnd && event->axis == bestSplit.axis; ++event) {
			if (event->type == EdgeEvent::EEdgeEnd && event->pos <= bestSplit.pos) {
				/* The primitive's interval ends before or on the split plane
				   -> classify to the left side */
				KDAssert(storage.get(event->index) == EBothSides);
				storage.set(event->index, ELeftSide);
				primsBoth--;
				primsLeft++;
			} else if (event->type == EdgeEvent::EEdgeStart
					&& event->pos >= bestSplit.pos) {
				/* The primitive's interval starts after or on the split plane
				   -> classify to the right side */
				KDAssert(storage.get(event->index) == EBothSides);
				storage.set(event->index, ERightSide);
				primsBoth--;
				primsRight++;
			} else if (event->type == EdgeEvent::EEdgePlanar) {
				/* If the planar primitive is not on the split plane, the
				   classification is easy. Otherwise, place it on the side with
				   the better SAH score */
				KDAssert(storage.get(event->index) == EBothSides);
				if (event->pos < bestSplit.pos || (event->pos == bestSplit.pos
						&& bestSplit.planarLeft)) {
					storage.set(event->index, ELeftSide);
					primsBoth--;
					primsLeft++;
				} else if (event->pos > bestSplit.pos || (event->pos == bestSplit.pos &&
					!bestSplit.planarLeft)) {
					storage.set(event->index, ERightSide);
					primsBoth--;
					primsRight++;
				} else {
					KDAssertEx(false, "Internal error!");
				}
			}
		}

		/* Some sanity checks */
		KDAssert(primsLeft + primsRight + primsBoth == primCount);
		KDAssert(primsLeft + primsBoth == bestSplit.numLeft);
		KDAssert(primsRight + primsBoth == bestSplit.numRight);

		OrderedChunkAllocator &leftAlloc = ctx.leftAlloc,
			&rightAlloc = ctx.rightAlloc;

		EdgeEvent *leftEventsStart, *rightEventsStart;
		if (isLeftChild) {
			leftEventsStart = eventStart;
			rightEventsStart = rightAlloc.allocate<EdgeEvent>(bestSplit.numRight * 6);
		} else {
			leftEventsStart = leftAlloc.allocate<EdgeEvent>(bestSplit.numLeft * 6);
			rightEventsStart = eventStart;
		}

		EdgeEvent *leftEventsEnd = leftEventsStart, *rightEventsEnd = rightEventsStart;

		AABB leftNodeAABB = nodeAABB, rightNodeAABB = nodeAABB;
		leftNodeAABB.max[bestSplit.axis] = bestSplit.pos;
		rightNodeAABB.min[bestSplit.axis] = bestSplit.pos;

		size_type prunedLeft = 0, prunedRight = 0;

		/* ==================================================================== */
		/*                            Partitioning                              */
		/* ==================================================================== */

		if (m_clip) {
			EdgeEvent *leftEventsTempStart = leftAlloc.allocate<EdgeEvent>(primsLeft * 6),
					  *rightEventsTempStart = rightAlloc.allocate<EdgeEvent>(primsRight * 6),
					  *newEventsLeftStart = leftAlloc.allocate<EdgeEvent>(primsBoth * 6),
					  *newEventsRightStart = rightAlloc.allocate<EdgeEvent>(primsBoth * 6);

			EdgeEvent *leftEventsTempEnd = leftEventsTempStart,
					*rightEventsTempEnd = rightEventsTempStart,
					*newEventsLeftEnd = newEventsLeftStart,
					*newEventsRightEnd = newEventsRightStart;

			for (EdgeEvent *event = eventStart; event<eventEnd; ++event) {
				uint8_t classification = storage.get(event->index);
				if (classification == ELeftSide) {
					/* Left-only primitive. Move to the left list and advance */
					*leftEventsTempEnd++ = *event;
				} else if (classification == ERightSide) {
					/* Right-only primitive. Move to the right list and advance */
					*rightEventsTempEnd++ = *event;
				} else if (classification == EBothSides) {
					/* The primitive overlaps the split plane. Re-clip and
					   generate new events for each side */
					const index_type index = event->index;

					AABB clippedLeft = cast()->getClippedAABB(index, leftNodeAABB);
					AABB clippedRight = cast()->getClippedAABB(index, rightNodeAABB);

					KDAssert(leftNodeAABB.contains(clippedLeft));
					KDAssert(rightNodeAABB.contains(clippedRight));

					if (clippedLeft.isValid() && clippedLeft.getSurfaceArea() > 0) {
						for (int axis=0; axis<3; ++axis) {
							float min = (float) clippedLeft.min[axis], max = (float) clippedLeft.max[axis];

							if (min == max) {
								*newEventsLeftEnd++ = EdgeEvent(EdgeEvent::EEdgePlanar, axis, min, index);
							} else {
								*newEventsLeftEnd++ = EdgeEvent(EdgeEvent::EEdgeStart, axis, min, index);
								*newEventsLeftEnd++ = EdgeEvent(EdgeEvent::EEdgeEnd, axis, max, index);
							}
						}
					} else {
						prunedLeft++;
					}

					if (clippedRight.isValid() && clippedRight.getSurfaceArea() > 0) {
						for (int axis=0; axis<3; ++axis) {
							float min = (float) clippedRight.min[axis], max = (float) clippedRight.max[axis];

							if (min == max) {
								*newEventsRightEnd++ = EdgeEvent(EdgeEvent::EEdgePlanar, axis, min, index);
							} else {
								*newEventsRightEnd++ = EdgeEvent(EdgeEvent::EEdgeStart, axis, min, index);
								*newEventsRightEnd++ = EdgeEvent(EdgeEvent::EEdgeEnd, axis, max, index);
							}
						}
					} else {
						prunedRight++;
					}

					/* Mark this primitive as processed so that clipping
						is only done once */
					storage.set(index, EBothSidesProcessed);
				}
			}

			KDAssert(leftEventsTempEnd - leftEventsTempStart <= primsLeft * 6);
			KDAssert(rightEventsTempEnd - rightEventsTempStart <= primsRight * 6);
			KDAssert(newEventsLeftEnd - newEventsLeftStart <= primsBoth * 6);
			KDAssert(newEventsRightEnd - newEventsRightStart <= primsBoth * 6);
			ctx.pruned += prunedLeft + prunedRight;

			/* Sort the events from overlapping prims */
			std::sort(newEventsLeftStart, newEventsLeftEnd, EdgeEventOrdering());
			std::sort(newEventsRightStart, newEventsRightEnd, EdgeEventOrdering());

			/* Merge the left list */
			leftEventsEnd = std::merge(leftEventsTempStart,
				leftEventsTempEnd, newEventsLeftStart, newEventsLeftEnd,
				leftEventsStart, EdgeEventOrdering());

			/* Merge the right list */
			rightEventsEnd = std::merge(rightEventsTempStart,
				rightEventsTempEnd, newEventsRightStart, newEventsRightEnd,
				rightEventsStart, EdgeEventOrdering());

			/* Release temporary memory */
			leftAlloc.release(newEventsLeftStart);
			leftAlloc.release(leftEventsTempStart);
			rightAlloc.release(newEventsRightStart);
			rightAlloc.release(rightEventsTempStart);
		} else {
			for (EdgeEvent *event = eventStart; event < eventEnd; ++event) {
				uint8_t classification = storage.get(event->index);
				if (classification == ELeftSide) {
					/* Left-only primitive. Move to the left list and advance */
					*leftEventsEnd++ = *event;
				} else if (classification == ERightSide) {
					/* Right-only primitive. Move to the right list and advance */
					*rightEventsEnd++ = *event;
				} else if (classification == EBothSides) {
					/* The primitive overlaps the split plane. Its edge events
					   must be added to both lists. */
					*leftEventsEnd++ = *event;
					*rightEventsEnd++ = *event;
				}
			}
			KDAssert(leftEventsEnd - leftEventsStart <= bestSplit.numLeft * 6);
			KDAssert(rightEventsEnd - rightEventsStart <= bestSplit.numRight * 6);
		}

		/* Shrink the edge event storage now that we know exactly how
		   many are on each side */
		ctx.leftAlloc.shrinkAllocation(leftEventsStart,
				leftEventsEnd - leftEventsStart);

		ctx.rightAlloc.shrinkAllocation(rightEventsStart,
				rightEventsEnd - rightEventsStart);

		/* ==================================================================== */
		/*                              Recursion                               */
		/* ==================================================================== */

		KDNode *children = ctx.nodes.allocate(2);

		size_type nodePosBeforeSplit = ctx.nodes.size();
		size_type indexPosBeforeSplit = ctx.indices.size();
		size_type leafNodeCountBeforeSplit = ctx.leafNodeCount;
		size_type nonemptyLeafNodeCountBeforeSplit = ctx.nonemptyLeafNodeCount;
		size_type innerNodeCountBeforeSplit = ctx.innerNodeCount;

		if (!node->initInnerNode(bestSplit.axis, bestSplit.pos, children-node)) {
			m_indirectionLock->lock();
			size_t indirectionIdx = m_indirections.size();
			m_indirections.push_back(children);
			/* Unable to store relative offset -- create an indirection
			   table entry */
			node->initIndirectionNode(bestSplit.axis, bestSplit.pos, indirectionIdx);
			m_indirectionLock->unlock();
		}
		ctx.innerNodeCount++;

		Float leftSAHCost = buildTreeSAH(ctx, depth+1, children,
				leftNodeAABB, leftEventsStart, leftEventsEnd,
				bestSplit.numLeft - prunedLeft, true, badRefines);

		Float rightSAHCost = buildTreeSAH(ctx, depth+1, children+1,
				rightNodeAABB, rightEventsStart, rightEventsEnd,
				bestSplit.numRight - prunedRight, false, badRefines);

		Float saLeft = leftNodeAABB.getSurfaceArea();
		Float saRight = rightNodeAABB.getSurfaceArea();

		/* Compute the final SAH cost given the updated cost
		   values received from the children */
		Float finalSAHCost = m_traversalCost +
			(saLeft * leftSAHCost + saRight * rightSAHCost) * invSA;

		/* Release the index lists not needed by the children anymore */
		if (isLeftChild)
			ctx.rightAlloc.release(rightEventsStart);
		else
			ctx.leftAlloc.release(leftEventsStart);

		/* ==================================================================== */
	    /*                           Final decision                             */
	    /* ==================================================================== */

		if (!m_retract || finalSAHCost < primCount * m_intersectionCost) {
			return finalSAHCost;
		} else {
			/* In the end, splitting didn't help to reduce the SAH cost.
			   Tear up everything below this node and create a leaf */
			ctx.nodes.resize(nodePosBeforeSplit);
			ctx.retractedSplits++;
			ctx.leafNodeCount = leafNodeCountBeforeSplit;
			ctx.nonemptyLeafNodeCount = nonemptyLeafNodeCountBeforeSplit;
			ctx.innerNodeCount = innerNodeCountBeforeSplit;
			createLeafAfterRetraction(ctx, node, indexPosBeforeSplit, primCount);
			return leafCost;
		}

		return bestSplit.sahCost;
	}

	/**
	 * \brief Min-max binning as described in
	 * "Highly Parallel Fast KD-tree Construction for Interactive
	 *  Ray Tracing of Dynamic Scenes"
	 * by M. Shevtsov, A. Soupikov and A. Kapustin
	 *
	 * \tparam BinCount Number of bins to be allocated
	 */
	template <int BinCount> struct MinMaxBins {
		MinMaxBins(const AABB &aabb) : m_aabb(aabb) {
			m_binSize = m_aabb.getExtents() / BinCount;
		}

		/**
		 * \brief Run min-max binning
		 *
		 * \param derived Derived class to be used to determine the AABB for
		 *     a given list of primitives
		 * \param indices Primitive indirection list
		 * \param primCount Specifies the length of \a indices
		 */
		void bin(const Derived *derived, index_type *indices, size_type primCount) {
			m_primCount = primCount;
			memset(m_minBins, 0, sizeof(size_type) * 3 * BinCount);
			memset(m_maxBins, 0, sizeof(size_type) * 3 * BinCount);
			Vector invBinSize;

			for (int axis=0; axis<3; ++axis)
				invBinSize[axis] = 1/m_binSize[axis];

			for (size_type i=0; i<m_primCount; ++i) {
				const AABB aabb = derived->getAABB(indices[i]);
				for (int axis=0; axis<3; ++axis) {
					int minIdx = (int) ((aabb.min[axis] - m_aabb.min[axis])
							* invBinSize[axis]);
					int maxIdx = (int) ((aabb.max[axis] - m_aabb.min[axis])
							* invBinSize[axis]);
					m_maxBins[axis * BinCount + std::max(0, std::min(maxIdx, BinCount-1))]++;
					m_minBins[axis * BinCount + std::max(0, std::min(minIdx, BinCount-1))]++;
				}
			}
		}

		/**
		 * \brief Evaluate the surface area heuristic at each bin boundary
		 * and return the maximizer for the given cost constants. Min-max
		 * binning uses no "empty space bonus" since it cannot create such
		 * splits.
		 */
		SplitCandidate maximizeSAH(Float traversalCost, Float intersectionCost) {
			SplitCandidate candidate;
			Float normalization = 2.0f / m_aabb.getSurfaceArea();
			int binIdx = 0, leftBin = 0;

			for (int axis=0; axis<3; ++axis) {
				Vector extents = m_aabb.getExtents();
				size_type numLeft = 0, numRight = m_primCount;
				Float leftWidth = 0, rightWidth = extents[axis];
				const Float binSize = m_binSize[axis];

				for (int i=0; i<BinCount-1; ++i) {
					numLeft  += m_minBins[binIdx];
					numRight -= m_maxBins[binIdx];
					leftWidth += binSize;
					rightWidth -= binSize;

					extents[axis] = leftWidth;
					Float pLeft = normalization * (extents.x*extents.y
							+ extents.x*extents.z + extents.y*extents.z);

					extents[axis] = rightWidth;
					Float pRight = normalization * (extents.x*extents.y
							+ extents.x*extents.z + extents.y*extents.z);

					Float sahCost = traversalCost + intersectionCost
						* (pLeft * numLeft + pRight * numRight);

					if (sahCost < candidate.sahCost) {
						candidate.sahCost = sahCost;
						candidate.axis = axis;
						candidate.numLeft = numLeft;
						candidate.numRight = numRight;
						leftBin = i;
					}

					binIdx++;
				}
				binIdx++;
			}

			KDAssert(candidate.sahCost != std::numeric_limits<Float>::infinity());

			const int axis = candidate.axis;
			const float min = m_aabb.min[axis];

			/* This part is ensures that the returned split plane is consistent
			 * with the floating point calculations done by the binning code
			 * in \ref bin(). Since reciprocals and various floating point
			 * roundoff errors are involved, simply setting
			 *
			 * candidate.pos = m_aabb.min[axis] + (leftBin+1) * m_binSize[axis];
			 *
			 * will potentially lead to a different number primitives being
			 * classified to the left and right compared to the numbers stored
			 * in candidate.numLeft and candidate.numRight. We can't have that,
			 * however, since the partitioning code assumes that these
			 * numbers are correct. This removes the need for an extra sweep
			 * through the whole primitive list.
			 */
			float invBinSize = 1/m_binSize[axis],
				  split = min + (leftBin + 1) * m_binSize[axis];
			float splitNext = nextafterf(split,
				  std::numeric_limits<float>::max());
			int idx     = (int) ((split - min) * invBinSize);
			int idxNext = (int) ((splitNext - min) * invBinSize);

			/**
			 * The split plane should be along the last discrete floating
			 * floating position, which would still be classified into
			 * the left bin.
			 */
			if (!(idx <= leftBin && idxNext > leftBin)) {
				float direction;

				/* First, determine the search direction */
				if (idx > leftBin)
					direction = -std::numeric_limits<float>::max();
				else
					direction = std::numeric_limits<float>::max();

				int it = 0;
				while (true) {
					split     = nextafterf(split, direction);
					splitNext = nextafterf(split,
							std::numeric_limits<float>::max());
					idx     = (int) ((split - min) * invBinSize);
					idxNext = (int) ((splitNext - min) * invBinSize);
					if (idx == leftBin && idxNext > leftBin)
						break;
					if (idx < leftBin && idxNext > leftBin) {
						/* Insufficient floating point resolution -- a leaf will be created. */
						candidate.sahCost = std::numeric_limits<Float>::infinity();
						break;
					}

					++it;
				}
			}

			if (split <= m_aabb.min[axis] || split > m_aabb.max[axis]) {
				/* Insufficient floating point resolution -- a leaf will be created. */
				candidate.sahCost = std::numeric_limits<Float>::infinity();
			}

			candidate.pos = split;

			return candidate;
		}

		/**
		 * \brief Given a suitable split candiate, compute tight bounding
		 * boxes for the left and right subtrees and return associated
		 * primitive lists.
		 */
		boost::tuple<AABB, index_type *, AABB, index_type *> partition(
				BuildContext &ctx, const Derived *derived, index_type *primIndices,
				SplitCandidate &split, bool isLeftChild, Float traversalCost, Float intersectionCost) {
			const float splitPos = split.pos;
			const int axis = split.axis;
			size_type numLeft = 0, numRight = 0;
			AABB leftBounds, rightBounds;

			index_type *leftIndices, *rightIndices;
			if (isLeftChild) {
				OrderedChunkAllocator &rightAlloc = ctx.rightAlloc;
				leftIndices = primIndices;
				rightIndices = rightAlloc.allocate<index_type>(split.numRight);
			} else {
				OrderedChunkAllocator &leftAlloc = ctx.leftAlloc;
				leftIndices = leftAlloc.allocate<index_type>(split.numLeft);
				rightIndices = primIndices;
			}

			for (size_type i=0; i<m_primCount; ++i) {
				const index_type primIndex = primIndices[i];
				const AABB aabb = derived->getAABB(primIndex);

				if (aabb.max[axis] <= splitPos) {
					KDAssert(numLeft < split.numLeft);
					leftBounds.expandBy(aabb);
					leftIndices[numLeft++] = primIndex;
				} else if (aabb.min[axis] > splitPos) {
					KDAssert(numRight < split.numRight);
					rightBounds.expandBy(aabb);
					rightIndices[numRight++] = primIndex;
				} else {
					leftBounds.expandBy(aabb);
					rightBounds.expandBy(aabb);
					KDAssert(numLeft < split.numLeft);
					KDAssert(numRight < split.numRight);
					leftIndices[numLeft++] = primIndex;
					rightIndices[numRight++] = primIndex;
				}
			}

			leftBounds.clip(m_aabb);
			rightBounds.clip(m_aabb);

			KDAssert(numLeft == split.numLeft);
			KDAssert(numRight == split.numRight);

			/// Release the unused memory regions
			if (isLeftChild)
				ctx.leftAlloc.shrinkAllocation(leftIndices, split.numLeft);
			else
				ctx.rightAlloc.shrinkAllocation(rightIndices, split.numRight);

			leftBounds.max[axis] = std::min(leftBounds.max[axis], (Float) splitPos);
			rightBounds.min[axis] = std::max(rightBounds.min[axis], (Float) splitPos);

			if (leftBounds.max[axis] != rightBounds.min[axis]) {
				/* There is some space between the child nodes -- move
				   the split plane onto one of the AABBs so that the
				   surface area heuristic is minimized */
				Float normalization = 2.0f / m_aabb.getSurfaceArea();
				Vector extents = m_aabb.getExtents();

				extents[axis] = leftBounds.max[axis] - m_aabb.min[axis];
				Float pLeft1 = normalization * (extents.x*extents.y
							+ extents.x*extents.z + extents.y*extents.z);
				extents[axis] = m_aabb.max[axis] - leftBounds.max[axis];
				Float pRight1 = normalization * (extents.x*extents.y
							+ extents.x*extents.z + extents.y*extents.z);
				Float sahCost1 = traversalCost + intersectionCost
					* (pLeft1 * numLeft + pRight1 * numRight);

				extents[axis] = rightBounds.min[axis] - m_aabb.min[axis];
				Float pLeft2 = normalization * (extents.x*extents.y
							+ extents.x*extents.z + extents.y*extents.z);
				extents[axis] = m_aabb.max[axis] - rightBounds.min[axis];
				Float pRight2 = normalization * (extents.x*extents.y
							+ extents.x*extents.z + extents.y*extents.z);
				Float sahCost2 = traversalCost + intersectionCost
					* (pLeft2 * numLeft + pRight2 * numRight);

				if (sahCost1 <= sahCost2) {
					split.sahCost = sahCost1;
					split.pos = leftBounds.max[axis];
				} else {
					split.sahCost = sahCost2;
					split.pos = rightBounds.min[axis];
				}
			}

			return boost::make_tuple(leftBounds, leftIndices,
					rightBounds, rightIndices);
		}
	private:
		size_type m_minBins[3*BinCount], m_maxBins[3*BinCount];
		size_type m_primCount;
		AABB m_aabb;
		Vector m_binSize;
	};

private:
	KDNode *m_nodes;
	index_type *m_indices;
	Float m_traversalCost;
	Float m_intersectionCost;
	Float m_emptySpaceBonus;
	bool m_clip, m_retract, m_parallel;
	AABB m_aabb;
	size_type m_maxDepth;
	size_type m_stopPrims;
	size_type m_maxBadRefines;
	size_type m_exactPrimThreshold;
	std::vector<SAHTreeBuilder *> m_builders;
	std::vector<KDNode *> m_indirections;
	ref<Mutex> m_indirectionLock;
	BuildInterface m_interface;
};

MTS_NAMESPACE_END

#endif /* __KDTREE_GENERIC_H */